From 3bf3a6bfbda2ff2216ffeff01f96890d46898e0f Mon Sep 17 00:00:00 2001 From: Ral Date: Sat, 15 Jul 2023 15:30:35 +0200 Subject: [PATCH] Initial checkin --- .gitignore | 7 ++++ do_archive.py | 44 +++++++++++++++++++++++++ readme.md | 74 +++++++++++++++++++++++++++++++++++++++++++ sitemap_to_urllist.py | 19 +++++++++++ 4 files changed, 144 insertions(+) create mode 100644 .gitignore create mode 100644 do_archive.py create mode 100644 readme.md create mode 100644 sitemap_to_urllist.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9f9b83c --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*~ + +*.xml +*.txt + +__pycache__/ +venv/ diff --git a/do_archive.py b/do_archive.py new file mode 100644 index 0000000..c4f36fb --- /dev/null +++ b/do_archive.py @@ -0,0 +1,44 @@ +#!/bin/python3 + +import sys +import time +import savepagenow + + +urlfile = sys.argv[1] + +print(f"Reading urls from: {urlfile}") + +is_url = lambda s: s.startswith("http") + +with open(urlfile, encoding='utf_8') as f: + lines = f.readlines() + urllist = list(filter(is_url, lines)) + urllist = list(map(lambda s: s.strip(), urllist)) + + +n = len(urllist) +print(f"Archiving {n} urls") + + +failed_urls = [] + +for i, url in enumerate(urllist): + print(f"Archiving url [{i+1}/{n}]: {url}") + + try: + archived_url = savepagenow.capture(url) + print(" DONE") + print(" -> {archived_url}") + except savepagenow.exceptions.CachedPage: + print(" CACHED") + except: + failed_urls.append(url) + print(" FAILED") + + # Avoid http error 429: too many requests + time.sleep(5) + + +with open("failed_urls.txt", 'w', encoding='utf_8') as f: + f.write('\n'.join(failed_urls)) diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..50e42b1 --- /dev/null +++ b/readme.md @@ -0,0 +1,74 @@ +Web Archive CLI +=============== + +Simple Python CLI to archive whole websites to the Web Archive via a `sitemap.xml` file. + + +Installation +------------ + +Create a fresh Python virtual env: + +``` +python3 -m venv venv +``` + +and activate it: + +``` +. venv/bin/activate +``` + +and install the dependencies: + +``` +pip install -r requirements.txt +``` + + +Usage +----- + +Activate the Python virtual env: + +``` +. venv/bin/activate +``` + +Convert a `sitemap.xml` file to a plain list of URLs: + +``` +python sitemap_to_urllist.py sitemap_example.org_2023-01-01.xml +``` + +Push all URLs to the web archive: + +``` +python do_archive.py urls_example.org_2023-01-01.txt +``` + + +Dependencies +------------ + +The archive script is based on the `savepagenow` Python package: + +* https://pypi.org/project/savepagenow/ +* https://github.com/palewire/savepagenow + + +To archive a single URL only, the `savepagenow` CLI can be used directly: + +* https://palewi.re/docs/savepagenow/cli.html + + +Links +----- + +Wayback API: + +* https://archive.org/help/wayback_api.php + +Manual paper feed: + +* https://web.archive.org/save/ diff --git a/sitemap_to_urllist.py b/sitemap_to_urllist.py new file mode 100644 index 0000000..1728604 --- /dev/null +++ b/sitemap_to_urllist.py @@ -0,0 +1,19 @@ +#!/bin/python3 + +import sys +import xml.etree.ElementTree as ET + +sitemapfilename = sys.argv[1] + +domain, timetag = sitemapfilename.rstrip(".xml").split("_")[1:3] + +tree = ET.parse(sitemapfilename) +root = tree.getroot() +xmltag = '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' + +urllist = [loc.text for loc in root.iter(xmltag)] + +urlfilename = f"urls_{domain}_{timetag}.txt" + +with open(urlfilename, 'w', encoding='utf_8') as f: + f.write('\n'.join(urllist))