Initial checkin

2024-08-16 09:59:49 +02:00 · 2023-07-15 15:30:35 +02:00 · 2023-07-15 15:30:35 +02:00 · 3bf3a6bfbd
commit 3bf3a6bfbd
4 changed files with 144 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,7 @@
+*~
+
+*.xml
+*.txt
+
+__pycache__/
+venv/
--- a/do_archive.py
+++ b/do_archive.py
@ -0,0 +1,44 @@
+#!/bin/python3
+
+import sys
+import time
+import savepagenow
+
+
+urlfile = sys.argv[1]
+
+print(f"Reading urls from: {urlfile}")
+
+is_url = lambda s: s.startswith("http")
+
+with open(urlfile, encoding='utf_8') as f:
+    lines = f.readlines()
+    urllist = list(filter(is_url, lines))
+    urllist = list(map(lambda s: s.strip(), urllist))
+
+
+n = len(urllist)
+print(f"Archiving {n} urls")
+
+
+failed_urls = []
+
+for i, url in enumerate(urllist):
+    print(f"Archiving url [{i+1}/{n}]: {url}")
+
+    try:
+        archived_url = savepagenow.capture(url)
+        print("   DONE")
+        print("   -> {archived_url}")
+    except savepagenow.exceptions.CachedPage:
+        print("   CACHED")
+    except:
+        failed_urls.append(url)
+        print("   FAILED")
+
+    # Avoid http error 429: too many requests
+    time.sleep(5)
+
+
+with open("failed_urls.txt", 'w', encoding='utf_8') as f:
+    f.write('\n'.join(failed_urls))
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,74 @@
+Web Archive CLI
+===============
+
+Simple Python CLI to archive whole websites to the Web Archive via a `sitemap.xml` file.
+
+
+Installation
+------------
+
+Create a fresh Python virtual env:
+
+```
+python3 -m venv venv
+```
+
+and activate it:
+
+```
+. venv/bin/activate
+```
+
+and install the dependencies:
+
+```
+pip install -r requirements.txt
+```
+
+
+Usage
+-----
+
+Activate the Python virtual env:
+
+```
+. venv/bin/activate
+```
+
+Convert a `sitemap.xml` file to a plain list of URLs:
+
+```
+python sitemap_to_urllist.py sitemap_example.org_2023-01-01.xml
+```
+
+Push all URLs to the web archive:
+
+```
+python do_archive.py urls_example.org_2023-01-01.txt
+```
+
+
+Dependencies
+------------
+
+The archive script is based on the `savepagenow` Python package:
+
+* https://pypi.org/project/savepagenow/
+* https://github.com/palewire/savepagenow
+
+
+To archive a single URL only, the `savepagenow` CLI can be used directly:
+
+* https://palewi.re/docs/savepagenow/cli.html
+
+
+Links
+-----
+
+Wayback API:
+
+* https://archive.org/help/wayback_api.php
+
+Manual paper feed:
+
+* https://web.archive.org/save/
--- a/sitemap_to_urllist.py
+++ b/sitemap_to_urllist.py
@ -0,0 +1,19 @@
+#!/bin/python3
+
+import sys
+import xml.etree.ElementTree as ET
+
+sitemapfilename = sys.argv[1]
+
+domain, timetag = sitemapfilename.rstrip(".xml").split("_")[1:3]
+
+tree = ET.parse(sitemapfilename)
+root = tree.getroot()
+xmltag = '{http://www.sitemaps.org/schemas/sitemap/0.9}loc'
+
+urllist = [loc.text for loc in root.iter(xmltag)]
+
+urlfilename = f"urls_{domain}_{timetag}.txt"
+
+with open(urlfilename, 'w', encoding='utf_8') as f:
+    f.write('\n'.join(urllist))