Initial checkin

2024-08-16 09:59:49 +02:00 · 2023-07-15 15:30:35 +02:00 · 2023-07-15 15:30:35 +02:00 · 3bf3a6bfbd
commit 3bf3a6bfbd
4 changed files with 144 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,7 @@
 *~
 *.xml
 *.txt
 __pycache__/
 venv/
--- a/do_archive.py
+++ b/do_archive.py
@ -0,0 +1,44 @@
 #!/bin/python3
 import sys
 import time
 import savepagenow
 urlfile = sys.argv[1]
 print(f"Reading urls from: {urlfile}")
 is_url = lambda s: s.startswith("http")
 with open(urlfile, encoding='utf_8') as f:
    lines = f.readlines()
    urllist = list(filter(is_url, lines))
    urllist = list(map(lambda s: s.strip(), urllist))
 n = len(urllist)
 print(f"Archiving {n} urls")
 failed_urls = []
 for i, url in enumerate(urllist):
    print(f"Archiving url [{i+1}/{n}]: {url}")
    try:
        archived_url = savepagenow.capture(url)
        print("   DONE")
        print("   -> {archived_url}")
    except savepagenow.exceptions.CachedPage:
        print("   CACHED")
    except:
        failed_urls.append(url)
        print("   FAILED")
    # Avoid http error 429: too many requests
    time.sleep(5)
 with open("failed_urls.txt", 'w', encoding='utf_8') as f:
    f.write('\n'.join(failed_urls))
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,74 @@
 Web Archive CLI
 ===============
 Simple Python CLI to archive whole websites to the Web Archive via a `sitemap.xml` file.
 Installation
 ------------
 Create a fresh Python virtual env:
 ```
 python3 -m venv venv
 ```
 and activate it:
 ```
 . venv/bin/activate
 ```
 and install the dependencies:
 ```
 pip install -r requirements.txt
 ```
 Usage
 -----
 Activate the Python virtual env:
 ```
 . venv/bin/activate
 ```
 Convert a `sitemap.xml` file to a plain list of URLs:
 ```
 python sitemap_to_urllist.py sitemap_example.org_2023-01-01.xml
 ```
 Push all URLs to the web archive:
 ```
 python do_archive.py urls_example.org_2023-01-01.txt
 ```
 Dependencies
 ------------
 The archive script is based on the `savepagenow` Python package:
 * https://pypi.org/project/savepagenow/
 * https://github.com/palewire/savepagenow
 To archive a single URL only, the `savepagenow` CLI can be used directly:
 * https://palewi.re/docs/savepagenow/cli.html
 Links
 -----
 Wayback API:
 * https://archive.org/help/wayback_api.php
 Manual paper feed:
 * https://web.archive.org/save/
--- a/sitemap_to_urllist.py
+++ b/sitemap_to_urllist.py
@ -0,0 +1,19 @@
 #!/bin/python3
 import sys
 import xml.etree.ElementTree as ET
 sitemapfilename = sys.argv[1]
 domain, timetag = sitemapfilename.rstrip(".xml").split("_")[1:3]
 tree = ET.parse(sitemapfilename)
 root = tree.getroot()
 xmltag = '{http://www.sitemaps.org/schemas/sitemap/0.9}loc'
 urllist = [loc.text for loc in root.iter(xmltag)]
 urlfilename = f"urls_{domain}_{timetag}.txt"
 with open(urlfilename, 'w', encoding='utf_8') as f:
    f.write('\n'.join(urllist))