1
0
Fork 0
mirror of https://codeberg.org/ral/web_archive_cli.git synced 2024-08-16 09:59:49 +02:00

Initial checkin

This commit is contained in:
Ral 2023-07-15 15:30:35 +02:00
commit 3bf3a6bfbd
4 changed files with 144 additions and 0 deletions

7
.gitignore vendored Normal file
View file

@ -0,0 +1,7 @@
*~
*.xml
*.txt
__pycache__/
venv/

44
do_archive.py Normal file
View file

@ -0,0 +1,44 @@
#!/bin/python3
import sys
import time
import savepagenow
urlfile = sys.argv[1]
print(f"Reading urls from: {urlfile}")
is_url = lambda s: s.startswith("http")
with open(urlfile, encoding='utf_8') as f:
lines = f.readlines()
urllist = list(filter(is_url, lines))
urllist = list(map(lambda s: s.strip(), urllist))
n = len(urllist)
print(f"Archiving {n} urls")
failed_urls = []
for i, url in enumerate(urllist):
print(f"Archiving url [{i+1}/{n}]: {url}")
try:
archived_url = savepagenow.capture(url)
print(" DONE")
print(" -> {archived_url}")
except savepagenow.exceptions.CachedPage:
print(" CACHED")
except:
failed_urls.append(url)
print(" FAILED")
# Avoid http error 429: too many requests
time.sleep(5)
with open("failed_urls.txt", 'w', encoding='utf_8') as f:
f.write('\n'.join(failed_urls))

74
readme.md Normal file
View file

@ -0,0 +1,74 @@
Web Archive CLI
===============
Simple Python CLI to archive whole websites to the Web Archive via a `sitemap.xml` file.
Installation
------------
Create a fresh Python virtual env:
```
python3 -m venv venv
```
and activate it:
```
. venv/bin/activate
```
and install the dependencies:
```
pip install -r requirements.txt
```
Usage
-----
Activate the Python virtual env:
```
. venv/bin/activate
```
Convert a `sitemap.xml` file to a plain list of URLs:
```
python sitemap_to_urllist.py sitemap_example.org_2023-01-01.xml
```
Push all URLs to the web archive:
```
python do_archive.py urls_example.org_2023-01-01.txt
```
Dependencies
------------
The archive script is based on the `savepagenow` Python package:
* https://pypi.org/project/savepagenow/
* https://github.com/palewire/savepagenow
To archive a single URL only, the `savepagenow` CLI can be used directly:
* https://palewi.re/docs/savepagenow/cli.html
Links
-----
Wayback API:
* https://archive.org/help/wayback_api.php
Manual paper feed:
* https://web.archive.org/save/

19
sitemap_to_urllist.py Normal file
View file

@ -0,0 +1,19 @@
#!/bin/python3
import sys
import xml.etree.ElementTree as ET
sitemapfilename = sys.argv[1]
domain, timetag = sitemapfilename.rstrip(".xml").split("_")[1:3]
tree = ET.parse(sitemapfilename)
root = tree.getroot()
xmltag = '{http://www.sitemaps.org/schemas/sitemap/0.9}loc'
urllist = [loc.text for loc in root.iter(xmltag)]
urlfilename = f"urls_{domain}_{timetag}.txt"
with open(urlfilename, 'w', encoding='utf_8') as f:
f.write('\n'.join(urllist))