mirror of
https://codeberg.org/ral/web_archive_cli.git
synced 2024-08-16 09:59:49 +02:00
Initial checkin
This commit is contained in:
commit
3bf3a6bfbd
4 changed files with 144 additions and 0 deletions
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
*~
|
||||||
|
|
||||||
|
*.xml
|
||||||
|
*.txt
|
||||||
|
|
||||||
|
__pycache__/
|
||||||
|
venv/
|
44
do_archive.py
Normal file
44
do_archive.py
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
#!/bin/python3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import savepagenow
|
||||||
|
|
||||||
|
|
||||||
|
urlfile = sys.argv[1]
|
||||||
|
|
||||||
|
print(f"Reading urls from: {urlfile}")
|
||||||
|
|
||||||
|
is_url = lambda s: s.startswith("http")
|
||||||
|
|
||||||
|
with open(urlfile, encoding='utf_8') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
urllist = list(filter(is_url, lines))
|
||||||
|
urllist = list(map(lambda s: s.strip(), urllist))
|
||||||
|
|
||||||
|
|
||||||
|
n = len(urllist)
|
||||||
|
print(f"Archiving {n} urls")
|
||||||
|
|
||||||
|
|
||||||
|
failed_urls = []
|
||||||
|
|
||||||
|
for i, url in enumerate(urllist):
|
||||||
|
print(f"Archiving url [{i+1}/{n}]: {url}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
archived_url = savepagenow.capture(url)
|
||||||
|
print(" DONE")
|
||||||
|
print(" -> {archived_url}")
|
||||||
|
except savepagenow.exceptions.CachedPage:
|
||||||
|
print(" CACHED")
|
||||||
|
except:
|
||||||
|
failed_urls.append(url)
|
||||||
|
print(" FAILED")
|
||||||
|
|
||||||
|
# Avoid http error 429: too many requests
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
|
||||||
|
with open("failed_urls.txt", 'w', encoding='utf_8') as f:
|
||||||
|
f.write('\n'.join(failed_urls))
|
74
readme.md
Normal file
74
readme.md
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
Web Archive CLI
|
||||||
|
===============
|
||||||
|
|
||||||
|
Simple Python CLI to archive whole websites to the Web Archive via a `sitemap.xml` file.
|
||||||
|
|
||||||
|
|
||||||
|
Installation
|
||||||
|
------------
|
||||||
|
|
||||||
|
Create a fresh Python virtual env:
|
||||||
|
|
||||||
|
```
|
||||||
|
python3 -m venv venv
|
||||||
|
```
|
||||||
|
|
||||||
|
and activate it:
|
||||||
|
|
||||||
|
```
|
||||||
|
. venv/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
|
and install the dependencies:
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
|
||||||
|
Activate the Python virtual env:
|
||||||
|
|
||||||
|
```
|
||||||
|
. venv/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
|
Convert a `sitemap.xml` file to a plain list of URLs:
|
||||||
|
|
||||||
|
```
|
||||||
|
python sitemap_to_urllist.py sitemap_example.org_2023-01-01.xml
|
||||||
|
```
|
||||||
|
|
||||||
|
Push all URLs to the web archive:
|
||||||
|
|
||||||
|
```
|
||||||
|
python do_archive.py urls_example.org_2023-01-01.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
Dependencies
|
||||||
|
------------
|
||||||
|
|
||||||
|
The archive script is based on the `savepagenow` Python package:
|
||||||
|
|
||||||
|
* https://pypi.org/project/savepagenow/
|
||||||
|
* https://github.com/palewire/savepagenow
|
||||||
|
|
||||||
|
|
||||||
|
To archive a single URL only, the `savepagenow` CLI can be used directly:
|
||||||
|
|
||||||
|
* https://palewi.re/docs/savepagenow/cli.html
|
||||||
|
|
||||||
|
|
||||||
|
Links
|
||||||
|
-----
|
||||||
|
|
||||||
|
Wayback API:
|
||||||
|
|
||||||
|
* https://archive.org/help/wayback_api.php
|
||||||
|
|
||||||
|
Manual paper feed:
|
||||||
|
|
||||||
|
* https://web.archive.org/save/
|
19
sitemap_to_urllist.py
Normal file
19
sitemap_to_urllist.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
#!/bin/python3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
sitemapfilename = sys.argv[1]
|
||||||
|
|
||||||
|
domain, timetag = sitemapfilename.rstrip(".xml").split("_")[1:3]
|
||||||
|
|
||||||
|
tree = ET.parse(sitemapfilename)
|
||||||
|
root = tree.getroot()
|
||||||
|
xmltag = '{http://www.sitemaps.org/schemas/sitemap/0.9}loc'
|
||||||
|
|
||||||
|
urllist = [loc.text for loc in root.iter(xmltag)]
|
||||||
|
|
||||||
|
urlfilename = f"urls_{domain}_{timetag}.txt"
|
||||||
|
|
||||||
|
with open(urlfilename, 'w', encoding='utf_8') as f:
|
||||||
|
f.write('\n'.join(urllist))
|
Loading…
Reference in a new issue