mirror of
https://codeberg.org/ral/web_archive_cli.git
synced 2024-08-16 09:59:49 +02:00
Initial checkin
This commit is contained in:
commit
3bf3a6bfbd
4 changed files with 144 additions and 0 deletions
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
*~
|
||||
|
||||
*.xml
|
||||
*.txt
|
||||
|
||||
__pycache__/
|
||||
venv/
|
44
do_archive.py
Normal file
44
do_archive.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
#!/bin/python3
|
||||
|
||||
import sys
|
||||
import time
|
||||
import savepagenow
|
||||
|
||||
|
||||
urlfile = sys.argv[1]
|
||||
|
||||
print(f"Reading urls from: {urlfile}")
|
||||
|
||||
is_url = lambda s: s.startswith("http")
|
||||
|
||||
with open(urlfile, encoding='utf_8') as f:
|
||||
lines = f.readlines()
|
||||
urllist = list(filter(is_url, lines))
|
||||
urllist = list(map(lambda s: s.strip(), urllist))
|
||||
|
||||
|
||||
n = len(urllist)
|
||||
print(f"Archiving {n} urls")
|
||||
|
||||
|
||||
failed_urls = []
|
||||
|
||||
for i, url in enumerate(urllist):
|
||||
print(f"Archiving url [{i+1}/{n}]: {url}")
|
||||
|
||||
try:
|
||||
archived_url = savepagenow.capture(url)
|
||||
print(" DONE")
|
||||
print(" -> {archived_url}")
|
||||
except savepagenow.exceptions.CachedPage:
|
||||
print(" CACHED")
|
||||
except:
|
||||
failed_urls.append(url)
|
||||
print(" FAILED")
|
||||
|
||||
# Avoid http error 429: too many requests
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
with open("failed_urls.txt", 'w', encoding='utf_8') as f:
|
||||
f.write('\n'.join(failed_urls))
|
74
readme.md
Normal file
74
readme.md
Normal file
|
@ -0,0 +1,74 @@
|
|||
Web Archive CLI
|
||||
===============
|
||||
|
||||
Simple Python CLI to archive whole websites to the Web Archive via a `sitemap.xml` file.
|
||||
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
Create a fresh Python virtual env:
|
||||
|
||||
```
|
||||
python3 -m venv venv
|
||||
```
|
||||
|
||||
and activate it:
|
||||
|
||||
```
|
||||
. venv/bin/activate
|
||||
```
|
||||
|
||||
and install the dependencies:
|
||||
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
Activate the Python virtual env:
|
||||
|
||||
```
|
||||
. venv/bin/activate
|
||||
```
|
||||
|
||||
Convert a `sitemap.xml` file to a plain list of URLs:
|
||||
|
||||
```
|
||||
python sitemap_to_urllist.py sitemap_example.org_2023-01-01.xml
|
||||
```
|
||||
|
||||
Push all URLs to the web archive:
|
||||
|
||||
```
|
||||
python do_archive.py urls_example.org_2023-01-01.txt
|
||||
```
|
||||
|
||||
|
||||
Dependencies
|
||||
------------
|
||||
|
||||
The archive script is based on the `savepagenow` Python package:
|
||||
|
||||
* https://pypi.org/project/savepagenow/
|
||||
* https://github.com/palewire/savepagenow
|
||||
|
||||
|
||||
To archive a single URL only, the `savepagenow` CLI can be used directly:
|
||||
|
||||
* https://palewi.re/docs/savepagenow/cli.html
|
||||
|
||||
|
||||
Links
|
||||
-----
|
||||
|
||||
Wayback API:
|
||||
|
||||
* https://archive.org/help/wayback_api.php
|
||||
|
||||
Manual paper feed:
|
||||
|
||||
* https://web.archive.org/save/
|
19
sitemap_to_urllist.py
Normal file
19
sitemap_to_urllist.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
#!/bin/python3
|
||||
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
sitemapfilename = sys.argv[1]
|
||||
|
||||
domain, timetag = sitemapfilename.rstrip(".xml").split("_")[1:3]
|
||||
|
||||
tree = ET.parse(sitemapfilename)
|
||||
root = tree.getroot()
|
||||
xmltag = '{http://www.sitemaps.org/schemas/sitemap/0.9}loc'
|
||||
|
||||
urllist = [loc.text for loc in root.iter(xmltag)]
|
||||
|
||||
urlfilename = f"urls_{domain}_{timetag}.txt"
|
||||
|
||||
with open(urlfilename, 'w', encoding='utf_8') as f:
|
||||
f.write('\n'.join(urllist))
|
Loading…
Reference in a new issue