1
0
Fork 0
mirror of https://codeberg.org/ral/web_archive_cli.git synced 2024-08-16 09:59:49 +02:00
web_archive_cli/do_archive.py

45 lines
904 B
Python
Raw Normal View History

2023-07-15 15:30:35 +02:00
#!/bin/python3
import sys
import time
import savepagenow
urlfile = sys.argv[1]
print(f"Reading urls from: {urlfile}")
is_url = lambda s: s.startswith("http")
with open(urlfile, encoding='utf_8') as f:
lines = f.readlines()
urllist = list(filter(is_url, lines))
urllist = list(map(lambda s: s.strip(), urllist))
n = len(urllist)
print(f"Archiving {n} urls")
failed_urls = []
for i, url in enumerate(urllist):
print(f"Archiving url [{i+1}/{n}]: {url}")
try:
archived_url = savepagenow.capture(url)
print(" DONE")
print(" -> {archived_url}")
except savepagenow.exceptions.CachedPage:
print(" CACHED")
except:
failed_urls.append(url)
print(" FAILED")
# Avoid http error 429: too many requests
time.sleep(5)
with open("failed_urls.txt", 'w', encoding='utf_8') as f:
f.write('\n'.join(failed_urls))