mirror of
https://codeberg.org/ral/web_archive_cli.git
synced 2024-08-16 09:59:49 +02:00
45 lines
904 B
Python
45 lines
904 B
Python
|
#!/bin/python3
|
||
|
|
||
|
import sys
|
||
|
import time
|
||
|
import savepagenow
|
||
|
|
||
|
|
||
|
urlfile = sys.argv[1]
|
||
|
|
||
|
print(f"Reading urls from: {urlfile}")
|
||
|
|
||
|
is_url = lambda s: s.startswith("http")
|
||
|
|
||
|
with open(urlfile, encoding='utf_8') as f:
|
||
|
lines = f.readlines()
|
||
|
urllist = list(filter(is_url, lines))
|
||
|
urllist = list(map(lambda s: s.strip(), urllist))
|
||
|
|
||
|
|
||
|
n = len(urllist)
|
||
|
print(f"Archiving {n} urls")
|
||
|
|
||
|
|
||
|
failed_urls = []
|
||
|
|
||
|
for i, url in enumerate(urllist):
|
||
|
print(f"Archiving url [{i+1}/{n}]: {url}")
|
||
|
|
||
|
try:
|
||
|
archived_url = savepagenow.capture(url)
|
||
|
print(" DONE")
|
||
|
print(" -> {archived_url}")
|
||
|
except savepagenow.exceptions.CachedPage:
|
||
|
print(" CACHED")
|
||
|
except:
|
||
|
failed_urls.append(url)
|
||
|
print(" FAILED")
|
||
|
|
||
|
# Avoid http error 429: too many requests
|
||
|
time.sleep(5)
|
||
|
|
||
|
|
||
|
with open("failed_urls.txt", 'w', encoding='utf_8') as f:
|
||
|
f.write('\n'.join(failed_urls))
|