#!/bin/python3 import sys import time import savepagenow urlfile = sys.argv[1] print(f"Reading urls from: {urlfile}") is_url = lambda s: s.startswith("http") with open(urlfile, encoding='utf_8') as f: lines = f.readlines() urllist = list(filter(is_url, lines)) urllist = list(map(lambda s: s.strip(), urllist)) n = len(urllist) print(f"Archiving {n} urls") failed_urls = [] for i, url in enumerate(urllist): print(f"Archiving url [{i+1}/{n}]: {url}") try: archived_url = savepagenow.capture(url) print(" DONE") print(" -> {archived_url}") except savepagenow.exceptions.CachedPage: print(" CACHED") except: failed_urls.append(url) print(" FAILED") # Avoid http error 429: too many requests time.sleep(5) with open("failed_urls.txt", 'w', encoding='utf_8') as f: f.write('\n'.join(failed_urls))