1
0
Fork 0
mirror of https://codeberg.org/ral/web_archive_cli.git synced 2024-08-16 09:59:49 +02:00
web_archive_cli/sitemap_to_urllist.py
2023-07-15 15:30:35 +02:00

19 lines
457 B
Python

#!/bin/python3
import sys
import xml.etree.ElementTree as ET
sitemapfilename = sys.argv[1]
domain, timetag = sitemapfilename.rstrip(".xml").split("_")[1:3]
tree = ET.parse(sitemapfilename)
root = tree.getroot()
xmltag = '{http://www.sitemaps.org/schemas/sitemap/0.9}loc'
urllist = [loc.text for loc in root.iter(xmltag)]
urlfilename = f"urls_{domain}_{timetag}.txt"
with open(urlfilename, 'w', encoding='utf_8') as f:
f.write('\n'.join(urllist))