mirror of
https://codeberg.org/ral/web_archive_cli.git
synced 2024-08-16 09:59:49 +02:00
20 lines
457 B
Python
20 lines
457 B
Python
|
#!/bin/python3
|
||
|
|
||
|
import sys
|
||
|
import xml.etree.ElementTree as ET
|
||
|
|
||
|
sitemapfilename = sys.argv[1]
|
||
|
|
||
|
domain, timetag = sitemapfilename.rstrip(".xml").split("_")[1:3]
|
||
|
|
||
|
tree = ET.parse(sitemapfilename)
|
||
|
root = tree.getroot()
|
||
|
xmltag = '{http://www.sitemaps.org/schemas/sitemap/0.9}loc'
|
||
|
|
||
|
urllist = [loc.text for loc in root.iter(xmltag)]
|
||
|
|
||
|
urlfilename = f"urls_{domain}_{timetag}.txt"
|
||
|
|
||
|
with open(urlfilename, 'w', encoding='utf_8') as f:
|
||
|
f.write('\n'.join(urllist))
|