pip install beautifulsoup4
import requests
from bs4 import BeautifulSoup
import csv
import sys
def get_sitemap_urls(sitemap_url, visited=None):
"""
Recursively fetch URLs from a sitemap, handling sitemap indexes.
"""
if visited is None:
visited = set()
if sitemap_url in visited:
return set()
visited.add(sitemap_url)
urls = set()
print(f"Fetching: {sitemap_url}")
try:
# Some servers block requests without a User-Agent
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(sitemap_url, headers=headers, timeout=10)
response.raise_for_status()
# Parse the XML
soup = BeautifulSoup(response.content, 'xml')
# 1. Check if it's a Sitemap Index (points to other sitemaps)
sitemaps = soup.find_all('sitemap')
if sitemaps:
print(f" -> Found sitemap index with {len(sitemaps)} sub-sitemaps.")
for sitemap in sitemaps:
loc = sitemap.find('loc')
if loc and loc.text:
# Recursively fetch the sub-sitemap
urls.update(get_sitemap_urls(loc.text.strip(), visited))
# 2. Extract standard URLs
url_tags = soup.find_all('url')
if url_tags:
print(f" -> Found {len(url_tags)} URLs in this sitemap.")
for url_tag in url_tags:
loc = url_tag.find('loc')
if loc and loc.text:
urls.add(loc.text.strip())
except requests.exceptions.RequestException as e:
print(f"Error fetching {sitemap_url}: {e}")
except Exception as e:
print(f"Error parsing {sitemap_url}: {e}")
return urls
def save_to_csv(urls, filename="sitemap_urls.csv"):
"""
Save a set/list of URLs to a CSV file.
"""
if not urls:
print("No URLs to save.")
return
# Sort URLs alphabetically for easier reading
sorted_urls = sorted(list(urls))
with open(filename, mode='w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['url']) # Header
for url in sorted_urls:
writer.writerow([url])
print(f"\nSuccess! Saved {len(sorted_urls)} unique URLs to {filename}")
TARGET_SITEMAP = "https://ai.maryland.gov/sitemap.xml"
OUTPUT_CSV = "sitemap_urls.csv"
print(f"Starting sitemap extraction for: {TARGET_SITEMAP}")
extracted_urls = get_sitemap_urls(TARGET_SITEMAP)
save_to_csv(extracted_urls, OUTPUT_CSV)