Beautifulsoup4

pip install beautifulsoup4

import requests

from bs4 import BeautifulSoup

import csv

import sys

def get_sitemap_urls(sitemap_url, visited=None):

"""

Recursively fetch URLs from a sitemap, handling sitemap indexes.

"""

if visited is None:

visited = set()

if sitemap_url in visited:

return set()

visited.add(sitemap_url)

urls = set()

print(f"Fetching: {sitemap_url}")

try:

# Some servers block requests without a User-Agent

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

response = requests.get(sitemap_url, headers=headers, timeout=10)

response.raise_for_status()

# Parse the XML

soup = BeautifulSoup(response.content, 'xml')

# 1. Check if it's a Sitemap Index (points to other sitemaps)

sitemaps = soup.find_all('sitemap')

if sitemaps:

print(f" -> Found sitemap index with {len(sitemaps)} sub-sitemaps.")

for sitemap in sitemaps:

loc = sitemap.find('loc')

if loc and loc.text:

# Recursively fetch the sub-sitemap

urls.update(get_sitemap_urls(loc.text.strip(), visited))

# 2. Extract standard URLs

url_tags = soup.find_all('url')

if url_tags:

print(f" -> Found {len(url_tags)} URLs in this sitemap.")

for url_tag in url_tags:

loc = url_tag.find('loc')

if loc and loc.text:

urls.add(loc.text.strip())

except requests.exceptions.RequestException as e:

print(f"Error fetching {sitemap_url}: {e}")

except Exception as e:

print(f"Error parsing {sitemap_url}: {e}")

return urls

def save_to_csv(urls, filename="sitemap_urls.csv"):

"""

Save a set/list of URLs to a CSV file.

"""

if not urls:

print("No URLs to save.")

return

# Sort URLs alphabetically for easier reading

sorted_urls = sorted(list(urls))

with open(filename, mode='w', newline='', encoding='utf-8') as f:

writer = csv.writer(f)

writer.writerow(['url']) # Header

for url in sorted_urls:

writer.writerow([url])

print(f"\nSuccess! Saved {len(sorted_urls)} unique URLs to {filename}")

TARGET_SITEMAP = "https://ai.maryland.gov/sitemap.xml"

OUTPUT_CSV = "sitemap_urls.csv"

print(f"Starting sitemap extraction for: {TARGET_SITEMAP}")

extracted_urls = get_sitemap_urls(TARGET_SITEMAP)

save_to_csv(extracted_urls, OUTPUT_CSV)