https://opendata.maryland.gov/
Scrape data
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "beautifulsoup4",
# "pandas",
# "requests",
# "tqdm",
# ]
# ///
# uv pip install beautifulsoup4 pandas requests tqdm
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
url = "https://opendata.maryland.gov/browse"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
modal = soup.find("div", {"data-modal-facet": "Category"})
categories = [item.text.strip().replace(" ", "+") for item in modal.find_all("a")]
category = "Energy+and+Environment"
base_url = f"https://opendata.maryland.gov/browse?category={category}&page="
datasets = []
page = 1
with tqdm(desc="Scraping pages", unit="page") as progress_bar:
while True:
url = f"{base_url}{page}"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
entries = soup.find_all("div", class_="browse2-result")
if not entries:
print("\nNo more datasets found. Stopping.")
break
for entry in entries:
try:
title = (
entry.find("h2", class_="browse2-result-name")
.find("a")
.text.strip()
)
link = entry.find("h2", class_="browse2-result-name").find("a")["href"]
description_div = entry.find("div", class_="browse2-result-description")
description = (
description_div.text.strip()
if description_div
else "No description"
)
raw_timestamp = entry.find("span", class_="dateLocalize")[
"data-rawdatetime"
]
if raw_timestamp:
updated = pd.to_datetime(
int(raw_timestamp), unit="s", utc=True
).tz_convert("US/Eastern")
else:
updated = "No update info"
views_div = entry.find("div", class_="browse2-result-view-count-value")
views = views_div.text.strip().replace(",", "") if views_div else "0"
tags_div = entry.find("div", class_="browse2-result-topics")
if tags_div:
tags = ", ".join(
tag.text.strip()
for tag in tags_div.find_all("a", class_="browse2-result-topic")
)
else:
tags = "No tags"
datasets.append(
{
"title": title,
"url": link,
"description": description,
"updated": updated,
"views": int(views) if views.isdigit() else 0,
"tags": tags,
}
)
except AttributeError as e:
print(f"\nError processing an entry: {e}")
continue
page += 1
progress_bar.update(1)
df = pd.DataFrame(datasets)