Opendata.Maryland.gov

https://opendata.maryland.gov/

Scrape data

# /// script

# requires-python = ">=3.12"

# dependencies = [

# "beautifulsoup4",

# "pandas",

# "requests",

# "tqdm",

# ]

# ///

# uv pip install beautifulsoup4 pandas requests tqdm

import requests

from bs4 import BeautifulSoup

import pandas as pd

from tqdm import tqdm

url = "https://opendata.maryland.gov/browse"

response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")

modal = soup.find("div", {"data-modal-facet": "Category"})

categories = [item.text.strip().replace(" ", "+") for item in modal.find_all("a")]

category = "Energy+and+Environment"

base_url = f"https://opendata.maryland.gov/browse?category={category}&page="

datasets = []

page = 1

with tqdm(desc="Scraping pages", unit="page") as progress_bar:

while True:

url = f"{base_url}{page}"

response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")

entries = soup.find_all("div", class_="browse2-result")

if not entries:

print("\nNo more datasets found. Stopping.")

break

for entry in entries:

try:

title = (

entry.find("h2", class_="browse2-result-name")

.find("a")

.text.strip()

)

link = entry.find("h2", class_="browse2-result-name").find("a")["href"]

description_div = entry.find("div", class_="browse2-result-description")

description = (

description_div.text.strip()

if description_div

else "No description"

)

raw_timestamp = entry.find("span", class_="dateLocalize")[

"data-rawdatetime"

]

if raw_timestamp:

updated = pd.to_datetime(

int(raw_timestamp), unit="s", utc=True

).tz_convert("US/Eastern")

else:

updated = "No update info"

views_div = entry.find("div", class_="browse2-result-view-count-value")

views = views_div.text.strip().replace(",", "") if views_div else "0"

tags_div = entry.find("div", class_="browse2-result-topics")

if tags_div:

tags = ", ".join(

tag.text.strip()

for tag in tags_div.find_all("a", class_="browse2-result-topic")

)

else:

tags = "No tags"

datasets.append(

{

"title": title,

"url": link,

"description": description,

"updated": updated,

"views": int(views) if views.isdigit() else 0,

"tags": tags,

}

)

except AttributeError as e:

print(f"\nError processing an entry: {e}")

continue

page += 1

progress_bar.update(1)

df = pd.DataFrame(datasets)

Google Sites

Report abuse