The script below will:
Conduct a search for "e coli" (found in the "URL" variable in the "main" function).
You can change this to be whatever search you want it to be. We suggest playing around with the browser UI to dial in the query you want, copying it using the "copy API button" and then pasting into the script.
Parse the search results and extract the project IDs, their location and their type.
You can change these to be whatever values you want. The "_id" field may be of particular interest.
You can add additional keys of interest to the TARGET_KEYS object.
#!/usr/bin/env python3
import json
from collections.abc import Mapping, Sequence
from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
import requests
TARGET_KEYS = {
"analysis_project_id",
"sequencing_project_id",
"pmo_project_id",
}
def ensure_json_url(url: str) -> str:
"""
Add format=json if it is not already present.
This helps avoid HTML rendering wrappers around the API response.
"""
parsed = urlparse(url)
query = dict(parse_qsl(parsed.query, keep_blank_values=True))
query.setdefault("format", "json")
new_query = urlencode(query, doseq=True)
return urlunparse(parsed._replace(query=new_query))
def walk(obj, path=""):
"""
Recursively walk nested dict/list structures and yield
(path, key, value) whenever a target key is found.
"""
if isinstance(obj, Mapping):
for key, value in obj.items():
current_path = f"{path}.{key}" if path else key
if key in TARGET_KEYS:
yield current_path, key, value
yield from walk(value, current_path)
elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes, bytearray)):
for i, item in enumerate(obj):
current_path = f"{path}[{i}]"
yield from walk(item, current_path)
def fetch_json(url: str, timeout: int = 60) -> dict:
api_url = ensure_json_url(url)
headers = {
"Accept": "application/json",
"User-Agent": "python-requests/jgi-id-extractor",
}
resp = requests.get(api_url, headers=headers, timeout=timeout)
resp.raise_for_status()
return resp.json()
def extract_project_ids(data: dict) -> dict:
"""
Return both:
1. all found values by key
2. detailed locations where each value was found
"""
found = {k: set() for k in TARGET_KEYS}
detailed = []
for path, key, value in walk(data):
found[key].add(value)
detailed.append({
"path": path,
"key": key,
"value": value,
})
return {
"unique_ids": {k: sorted(v) for k, v in found.items()},
"matches": detailed,
}
def main():
url = "https://files.jgi.doe.gov/search/?q=e+coli"
data = fetch_json(url)
result = extract_project_ids(data)
print("Unique project IDs found:")
print(json.dumps(result["unique_ids"], indent=2, sort_keys=True))
print("\nDetailed matches:")
print(json.dumps(result["matches"], indent=2, sort_keys=False))
if __name__ == "__main__":
main()