https://developers.cloudflare.com/browser-rendering/
Token verify
Fetch HTML REST API
curl -X 'POST' "https://api.cloudflare.com/client/v4/accounts/${CLOUDFLARE_ACCOUNT_NUMBER}/browser-rendering/content" \
-H 'Content-Type: application/json' \
-H "Authorization: Bearer ${CLOUDFLARE_API_KEY}" \
-d '{"url": "https://example.com"}'
URL to markdown REST API
curl -X 'POST' "https://api.cloudflare.com/client/v4/accounts/${CLOUDFLARE_ACCOUNT_NUMBER}/browser-rendering/markdown" \
-H 'Content-Type: application/json' \
-H "Authorization: Bearer ${CLOUDFLARE_BROWSER_RENDERING_API_KEY}" \
-d '{"url": "https://services.mde.maryland.gov/Application/SearchPermitTypes"}'
URL to markdown python
account_number = os.getenv('CLOUDFLARE_ACCOUNT_NUMBER')
api_key = os.getenv('CLOUDFLARE_BROWSER_RENDERING_API_KEY')
endpoint_url = f"https://api.cloudflare.com/client/v4/accounts/{account_number}/browser-rendering/markdown"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
data = {
"url": "https://services.mde.maryland.gov/Application/SearchPermitTypes"
}
try:
response = requests.post(endpoint_url, headers=headers, json=data)
response.raise_for_status()
markdown = response.json()["result"]
print(markdown)
except requests.exceptions.HTTPError as err:
print(f"HTTP error occurred: {err}")
except Exception as err:
print(f"An error occurred: {err}")
AI task
curl https://api.cloudflare.com/client/v4/accounts/${CLOUDFLARE_ACCOUNT_NUMBER}/ai/run/@cf/meta/llama-3-8b-instruct \
-H "Authorization: Bearer ${CLOUDFLARE_WORKERS_AI_API_KEY}" \
-d '{"messages":[{"role":"user","content":"test"}]}'
local pdf to markdown
https://developers.cloudflare.com/workers-ai/features/markdown-conversion/
curl "https://api.cloudflare.com/client/v4/accounts/${CLOUDFLARE_ACCOUNT_NUMBER}/ai/tomarkdown" \
-X POST \
-H "Authorization: Bearer ${CLOUDFLARE_WORKERS_AI_API_KEY}" \
-F "files=@sb0818E.pdf"
crawl rest
curl -X POST "https://api.cloudflare.com/client/v4/accounts/${CLOUDFLARE_ACCOUNT_NUMBER}/browser-rendering/crawl" \
-H "Authorization: Bearer ${CLOUDFLARE_BROWSER_RENDERING_API_KEY}" \
-H 'Content-Type: application/json' \
-d '{"url": "https://ai.maryland.gov/"}'
curl -X GET "https://api.cloudflare.com/client/v4/accounts/${CLOUDFLARE_ACCOUNT_NUMBER}/browser-rendering/crawl/{{ ID }}" \
-H "Authorization: Bearer ${CLOUDFLARE_BROWSER_RENDERING_API_KEY}"
crawl python for urls
# Ensure environment variables are set
account_number = os.environ.get('CLOUDFLARE_ACCOUNT_NUMBER')
api_key = os.environ.get('CLOUDFLARE_BROWSER_RENDERING_API_KEY')
if not account_number or not api_key:
print("Error: CLOUDFLARE_ACCOUNT_NUMBER and CLOUDFLARE_BROWSER_RENDERING_API_KEY must be set.")
exit(1)
# 1. Start the crawl job
endpoint_url = f"https://api.cloudflare.com/client/v4/accounts/{account_number}/browser-rendering/crawl"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
payload = {
"url": "https://ai.maryland.gov/",
"limit": 100,
"options": {
"includePatterns":[
"https://ai.maryland.gov/**"
]
}
}
print("Starting crawl job...")
response = requests.post(endpoint_url, headers=headers, json=payload)
response.raise_for_status()
# Handle cases where 'result' is the string ID or a dictionary containing 'id'
result = response.json().get("result")
if isinstance(result, dict):
job_id = result.get("id")
else:
job_id = result
if not job_id:
print("Failed to get a Job ID from the API response.")
print("Raw response:", response.json())
exit(1)
print(f"Job ID: {job_id}")
# 2. Poll until the job is complete
get_url = f"https://api.cloudflare.com/client/v4/accounts/{account_number}/browser-rendering/crawl/{job_id}"
get_headers = {"Authorization": f"Bearer {api_key}"}
print("Polling for job completion...")
while True:
poll_url = f"{get_url}?limit=1"
get_response = requests.get(poll_url, headers=get_headers)
get_response.raise_for_status()
result_data = get_response.json().get("result", {})
status = result_data.get("status")
if status in ["completed", "cancelled_due_to_limits", "cancelled_due_to_timeout", "errored"]:
print(f"\nCrawl finished with status: {status}")
break
print(f"Crawl status: {status}... waiting 5 seconds.", end="\r", flush=True)
sleep(5)
# 3. Paginate through results and write to CSV incrementally
csv_file = "urls.csv"
crawled_urls = set()
cursor = None
page_number = 1
print(f"\nFetching all records and writing to {csv_file}...")
# Initialize the CSV file with a header
with open(csv_file, mode='w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['url'])
while True:
paginated_url = f"{get_url}?cursor={cursor}" if cursor else get_url
page_response = requests.get(paginated_url, headers=get_headers)
page_response.raise_for_status()
page_result = page_response.json().get("result", {})
records = page_result.get("records", [])
print(f"Fetched page {page_number}: found {len(records)} records.")
for record in records:
rec_status = record.get("status")
url = record.get("url")
if rec_status in ["completed", "success"] and url not in crawled_urls:
crawled_urls.add(url)
writer.writerow([url])
f.flush() # Ensure it writes to disk immediately
cursor = page_result.get("cursor")
if not cursor:
break
page_number += 1
print(f"\n--- Final Results ---")
print(f"Total unique completed URLs saved to {csv_file}: {len(crawled_urls)}")
# 4. Create and display the final DataFrame
if crawled_urls:
sorted_urls = sorted(list(crawled_urls))
crawled_df = pd.DataFrame(sorted_urls, columns=['url'])
print("\nFinal DataFrame (first 10 rows):")
print(crawled_df.head(10))
else:
print("\nNo URLs were successfully crawled.")