https://github.com/madmaze/pytesseract
import re
from PIL import Image
import pytesseract
image_path = "/Users/raybell/Desktop/Screenshot 2024-11-17 at 9.44.37 AM.png"
extracted_text = pytesseract.image_to_string(Image.open(image_path))
# use regex to find 4 digit nummbers e.g. years
years = re.findall(r'\b(20\d{2})\b', extracted_text)
spending = re.findall(r'\$\d+\.?\d*B', extracted_text)
df = pytesseract.image_to_data(Image.open(image_path), output_type='data.frame')