A classic on data analysis is the IMDb data analysis. But I have noticed that regulary what data scientist do is analyzing the current top 250 rating. So, as I am more interested on a kind of subjective subjects such as how perspective of a good movie change over the time, what I would need is the history of the rating and all other data related along the time.
Fortunatelly IMDb official site publish the history of the 250 list here, but since they show data on static or build on the fly controls it is not an easy task to scrap/crawl such public data.
There is another source of such data here that is more easy to scrap.
In this post I share the (Python 3) code used to crawl and the preliminar analysis (in mathematica) that only aims to show dynamicly how raing of each movie change over the time. Interesting analysis will be introduced in future post.
First the python code for scrapping
import requests
#from bs4 import BeautifulSoup
import bs4 as bs
from string import ascii_uppercase
import requests
import lxml.html as lh
import pandas as pd
import re
import csv
linkTableMovies = []
allMoviesNames = []
# Collect and parse all pages with movies names alphabetically
for c in ascii_uppercase:
page = requests.get('http://top250.info/movies/?' + c)
soup = bs.BeautifulSoup(page.text, 'html.parser')
# Pull all text from the BodyText div
movies_name_list = soup.find(class_='layout')
# Pull text from all instances of <a> tag within BodyText div
movies_name_list_items = movies_name_list.find_all('a')
# Code for debugging: prints movies' names
# Create for loop to print out all movies' names
# for movie_name in movies_name_list_items:
# print(movie_name.prettify())
#
# for movie_name in movies_name_list_items:
# movie = movie_name.contents[0]
# print(movie)
# capture all pages in alphabetic order containing movies lists
for movie_name in movies_name_list_items:
#print(movie_name.text)
link='http://top250.info' + movie_name['href'] + '/full'
#print(link)
linkTableMovies.append(link)
allMoviesNames.append(movie_name.text)
# unit test (kind of): only one line is appended to see how it works
# link='http://top250.info/movie/?0257360/full'
# linkTableMovies.append(link)
# here is where data are saved as csv file
f = csv.writer(open('/Users/beto/Documents/IMDbData.csv', 'w'))
counter = -1
limit = len(allMoviesNames)
for url in linkTableMovies:
res = requests.get(url)
counter=counter+1
movName=allMoviesNames[counter]
# for watching progress of scrapping
print('Processing '+ str(counter) + ' of ' + str(limit) + ' (' + movName + ')')
soup = bs.BeautifulSoup(res.content, 'lxml')
# ir founds all data inside rows of the table.
# as rows are of different kind, they all starts with "row_"
tb = soup.find_all('tr', class_=re.compile("row_"))
#print(tb)
# saving data: Date, position, rate, number of votes
for row in tb:
tds = row.find_all('td')
if len(tds)>0:
date = tds[0].text
pos = tds[1].text
rate = tds[3].text
votes = tds[4].text
f.writerow([movName, date, pos, rate, votes])