#3 - Scripts‎ > ‎

FIFA Scraping

The FIFA rankings pages display monthly rankings for each country from 1993 - 2010. However, there is no API or xml to consume this data. As such, in order to get the information from FIFA, we wrote a script to scrape the HTML from the rankings pages for each month. After retrieving each month's rankings, we calculated a yearly average ranking for each country and outputted all of the average rankings per year per country.

Here is the script:

1. fifa.pl  (see the file-->)
   1: # scrapes FIFA site to get men's football rankings for every month and export average ranking by year
   2: # Travis August, Kevin Champion, Lei Shi - 02/26/10
   3:  
   4:  
   5: BEGIN { 
   6:     my $base_module_dir = (-d '/home/si601/perl' ? '/home/si601/perl' : ( getpwuid($>) )[7] . '/perl/'); 
   7:     unshift @INC, map { $base_module_dir . $_ } @INC; 
   8: } 
   9:  
  10:  
  11: # automate submission to a php script
  12: use WWW::Mechanize;
  13:  
  14: # parse the resulting html
  15: use HTML::TokeParser;
  16:  
  17: # module for simple web retrieval
  18: use LWP::Simple;
  19:  
  20: # if you need to look at any of the data structures
  21: use Data::Dumper;
  22:  
  23: # Open a file for output, assigning it the filehandle OUT
  24: open(STDOUT,">fifa_raw_mens_year_averages.txt")||die "couldn't open file\n";
  25:  
  26: # Print headings to the output file
  27: print STDOUT "year\tcountry\trank\tpoints\n";
  28:  
  29: my $year = 1993;
  30: my %yearpoints, %avepoints, %all = ();
  31:  
  32: # men's 1993 - 2008 = index 2 - 176 (nearly monthly)
  33: # women's 2003 - 2008 = index 500 - 524 (quarterly)
  34: foreach ($index=2; $index <= 177; $index++){#176; $index++){
  35:     
  36:     if ($index == 104 | $index == 153){ next; }
  37:     
  38:     $thisyear;
  39:         
  40:     for ($page=1; $page <= 5; $page++){
  41:  
  42:         # fifa uses AHAH to generate their rankings pages, need retrieved HTML instead of display page
  43:         
  44:         $rankingurl = "http://www.fifa.com/worldfootball/ranking/rank=$index/confederation=0/page=$page/_index.html";
  45:         
  46:         # VERY IMPORTANT!!!! slow down crawl so that you don't make fifa mad
  47:         sleep(1);
  48:         
  49:         $rankingspage = get $rankingurl;
  50:         
  51:         if ($page == 1){
  52:         
  53:             # instantiate a TokeParser object
  54:             my $p = HTML::TokeParser->new(\$rankingspage) || die "parse error parsing $rankingspage\n";
  55:         
  56:             while ( $tag = $p->get_tag("div") ) {
  57:                 if ($tag->[1]{class} and $tag->[1]{class} eq 'infoRanking'){
  58:                     $p->get_tag('div');
  59:                     $temp = $p->get_text("/div");
  60:                     $temp =~ /(\d+)$/;
  61:                     $thisyear = $1;
  62:                 }
  63:             }
  64:         }
  65:         
  66:         # instantiate a TokeParser object
  67:         my $p = HTML::TokeParser->new(\$rankingspage) || die "parse error parsing $rankingspage\n";
  68:         
  69:         $p->get_tag('table') || die "Not enough table tags!";
  70:         
  71:         $p->get_tag('tbody');
  72:         
  73:         while($tr = $p->get_tag('tr')){
  74:             
  75:             $p->get_tag('td');
  76:             $rank = $p->get_text("/td");
  77:             
  78:             $p->get_tag('td');
  79:             $p->get_tag('a');
  80:             $p->get_tag('a');
  81:             $country = $p->get_text("/a");
  82:             
  83:             $p->get_tag('td');
  84:             $points = $p->get_text("/td");
  85:             push(@{$yearpoints{$country}},$points);                        
  86:         }
  87:     }
  88:     #print "$thisyear\n";
  89:     if ($thisyear != $year){
  90:         #print "$thisyear - $year\n";
  91:         for $c (keys %yearpoints){
  92:             $total = 0;
  93:             foreach (@{$yearpoints{$c}}){
  94:                 $total += $_;
  95:                 #print "$_, ";
  96:             }
  97:             $avepoints{$c} = ($total/scalar(@{$yearpoints{$c}}));
  98:             $all{$year}{$c} = $avepoints{$c};
  99:             #print "$c - $avepoints{$c}\n";
 100:         }
 101:         #print STDOUT "$year\t";
 102:         #$all{$year} = \%avepoints;
 103:         #print $all{$year}{'Brazil'}."\n";
 104:         $year = $thisyear;
 105:         %yearpoints = ();
 106:         %avepoints = ();
 107:     }
 108: }
 109:  
 110: #print STDOUT "\n\n";
 111:  
 112: for my $k1 (sort keys %all ) {
 113:     #print "k1: $k1\n";
 114:     $k = 0;
 115:     for my $k2 (sort {$all{$k1}{$b} <=> $all{$k1}{$a};} keys %{$all{$k1}} ) {
 116:         $k++;
 117:         print STDOUT "$k1\t$k2\t$k\t$all{$k1}{$k2}\n";
 118:     }
 119: }
 120:  
 121: close STDOUT;