rupa-blogs-links-perl-program

#!/usr/bin/perl

use strict;

use LWP::UserAgent;

use HTTP::Response;

use URI::Heuristic;

use HTML::Entities;

use Text::Unidecode;


sub get_html {

    my $raw_url = shift;

    my $url = URI::Heuristic::uf_urlstr($raw_url);


    my $ua = LWP::UserAgent->new(); 

    # bogus referrer to perplex the log analyzers

    my $response = $ua->get($url, Referer => "http://wizard.yellowbrick.oz");


    if ($response->is_error()) {

        exit;

    }

    else {

        return $response->content();

    }

}


sub get_links_in_post {

    my $url = shift;

    my $html = get_html($url);

    my $post;

    my $publishTime;


    ($publishTime) = $html =~ m{tracking\.publishTime = (.*?)\;}s;

    printf ("Published Time: %s\n", $publishTime);

    ($post) = $html =~ m{By\s+Rupa\s+Subramanya(.*?)Rupa\s+Subramanya}s

        or die "Couldn't find post";

    $post =~ s/insetContent(.*?)div>//s;    # remove unnecessary links to past / relevant posts


    my $firstTime = 2;  # <ul> and </ul> need to be printed only once each

    foreach (split /\n/, $post) {

        while (m{

                 <a\ href="

                 ([^\"]+)   # link to doc or external source

                 \">

                 ([^<]+)    # link text in the post

                 </a>

                }gx) {

                    # printf ("\t%s -> %s\n", $2, $1);

                    print "<ul>\n" if (($firstTime==2) && $firstTime--);

                    printf ("<li>%s -> <a href=\"%s\">%s</a></li>\n", $2, $1, $1);

        }

    }

    print "</ul>\n" if (($firstTime==1) && $firstTime--);

}


my $base_url = "http://blogs.wsj.com/indiarealtime/tag/Economics-Journal/page/";

my $counter = 0;

my $html;

while (1) {

    $counter++;

    my $raw_url = sprintf "%s%i/", $base_url, $counter;

    $html = get_html($raw_url);


    foreach (split /\n/, $html) {

        if (/(href)(.*?)Economics Journal:/) {

            s/Economics Journal: //;    # I don't know why every post has this prefix

            m{

              <a\ href="

              ([^\"]+)  # link to blog = $1 = everything to next quote

              \">

              ([^<]+)   # blog title = $2 = everything up to </a>

              </a>}gx;

            # print "=============================\n";

            print "<hr/>\n";

            # printf("%s -> %s  ", unidecode(decode_entities($2)), $1);  

            printf("%s -> <a href=\"%s\">%s</a>\n<br/>\n", unidecode(decode_entities($2)), $1, $1);  

            get_links_in_post($1);

        }

    }

}


Comments