rupa-blogs-links-perl-program

#!/usr/bin/perl

use strict;

use LWP::UserAgent;

use HTTP::Response;

use URI::Heuristic;

use HTML::Entities;

use Text::Unidecode;

sub get_html {

my $raw_url = shift;

my $url = URI::Heuristic::uf_urlstr($raw_url);

my $ua = LWP::UserAgent->new();

# bogus referrer to perplex the log analyzers

my $response = $ua->get($url, Referer => "http://wizard.yellowbrick.oz");

if ($response->is_error()) {

exit;

}

else {

return $response->content();

}

}

sub get_links_in_post {

my $url = shift;

my $html = get_html($url);

my $post;

my $publishTime;

($publishTime) = $html =~ m{tracking\.publishTime = (.*?)\;}s;

printf ("Published Time: %s\n", $publishTime);

($post) = $html =~ m{By\s+Rupa\s+Subramanya(.*?)Rupa\s+Subramanya}s

or die "Couldn't find post";

$post =~ s/insetContent(.*?)div>//s; # remove unnecessary links to past / relevant posts

my $firstTime = 2; # <ul> and </ul> need to be printed only once each

foreach (split /\n/, $post) {

while (m{

<a\ href="

([^\"]+) # link to doc or external source

\">

([^<]+) # link text in the post

</a>

}gx) {

# printf ("\t%s -> %s\n", $2, $1);

print "<ul>\n" if (($firstTime==2) && $firstTime--);

printf ("<li>%s -> <a href=\"%s\">%s</a></li>\n", $2, $1, $1);

}

}

print "</ul>\n" if (($firstTime==1) && $firstTime--);

}

my $base_url = "http://blogs.wsj.com/indiarealtime/tag/Economics-Journal/page/";

my $counter = 0;

my $html;

while (1) {

$counter++;

my $raw_url = sprintf "%s%i/", $base_url, $counter;

$html = get_html($raw_url);

foreach (split /\n/, $html) {

if (/(href)(.*?)Economics Journal:/) {

s/Economics Journal: //; # I don't know why every post has this prefix

m{

<a\ href="

([^\"]+) # link to blog = $1 = everything to next quote

\">

([^<]+) # blog title = $2 = everything up to </a>

</a>}gx;

# print "=============================\n";

print "<hr/>\n";

# printf("%s -> %s ", unidecode(decode_entities($2)), $1);

printf("%s -> <a href=\"%s\">%s</a>\n<br/>\n", unidecode(decode_entities($2)), $1, $1);

get_links_in_post($1);

}

}

}