rupa-blogs-links-perl-program
#!/usr/bin/perl
use strict;
use LWP::UserAgent;
use HTTP::Response;
use URI::Heuristic;
use HTML::Entities;
use Text::Unidecode;
sub get_html {
my $raw_url = shift;
my $url = URI::Heuristic::uf_urlstr($raw_url);
my $ua = LWP::UserAgent->new();
# bogus referrer to perplex the log analyzers
my $response = $ua->get($url, Referer => "http://wizard.yellowbrick.oz");
if ($response->is_error()) {
exit;
}
else {
return $response->content();
}
}
sub get_links_in_post {
my $url = shift;
my $html = get_html($url);
my $post;
my $publishTime;
($publishTime) = $html =~ m{tracking\.publishTime = (.*?)\;}s;
printf ("Published Time: %s\n", $publishTime);
($post) = $html =~ m{By\s+Rupa\s+Subramanya(.*?)Rupa\s+Subramanya}s
or die "Couldn't find post";
$post =~ s/insetContent(.*?)div>//s; # remove unnecessary links to past / relevant posts
my $firstTime = 2; # <ul> and </ul> need to be printed only once each
foreach (split /\n/, $post) {
while (m{
<a\ href="
([^\"]+) # link to doc or external source
\">
([^<]+) # link text in the post
</a>
}gx) {
# printf ("\t%s -> %s\n", $2, $1);
print "<ul>\n" if (($firstTime==2) && $firstTime--);
printf ("<li>%s -> <a href=\"%s\">%s</a></li>\n", $2, $1, $1);
}
}
print "</ul>\n" if (($firstTime==1) && $firstTime--);
}
my $base_url = "http://blogs.wsj.com/indiarealtime/tag/Economics-Journal/page/";
my $counter = 0;
my $html;
while (1) {
$counter++;
my $raw_url = sprintf "%s%i/", $base_url, $counter;
$html = get_html($raw_url);
foreach (split /\n/, $html) {
if (/(href)(.*?)Economics Journal:/) {
s/Economics Journal: //; # I don't know why every post has this prefix
m{
<a\ href="
([^\"]+) # link to blog = $1 = everything to next quote
\">
([^<]+) # blog title = $2 = everything up to </a>
</a>}gx;
# print "=============================\n";
print "<hr/>\n";
# printf("%s -> %s ", unidecode(decode_entities($2)), $1);
printf("%s -> <a href=\"%s\">%s</a>\n<br/>\n", unidecode(decode_entities($2)), $1, $1);
get_links_in_post($1);
}
}
}