#!/usr/bin/perl -w
# Bryan Bishop (kanzure@gmail.com) 2008-03-23/24 nighter
# AutoScholar 1.0.0
# Fetches science. Enough said.
# You may want to modify the scholar.google.com line and the usernames and passwords.
use strict;
use WWW::Mechanize;
use HTML::TokeParser;
use HTTP::Cookies;
use HTML::Strip;
use Crypt::SSLeay;
use LWP::UserAgent;
use HTML::LinkExtractor;
( my $title ) = @ARGV;
my $agent = WWW::Mechanize->new( agent => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080201 Firefox/2.0.0.12' );
$agent->cookie_jar(HTTP::Cookies->new);
my $response = $agent->get("http://scholar.google.com.ezproxy.lib.utexas.edu/scholar?q=");
#print $response->decoded_content();
$agent->form_number(1);
my $username = "*****";
my $password = "@@@@@@";
$agent->field("user", "$username");
$agent->field("pass", "$password");
$agent->submit();
fetchPaper("$title");
sub fetchPaper {
my ($name, @array) = @_;
#my $name = @_;
print "Fetching paper with name $name... please wait.\n\n";
# Now fetch the actual paper. Let's try something simple.
my $response = $agent->get("http://scholar.google.com.ezproxy.lib.utexas.edu/scholar?q=$name");
my $searchresults = $response->decoded_content();
#print $searchresults;
# Return the first link *AFTER* any occurence of [PDF]
if($searchresults =~ /\[PDF\]<\/b>/) {
print "... found a PDF on Google Scholar directly.!\n\n";
$searchresults =~ '\[PDF\].*?(.*?)';
my $url = $1;
print "URL is $url.\n\n";
my $hs = HTML::Strip->new();
my $clean_text = $hs->parse( $3 );
$hs->eof;
`wget --proxy-user=$username --proxy-password='$password' "$url" --output-document="$clean_text.pdf"`;
# --output-document="$clear_text.pdf" ???
} elsif ($searchresults =~ "Get this article") {
print "Did not find a PDF on Google Scholar ... now activating university services.\n\n";
# Google Scholar does not know of a PDF.
# So, go to the page that says "All 32 versions"
# Then click either (1) the link that says "GET THIS PAPER" or (less preferably) (2) any paper link.
# Given #1, go to the UT website and see the first form details: hidden variables service_id and request_id
# Then GET http://p9003-www.lib.utexas.edu.ezproxy.lib.utexas.edu/sfx_local/cgi/core/sfxresolver.cgi?basic1&tmp_ctx_svc_id=1&tmp_ctx_obj_id=1&service_id=$service_id&request_id=$request_id.
$searchresults =~ '.*Get this article';
my $link = $1; my $onmousedown = $2;
my $response2 = $agent->get($link);
print "... viewing university website & extracting service links.\n\n";
#my $r2 = $response2->decoded_content();
$agent->form_number(1);
my $service_id = $agent->value("service_id");
my $request_id = $agent->value("request_id");
my $response3 = $agent->get("http://p9003-www.lib.utexas.edu.ezproxy.lib.utexas.edu/sfx_local/cgi/core/sfxresolver.cgi?basic1&tmp_ctx_svc_id=1&tmp_ctx_obj_id=1&service_id=$service_id&request_id=$request_id");
print "--> Getting to the last link in the trail.\n\n";
my $r3 = $response3->decoded_content();
#print $r3;
if($r3 =~ /PDF/) {
#my $input = $r3;
my $LX = new HTML::LinkExtractor();
$LX->parse(\$r3);
foreach my $x (@{$LX->links}) {
#print "The text is " . $$x{_TEXT} . "\n\n";
my $hs = HTML::Strip->new();
my $clean_text = $hs->parse( $$x{_TEXT} );
$hs->eof;
if($clean_text =~ /PDF/) {
$link = $$x{href};
#print "The clean text is $clean_text\n\n";
}
}
# On a specific journal website. Find the link with "PDF" in the *name* of the link, not URL.
#$r3 =~ '.*?PDF.*?';
#my $link = $1;
print "The paper's link is $link.\n\n";
if(!($link =~ /http/)) {
# Find the given base URL for this page.
my @info = split(/\//, substr($agent->uri(), 7), 2);
print "The substring is " . substr($agent->uri(), 7) . "\n\n";
$link = "http://" . $info[0] . "$link";
print "****** TODO: Find the base URL for this page. Link is $link\n\n";
}
#my $hs = HTML::Strip->new();
#my $clean_text = $hs->parse( $link );
#$hs->eof;
#`wget --proxy-user=$username --proxy-password='$password' "$link" --output-document="$name.pdf"`;
print "The link is still $link\n\n";
my $response4 = $agent->get($link);
open(OUTTY, ">>$name.pdf");
print OUTTY $response4->decoded_content();
close(OUTTY);
print "Done saving file.\n\n";
# --output-document="$clear_text.pdf" ???
}
} else {
print "No PDFs. No \"Get this article\". Try manually? (Please program in a routine to fix in this scenario).\n\n";
}
# Done.
}