#!/usr/bin/perl -w # Bryan Bishop (kanzure@gmail.com) 2008-03-23/24 nighter # AutoScholar 1.0.0 # Fetches science. Enough said. # You may want to modify the scholar.google.com line and the usernames and passwords. use strict; use WWW::Mechanize; use HTML::TokeParser; use HTTP::Cookies; use HTML::Strip; use Crypt::SSLeay; use LWP::UserAgent; use HTML::LinkExtractor; ( my $title ) = @ARGV; my $agent = WWW::Mechanize->new( agent => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080201 Firefox/2.0.0.12' ); $agent->cookie_jar(HTTP::Cookies->new); my $response = $agent->get("http://scholar.google.com.ezproxy.lib.utexas.edu/scholar?q="); #print $response->decoded_content(); $agent->form_number(1); my $username = "*****"; my $password = "@@@@@@"; $agent->field("user", "$username"); $agent->field("pass", "$password"); $agent->submit(); fetchPaper("$title"); sub fetchPaper { my ($name, @array) = @_; #my $name = @_; print "Fetching paper with name $name... please wait.\n\n"; # Now fetch the actual paper. Let's try something simple. my $response = $agent->get("http://scholar.google.com.ezproxy.lib.utexas.edu/scholar?q=$name"); my $searchresults = $response->decoded_content(); #print $searchresults; # Return the first link *AFTER* any occurence of [PDF] if($searchresults =~ /\[PDF\]<\/b>/) { print "... found a PDF on Google Scholar directly.!\n\n"; $searchresults =~ '\[PDF\].*?(.*?)'; my $url = $1; print "URL is $url.\n\n"; my $hs = HTML::Strip->new(); my $clean_text = $hs->parse( $3 ); $hs->eof; `wget --proxy-user=$username --proxy-password='$password' "$url" --output-document="$clean_text.pdf"`; # --output-document="$clear_text.pdf" ??? } elsif ($searchresults =~ "Get this article") { print "Did not find a PDF on Google Scholar ... now activating university services.\n\n"; # Google Scholar does not know of a PDF. # So, go to the page that says "All 32 versions" # Then click either (1) the link that says "GET THIS PAPER" or (less preferably) (2) any paper link. # Given #1, go to the UT website and see the first form details: hidden variables service_id and request_id # Then GET http://p9003-www.lib.utexas.edu.ezproxy.lib.utexas.edu/sfx_local/cgi/core/sfxresolver.cgi?basic1&tmp_ctx_svc_id=1&tmp_ctx_obj_id=1&service_id=$service_id&request_id=$request_id. $searchresults =~ '.*Get this article'; my $link = $1; my $onmousedown = $2; my $response2 = $agent->get($link); print "... viewing university website & extracting service links.\n\n"; #my $r2 = $response2->decoded_content(); $agent->form_number(1); my $service_id = $agent->value("service_id"); my $request_id = $agent->value("request_id"); my $response3 = $agent->get("http://p9003-www.lib.utexas.edu.ezproxy.lib.utexas.edu/sfx_local/cgi/core/sfxresolver.cgi?basic1&tmp_ctx_svc_id=1&tmp_ctx_obj_id=1&service_id=$service_id&request_id=$request_id"); print "--> Getting to the last link in the trail.\n\n"; my $r3 = $response3->decoded_content(); #print $r3; if($r3 =~ /PDF/) { #my $input = $r3; my $LX = new HTML::LinkExtractor(); $LX->parse(\$r3); foreach my $x (@{$LX->links}) { #print "The text is " . $$x{_TEXT} . "\n\n"; my $hs = HTML::Strip->new(); my $clean_text = $hs->parse( $$x{_TEXT} ); $hs->eof; if($clean_text =~ /PDF/) { $link = $$x{href}; #print "The clean text is $clean_text\n\n"; } } # On a specific journal website. Find the link with "PDF" in the *name* of the link, not URL. #$r3 =~ '.*?PDF.*?'; #my $link = $1; print "The paper's link is $link.\n\n"; if(!($link =~ /http/)) { # Find the given base URL for this page. my @info = split(/\//, substr($agent->uri(), 7), 2); print "The substring is " . substr($agent->uri(), 7) . "\n\n"; $link = "http://" . $info[0] . "$link"; print "****** TODO: Find the base URL for this page. Link is $link\n\n"; } #my $hs = HTML::Strip->new(); #my $clean_text = $hs->parse( $link ); #$hs->eof; #`wget --proxy-user=$username --proxy-password='$password' "$link" --output-document="$name.pdf"`; print "The link is still $link\n\n"; my $response4 = $agent->get($link); open(OUTTY, ">>$name.pdf"); print OUTTY $response4->decoded_content(); close(OUTTY); print "Done saving file.\n\n"; # --output-document="$clear_text.pdf" ??? } } else { print "No PDFs. No \"Get this article\". Try manually? (Please program in a routine to fix in this scenario).\n\n"; } # Done. }