[Bioperl-l] Code for retrieving PDF file using a Pubmed link.

Fri Jan 21 12:39:27 EST 2005

[Seems that attachment is not supported. Here I re-send it...]

Hi,

Here I attached the code mentioned earlier. I donot know if the mailing
list system supports attachement. So, I also paste the code at the end of
this email.

I have put the detailed instruction in the comment part. Any usage
problem, please contact me.

The module will do its best to find the PDF link. But it can fail at some
publisher sites. You can let the module to put the processing result in a
log file. The flag of "NOT_FOUND_OR_ALLOWED" means that it failed to
download the PDF file. It is possible that the PDF location is too
complicated to the parser, or your institute does not have right to view
the full text.

For around 360 publication (with full text link) required in our project,
the module can got the PDF for around 330 of them. While our project going
on, I will update this module to make it more robust.

I hope the module can be a part of Bioperl ultimately. But before that,
you guys can help me to test.

Good weekend,

Yongjian Guo
at
Virginia Bioinformatics Institute

-----------------------------------------------------------------------

# $Id: PDFDownloader.pm   2005/1/20$
# Version 0.1
#
# Cared for by Yongjian Guo <yguo at vbi.vt.edu>
# For copyright and disclaimer see below.

# POD documentation - main docs before the code

=head1 NAME

PDFDownloader - Download full text PDF file using a Pubmed entry.

=head1 SYNOPSIS

              use PDFDownloader;
              #build the object,
              $worker = new PDFDownloader({logFile=>$logFile,
                                           link=>$link,
                                           dir=>$dirName,
                                           fileName=>$fileName});
              #start to download.
              $worker->start();

              The log information can be saved in the log file or shown on
screen.
              The following information will be given:

              DONE : Successfully finish downloading.
              NOT_OPEN_MED : Can not open the medine page.
              NOT_OPEN_PUB : Can not open the publisher site,
              NO_LINK : The given link does not have full text link out.
              NOT_FOUND_OR_ALLOWED : PDF entry can not be found or user
does not have right to view full text.

=head1 DESCRIPTION

This module will download the full text PDF file from the publisher website
using a Pubmed entry, if there is full text available.

=head1 Attributes

              link:  The pubmed link for an article.
              logFile: The assigned log file name. If it is the empty, the
information will be shown on screen.
              dir:   The directory of the pdf file to be saved.
              fileName: The name prefix of the target PDF file to be
saved. The downloaded file has the name
                        of fileName.pdf

=head1 FEEDBACK

=head2 Reporting Bugs

Report bugs to yguo at vbi.vt.edu.

=head1 AUTHORS

Yongjian Guo @ Virginia Bioinformatics Institute.

=head1 COPYRIGHT

This module is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 DISCLAIMER

This software is provided "as is" without warranty of any kind.

=cut

package PDFDownloader;

use strict;
use LWP::UserAgent;
use HTTP::Cookies;

#Function to create the PDFDownloader object.
#the parameter is a hash and its required entry
#is "link", which is a Pubmed article entry, like:
#http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=8931319

sub new{
    my $self=shift;
    my $para=shift;
    my %op=();
    $op{keep_alive}=1;
    $op{agent}="Mozilla/5.0";
    $op{timeout}=20;
    $op{cookie_jar}=HTTP::Cookies->new(file => "cookies.txt");
    my $class=bless{
	logFile=>$para->{logFile} || "",
	link=>$para->{link}, #what given is a link.
	dir=>$para->{dir} || ".",
	fileName=>$para->{fileName} || rand(),
	base=>"", #use to save the base url of the publisher site.
	fp=>LWP::UserAgent->new(%op),

    }, $self;
    if(!defined $class) {
	die "can not create object $class\n";
    }
   return $class;
}

#function to start searching and download process.

sub start{
    my $self=shift;
    my $ncbiBase="http://www.ncbi.nlm.nih.gov";
    my $data=$self->_getLinkContent($self->{link});
    if(length($data)==0){
	$self->_log("NOT_OPEN_MED\t".$self->{link});
    }
    #ok we get the link content, we analysis.
    if($data=~/href=\".*?db=pubmed\&url=(.*?)\"\s+/){
	#if we can get this pattern.  direct
	$self->_parsePubSite($1);

    }elsif($data=~/href=\"(.*?articlerender.*?)\"\s+/){
	#if is the direct deposit, not direct
	$self->_parsePubSite($1);
    }else{
	$self->_log("NO_LINK\t".$self->{link});
    }

}

#function to parse the first page on the publisher site.

sub _parsePubSite{
    my($self, $link)=@_;
    my $data=$self->_getLinkContent($link);
    my ($pos, $pos2, $tmpString, $result, @array);
    $result=-1; #initial is negative.
    if(length($data)==0){
	$self->_log("NOT_OPEN_PUB\t".$link);
	return;
    }
    #find the PDF string,
    $data=~s/\n//g;
    $data=~s/&nbsp;/ /g;
    #first we try if this is a direct link,
    $tmpString=$self->_getPDFLink("", $data);
    if(length($tmpString)!=0){
	#found the link,
	$result=$self->_tryGetPDF($tmpString);
	if($result==1){
	    return;
	}
    }

    if($data=~/([\s|(|>]pdf[\)|\s|<])/ ||
$data=~/([\s|(|>]PDF[\)|\s|<|"])/ ){
       #two possiblities,
       # 1. a link,
       # 2. a javascript.
       $pos=index($data, $1);
       $pos2=rindex(substr($data, 0, $pos), "href="); #found the earliest
href.
       $tmpString=substr($data, $pos2, $pos-$pos2);
       if($tmpString=~/\"(.*?)\"/){
         $tmpString=$1;
       }
       #further extraction
       if($tmpString=~/\'(.*?)\'/){
      	 $tmpString=$1;
       }
       #ok, here we got the $tmpString for a next link,
       #use a try mechanism,
       $result=$self->_tryGetPDF($tmpString);
       if($result==-1){
	 #no success,
	 @array=$self->_getSubLinks($tmpString);
	 foreach my $entry (@array){
             if($self->_tryGetPDF($self->_getPDFLink($entry))==1){
       	     $result=1; #success,
	     last;
	   }
	 }
	}
    }
    #further try,
    if($result!=1){
       #it is possible that the direct link is a frame,
       @array=$self->_getSubLinks($link);
       foreach my $entry (@array){
	 if($self->_tryGetPDF($self->_getPDFLink($entry))==1){
	   $result=1; #success,
	   last;
	 }
       }
       if($result!=1){
	 $self->_log("NOT_FOUND_OR_ALLOWED\t".$self->{link});
       }
     }
}

sub _tryGetPDF{
    my($self, $link)=@_;
    my $result=$self->_getPDFFile($link);
    if($result==1){
	$self->_log("DONE\t".$self->{link});
    }
    return $result;
}

#given a web page, use this one to get all of the links in that page.

sub _getSubLinks{
    my ($self, $link)=@_;
    my $data=$self->_getLinkContent($link);
    my @array=();
    my $pos=0;
    my $pos2=0;
    my $tmp="";
    my $count=0;
    while(1){
	$pos=index($data, "\"", $pos);
	if($pos==-1 || $count>50){   #it is possible the page does not have link.
we use the number to control.
	    last;
	}
	$pos2=index($data, "\"", $pos+1);
	$tmp=substr($data, $pos+1, $pos2-$pos-1);
	if($tmp=~/^(http)|\//){
	    push(@array, $tmp);
	}
	$pos=$pos2+1;
	$count++;
    }
    if($count>=50){
	@array=();
    }
    return @array;
}

#function to return a pdf file link from a webpage.

sub _getPDFLink{
    my($self, $link, $data)=@_;
    if(length($link)!=0){
	$data=$self->_getLinkContent($link);
	$data=~s/\n//g;
    }
    if($data=~/.*[\"|\'](.{5,}\.pdf)[\"|\']/ ||
$data=~/.*[\"|\'](.{5,}\.PDF)[\"|\']/ ){
	#ok, there is pdf file.
	return $1;
    }
    return "";  #not found.
}

#function to get the homepage. redirection is taken cared.

sub _getLinkContent{
    my ($self, $link)=@_;
    if($link!~/http:\/\//){  #some link has the format of http:/www..
	my $pos=index($link, "/");
	$link=substr($link, $pos);
    }
    if($link!~/^http/){
	$link=$self->_buildURL($link);
    }

    $link=~s/&amp\;/&/g;
    my $response=$self->{fp}->get($link);
    my $rHeader="";
    #ok, we need to analysis the header. to see if there is a refresh,
    #if yes, we will refresh the link,
    if($response->is_success()){
	$rHeader=$response->header("Refresh");
	if(length($rHeader)>0){
	    if($rHeader=~/URL\=(.*)/){
		return $self->_getLinkContent($1);
	    }
	}else{
	    #update the base url.
	    $self->{base}=$response->base();
	    return $response->content;
	}
    }
    return "";

}

#get the real pdf file. redirection is taken cared.

sub _getPDFFile{
    my($self, $link)=@_;
    if($link!~/^http/){
	$link=$self->_buildURL($link);
    }
    $link=~s/&amp\;/&/g;
    my $done=0;
    my $fileName=$self->{dir}."/".$self->{fileName}.".pdf";
    #try to see if there is a refresh,
    my $response=$self->{fp}->get($link);
    if($response->is_success()){
	my $rHeader=$response->header("Refresh");
	if(length($rHeader)>0 && $rHeader=~/URL\=(.*)/){
	    return $self->_getPDFFile($1);
	}
    }

    $self->{fp}->get($link, ":content_file"=>$fileName);
    #ok now, we test if this file is the pdf file, if yes,
    #we done, if not, return some message.
    open PDFIN, $fileName;
    while(<PDFIN>){
	if($_=~/^%PDF/){
	    $done=1;
	    last;
	}
    }
    close PDFIN;
    if($done==0){
	unlink $fileName;
	return -1;
    }
    return 1; #everything ok,

}

#function to record the log

sub _log{
    my($self, $data)=@_;
    if(length($self->{logFile})==0){
	print $data,"\n";
	return;
    }
    open LOGOUT, ">>".$self->{logFile} or die "can not open the log file
to write\n";
    print LOGOUT $data, "\n";
    close LOGOUT;
    return;
}

#function to build the full url.

sub _buildURL{
    my($self, $target)=@_;

    if($target=~/^\//){
	if($self->{base}=~/(http:\/\/.*?)\//){
	    return $1.$target;
	}
    }else{
	if($self->{base}=~/(http:\/\/.*)\//){
	    return $1."/".$target;
	}
    }
    return $target;
}

1;