Bioperl: Parsing Medline Docs
David J. States
states@gpc.wustl.edu
Mon, 27 Sep 1999 17:50:10 -0500
On Monday, September 27, 1999 4:44 PM, Benjamin Stapley
[SMTP:ben@sophocles.gws.uky.edu] wrote:
> Hi
> Does any one know of perl scripts/modules that parse Medline documents
> in Medlars format? Surely someone has attempted this before.
> Ben
>
>
> =========== Bioperl Project Mailing List Message Footer =======
> Project URL: http://bio.perl.org/
> For info about how to (un)subscribe, where messages are archived, etc:
> http://www.techfak.uni-bielefeld.de/bcd/Perl/Bio/vsns-bcd-perl.html
> ====================================================================
The script below reads the traditional medline format (Elhill) and does a
greedy clustering by most highly published authors. Useful for going
through large collections of citations and grouping them by laboratory.
Note: this is old code (perl 3.0) and has not been actively maintained for
almost a decade (I am serious). It seems to work with perl 5 and modern
medline files from Entrez, but there may be some subtle formatting changes
that are not properly handled. Sorry if the command line options are not
documented, but I just never got around to it :).
Note
----
David J. States, M.D., Ph.D.
Associate Professor and Director
Institute for Biomedical Computing
Washington University in St. Louis
700 S. Euclid Ave.
St. Louis, MO 63110
tel: 314 362 2134
fax: 314 362 0234
email: states@ibc.wustl.edu
#!/usr/local/bin/perl
# perl script to read medline/Elhill output
# pick up authors, and cluster by most common author
$max = 78;
$ab=1;
$ad=1;
$ti=1;
$au=1;
$so=1;
$bya=1;
$byj=0;
$bib=0;
$nref = 1;
foreach $arg (@ARGV) {
if ($arg eq "-byj") {
$byj=1;
$bya=0;
}
if ($arg eq "-bya") {
$bya=1;
$byj=0;
}
if ($arg eq "-ab") {
$ab=0;
}
if ($arg eq "+ab") {
$ab=1;
}
if ($arg eq "-au") {
$au=1;
}
if ($arg eq "+au") {
$au=1;
}
if ($arg eq "-ti") {
$ti=1;
}
if ($arg eq "+ti") {
$ti=1;
}
if ($arg eq "-so") {
$so=1;
}
if ($arg eq "+so") {
$so=1;
}
if ($arg eq "-ad") {
$ad=1;
}
if ($arg eq "+ad") {
$ad=1;
}
if ($arg eq "-bib") {
$bib=1;
}
}
while(<>) {
chop;
s/[\0-\31]//g;
$field = substr($_,0,3);
$dat = substr($_,6).' ';
if ($blank && $field) { $nref++; }
$blank = length($_)==0;
if ($field ne " ") {
$mode="JUNK";
}
if ($field eq "TI ") {
$mode="TIT";
}
if ($field eq "AU ") {
$mode="AUT";
}
if ($field eq "SO ") {
$mode="JOU";
}
if ($field eq "YE ") {
$mode="YEA";
}
if ($field eq "AD ") {
$mode="ADD";
}
if ($field eq "AB ") {
$mode="ABS";
}
if ($field eq "MH ") {
$mode="MES";
}
if ($field eq "UID") {
$mode="UNI";
}
if ($mode eq "TIT") {
$title{$nref} .= $dat;
}
if ($mode eq "AUT") {
$dat =~ s/ *; */;/g;
$dat =~ s/^ *//;
$dat =~ s/ *$//;
@t = split(";",$dat);
if ($field eq " " && $danglingAU) {
$a = $author{$nref,$nau{$nref}}.' '.@t[0];
$author{$nref,$nau{$nref}} = $a;
$aref{$a} .= $nref.' ';
@t = splice(@t,0,1);
}
$danglingAU = substr($dat,length($dat)-1,1) ne ";";
foreach $a (@t) {
if ($a eq 'et al') { next; }
if ($a ne ' ') {
$nau{$nref}++;
$author{$nref,$nau{$nref}} = $a;
$aref{$a} .= $nref.' ';
} else {
$danglingAU = 0;
}
}
}
if ($mode eq "JOU") {
($jl) = split(' *[0-9]',$dat);
$jref{$jl} .= $nref.' ';
@tok = split(' ',$dat);
$found = 0;
for ($i=0; $i<@tok && !$found; $i++) {
$tok = @tok[$i];
if (length($tok)==4 && substr($tok,0,2) eq "19") {
$found = $i;
}
}
if ($found) {
$year{$nref} = int($tok);
$date = "";
$found2=0;
for ($i=$found+1; $i<@tok && !$found2; $i++) {
if (@tok[$i] =~ /;/) {
$found2 = $i;
} else {
$date .= "@tok[$i] ";
}
}
$day = "";
if ($found2) {
($day,$citation) = split(';',@tok[$found2]);
$date .= "$day ";
@tok[$found2] = $citation;
splice(@tok,$found,$found2-$found);
} else {
splice(@tok,$found,1);
}
}
$journal{$nref} = join(' ',@tok);
}
if ($mode eq "YEA") {
$year{$nref} .= $dat;
}
if ($mode eq "ADD") {
$address{$nref} .= $dat;
}
if ($mode eq "ABS") {
$abstract{$nref} .= $dat;
}
if ($mode eq "MES") {
$nmesh{$nref}++;
$mesh{$nref,$nmes{$nref}} .= $dat;
}
if ($mode eq "UNI") {
$uid{$nref} .= $dat;
}
}
sub byaref {
length($aref{$b}) - length($aref{$a});
}
sub byjref {
length($jref{$b}) - length($jref{$a});
}
sub byyear {
$year{$b} - $year{$a};
}
if ($bya) {
foreach $a (sort byaref keys %aref) {
if (!$bib) {
printf("\n--------------------------------------\n");
printf("Key author: %s\n",$a);
}
foreach $r (sort byyear split(' ',$aref{$a})) {
do PrintEntry(int($r));
}
}
}
if ($byj) {
foreach $a (sort byjref keys %jref) {
if (!$bib) {
printf("\n--------------------------------------\n");
printf("Journal: %s\n",$a);
}
foreach $r (sort byyear split(' ',$jref{$a})) {
do PrintEntry(int($r));
}
}
}
sub ProcessAuthor {
local ($str) = @_;
local ($i,$t,$initials,@tok,@t2);
@tok = split(' ',$str);
$initials = 0;
for ($i=1; $i<@tok && !$initials; $i++) {
$t = (@tok[$i] =~ /^[A-Z]*$/);
if ($t) { $initials = $i; }
}
if ($initials) {
@tok[$initials-1] .= ",";
@t2 = split('',@tok[$initials]);
for($i=0; $i<@t2; $i++) {
@t2[$i] .= ".";
}
@tok[$initials] = join(' ',@t2);
}
return(join(' ',@tok));
}
sub PrintEntry {
local($i)=@_;
local($j,$string);
if (!$hit{$journal{$i}}) {
$hit{$journal{$i}} = 1;
printf("\n");
if ($bib) {
@out=0;
for($j=1; $j<=$nau{$i}; $j++) {
@out[$j-1] = do
ProcessAuthor($author{$i,$j});
}
if (@out>1) { splice(@out,$#out,0,"and"); }
$string = join(", ",@out);
$string =~ s/ and, / and /;
if ($year{$i}) { $string .= " ($year{$i})";
}
do PrintString($string,1);
do PrintString($title{$i},1);
$string = $journal{$i};
$string .= ".";
do PrintString($string,1);
} else {
if ($au) {
@out=0;
for($j=1; $j<=$nau{$i}; $j++) {
@out[$j-1] = $author{$i,$j};
}
$string = "Author: ".join(", ",@out);
if ($year{$i}) { $string .= " ($year{$i})";
}
do PrintString($string);
}
if ($ti) {
do PrintString("Title: ".$title{$i});
}
if ($so) {
do PrintString("Journal: ".$journal{$i});
}
if($ad && $address{$i}) {
do PrintString("Address: ".$address{$i});
}
if ($ab && $abstract{$i}) {
do PrintString("Abstract: ".$abstract{$i});
}
}
}
}
sub PrintString {
local($string,$long) = @_;
local($end,$l,$w,@t);
if ($long) {
printf("%s\n",$string);
} else {
$l=0;
foreach $w (split(' ',$string)) {
if (length($w)<$max && length($w)+1+$l>$max) {
printf("\n ");
$l=2;
}
printf("%s ",$w);
$l += length($w)+1;
}
printf("\n");
}
}
=========== Bioperl Project Mailing List Message Footer =======
Project URL: http://bio.perl.org/
For info about how to (un)subscribe, where messages are archived, etc:
http://www.techfak.uni-bielefeld.de/bcd/Perl/Bio/vsns-bcd-perl.html
====================================================================