Bioperl: Parsing Medline Docs

David J. States states@gpc.wustl.edu
Mon, 27 Sep 1999 17:50:10 -0500


On Monday, September 27, 1999 4:44 PM, Benjamin Stapley
[SMTP:ben@sophocles.gws.uky.edu] wrote:
> Hi
> Does any one know of perl scripts/modules that parse Medline documents
> in Medlars format?  Surely someone has attempted this before.
> Ben
> 
> 
> =========== Bioperl Project Mailing List Message Footer =======
> Project URL: http://bio.perl.org/
> For info about how to (un)subscribe, where messages are archived, etc:
> http://www.techfak.uni-bielefeld.de/bcd/Perl/Bio/vsns-bcd-perl.html
> ====================================================================

The script below reads the traditional medline format (Elhill) and does a
greedy clustering by most highly published authors.  Useful for going
through large collections of citations and grouping them by laboratory.

Note: this is old code (perl  3.0) and has not been actively maintained for
almost a decade (I am serious).  It seems to work with perl 5 and modern
medline files from Entrez, but there may be some subtle formatting changes
that are not properly handled.   Sorry if the command line options are not
documented, but I just never got around to it :).

Note

----
David J. States, M.D., Ph.D.
Associate Professor and Director
Institute for Biomedical Computing
Washington University in St. Louis
700 S. Euclid Ave.
St. Louis, MO   63110

tel: 314 362 2134
fax: 314 362 0234
email: states@ibc.wustl.edu


#!/usr/local/bin/perl
# perl script to read medline/Elhill output
# pick up authors, and cluster by most common author

$max = 78;

$ab=1;
$ad=1;
$ti=1;
$au=1;
$so=1;
$bya=1;
$byj=0;
$bib=0;
$nref = 1;

foreach $arg (@ARGV) {
	if ($arg eq "-byj") {
		$byj=1;
		$bya=0;
	} 
	if ($arg eq "-bya") {
		$bya=1;
		$byj=0;
	} 
	if ($arg eq "-ab") {
		$ab=0;
	} 
	if ($arg eq "+ab") {
		$ab=1;
	} 
	if ($arg eq "-au") {
		$au=1;
	} 
	if ($arg eq "+au") {
		$au=1;
	} 
	if ($arg eq "-ti") {
		$ti=1;
	} 
	if ($arg eq "+ti") {
		$ti=1;
	} 
	if ($arg eq "-so") {
		$so=1;
	} 
	if ($arg eq "+so") {
		$so=1;
	} 
	if ($arg eq "-ad") {
		$ad=1;
	} 
	if ($arg eq "+ad") {
		$ad=1;
	}
	if ($arg eq "-bib") {
		$bib=1;
	}
}

while(<>) {
	chop; 
	s/[\0-\31]//g;
	$field = substr($_,0,3); 
	$dat = substr($_,6).' ';

	if ($blank && $field) { $nref++; }
	$blank = length($_)==0;

	if ($field ne "   ") {
		$mode="JUNK";
	}

	if ($field eq "TI ") {
		$mode="TIT";
	}
	if ($field eq "AU ") {
		$mode="AUT";
	}
	if ($field eq "SO ") {
		$mode="JOU";
	}
	if ($field eq "YE ") {
		$mode="YEA";
	}
	if ($field eq "AD ") {
		$mode="ADD";
	}
	if ($field eq "AB ") {
		$mode="ABS";
	}
	if ($field eq "MH ") {
		$mode="MES";
	}
	if ($field eq "UID") {
		$mode="UNI";
	}

	if ($mode eq "TIT") {
		$title{$nref} .= $dat;
	}

	if ($mode eq "AUT") {
		$dat =~ s/ *; */;/g;
		$dat =~ s/^ *//;
		$dat =~ s/ *$//;
		@t = split(";",$dat);
		if ($field eq "   " && $danglingAU) {
			$a = $author{$nref,$nau{$nref}}.' '.@t[0];
			$author{$nref,$nau{$nref}} = $a;
			$aref{$a} .= $nref.' ';
			@t = splice(@t,0,1);
		}
		$danglingAU = substr($dat,length($dat)-1,1) ne ";";
		foreach $a (@t) {
			if ($a eq 'et al') { next; }
			if ($a ne ' ') {
				$nau{$nref}++;
				$author{$nref,$nau{$nref}} = $a;
				$aref{$a} .= $nref.' ';
			} else {
				$danglingAU = 0;
			}
		}
	}

	if ($mode eq "JOU") {
		($jl) = split(' *[0-9]',$dat);
		$jref{$jl} .= $nref.' ';
		@tok = split(' ',$dat);
		$found = 0;
		for ($i=0; $i<@tok && !$found; $i++) {
			$tok = @tok[$i];
			if (length($tok)==4 && substr($tok,0,2) eq "19") {
				$found = $i;
			}
		}
		if ($found) {
			$year{$nref} = int($tok);
			$date = "";
			$found2=0;
			for ($i=$found+1; $i<@tok && !$found2; $i++) {
				if (@tok[$i] =~ /;/) {
					$found2 = $i; 
				} else {
					$date .= "@tok[$i] "; 
				}
			}
			$day = "";
			if ($found2) {
				($day,$citation) = split(';',@tok[$found2]);
				$date .= "$day ";
				@tok[$found2] = $citation;
				splice(@tok,$found,$found2-$found);
			} else {
				splice(@tok,$found,1);
			}
		}
		$journal{$nref} = join(' ',@tok);
	}

	if ($mode eq "YEA") {
		$year{$nref} .= $dat;
	}

	if ($mode eq "ADD") {
		$address{$nref} .= $dat;
	}
	if ($mode eq "ABS") {
		$abstract{$nref} .= $dat;
	}

	if ($mode eq "MES") {
		$nmesh{$nref}++;
		$mesh{$nref,$nmes{$nref}} .= $dat;
	}

	if ($mode eq "UNI") {
		$uid{$nref} .= $dat;
	}
}

sub byaref {
	length($aref{$b}) - length($aref{$a});
}

sub byjref {
	length($jref{$b}) - length($jref{$a});
}

sub byyear {
	$year{$b} - $year{$a};
}

if ($bya) {
	foreach $a (sort byaref keys %aref) {
		if (!$bib) {
	
printf("\n--------------------------------------\n");
			printf("Key author: %s\n",$a);
		}
		foreach $r (sort byyear split(' ',$aref{$a})) {
			do PrintEntry(int($r));
		}
	}
}
if ($byj) {
	foreach $a (sort byjref keys %jref) {
		if (!$bib) {
	
printf("\n--------------------------------------\n");
			printf("Journal: %s\n",$a);
		}
		foreach $r (sort byyear split(' ',$jref{$a})) {
			do PrintEntry(int($r));
		}
	}
}

sub ProcessAuthor {
	local ($str) = @_;
	local ($i,$t,$initials,@tok,@t2);

	@tok = split(' ',$str);
	$initials = 0;
	for ($i=1; $i<@tok && !$initials; $i++) {
		$t = (@tok[$i] =~ /^[A-Z]*$/);
		if ($t) { $initials = $i; }
	}

	if ($initials) {
		@tok[$initials-1] .= ",";
		@t2 = split('',@tok[$initials]); 
		for($i=0; $i<@t2; $i++) {
			@t2[$i] .= ".";
		}
		@tok[$initials] = join(' ',@t2);
	}
	return(join(' ',@tok));
}

sub PrintEntry {

	local($i)=@_;
	local($j,$string);

	if (!$hit{$journal{$i}}) {
		$hit{$journal{$i}} = 1;
		printf("\n");
		if ($bib) {
				@out=0;
				for($j=1; $j<=$nau{$i}; $j++) {
					@out[$j-1] = do
ProcessAuthor($author{$i,$j});
				}
				if (@out>1) { splice(@out,$#out,0,"and"); }
				$string = join(", ",@out);
				$string =~ s/ and, / and /;
				if ($year{$i}) { $string .= " ($year{$i})";
}
				do PrintString($string,1);
				do PrintString($title{$i},1);
				$string = $journal{$i};
				$string .= ".";
				do PrintString($string,1);
		} else {
			if ($au) {
				@out=0;
				for($j=1; $j<=$nau{$i}; $j++) {
					@out[$j-1] = $author{$i,$j};
				}
				$string = "Author: ".join(", ",@out);
				if ($year{$i}) { $string .= " ($year{$i})";
}
				do PrintString($string);
			}
			if ($ti) {
				do PrintString("Title: ".$title{$i});
			}
			if ($so) {
				do PrintString("Journal: ".$journal{$i});
			}
			if($ad && $address{$i}) {
				do PrintString("Address: ".$address{$i});
			}
			if ($ab && $abstract{$i}) {
				do PrintString("Abstract: ".$abstract{$i});
			}
		}
	}
}

sub PrintString {

	local($string,$long) = @_;
	local($end,$l,$w,@t);

	if ($long) {
		printf("%s\n",$string);
	} else {
		$l=0;
		foreach $w (split(' ',$string)) {
			if (length($w)<$max && length($w)+1+$l>$max) {
				printf("\n  ");
				$l=2;
			}
			printf("%s ",$w);
			$l += length($w)+1;
		}
		printf("\n");
	}
}


=========== Bioperl Project Mailing List Message Footer =======
Project URL: http://bio.perl.org/
For info about how to (un)subscribe, where messages are archived, etc:
http://www.techfak.uni-bielefeld.de/bcd/Perl/Bio/vsns-bcd-perl.html
====================================================================