#!/usr/bin/perl # 2008-07-08 Bryan Bishop kanzure@gmail.com # Extract information from the humancortex XML files for specimen ID numbers. # Actually -- we're just parsing the gene information # Eventually to steal the OXT.xml files from the Allen Institute servers. use Tie::Handle::CSV; my %brainhash = (); my %filehash = (); open(DOOR, ">gene_list.txt") or die $!; open(DOOR2, ">gene_list2.txt") or die $!; for ($i = 1; $i <= 190; $i++) { print "Iterating to file $i.xml.\n"; open(FILE, ") { #print "Debug me too."; if($line =~ /\(\d+)/) { #print "Debug me."; $num = $1; print DOOR "$1\n"; if(!$brainhash->{$num}) { # New/unique Entrez gene ID. # Need to fetch the XML data (or parse the CSV file) # and obtain the official three-letter symbol for this gene. # Add it to %brainhash. # Use the Tie module from CPAN. print "New Entrez gene ID to be added to the hash: $1\n"; my $breaknow = 0; my $csv_fh = Tie::Handle::CSV->new('geneinfo/hg.csv', header => 0, key_case => 'lower', sep_char => ','); while (!$breaknow && ($csv_line = <$csv_fh>)) { if($csv_line->[1] eq $num) { $brainhash->{$num} = $csv_line->[10]; print DOOR2 $csv_line->[10]; print "Gene symbol: "; print $csv_line->[10], "\n"; print DOOR2 "\n"; $breaknow = 1; } # Otherwise there is no match yet. #print "This line's first value is: ", $csv_line->[1], "\n", "and the brainhash is now: ", $brainhash->{$num}, "\n"; # } #print "Did the loop break naturally? \$breaknow = $breaknow\n"; } my $symbol = uc($brainhash->{$num}); print $symbol, "\n"; # Forgot the \n and got some big, dumpy outputs. Ugh. # Fetch the XML document corresponding to this gene symbol. if(!(-e "$symbol.xml") && !$filehash->{"$symbol.xml"}) { `wget -nc http://humancortex.alleninstitute.org/has/human/brain/$symbol.xml --output-file=wget.error.log`; $filehash->{"$symbol.xml"} = 1; # So even if it doesn't fetch, it'll be in the logs. And it will just skip. # One too many hours spent with wget querying for nonexistant files on their servers. } # print "XML fetch. Initializing wget fetch sequence on gene with Entrez ID $1\n"; # `wget -nc http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?id=$1&db=gene&retmode=xml`; # `wget -nc http://humancortex.alleninstitute.org/has/human/specimen/summary/$1.xml `; # print "\nJPEG fetch on human cortex gene (Entrez) $1.\n"; # `wget -nc http://humancortex.alleninstitute.org/ImageWeb/GetImage?imageId=$1&zoom=3&size=max`; # print "Successfully fetched human cortex gene (from $i) $1.\n"; } } print "Closing file $i.\n"; close(FILE); #print "\n\nDone.\n\n"; } close(DOOR); close(DOOR2);