#!/usr/bin/perl
# 2008-07-08 Bryan Bishop kanzure@gmail.com
# Extract information from the humancortex XML files for specimen ID numbers.
# Actually -- we're just parsing the gene information
# Eventually to steal the OXT.xml files from the Allen Institute servers.

use Tie::Handle::CSV;

my %brainhash = ();
my %filehash = ();
open(DOOR, ">gene_list.txt") or die $!;
open(DOOR2, ">gene_list2.txt") or die $!;
for ($i = 1; $i <= 190; $i++) {
	print "Iterating to file $i.xml.\n";
	open(FILE, "<imageseries/$i.xml") or die $!;
	while(my $line = <FILE>) {
		#print "Debug me too.";
		if($line =~ /\<entrezgeneid type\=\"integer\"\>(\d+)/) {
			#print "Debug me.";
			$num = $1;
			print DOOR "$1\n";
			if(!$brainhash->{$num}) {
				# New/unique Entrez gene ID.
				# Need to fetch the XML data (or parse the CSV file)
				# and obtain the official three-letter symbol for this gene.
				# Add it to %brainhash. 
				# Use the Tie module from CPAN. 
				print "New Entrez gene ID to be added to the hash: $1\n";
				my $breaknow = 0;
				my $csv_fh = Tie::Handle::CSV->new('geneinfo/hg.csv', header => 0, key_case => 'lower', sep_char => ',');
				while (!$breaknow && ($csv_line = <$csv_fh>)) {
					if($csv_line->[1] eq $num) {
						$brainhash->{$num} = $csv_line->[10];
						print DOOR2 $csv_line->[10];
						print "Gene symbol: ";
						print $csv_line->[10], "\n";
						print DOOR2 "\n";
						$breaknow = 1;
					} # Otherwise there is no match yet.
					#print "This line's first value is: ",  $csv_line->[1], "\n", "and the brainhash is now: ", $brainhash->{$num}, "\n";
					#<STDIN>
				}
				#print "Did the loop break naturally? \$breaknow = $breaknow\n";
			}
			
			my $symbol = uc($brainhash->{$num});
			print $symbol, "\n"; # Forgot the \n and got some big, dumpy outputs. Ugh.
			# Fetch the XML document corresponding to this gene symbol.
			if(!(-e "$symbol.xml") && !$filehash->{"$symbol.xml"}) {
				`wget -nc http://humancortex.alleninstitute.org/has/human/brain/$symbol.xml --output-file=wget.error.log`;
				$filehash->{"$symbol.xml"} = 1; # So even if it doesn't fetch, it'll be in the logs. And it will just skip.
										    # One too many hours spent with wget querying for nonexistant files on their servers.
			}

			# print "XML fetch. Initializing wget fetch sequence on gene with Entrez ID $1\n";
#			`wget -nc http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?id=$1&db=gene&retmode=xml`;
#			`wget -nc http://humancortex.alleninstitute.org/has/human/specimen/summary/$1.xml `;
#			print "\nJPEG  fetch on human cortex gene (Entrez) $1.\n";
#			`wget -nc http://humancortex.alleninstitute.org/ImageWeb/GetImage?imageId=$1&zoom=3&size=max`;
#			print "Successfully fetched human cortex gene (from $i) $1.\n";
		}
	}
	print "Closing file $i.\n";
	close(FILE);
	#print "\n\nDone.\n\n";
}
close(DOOR);
close(DOOR2);