attachment:query_pubmed.pl of Data/RoadsNotTaken - Bader Lab @ The University of Toronto

Attachment 'query_pubmed.pl'

   1 #!/usr/local/bin/perl -w
   2 # ===========================================================================
   3 # Generic Tool to given a list of genes (and excluded terms) search entrez
   4 #  for all aliases of the gene followed by a pubmed query to see how the
   5 #  number of publications has changed over time.
   6 # ===========================================================================
   7 use LWP::Simple;
   8 use XML::Parser;
   9 use XML::Simple;	# module to parse XML data which is the response to the query
  10 use Data::Dumper;	# module for visualization of parsed XML data; not obligatory for function but was used for development
  11 
  12 use Getopt::Long; # to parse the command line options.
  13 
  14 my $query_ending  = "[sym] AND (\"mus musculus\"[Organism] OR \"homo sapiens\"[Organism])";
  15 my $report = "xml";
  16 my $email = "ruth.isserlin\@utoronto.ca";
  17 my $tool = "publication_distribution";
  18 
  19 #url for the ncbi e-utils
  20 my $utils = "http://www.ncbi.nlm.nih.gov/entrez/eutils";
  21 
  22 #the different queries
  23 my $esearch_pubmed = "$utils/esearch.fcgi?" .
  24               "db=Pubmed&usehistory=n&email=$email&tool=$tool&term=";
  25 
  26 my $esearch_gene = "$utils/esearch.fcgi?" .
  27               "db=Gene&usehistory=y&email=$email&tool=$tool&term=";
  28 
  29 
  30 
  31 #if you want to limit the pubmed search to anything (for example [Title]) add it to constraints
  32 my $constraints = "";
  33 
  34 #variables to hold command line arguments
  35 my (
  36 	$genes_filename,
  37 	$exclusions_filename,
  38 	$output_dirname,
  39 	$output_filename,
  40 	$restrict,
  41 );
  42 
  43 GetOptions (
  44 	"genes|g=s"		=> \$genes_filename,
  45 	"exclusions|e=s"	=> \$exclusions_filename,
  46 	"outputdir|o=s"	=> \$output_dirname,
  47       "outputfile|f=s"  => \$output_filename,
  48 	"restrict|r=s"	=> \$restrict,
  49 );
  50 
  51 #open the output file
  52 open(PUBS, ">$output_dirname/$output_filename") or die "Error opening $output_dirname/$output_filename : $!\n";
  53 
  54 #print Header
  55 print(PUBS "Initial Gene Query\tPrimary Name\tAliases\t1950-1979\t1980-1984\t1985-1989\t1990-1994\t1995-2000\t2001-2005\t2006-2010\t1980\t1981\t1982\t1983\t1984\t1985\t1986\t1987\t1988\t1989\t1990\t1991\t1992\t1993\t1994\t1995\t1996\t1997\t1998\t1999\t2000\t2001\t2002\t2003\t2004\t2005\t2006\t2007\t2008\t2009\t2010\tPubmed Query\n");
  56 	
  57 
  58 #get all the names we want to exclude
  59 # There are aliases that can bring back loads of pubmed hits, for example gene name "MR", that you might
  60 # want to exclude from pubmed search
  61 open(IGNORE_IN, "$exclusions_filename") or die "error opening $exclusions_filename : $!\n";
  62 my @ignore_list = ();
  63 
  64 while(<IGNORE_IN>){
  65 	chomp;
  66 	my @cur = split(/\n/);
  67 	
  68 	push @ignore_list, $cur[0];	
  69 }
  70 
  71 #open the file that stores all the gene symbols for nuclear receptors
  72 #get the gene Names to do each pubmed search
  73 open(Names_IN, "$genes_filename") or die "Error opening $genes_filename : $!\n";
  74 
  75 
  76 #go through each name (in the genes_file) 
  77 # search entrez for aliases 
  78 # make sure none of the aliases are in the exclusion list
  79 # make sure that each individual search term returns results when it is searched in pubmed.  Because if no
  80 #     results are returned by pubmed for a particular term pubmed expands it to try and get hits.
  81 #	for example the term "orphan nuclear hormone receptor 1" is not found as a quoted term so pubmed
  82 #	translates it to:("child, orphaned"[MeSH Terms] OR ("child"[All Fields] AND "orphaned"[All Fields]) 
  83 # 		OR "orphaned child"[All Fields] OR "orphan"[All Fields]) AND ("receptors, cytoplasmic and 
  84 #		nuclear"[MeSH Terms] OR ("receptors"[All Fields] AND "cytoplasmic"[All Fields] AND 
  85 #		"nuclear"[All Fields]) OR "cytoplasmic and nuclear receptors"[All Fields] OR ("nuclear"[All Fields] 
  86 #		AND "hormone"[All Fields] AND "receptor"[All Fields]) OR "nuclear hormone receptor"[All Fields]) 
  87 #		AND 1[All Fields]
  88 # construct the base pubmed query
  89 # and then search pubmed for the number of publications
  90 while (<Names_IN>){
  91 	chomp;
  92 	my @name  = split(/\n/);
  93 	print "$name[0]\n";
  94 
  95 	#search entrez for aliases.
  96 	# Get the entrez Entry in XML format and parse out all the aliases and alternate names		
  97 	my $query = $name[0].$query_ending; 
  98 
  99 	my $esearch_result = get($esearch_gene . $query);
 100 
 101 	$esearch_result =~ 
 102 	m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 103 
 104 	my $Count    = $1;
 105 	my $QueryKey = $2;
 106 	my $WebEnv   = $3;
 107 
 108 
 109 	my $retstart;
 110 	my $retmax=1;
 111 	
 112 	my $pubmed_query = "";
 113 	@aliases = ();
 114 	@alternate_names = ();
 115 
 116 	#go through all records returned by Entrez, there could be one mouse and one human
 117 	for($retstart = 0; $retstart < $Count; $retstart += $retmax) {
 118   
 119   		my $efetch = "$utils/efetch.fcgi?" .
 120                		"rettype=$report&retmode=text&retstart=$retstart&retmax=$retmax&" .
 121                		"db=Gene&query_key=$QueryKey&WebEnv=$WebEnv";
 122 	
 123   		my $efetch_result = get($efetch);
 124  
 125   		$tag = "";
 126   		$primaryName = "";
 127   		
 128   		$p3 = new XML::Parser(ErrorContext => 2);
 129 
 130   		$p3->setHandlers(Start => \&startElement,
 131                            End => \&endElement,
 132                            Char => \&characterData,Default => \&default);
 133 		
 134 		#use XML parse function to parse the results returned by Entrez.
 135   		eval{$p3->parse($efetch_result);};
 136   		if($@){
 137 			my $error = $@;
 138 			print "$error\n";
 139 			next;
 140   		}
 141   		
 142 	}
 143 	
 144 	#start constructing the pubmed query
 145 	$pubmed_query = "((\"" . $primaryName ."\"";
 146 	
 147 	#go through each of the aliases in the list.  Only add it to the query if the individual
 148 	# query returns somethings and it is not in the exclusion list.
 149 	foreach $aliases (@aliases){
 150 						
 151 		#before doing a search check to see if the name is in the exclude list.
 152 		my $ignore = "no";
 153 		foreach $a (@ignore_list){
 154 			#one of the novel
 155 			if($a eq $aliases){
 156 				$ignore = "yes";
 157 			}
 158 
 159 		}
 160 			
 161 		if($ignore eq "no"){
 162 			#for each gene name do a separate pubmed query and get the counts
 163 			#if the individual query comes back with one of the below warnings then don't 
 164 			#add it to the search.
 165 			
 166 			my $notFound = "";
 167 
 168 			$pubmed_name = "\"" . $aliases . "\"" . $constraints;
 169 			$esearch_result = get($esearch_pubmed . $pubmed_name);
 170  			
 171 			#tag within the pubmed search results that indicate the quoted phrase was not found
 172 			# and that pubmed will try and expand it.
 173 			$esearch_result =~ 
 174   				m|<QuotedPhraseNotFound>(.*)</QuotedPhraseNotFound>|s;
 175 
 176 			$notFound = $1;
 177 
 178 			if($notFound ne $pubmed_name){
 179 						
 180 				$pubmed_query = $pubmed_query . " OR \"" . $aliases . "\"" . $constraints;
 181 					
 182 
 183 			}
 184 		}
 185   	}
 186 		
 187 		
 188 	#finished adding all the gene names, now add the alternate names
 189 	# For more restrictive search change change to AND here.
 190 	$pubmed_query = $pubmed_query . ") OR ( ";
 191 	
 192 
 193 	foreach $alternate_names (@alternate_names){
 194 			
 195 		#before doing a search check to see if the name is in the exclude list.
 196 		$ignore = "no";
 197 		foreach $a (@ignore_list){
 198 			#one of the novel
 199 			if($a eq $alternate_names){
 200 				$ignore = "yes";
 201 			}
 202 
 203 		}
 204 			
 205 		if($ignore eq "no"){
 206 			#for each gene name do a separate pubmed query and get the counts
 207 			my $notFound = "";
 208 			$pubmed_name = "\"" . $alternate_names . "\"" . $constraints;
 209 			$esearch_result = get($esearch_pubmed . $pubmed_name);
 210  			
 211 			#tag within the pubmed search results that indicate the quoted phrase was not found
 212 			# and that pubmed will try and expand it.										
 213 			$esearch_result =~ 
 214   				m|<QuotedPhraseNotFound>(.*)</QuotedPhraseNotFound>|s;
 215 
 216 			$notFound = $1;
 217 		
 218 			if($notFound ne $pubmed_name){										
 219 				$pubmed_query = $pubmed_query . " \"" . $alternate_names . "\"" . $constraints ." OR ";
 220 			}
 221 		}
 222   	}
 223 
 224 
 225 	#add an extra name to the RXRs
 226 	if($name[0] eq "NR2B1"){
 227 		$pubmed_query = $pubmed_query . " \"RXR alpha\"" . $constraints." OR \"RXRalpha\"". $constraints;
 228 	}
 229 	#add an extra name to the RXRs
 230 	if($name[0] eq "NR2B2"){
 231 		$pubmed_query = $pubmed_query . " \"RXR beta\"" . $constraints." OR \"RXRbeta\"" . $constraints;
 232 	}
 233 	#add an extra name to the RXRs
 234 	if($name[0] eq "NR2B3"){
 235 		$pubmed_query = $pubmed_query . " \"RXR gamma\"" . $constraints." OR \"RXRgamma\"". $constraints;
 236 	}
 237 
 238       #add the generic name "thyroid hormone receptor" to one of the receptors.
 239 	if($name[0] eq "NR1A2"){
 240 		$pubmed_query = $pubmed_query . " (\"thyroid hormone receptor\" AND \"beta\")". $constraints;
 241 	}
 242 	if($name[0] eq "NR1A1"){
 243 		$pubmed_query = $pubmed_query . " (\"thyroid hormone receptor\" AND \"alpha\")". $constraints;
 244 	}
 245 
 246 
 247 	$pubmed_query = $pubmed_query . ")) and \"". $restrict . "\"" . $constraints;
 248 		
 249 	#create a file to output all the pmids to 
 250 	#do the search without any year restriction
 251 
 252 	$esearch_result = get($esearch_pubmed . $pubmed_query);
 253 	$esearch_result =~ 
 254   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 255 
 256 	my $Count_all    = $1;
 257 	my $Querykey = $2;
 258 	my $WebKey = $3;
 259 		
 260 
 261 	my $efetch_pmids = "$utils/efetch.fcgi?" .
 262              		"db=pubmed&usehistory=y&rettype=uilist&retmode=text&WebEnv=$WebKey&query_key=$QueryKey";
 263 	
 264 		
 265 	my $efetch_pmids_results = get($efetch_pmids);
 266 		
 267 	#create a directory in the output directory called PMIDs to contain the PMIDs
 268 	mkdir "$output_dirname/PMIDS";
 269 
 270 	#open a file the name of the main search
 271 	open(PMIDS, ">$output_dirname/PMIDS/$name[0].txt");
 272 		
 273 	my @pmids  = split(/\n/, $efetch_pmids_results);
 274 	foreach $pmids (@pmids){
 275 			print PMIDS "PMID(".$pmids.") OR \n";
 276 	}
 277 	close PMIDS;
 278 
 279 
 280 	##########################################
 281 	# Do pubmed searches for years 1950-2010 in 5 year subsets
 282 	# and individually for year 1980-2010
 283 	#########################################
 284 
 285 	#search 1950-1979
 286 	$pubmed_query_year = $pubmed_query . " and 1950:1979[dp]";
 287 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 288  
 289 	$esearch_result =~ 
 290   	m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 291 
 292 	my $Count_195079    = $1;
 293 
 294 	#search 1980-1984
 295 	$pubmed_query_year = $pubmed_query . " and 1980:1984[dp]";
 296 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 297  
 298 	$esearch_result =~ 
 299   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 300 
 301 	my $Count_198084    = $1;
 302 
 303 	#search 1985-1989
 304 	$pubmed_query_year = $pubmed_query . " and 1985:1989[dp]";
 305 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 306  
 307 	$esearch_result =~ 
 308   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 309 
 310 	my $Count_198589    = $1;
 311 
 312 	#search 1990-1994
 313 	$pubmed_query_year = $pubmed_query . " and 1990:1994[dp]";
 314 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 315  
 316 	$esearch_result =~ 
 317   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 318 
 319 	my $Count_199094    = $1;
 320 	
 321 	#1995-2000
 322 	$pubmed_query_year = $pubmed_query . " and 1995:2000[dp]";
 323 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 324 	$esearch_result =~ 
 325   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 326 
 327 	my $Count_19952000    = $1;
 328 
 329 	#and then search by each year individually
 330 	# 1980
 331 	$pubmed_query_year = $pubmed_query . " and 1980[dp]";
 332 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 333  	$esearch_result =~ 
 334   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 335 
 336 	my $Count_1980    = $1;
 337 
 338 	# 1981
 339 	$pubmed_query_year = $pubmed_query . " and 1981[dp]";
 340 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 341 	$esearch_result =~ 
 342   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 343 
 344 	my $Count_1981    = $1;
 345 
 346 	# 1982
 347 	$pubmed_query_year = $pubmed_query . " and 1982[dp]";
 348 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 349 	$esearch_result =~ 
 350   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 351 
 352 	my $Count_1982    = $1;
 353 
 354 	# 1983
 355 	$pubmed_query_year = $pubmed_query . " and 1983[dp]";
 356 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 357  	$esearch_result =~ 
 358   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 359 
 360 	my $Count_1983    = $1;	
 361 		
 362 	# 1984
 363 	$pubmed_query_year = $pubmed_query . " and 1984[dp]";
 364 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 365  	$esearch_result =~ 
 366   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 367 
 368 	my $Count_1984    = $1;
 369 
 370 	# 1985
 371 	$pubmed_query_year = $pubmed_query . " and 1985[dp]";
 372 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 373  	$esearch_result =~ 
 374   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 375 
 376 	my $Count_1985    = $1;
 377 
 378 	# 1986
 379 	$pubmed_query_year = $pubmed_query . " and 1986[dp]";
 380 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 381  	$esearch_result =~ 
 382   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 383 
 384 	my $Count_1986    = $1;
 385 
 386 	# 1987
 387 	$pubmed_query_year = $pubmed_query . " and 1987[dp]";
 388 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 389  	$esearch_result =~ 
 390   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 391 
 392 	my $Count_1987    = $1;	
 393 
 394 	# 1988
 395 	$pubmed_query_year = $pubmed_query . " and 1988[dp]";
 396 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 397  	$esearch_result =~ 
 398   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 399 
 400 	my $Count_1988    = $1;
 401 
 402 	# 1989
 403 	$pubmed_query_year = $pubmed_query . " and 1989[dp]";
 404 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 405  	$esearch_result =~ 
 406   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 407 
 408 	my $Count_1989    = $1;		
 409 
 410 	# 1990
 411 	$pubmed_query_year = $pubmed_query . " and 1990[dp]";
 412 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 413 	$esearch_result =~ 
 414   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 415 
 416 	my $Count_1990    = $1;
 417 
 418 	# 1991
 419 	$pubmed_query_year = $pubmed_query . " and 1991[dp]";
 420 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 421  	$esearch_result =~ 
 422   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 423 
 424 	my $Count_1991    = $1;
 425 
 426 	# 1992
 427 	$pubmed_query_year = $pubmed_query . " and 1992[dp]";
 428 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 429  	$esearch_result =~ 
 430   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 431 
 432 	my $Count_1992    = $1;
 433 
 434 	# 1993
 435 	$pubmed_query_year = $pubmed_query . " and 1993[dp]";
 436 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 437  	$esearch_result =~ 
 438   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 439 
 440 	my $Count_1993    = $1;	
 441 
 442 	# 1994
 443 	$pubmed_query_year = $pubmed_query . " and 1994[dp]";
 444 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 445  	$esearch_result =~ 
 446   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 447 
 448 	my $Count_1994    = $1;
 449 
 450 
 451 	# 1995
 452 	$pubmed_query_year = $pubmed_query . " and 1995[dp]";
 453 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 454  	$esearch_result =~ 
 455   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 456 
 457 	my $Count_1995    = $1;
 458 
 459 
 460 	# 1996
 461 	$pubmed_query_year = $pubmed_query . " and 1996[dp]";
 462 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 463  	$esearch_result =~ 
 464   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 465 
 466 	my $Count_1996    = $1;
 467 
 468 	# 1997
 469 	$pubmed_query_year = $pubmed_query . " and 1997[dp]";
 470 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 471  	$esearch_result =~ 
 472   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 473 
 474 	my $Count_1997    = $1;	
 475 
 476 	# 1998
 477 	$pubmed_query_year = $pubmed_query . " and 1998[dp]";
 478 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 479  	$esearch_result =~ 
 480   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 481 
 482 	my $Count_1998    = $1;
 483 
 484 	# 1999
 485 	$pubmed_query_year = $pubmed_query . " and 1999[dp]";
 486 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 487  	$esearch_result =~ 
 488   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 489 
 490 	my $Count_1999    = $1;
 491 
 492 	# 2000
 493 	$pubmed_query_year = $pubmed_query . " and 2000[dp]";
 494 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 495  	$esearch_result =~ 
 496   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 497 
 498 	my $Count_2000    = $1;
 499 
 500 	# 2001
 501 	$pubmed_query_year = $pubmed_query . " and 2001[dp]";
 502 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 503  	$esearch_result =~ 
 504   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 505 
 506 	my $Count_2001    = $1;
 507 
 508 	# 2002
 509 	$pubmed_query_year = $pubmed_query . " and 2002[dp]";
 510 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 511  	$esearch_result =~ 
 512   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 513 
 514 	my $Count_2002    = $1;
 515 	
 516 	# 2003
 517 	$pubmed_query_year = $pubmed_query . " and 2003[dp]";
 518 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 519  	$esearch_result =~ 
 520   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 521 
 522 	my $Count_2003    = $1;
 523 	# 2004
 524 	$pubmed_query_year = $pubmed_query . " and 2004[dp]";
 525 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 526  	$esearch_result =~ 
 527   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 528 
 529 	my $Count_2004    = $1;
 530 
 531 	# 2005
 532 	$pubmed_query_year = $pubmed_query . " and 2005[dp]";
 533 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 534  	$esearch_result =~ 
 535   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 536 
 537 	my $Count_2005    = $1;
 538 
 539 	# 2006
 540 	$pubmed_query_year = $pubmed_query . " and 2006[dp]";
 541 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 542  	$esearch_result =~ 
 543   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 544 
 545 	my $Count_2006    = $1;
 546 
 547 	# 2007
 548 	$pubmed_query_year = $pubmed_query . " and 2007[dp]";
 549 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 550  	$esearch_result =~ 
 551   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 552 
 553 	my $Count_2007    = $1;
 554 
 555 	# 2008
 556 	$pubmed_query_year = $pubmed_query . " and 2008[dp]";
 557 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 558  	$esearch_result =~ 
 559   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 560 
 561 	my $Count_2008    = $1;
 562 
 563 	# 2009
 564 	$pubmed_query_year = $pubmed_query . " and 2009[dp]";
 565 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 566  	$esearch_result =~ 
 567   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 568 
 569 	my $Count_2009    = $1;
 570 
 571 	# 2010
 572 	$pubmed_query_year = $pubmed_query . " and 2010[dp]";
 573 	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 574  	$esearch_result =~ 
 575   		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
 576 
 577 	my $Count_2010    = $1;
 578 
 579 	my $Count_20002005 = ($Count_2001+$Count_2002+$Count_2003+$Count_2004+$Count_2005);
 580 	my $count_20062010 = ($Count_2006+$Count_2007+$Count_2008+$Count_2009+$Count_2010);
 581 
 582 	print(PUBS "$name[0]\t$primaryName\t@aliases\t$Count_195079\t$Count_198084\t$Count_198589\t$Count_199094\t$Count_19952000\t$Count_20002005\t$count_20062010\t$Count_1980\t$Count_1981\t$Count_1982\t$Count_1983\t$Count_1984\t$Count_1985\t$Count_1986\t$Count_1987\t$Count_1988\t$Count_1989\t$Count_1990\t$Count_1991\t$Count_1992\t$Count_1993\t$Count_1994\t$Count_1995\t$Count_1996\t$Count_1997\t$Count_1998\t$Count_1999\t$Count_2000\t$Count_2001\t$Count_2002\t$Count_2003\t$Count_2004\t$Count_2005\t$Count_2006\t$Count_2007\t$Count_2008\t$Count_2009\t$Count_2010\t$pubmed_query\n");
 583 	
 584 
 585 }
 586 
 587 close(PUBS);
 588 
 589 #Methods used by XML parser to get the fields of interest.
 590 # Gene-ref_locus = primary name (is assigned to primary name variable)  Each record should only contain one primary name
 591 # Gene-ref_syn_E = aliases (is assigned to an array of aliases)
 592 # Prot-ref_name_E - alternate names ( is assigned to an array of alternate names) 
 593 sub startElement {
 594        my( $parseinst, $element, %attrs ) = @_;
 595        SWITCH: {
 596               if ($element eq "Gene-ref_locus") {
 597                      $tag = "Gene-ref_locus";
 598                      last SWITCH;
 599               }
 600 	      if ($element eq "Gene-ref_syn_E") {
 601                      $tag = "Gene-ref_syn_E";
 602                      last SWITCH;
 603               }
 604 	     if ($element eq "Prot-ref_name_E") {
 605                      $tag = "Prot-ref_name_E";
 606                      last SWITCH;
 607               }
 608 		                 
 609              
 610        }
 611 }
 612 sub endElement {
 613        my( $parseinst, $element ) = @_;
 614        if ($element eq "Gene-ref_locus") {
 615               #print "\n";
 616        } elsif ($element eq "Gene-ref_syn_E") {
 617               #print "\n";
 618        }elsif ($element eq "Prot-ref_name_E") {
 619               #print "\n";
 620        }  
 621 }
 622 sub characterData {
 623        my( $parseinst, $data ) = @_;
 624        if (($tag eq "Gene-ref_locus")) {
 625               $data =~ s/\n|\t//g;
 626 	      $primaryName=$data;
 627 	      $tag = "";
 628        }
 629        if (($tag eq "Gene-ref_syn_E")) {
 630               $data =~ s/\n|\t//g;
 631 	      push(@aliases,$data) ;
 632 	      $tag = "";
 633        }
 634 	if (($tag eq "Prot-ref_name_E")) {
 635               $data =~ s/\n|\t//g;
 636 	      push(@alternate_names,$data) ;
 637 	      $tag = "";
 638        }
 639 }
 640 
 641 sub default {
 642        my( $parseinst, $data ) = @_;
 643        # you could do something here
 644 }
 645 
 646 
 647 sub usage {
 648     print <<EOF
 649 USAGE:
 650     ./query_entrez.pl --exclusions|-e exc.txt --genes|-g genes.txt --outputdir|-o outputdir --outputfile|-f outputfile
 651 
 652 DESCRIPTION:
 653     Given a list of genes (and exclusion terms) query entrez gene to get all the aliases
 654     of the given gene.  Construct a pubmed query with the gene name and all aliases (excluding
 655     any term in the exclusion list) and query pubmed for the years 1980-2010.
 656 
 657 OPTIONS:
 658     --genes, -g 
 659 	  The path to a tab-delimited file, where
 660 		1st column = gene name
 661     --exclusions, -e
 662         The path to a tab-delimited file, where
 663             1st column = term      # corresponding to a term/name/alias to excluded from the pubmed search
 664             
 665     --outputdir, -o
 666 	  The path to output the results to.  If doesn't exist a new
 667         file will be created. 
 668 
 669 	--outputfile, -f
 670 	  The name fo the output file.  If doesn't exist a new
 671         file will be created. 
 672 
 673 
 674 EOF
 675 }
Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
[get | view] (2011-04-28 15:00:28, 35.4 KB) [[attachment:IonChannels.xlsx]]
[get | view] (2011-04-28 15:00:28, 64.9 KB) [[attachment:Kinases.xlsx]]
[get | view] (2011-04-28 15:00:28, 37.5 KB) [[attachment:MethylTransferases.xls]]
[get | view] (2011-04-28 15:00:28, 25.2 KB) [[attachment:NuclearHormoneReceptors.xlsx]]
[get | view] (2011-04-28 15:00:28, 5269.2 KB) [[attachment:SupplementaryFigure1_highqualityProbes.pdf]]
[get | view] (2011-04-28 15:00:28, 2785.3 KB) [[attachment:SupplementaryFigure2_unpublishedProbes.pdf]]
[get | view] (2011-04-28 15:00:28, 4291.4 KB) [[attachment:SupplementaryFigure3_noProbes.pdf]]
[get | view] (2011-05-16 18:28:24, 0.0 KB) [[attachment:exclusionlist.txt]]
[get | view] (2011-04-28 15:00:28, 192.4 KB) [[attachment:figure1_NR_19502009.pdf]]
[get | view] (2011-04-28 15:00:28, 186.9 KB) [[attachment:figure2_kinases_19502009.pdf]]
[get | view] (2011-04-28 15:00:28, 194.9 KB) [[attachment:figure3_NR_19501995_vs2009.pdf]]
[get | view] (2011-04-28 15:00:28, 249.5 KB) [[attachment:figure4_NR_2009_probes.pdf]]
[get | view] (2011-04-28 15:00:28, 202.3 KB) [[attachment:figure5_Ionchannels_19502009_vs2009.pdf]]
[get | view] (2011-05-16 18:27:34, 0.3 KB) [[attachment:geneids.txt]]
[get | view] (2011-05-16 18:23:07, 22.3 KB) [[attachment:query_pubmed.pl]]
All files | Selected Files: delete move to page
You are not allowed to attach a file to this page.