Attachment 'query_pubmed.pl'
Download 1 #!/usr/local/bin/perl -w
2 # ===========================================================================
3 # Generic Tool to given a list of genes (and excluded terms) search entrez
4 # for all aliases of the gene followed by a pubmed query to see how the
5 # number of publications has changed over time.
6 # ===========================================================================
7 use LWP::Simple;
8 use XML::Parser;
9 use XML::Simple; # module to parse XML data which is the response to the query
10 use Data::Dumper; # module for visualization of parsed XML data; not obligatory for function but was used for development
11
12 use Getopt::Long; # to parse the command line options.
13
14 my $query_ending = "[sym] AND (\"mus musculus\"[Organism] OR \"homo sapiens\"[Organism])";
15 my $report = "xml";
16 my $email = "ruth.isserlin\@utoronto.ca";
17 my $tool = "publication_distribution";
18
19 #url for the ncbi e-utils
20 my $utils = "http://www.ncbi.nlm.nih.gov/entrez/eutils";
21
22 #the different queries
23 my $esearch_pubmed = "$utils/esearch.fcgi?" .
24 "db=Pubmed&usehistory=n&email=$email&tool=$tool&term=";
25
26 my $esearch_gene = "$utils/esearch.fcgi?" .
27 "db=Gene&usehistory=y&email=$email&tool=$tool&term=";
28
29
30
31 #if you want to limit the pubmed search to anything (for example [Title]) add it to constraints
32 my $constraints = "";
33
34 #variables to hold command line arguments
35 my (
36 $genes_filename,
37 $exclusions_filename,
38 $output_dirname,
39 $output_filename,
40 $restrict,
41 );
42
43 GetOptions (
44 "genes|g=s" => \$genes_filename,
45 "exclusions|e=s" => \$exclusions_filename,
46 "outputdir|o=s" => \$output_dirname,
47 "outputfile|f=s" => \$output_filename,
48 "restrict|r=s" => \$restrict,
49 );
50
51 #open the output file
52 open(PUBS, ">$output_dirname/$output_filename") or die "Error opening $output_dirname/$output_filename : $!\n";
53
54 #print Header
55 print(PUBS "Initial Gene Query\tPrimary Name\tAliases\t1950-1979\t1980-1984\t1985-1989\t1990-1994\t1995-2000\t2001-2005\t2006-2010\t1980\t1981\t1982\t1983\t1984\t1985\t1986\t1987\t1988\t1989\t1990\t1991\t1992\t1993\t1994\t1995\t1996\t1997\t1998\t1999\t2000\t2001\t2002\t2003\t2004\t2005\t2006\t2007\t2008\t2009\t2010\tPubmed Query\n");
56
57
58 #get all the names we want to exclude
59 # There are aliases that can bring back loads of pubmed hits, for example gene name "MR", that you might
60 # want to exclude from pubmed search
61 open(IGNORE_IN, "$exclusions_filename") or die "error opening $exclusions_filename : $!\n";
62 my @ignore_list = ();
63
64 while(<IGNORE_IN>){
65 chomp;
66 my @cur = split(/\n/);
67
68 push @ignore_list, $cur[0];
69 }
70
71 #open the file that stores all the gene symbols for nuclear receptors
72 #get the gene Names to do each pubmed search
73 open(Names_IN, "$genes_filename") or die "Error opening $genes_filename : $!\n";
74
75
76 #go through each name (in the genes_file)
77 # search entrez for aliases
78 # make sure none of the aliases are in the exclusion list
79 # make sure that each individual search term returns results when it is searched in pubmed. Because if no
80 # results are returned by pubmed for a particular term pubmed expands it to try and get hits.
81 # for example the term "orphan nuclear hormone receptor 1" is not found as a quoted term so pubmed
82 # translates it to:("child, orphaned"[MeSH Terms] OR ("child"[All Fields] AND "orphaned"[All Fields])
83 # OR "orphaned child"[All Fields] OR "orphan"[All Fields]) AND ("receptors, cytoplasmic and
84 # nuclear"[MeSH Terms] OR ("receptors"[All Fields] AND "cytoplasmic"[All Fields] AND
85 # "nuclear"[All Fields]) OR "cytoplasmic and nuclear receptors"[All Fields] OR ("nuclear"[All Fields]
86 # AND "hormone"[All Fields] AND "receptor"[All Fields]) OR "nuclear hormone receptor"[All Fields])
87 # AND 1[All Fields]
88 # construct the base pubmed query
89 # and then search pubmed for the number of publications
90 while (<Names_IN>){
91 chomp;
92 my @name = split(/\n/);
93 print "$name[0]\n";
94
95 #search entrez for aliases.
96 # Get the entrez Entry in XML format and parse out all the aliases and alternate names
97 my $query = $name[0].$query_ending;
98
99 my $esearch_result = get($esearch_gene . $query);
100
101 $esearch_result =~
102 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
103
104 my $Count = $1;
105 my $QueryKey = $2;
106 my $WebEnv = $3;
107
108
109 my $retstart;
110 my $retmax=1;
111
112 my $pubmed_query = "";
113 @aliases = ();
114 @alternate_names = ();
115
116 #go through all records returned by Entrez, there could be one mouse and one human
117 for($retstart = 0; $retstart < $Count; $retstart += $retmax) {
118
119 my $efetch = "$utils/efetch.fcgi?" .
120 "rettype=$report&retmode=text&retstart=$retstart&retmax=$retmax&" .
121 "db=Gene&query_key=$QueryKey&WebEnv=$WebEnv";
122
123 my $efetch_result = get($efetch);
124
125 $tag = "";
126 $primaryName = "";
127
128 $p3 = new XML::Parser(ErrorContext => 2);
129
130 $p3->setHandlers(Start => \&startElement,
131 End => \&endElement,
132 Char => \&characterData,Default => \&default);
133
134 #use XML parse function to parse the results returned by Entrez.
135 eval{$p3->parse($efetch_result);};
136 if($@){
137 my $error = $@;
138 print "$error\n";
139 next;
140 }
141
142 }
143
144 #start constructing the pubmed query
145 $pubmed_query = "((\"" . $primaryName ."\"";
146
147 #go through each of the aliases in the list. Only add it to the query if the individual
148 # query returns somethings and it is not in the exclusion list.
149 foreach $aliases (@aliases){
150
151 #before doing a search check to see if the name is in the exclude list.
152 my $ignore = "no";
153 foreach $a (@ignore_list){
154 #one of the novel
155 if($a eq $aliases){
156 $ignore = "yes";
157 }
158
159 }
160
161 if($ignore eq "no"){
162 #for each gene name do a separate pubmed query and get the counts
163 #if the individual query comes back with one of the below warnings then don't
164 #add it to the search.
165
166 my $notFound = "";
167
168 $pubmed_name = "\"" . $aliases . "\"" . $constraints;
169 $esearch_result = get($esearch_pubmed . $pubmed_name);
170
171 #tag within the pubmed search results that indicate the quoted phrase was not found
172 # and that pubmed will try and expand it.
173 $esearch_result =~
174 m|<QuotedPhraseNotFound>(.*)</QuotedPhraseNotFound>|s;
175
176 $notFound = $1;
177
178 if($notFound ne $pubmed_name){
179
180 $pubmed_query = $pubmed_query . " OR \"" . $aliases . "\"" . $constraints;
181
182
183 }
184 }
185 }
186
187
188 #finished adding all the gene names, now add the alternate names
189 # For more restrictive search change change to AND here.
190 $pubmed_query = $pubmed_query . ") OR ( ";
191
192
193 foreach $alternate_names (@alternate_names){
194
195 #before doing a search check to see if the name is in the exclude list.
196 $ignore = "no";
197 foreach $a (@ignore_list){
198 #one of the novel
199 if($a eq $alternate_names){
200 $ignore = "yes";
201 }
202
203 }
204
205 if($ignore eq "no"){
206 #for each gene name do a separate pubmed query and get the counts
207 my $notFound = "";
208 $pubmed_name = "\"" . $alternate_names . "\"" . $constraints;
209 $esearch_result = get($esearch_pubmed . $pubmed_name);
210
211 #tag within the pubmed search results that indicate the quoted phrase was not found
212 # and that pubmed will try and expand it.
213 $esearch_result =~
214 m|<QuotedPhraseNotFound>(.*)</QuotedPhraseNotFound>|s;
215
216 $notFound = $1;
217
218 if($notFound ne $pubmed_name){
219 $pubmed_query = $pubmed_query . " \"" . $alternate_names . "\"" . $constraints ." OR ";
220 }
221 }
222 }
223
224
225 #add an extra name to the RXRs
226 if($name[0] eq "NR2B1"){
227 $pubmed_query = $pubmed_query . " \"RXR alpha\"" . $constraints." OR \"RXRalpha\"". $constraints;
228 }
229 #add an extra name to the RXRs
230 if($name[0] eq "NR2B2"){
231 $pubmed_query = $pubmed_query . " \"RXR beta\"" . $constraints." OR \"RXRbeta\"" . $constraints;
232 }
233 #add an extra name to the RXRs
234 if($name[0] eq "NR2B3"){
235 $pubmed_query = $pubmed_query . " \"RXR gamma\"" . $constraints." OR \"RXRgamma\"". $constraints;
236 }
237
238 #add the generic name "thyroid hormone receptor" to one of the receptors.
239 if($name[0] eq "NR1A2"){
240 $pubmed_query = $pubmed_query . " (\"thyroid hormone receptor\" AND \"beta\")". $constraints;
241 }
242 if($name[0] eq "NR1A1"){
243 $pubmed_query = $pubmed_query . " (\"thyroid hormone receptor\" AND \"alpha\")". $constraints;
244 }
245
246
247 $pubmed_query = $pubmed_query . ")) and \"". $restrict . "\"" . $constraints;
248
249 #create a file to output all the pmids to
250 #do the search without any year restriction
251
252 $esearch_result = get($esearch_pubmed . $pubmed_query);
253 $esearch_result =~
254 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
255
256 my $Count_all = $1;
257 my $Querykey = $2;
258 my $WebKey = $3;
259
260
261 my $efetch_pmids = "$utils/efetch.fcgi?" .
262 "db=pubmed&usehistory=y&rettype=uilist&retmode=text&WebEnv=$WebKey&query_key=$QueryKey";
263
264
265 my $efetch_pmids_results = get($efetch_pmids);
266
267 #create a directory in the output directory called PMIDs to contain the PMIDs
268 mkdir "$output_dirname/PMIDS";
269
270 #open a file the name of the main search
271 open(PMIDS, ">$output_dirname/PMIDS/$name[0].txt");
272
273 my @pmids = split(/\n/, $efetch_pmids_results);
274 foreach $pmids (@pmids){
275 print PMIDS "PMID(".$pmids.") OR \n";
276 }
277 close PMIDS;
278
279
280 ##########################################
281 # Do pubmed searches for years 1950-2010 in 5 year subsets
282 # and individually for year 1980-2010
283 #########################################
284
285 #search 1950-1979
286 $pubmed_query_year = $pubmed_query . " and 1950:1979[dp]";
287 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
288
289 $esearch_result =~
290 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
291
292 my $Count_195079 = $1;
293
294 #search 1980-1984
295 $pubmed_query_year = $pubmed_query . " and 1980:1984[dp]";
296 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
297
298 $esearch_result =~
299 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
300
301 my $Count_198084 = $1;
302
303 #search 1985-1989
304 $pubmed_query_year = $pubmed_query . " and 1985:1989[dp]";
305 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
306
307 $esearch_result =~
308 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
309
310 my $Count_198589 = $1;
311
312 #search 1990-1994
313 $pubmed_query_year = $pubmed_query . " and 1990:1994[dp]";
314 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
315
316 $esearch_result =~
317 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
318
319 my $Count_199094 = $1;
320
321 #1995-2000
322 $pubmed_query_year = $pubmed_query . " and 1995:2000[dp]";
323 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
324 $esearch_result =~
325 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
326
327 my $Count_19952000 = $1;
328
329 #and then search by each year individually
330 # 1980
331 $pubmed_query_year = $pubmed_query . " and 1980[dp]";
332 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
333 $esearch_result =~
334 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
335
336 my $Count_1980 = $1;
337
338 # 1981
339 $pubmed_query_year = $pubmed_query . " and 1981[dp]";
340 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
341 $esearch_result =~
342 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
343
344 my $Count_1981 = $1;
345
346 # 1982
347 $pubmed_query_year = $pubmed_query . " and 1982[dp]";
348 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
349 $esearch_result =~
350 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
351
352 my $Count_1982 = $1;
353
354 # 1983
355 $pubmed_query_year = $pubmed_query . " and 1983[dp]";
356 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
357 $esearch_result =~
358 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
359
360 my $Count_1983 = $1;
361
362 # 1984
363 $pubmed_query_year = $pubmed_query . " and 1984[dp]";
364 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
365 $esearch_result =~
366 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
367
368 my $Count_1984 = $1;
369
370 # 1985
371 $pubmed_query_year = $pubmed_query . " and 1985[dp]";
372 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
373 $esearch_result =~
374 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
375
376 my $Count_1985 = $1;
377
378 # 1986
379 $pubmed_query_year = $pubmed_query . " and 1986[dp]";
380 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
381 $esearch_result =~
382 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
383
384 my $Count_1986 = $1;
385
386 # 1987
387 $pubmed_query_year = $pubmed_query . " and 1987[dp]";
388 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
389 $esearch_result =~
390 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
391
392 my $Count_1987 = $1;
393
394 # 1988
395 $pubmed_query_year = $pubmed_query . " and 1988[dp]";
396 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
397 $esearch_result =~
398 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
399
400 my $Count_1988 = $1;
401
402 # 1989
403 $pubmed_query_year = $pubmed_query . " and 1989[dp]";
404 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
405 $esearch_result =~
406 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
407
408 my $Count_1989 = $1;
409
410 # 1990
411 $pubmed_query_year = $pubmed_query . " and 1990[dp]";
412 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
413 $esearch_result =~
414 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
415
416 my $Count_1990 = $1;
417
418 # 1991
419 $pubmed_query_year = $pubmed_query . " and 1991[dp]";
420 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
421 $esearch_result =~
422 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
423
424 my $Count_1991 = $1;
425
426 # 1992
427 $pubmed_query_year = $pubmed_query . " and 1992[dp]";
428 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
429 $esearch_result =~
430 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
431
432 my $Count_1992 = $1;
433
434 # 1993
435 $pubmed_query_year = $pubmed_query . " and 1993[dp]";
436 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
437 $esearch_result =~
438 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
439
440 my $Count_1993 = $1;
441
442 # 1994
443 $pubmed_query_year = $pubmed_query . " and 1994[dp]";
444 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
445 $esearch_result =~
446 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
447
448 my $Count_1994 = $1;
449
450
451 # 1995
452 $pubmed_query_year = $pubmed_query . " and 1995[dp]";
453 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
454 $esearch_result =~
455 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
456
457 my $Count_1995 = $1;
458
459
460 # 1996
461 $pubmed_query_year = $pubmed_query . " and 1996[dp]";
462 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
463 $esearch_result =~
464 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
465
466 my $Count_1996 = $1;
467
468 # 1997
469 $pubmed_query_year = $pubmed_query . " and 1997[dp]";
470 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
471 $esearch_result =~
472 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
473
474 my $Count_1997 = $1;
475
476 # 1998
477 $pubmed_query_year = $pubmed_query . " and 1998[dp]";
478 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
479 $esearch_result =~
480 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
481
482 my $Count_1998 = $1;
483
484 # 1999
485 $pubmed_query_year = $pubmed_query . " and 1999[dp]";
486 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
487 $esearch_result =~
488 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
489
490 my $Count_1999 = $1;
491
492 # 2000
493 $pubmed_query_year = $pubmed_query . " and 2000[dp]";
494 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
495 $esearch_result =~
496 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
497
498 my $Count_2000 = $1;
499
500 # 2001
501 $pubmed_query_year = $pubmed_query . " and 2001[dp]";
502 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
503 $esearch_result =~
504 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
505
506 my $Count_2001 = $1;
507
508 # 2002
509 $pubmed_query_year = $pubmed_query . " and 2002[dp]";
510 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
511 $esearch_result =~
512 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
513
514 my $Count_2002 = $1;
515
516 # 2003
517 $pubmed_query_year = $pubmed_query . " and 2003[dp]";
518 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
519 $esearch_result =~
520 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
521
522 my $Count_2003 = $1;
523 # 2004
524 $pubmed_query_year = $pubmed_query . " and 2004[dp]";
525 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
526 $esearch_result =~
527 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
528
529 my $Count_2004 = $1;
530
531 # 2005
532 $pubmed_query_year = $pubmed_query . " and 2005[dp]";
533 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
534 $esearch_result =~
535 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
536
537 my $Count_2005 = $1;
538
539 # 2006
540 $pubmed_query_year = $pubmed_query . " and 2006[dp]";
541 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
542 $esearch_result =~
543 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
544
545 my $Count_2006 = $1;
546
547 # 2007
548 $pubmed_query_year = $pubmed_query . " and 2007[dp]";
549 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
550 $esearch_result =~
551 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
552
553 my $Count_2007 = $1;
554
555 # 2008
556 $pubmed_query_year = $pubmed_query . " and 2008[dp]";
557 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
558 $esearch_result =~
559 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
560
561 my $Count_2008 = $1;
562
563 # 2009
564 $pubmed_query_year = $pubmed_query . " and 2009[dp]";
565 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
566 $esearch_result =~
567 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
568
569 my $Count_2009 = $1;
570
571 # 2010
572 $pubmed_query_year = $pubmed_query . " and 2010[dp]";
573 $esearch_result = get($esearch_pubmed . $pubmed_query_year);
574 $esearch_result =~
575 m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;
576
577 my $Count_2010 = $1;
578
579 my $Count_20002005 = ($Count_2001+$Count_2002+$Count_2003+$Count_2004+$Count_2005);
580 my $count_20062010 = ($Count_2006+$Count_2007+$Count_2008+$Count_2009+$Count_2010);
581
582 print(PUBS "$name[0]\t$primaryName\t@aliases\t$Count_195079\t$Count_198084\t$Count_198589\t$Count_199094\t$Count_19952000\t$Count_20002005\t$count_20062010\t$Count_1980\t$Count_1981\t$Count_1982\t$Count_1983\t$Count_1984\t$Count_1985\t$Count_1986\t$Count_1987\t$Count_1988\t$Count_1989\t$Count_1990\t$Count_1991\t$Count_1992\t$Count_1993\t$Count_1994\t$Count_1995\t$Count_1996\t$Count_1997\t$Count_1998\t$Count_1999\t$Count_2000\t$Count_2001\t$Count_2002\t$Count_2003\t$Count_2004\t$Count_2005\t$Count_2006\t$Count_2007\t$Count_2008\t$Count_2009\t$Count_2010\t$pubmed_query\n");
583
584
585 }
586
587 close(PUBS);
588
589 #Methods used by XML parser to get the fields of interest.
590 # Gene-ref_locus = primary name (is assigned to primary name variable) Each record should only contain one primary name
591 # Gene-ref_syn_E = aliases (is assigned to an array of aliases)
592 # Prot-ref_name_E - alternate names ( is assigned to an array of alternate names)
593 sub startElement {
594 my( $parseinst, $element, %attrs ) = @_;
595 SWITCH: {
596 if ($element eq "Gene-ref_locus") {
597 $tag = "Gene-ref_locus";
598 last SWITCH;
599 }
600 if ($element eq "Gene-ref_syn_E") {
601 $tag = "Gene-ref_syn_E";
602 last SWITCH;
603 }
604 if ($element eq "Prot-ref_name_E") {
605 $tag = "Prot-ref_name_E";
606 last SWITCH;
607 }
608
609
610 }
611 }
612 sub endElement {
613 my( $parseinst, $element ) = @_;
614 if ($element eq "Gene-ref_locus") {
615 #print "\n";
616 } elsif ($element eq "Gene-ref_syn_E") {
617 #print "\n";
618 }elsif ($element eq "Prot-ref_name_E") {
619 #print "\n";
620 }
621 }
622 sub characterData {
623 my( $parseinst, $data ) = @_;
624 if (($tag eq "Gene-ref_locus")) {
625 $data =~ s/\n|\t//g;
626 $primaryName=$data;
627 $tag = "";
628 }
629 if (($tag eq "Gene-ref_syn_E")) {
630 $data =~ s/\n|\t//g;
631 push(@aliases,$data) ;
632 $tag = "";
633 }
634 if (($tag eq "Prot-ref_name_E")) {
635 $data =~ s/\n|\t//g;
636 push(@alternate_names,$data) ;
637 $tag = "";
638 }
639 }
640
641 sub default {
642 my( $parseinst, $data ) = @_;
643 # you could do something here
644 }
645
646
647 sub usage {
648 print <<EOF
649 USAGE:
650 ./query_entrez.pl --exclusions|-e exc.txt --genes|-g genes.txt --outputdir|-o outputdir --outputfile|-f outputfile
651
652 DESCRIPTION:
653 Given a list of genes (and exclusion terms) query entrez gene to get all the aliases
654 of the given gene. Construct a pubmed query with the gene name and all aliases (excluding
655 any term in the exclusion list) and query pubmed for the years 1980-2010.
656
657 OPTIONS:
658 --genes, -g
659 The path to a tab-delimited file, where
660 1st column = gene name
661 --exclusions, -e
662 The path to a tab-delimited file, where
663 1st column = term # corresponding to a term/name/alias to excluded from the pubmed search
664
665 --outputdir, -o
666 The path to output the results to. If doesn't exist a new
667 file will be created.
668
669 --outputfile, -f
670 The name fo the output file. If doesn't exist a new
671 file will be created.
672
673
674 EOF
675 }
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.