import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import java.net.URLEncoder; import java.text.Collator; import java.util.HashMap; import java.util.Locale; import java.util.TreeMap; import java.util.TreeSet; import java.util.Vector; import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.events.XMLEvent; /** * NCBIMap * @author Pierre Lindenbaum plindenbaum@yahoo.fr * */ public class NCBIMap { static final String Countries[][]= { {"ac","Ascension Island"}, {"ad","Andorra"}, {"ae","United Arab Emirates"}, {"af","Afghanistan"}, {"ag","Antigua and Barbuda"}, {"ai","Anguilla"}, {"al","Albania"}, {"am","Armenia"}, {"an","Netherlands Antilles"}, {"ao","Angola"}, {"aq","Antarctica"}, {"ar","Argentina"}, {"as","American Samoa"}, {"at","Austria"}, {"au","Australia"}, {"aw","Aruba"}, {"ax","Aland Islands"}, {"az","Azerbaijan"}, {"ba","Bosnia and Herzegovina"}, {"bb","Barbados"}, {"bd","Bangladesh"}, {"be","Belgium"}, {"bf","Burkina Faso"}, {"bg","Bulgaria"}, {"bh","Bahrain"}, {"bi","Burundi"}, {"bj","Benin"}, {"bm","Bermuda"}, {"bn","Brunei Darussalam"}, {"bo","Bolivia"}, {"br","Brazil"}, {"bs","Bahamas"}, {"bt","Bhutan"}, {"bv","Bouvet Island"}, {"bw","Botswana"}, {"by","Belarus"}, {"bz","Belize"}, {"ca","Canada"}, {"cc","Cocos (Keeling) Islands"}, {"cd","Congo, The Democratic Republic of the"}, {"cf","Central African Republic"}, {"cg","Congo, Republic of"}, {"ch","Switzerland"}, {"ci","Cote d'Ivoire"}, {"ck","Cook Islands"}, {"cl","Chile"}, {"cm","Cameroon"}, {"cn","China"}, {"co","Colombia"}, {"cr","Costa Rica"}, {"cu","Cuba"}, {"cv","Cape Verde"}, {"cx","Christmas Island"}, {"cy","Cyprus"}, {"cz","Czech Republic"}, {"de","Germany"}, {"dj","Djibouti"}, {"dk","Denmark"}, {"dm","Dominica"}, {"do","Dominican Republic"}, {"dz","Algeria"}, {"ec","Ecuador"}, {"ee","Estonia"}, {"eg","Egypt"}, {"eh","Western Sahara"}, {"er","Eritrea"}, {"es","Spain"}, {"et","Ethiopia"}, {"eu","European Union"}, {"fi","Finland"}, {"fj","Fiji"}, {"fk","Falkland Islands (Malvinas)"}, {"fm","Micronesia, Federated States of"}, {"fo","Faroe Islands"}, {"fr","France"}, {"ga","Gabon"}, {"gb","United Kingdom"}, {"gd","Grenada"}, {"gf","French Guiana"}, {"gg","Guernsey"}, {"gh","Ghana"}, {"gi","Gibraltar"}, {"gl","Greenland"}, {"gm","Gambia"}, {"gn","Guinea"}, {"gp","Guadeloupe"}, {"gq","Equatorial Guinea"}, {"gr","Greece"}, {"gs","South Georgia and the South Sandwich Islands"}, {"gt","Guatemala"}, {"gu","Guam"}, {"gw","Guinea-Bissau"}, {"gy","Guyana"}, {"hk","Hong Kong"}, {"hm","Heard and McDonald Islands"}, {"hn","Honduras"}, {"hr","Croatia/Hrvatska"}, {"ht","Haiti"}, {"hu","Hungary"}, {"id","Indonesia"}, {"ie","Ireland"}, {"il","Israel"}, {"im","Isle of Man"}, {"in","India"}, {"io","British Indian Ocean Territory"}, {"iq","Iraq"}, {"ir","Iran, Islamic Republic of"}, {"ir","Iran"}, {"is","Iceland"}, {"it","Italy"}, {"je","Jersey"}, {"jm","Jamaica"}, {"jo","Jordan"}, {"jp","Japan"}, {"ke","Kenya"}, {"kg","Kyrgyzstan"}, {"kh","Cambodia"}, {"ki","Kiribati"}, {"km","Comoros"}, {"kn","Saint Kitts and Nevis"}, {"kp","Korea, Democratic People's Republic"}, {"kr","Korea, Republic of"}, {"kr","South Korea"}, {"kw","Kuwait"}, {"ky","Cayman Islands"}, {"kz","Kazakhstan"}, {"la","Lao People's Democratic Republic"}, {"lb","Lebanon"}, {"lc","Saint Lucia"}, {"li","Liechtenstein"}, {"lk","Sri Lanka"}, {"lr","Liberia"}, {"ls","Lesotho"}, {"lt","Lithuania"}, {"lu","Luxembourg"}, {"lv","Latvia"}, {"ly","Libyan Arab Jamahiriya"}, {"ly","Libya"}, {"ma","Morocco"}, {"mc","Monaco"}, {"md","Moldova, Republic of"}, {"me","Montenegro"}, {"mg","Madagascar"}, {"mh","Marshall Islands"}, {"mk","Macedonia, The Former Yugoslav Republic of"}, {"ml","Mali"}, {"mm","Myanmar"}, {"mn","Mongolia"}, {"mo","Macao"}, {"mp","Northern Mariana Islands"}, {"mq","Martinique"}, {"mr","Mauritania"}, {"ms","Montserrat"}, {"mt","Malta"}, {"mu","Mauritius"}, {"mv","Maldives"}, {"mw","Malawi"}, {"mx","Mexico"}, {"mx","méxico"}, {"my","Malaysia"}, {"mz","Mozambique"}, {"na","Namibia"}, {"nc","New Caledonia"}, {"ne","Niger"}, {"nf","Norfolk Island"}, {"ng","Nigeria"}, {"ni","Nicaragua"}, {"nl","Netherlands"}, {"no","Norway"}, {"np","Nepal"}, {"nr","Nauru"}, {"nu","Niue"}, {"nz","New Zealand"}, {"om","Oman"}, {"pa","Panama"}, {"pe","Peru"}, {"pf","French Polynesia"}, {"pg","Papua New Guinea"}, {"ph","Philippines"}, {"pk","Pakistan"}, {"pl","Poland"}, {"pm","Saint Pierre and Miquelon"}, {"pn","Pitcairn Island"}, {"pr","Puerto Rico"}, {"ps","Palestinian Territory, Occupied"}, {"pt","Portugal"}, {"pw","Palau"}, {"py","Paraguay"}, {"qa","Qatar"}, {"re","Reunion Island"}, {"ro","Romania"}, {"rs","Serbia"}, {"ru","Russian Federation"}, {"rw","Rwanda"}, {"sa","Saudi Arabia"}, {"sb","Solomon Islands"}, {"sc","Seychelles"}, {"sd","Sudan"}, {"se","Sweden"}, {"sg","Singapore"}, {"sh","Saint Helena"}, {"si","Slovenia"}, {"sj","Svalbard and Jan Mayen Islands"}, {"sk","Slovak Republic"}, {"sl","Sierra Leone"}, {"sm","San Marino"}, {"sn","Senegal"}, {"so","Somalia"}, {"sr","Suriname"}, {"st","Sao Tome and Principe"}, {"su","Soviet Union (being phased out)"}, {"sv","El Salvador"}, {"sy","Syrian Arab Republic"}, {"sz","Swaziland"}, {"tc","Turks and Caicos Islands"}, {"td","Chad"}, {"tf","French Southern Territories"}, {"tg","Togo"}, {"th","Thailand"}, {"tj","Tajikistan"}, {"tk","Tokelau"}, {"tl","Timor-Leste"}, {"tm","Turkmenistan"}, {"tn","Tunisia"}, {"to","Tonga"}, {"tp","East Timor"}, {"tr","Turkey"}, {"tt","Trinidad and Tobago"}, {"tv","Tuvalu"}, {"tw","Taiwan"}, {"tz","Tanzania"}, {"ua","Ukraine"}, {"ug","Uganda"}, {"uk","United Kingdom"}, {"um","United States Minor Outlying Islands"}, {"us","United States"}, {"gov","United States"}, {"uy","Uruguay"}, {"uz","Uzbekistan"}, {"va","Holy See (Vatican City State)"}, {"va","Vatican"}, {"vc","Saint Vincent and the Grenadines"}, {"ve","Venezuela"}, {"vg","Virgin Islands, British"}, {"vi","Virgin Islands, U.S."}, {"vn","Vietnam"}, {"vu","Vanuatu"}, {"wf","Wallis and Futuna Islands"}, {"ws","Samoa"}, {"ye","Yemen"}, {"yt","Mayotte"}, {"yu","Yugoslavia"}, {"za","South Africa"}, {"zm","Zambia"}, {"zw","Zimbabwe"}, {"ge","Georgia"}, {"uk"," UK."}, {"uk"," UK,"}, {"uk"," UK,"}, {"us"," england."}, {"us"," u.s.a."}, {"us"," usa,"}, {"us"," usa."}, {"us",",usa,"}, {"us"," new york"}, {"ru","Russia"}, {"br","Brasil"}, {"es","españa"}, {"us","stanford"}, {"us","cornell"}, {"us","san-francisco"}, {"us","san francisco"}, {"us","calfornia"}, {"us","boston"},{"us","atlanta"},{"us","chicago"}, {"fr","cedex"}, {"kr","Korea"} }; private TreeMap> country2year2count= new TreeMap>(); private TreeSet allYears= new TreeSet(); /** max number or pubmed entries to return */ private int max_return=500; /** are we debugging */ private boolean debugging=false; /** * got some problem with the DTD of the NCBI. This reader ignores the second line * of the returned XML * @author pierre * */ private class IgnoreLine2 extends Reader { Reader delegate; boolean found=false; IgnoreLine2(Reader delegate) { this.delegate=delegate; } @Override public int read() throws IOException { int c= this.delegate.read(); if(c==-1) return c; if(c=='\n' && !found) { while((c=this.delegate.read())!=-1) { if(c=='\n') break; } found=true; return this.read(); } return c; } @Override public int read(char[] cbuf, int off, int len) throws IOException { if(found) return this.delegate.read(cbuf,off,len); int i=0; while(i year2count=this.country2year2count.get(prefix); if(year2count==null) { year2count= new HashMap(); this.country2year2count.put(prefix,year2count); } Integer count= year2count.get(year); if(count==null) count=0; year2count.put(year,count+1); this.allYears.add(year); } /** do our stuff */ private void run(String term) throws IOException,XMLStreamException { URL url= new URL( "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term="+ URLEncoder.encode(term, "UTF-8")+ "&retstart=0&retmax="+max_return+"&usehistory=y&retmode=xml&email=plindenbaum_at_yahoo.fr&tool=gis"); XMLEventReader reader= newReader(url); XMLEvent evt; String QueryKey=null; String WebEnv=null; int countId=0; while(!(evt=reader.nextEvent()).isEndDocument()) { if(!evt.isStartElement()) continue; String tag= evt.asStartElement().getName().getLocalPart(); if(tag.equals("QueryKey")) { QueryKey= reader.getElementText().trim(); } else if(tag.equals("WebEnv")) { WebEnv= reader.getElementText().trim(); } else if(tag.equals("Id")) { countId++; } } reader.close(); debug(countId); if(countId==0) return; url= new URL("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&WebEnv="+ URLEncoder.encode(WebEnv,"UTF-8")+ "&query_key="+URLEncoder.encode(QueryKey,"UTF-8")+ "&retmode=xml&retmax="+max_return+"&email=plindenbaum_at_yahoo.fr&tool=gis"); reader= newReader(url); String Affiliation=null; Integer year=null; this.allYears.clear(); this.country2year2count.clear(); countId=0; while(!(evt=reader.nextEvent()).isEndDocument()) { if(evt.isStartElement()) { String tag= evt.asStartElement().getName().getLocalPart(); if(tag.equals("Affiliation")) { Affiliation=reader.getElementText().trim().toLowerCase(); } else if(tag.contains("Year")) { year= Integer.parseInt(reader.getElementText().trim()); } } else if(evt.isEndElement()) { String tag= evt.asEndElement().getName().getLocalPart(); if(tag.equals("PubmedArticle")) { if( Affiliation!=null) { Affiliation=Affiliation.replaceAll("\\([ ]*at[ ]*\\)", "@"); boolean found=false; for(String mail: Affiliation.split("[ \t\\:\\<,\\>\\(\\)]")) { mail.replaceAll("\\{\\}", ""); if(mail.endsWith(".")) mail= mail.substring(0,mail.length()-1); int index=mail.indexOf('@'); if(index==-1) continue; int i= mail.lastIndexOf('.'); if(i==-1) continue; String suffix= mail.substring(i+1); for(i=0;i< Countries.length;++i) { if(suffix.equals(Countries[i][0])) { found=true; addRecord(Countries[i][0],year); //System.out.println(mail+"\t"+year+"\t"+Countries[i][0]+"\t"+Countries[i][1]); break; } } } if(!found) { for(int i=0;i< Countries.length;++i) { if(Affiliation.contains(Countries[i][1].toLowerCase())) { found=true; addRecord(Countries[i][0],year); //System.out.println(Affiliation+"\t"+year+"\t"+Countries[i][0]+"\t"+Countries[i][1]); break; } } } if(!found) { debug(Affiliation); } } else { debug("No Aff "+Affiliation); } year=null; Affiliation=null; } } } reader.close(); System.out.print("Country"); for(Integer y:allYears) System.out.print("\t"+y); System.out.println("\tTotal"); int total_of_total=0; for(String prefix:this.country2year2count.keySet()) { int total=0; HashMap year2count= this.country2year2count.get(prefix); System.out.print(prefix.toUpperCase()); for(Integer y: allYears) { Integer c= year2count.get(y); if(c==null) c=0; System.out.print("\t"+c); total+=c; } System.out.println("\t"+total); total_of_total+=total; } debug(total_of_total); } private void debug(Object o) { if(!debugging) return; System.err.println(o); } /** * @param args */ public static void main(String[] args) { NCBIMap prg= new NCBIMap(); //args=new String[]{"-n","1000","-d","Bioinformatics[Journal]"}; int optind=0; while(optind