Revision as of 20:35, 24 July 2011 edit Boghog (talk \| contribs) Autopatrolled, Extended confirmed users, IP block exemptions, New page reviewers, Pending changes reviewers, Rollbackers, Template editors 142,840 edits updated to include code to generate the {{drugs.com}} template ← Previous edit		Revision as of 09:04, 28 July 2011 edit undo Boghog (talk \| contribs) Autopatrolled, Extended confirmed users, IP block exemptions, New page reviewers, Pending changes reviewers, Rollbackers, Template editors 142,840 edits updated script to fix bugs for special cases Next edit →
Line 7: # \| Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" --> # \| MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" --> # In addition, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing with data from http://www.drugbank.ca/system/downloads/current/drugbank.txt.zip # The tradnames were obtained from http://www.merckmanuals.com/home/drugnames-index/trade/a.html # Finally the script sorts the fields in the order they are currently rendered by the drugbox template # (in the order of clinical, pharmacokinetic, identifiers, and chemical data) """{{Drugbox \| Watchedfields \| verifiedrevid = 408577806 \| IUPAC_name = \| OtherNames = \| image = \| width = Line 124 ⟶ 130: }}""" ~~import re~~ ~~import string~~ ~~import wikipedia~~ from collections import defaultdict import ~~urllib~~codecs import csv import re import string import urllib import wikipedia user = "BogBot" Line 137 ⟶ 143: regexp_ab = re.compile(r'\{\{(nobots\|bots\\|(allow=none\|deny=.?' + user + r'.?\|optout=all\|deny=all))\}\}') regexp_drug_infobox = re.compile(r"\{\{\s(Drugbox\|drugbox)\s(?P<PARAMS>.+)\s\}\}\s", re.DOTALL) regexp_param = re.compile(r"^\s?\\|\s?(?P<PARAM>\S+)\s?=\s?(?P<VALUE>.+)\s?($\|\\|)") regexp_nested_template = re.compile(r"\{\{(?P<PARAMS>.+)\}\}") # ATC_supplemental = {{ATC\|B01\|AC06}}, {{ATC\|N02\|BA01}} regexp_ATC_supplemental = re.compile(r"\\|\s?ATC_supplemental\s?=\s?(?P<TEMPLATE>.?)\s?($\|\\|)") # CASNo_Ref = {{cascite\|correct\|CAS}} regexp_CASNo_Ref = re.compile(r"\\|\s??CASNo_Ref\s?=\s?(?P<TEMPLATE>\{\{s(Cascite\|cascite).+?\}\})") # ChEMBL_Ref = {{ebicite\|correct\|EBI}} regexp_ChEMBL_Ref = re.compile(r"\\|\s??ChEMBL_Ref\s?=\s?(?P<TEMPLATE>\{\{s(Ebicite\|ebicite).+?\}\})") # ChemSpiderID_Ref = {{chemspidercite\|correct\|chemspider}} regexp_ChemSpiderID_Ref = re.compile(r"\\|\s??ChemSpiderID_Ref\s?=\s?(?P<TEMPLATE>\{\{s(Chemspidercite\|chemspidercite).+?\}\})") # Drugs.com = {{drugs.com\|monograph\|lisinopril}} regexp_Drugs_com = re.compile(r"\\|\s??Drugs\.com\s?=\s?(?P<TEMPLATE>\{\{s(Drugs\.com\|drugs\.com).+?\}\})") # KEGG_Ref = {{keggcite\|correct\|kegg}} regexp_KEGG_Ref = re.compile(r"\\|\s??KEGG_Ref\s?=\s?(?P<TEMPLATE>\{\{s(Keggcite\|keggcite).+?\}\})") # StdInChI_Ref = {{stdinchicite\|correct\|chemspider}} regexp_StdInChI_Ref = re.compile(r"\\|\s??StdInChI_Ref\s?=\s?(?P<TEMPLATE>\{\{s(Stdinchicite\|stdinchicite).+?\}\})") # StdInChIKey_Ref = {{stdinchicite\|correct\|chemspider}} regexp_StdInChIKey_Ref = re.compile(r"\\|\s??StdInChIKey_Ref\s?=\s?(?P<TEMPLATE>\{\{s(Stdinchicite\|stdinchicite).+?\}\})") # UNII_Ref = {{fdacite\|changed\|FDA}} regexp_UNII_Ref = re.compile(r"\\|\s??UNII_Ref\s?=\s?(?P<TEMPLATE>\{\{s(Fdacite\|fdacite).+?\}\})") # Included for bot exclusion compliance (see http://en.wikipedia.org/wiki/Template:Bots) Line 147 ⟶ 175: return True # list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt" ~~# articles = open('/Users/BogBot/progs/pywikipedia/drugbox/drugbox_titles.txt', 'r')~~ # articles = codecs.open('/Users/BogHog/progs/pywikipedia/drugbox/drugbox_titles.txt', mode = 'r', encoding='utf-8') # drugbank data obtained from http://www.drugbank.ca/system/downloads/current/drugbank.txt.zip drugbank_data = {} Line 154 ⟶ 184: # Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID drug_data = csv.reader(open('/Users/~~BogBot~~BogHog/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'), dialect='excel') # drugs.com root links: Line 170 ⟶ 200: # main loop new_drugbox = "" for article in articles: # article = article.rstrip('\n') ~~# INN = article~~ INN = ~~"Lisinopril"~~article # INN = "Asprin" # print article Line 195 ⟶ 228: result_drug_infobox = regexp_drug_infobox.search(text) if result_drug_infobox: # print "found it!" parameters = result_drug_infobox.group('PARAMS') current_parameters = {} # first extract and assign the nested templates result_ATC_supplemental = regexp_ATC_supplemental.search(parameters) if result_ATC_supplemental: template = result_ATC_supplemental.group('TEMPLATE') current_parameters['ATC_supplemental'] = template # print "found result_ATC_supplemental! ", template parameters = re.sub(regexp_ATC_supplemental, "\|", parameters) result_ChEMBL_Ref = regexp_ChEMBL_Ref.search(parameters) if result_ChEMBL_Ref: template = result_ChEMBL_Ref.group('TEMPLATE') current_parameters['ChEMBL_Ref'] = template # print "found result_ChEMBL_Ref! ", template parameters = re.sub(regexp_ChEMBL_Ref, "", parameters) result_ChemSpiderID_Ref = regexp_ChemSpiderID_Ref.search(parameters) if result_ChemSpiderID_Ref: template = result_ChemSpiderID_Ref.group('TEMPLATE') current_parameters['ChemSpiderID_Ref'] = template # print "found ChemSpiderID_Ref! ", template parameters = re.sub(regexp_ChemSpiderID_Ref, "", parameters) # print parameters result_Drugs_com = regexp_Drugs_com.search(parameters) if result_Drugs_com: template = result_Drugs_com.group('TEMPLATE') current_parameters['Drugs.com'] = template # print "found result_Drugs_com! ", template parameters = re.sub(regexp_Drugs_com, "", parameters) result_KEGG_Ref = regexp_KEGG_Ref.search(parameters) if result_KEGG_Ref: template = result_KEGG_Ref.group('TEMPLATE') current_parameters['KEGG_Ref'] = template # print "found KEGG_Ref! ", template parameters = re.sub(regexp_KEGG_Ref, "", parameters) result_StdInChI_Ref = regexp_StdInChI_Ref.search(parameters) if result_StdInChI_Ref: template = result_StdInChI_Ref.group('TEMPLATE') current_parameters['StdInChI_Ref'] = template # print "found StdInChI_Ref! ", template parameters = re.sub(regexp_StdInChI_Ref, "", parameters) result_StdInChIKey_Ref = regexp_StdInChIKey_Ref.search(parameters) if result_StdInChIKey_Ref: template = result_StdInChIKey_Ref.group('TEMPLATE') current_parameters['StdInChIKey_Ref'] = template # print "found StdInChIKey_Ref! ", template parameters = re.sub(regexp_StdInChIKey_Ref, "", parameters) result_UNII_Ref = regexp_UNII_Ref.search(parameters) if result_UNII_Ref: template = result_UNII_Ref.group('TEMPLATE') current_parameters['UNII_Ref'] = template # print "found UNII_Ref! ", template parameters = re.sub(regexp_UNII_Ref, "", parameters) # print "parameters:" # print parameters lines = parameters.splitlines() for line in lines: # print "index: ", line.count('\|') # if (line.count('\|') > 1 and line.count('[[') < 1 ): ~~result_drug_param = regexp_param.search(line)~~ if ~~result_drug_param~~(line.count('\|') > 1): # print "line1: ", line ~~parameter = result_drug_param.group('PARAM')~~ ~~value~~ sub_strings = ~~result_drug_param~~line.~~group~~split(~~'VALUE'~~"\|") ~~current_parameters[parameter] = value~~ # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links # taken from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/ new_list = [sub_strings[0]] for sub_string in sub_strings[1:]: if (']]' in sub_string) and ((not '[[' in sub_string) or sub_string.find(']]') < sub_string.find('[[')): new_list[-1] = new_list[-1] + '\|' + sub_string else: new_list.append(sub_string) # print "new_list: ", new_list sub_strings = new_list for sub_string in sub_strings: # print "sub_string: ", sub_string if (sub_string.count("=") > 0): parts = sub_string.split("=") # print "parts: ", parts parameter = str(parts[0].encode()).strip() value = str(parts[1].encode()).strip() # print "parameter, value: ", parameter, " ", value current_parameters[parameter] = value else: result_drug_param = regexp_param.search(line) # print line if result_drug_param: # print "made it!" # print "line2: ", line parameter = result_drug_param.group('PARAM').strip() value = result_drug_param.group('VALUE').strip() current_parameters[parameter] = value # print current_parameters # print "INN: ", INN if INN in drugbank_data: data = drugbank_data[INN] else: data = [] # merge tradenames new_tradenames = [] if data: merck_tradenames = sorted(set(data[0].split(";")))[1:] for index, object in enumerate(merck_tradenames): merck_tradenames[index] = string.capitalize(string.strip(object.encode())) else: merck_tradenames = [] if current_parameters.has_key("tradename"): current_tradenames = sorted(set(current_parameters['tradename'].split(", "))) Line 223 ⟶ 356: for index, object in enumerate(current_tradenames): current_tradenames[index] = string.capitalize(string.strip(object.encode())) merged_tradenames = ~~sorted(merck_tradenames + current_tradenames)~~[] if merck_tradenames: merged_tradenames = merck_tradenames ~~new_tradenames = sorted(set(merged_tradenames))~~ if current_tradenames: merged_tradenames.append(current_tradenames) ~~current_parameters['tradename'] = ", ".join(new_tradenames)~~ if merged_tradenames: ~~# print "merck tradenames: ", merck_tradenames~~ new_tradenames = sorted(set(merged_tradenames)) ~~# print "current tradenames: ", current_tradenames~~ # ~~print~~if ~~"new tradenames~~new_tradenames: ", current_parameters['tradename'] = ", ".join(new_tradenames) # print "merck tradenames: ", merck_tradenames # print "current tradenames: ", current_tradenames # print "new tradenames: ", current_parameters['tradename'] # test web page, returns "200" if OK: Line 238 ⟶ 374: # \| MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" --> if data: link = "http://www.nlm.nih.gov/medlineplus/druginfo/meds/" + data[2] + ".html" if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter current_parameters['MedlinePlus'] = data[2] # print "MedlinePlus: ", current_parameters['MedlinePlus'] else: link = "" # add Drugs.com parameter # \| Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" --> ~~stems = []~~ # create alternate candidate drugs.com links ~~drugnames = []~~ # alternateive roots include the monograph, CDI, CONS, MTM, and "parent" sectios of drgus.com web site ~~drugnames.append(INN)~~ # alternative stems inlclude the INN, trade names, and the name extracted from the drugbank link stems = [] drugnames = [] drugnames.append(INN) if new_tradenames: for tradename in new_tradenames: drugnames.append(tradename) for drugname in drugnames: drugname = string.lower(drugname) if (string.find(~~tradename~~drugname, " ") > 0-1): stems.append(string.replace(drugname, " ", "_")) stems.append(string.replace(drugname, " ", "-")) else: stems.append(drugname) if ~~try~~data: if ~~for root in roots~~data[3]: if (string.find(data[3], "http://www.drugs.com/") > -1): ~~for stem in stems:~~ ~~link~~temp = ~~root~~string.replace(data[13] ~~+ stem +~~, "http://www.~~html~~drugs.com/", "") # temp = ~~print~~string.replace(temp, ~~"attempted~~ ~~Drugs.com~~ ~~link:~~ ".html", ~~link~~"") drugnames.append(temp) ~~if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter~~ # print "drugnames: ", drugnames ~~current_parameters['Drugs.com'] = "{{drugs.com\|" + root[0] + "\|" + stem + "}}"~~ ~~raise StopIteration()~~ try: ~~# print "Drugs.com: ", current_parameters['Drugs.com']~~ ~~except~~for ~~StopIteration~~root in roots: ~~pass~~for stem in stems: link = root[1] + stem + ".html" # print "attempted Drugs.com link: ", link if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter current_parameters['Drugs.com'] = "{{drugs.com\|" + root[0] + "\|" + stem + "}}" raise StopIteration() # print "Drugs.com: ", current_parameters['Drugs.com'] except StopIteration: pass Line 277 ⟶ 430: # while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing if data: if data[4] and not current_parameters.has_key("KEGG"): current_parameters['KEGG'] = data[4] if data[6] and not current_parameters.has_key("ChemSpiderID"): current_parameters['ChemSpiderID'] = data[6] if data[7] and not current_parameters.has_key("PubChem"): current_parameters['PubChem'] = data[7] # build new drugbox template new_drugbox = "{{Drugbox\n" if current_parameters.has_key("Watchedfields"): new_drugbox += "\| Watchedfields = " + current_parameters['Watchedfields'] + "\n" if current_parameters.has_key("Verifiedfields"): new_drugbox += "\| Verifiedfields = " + current_parameters['Verifiedfields'] + "\n" if current_parameters.has_key("verifiedrevid"): new_drugbox += "\| verifiedrevid = " + current_parameters['verifiedrevid'] + "\n" if current_parameters.has_key("IUPAC_name"): new_drugbox += "\| IUPAC_name = " + current_parameters['IUPAC_name'] + "\n" if current_parameters.has_key("OtherNames"): new_drugbox += "\| OtherNames = " + current_parameters['OtherNames'] + "\n" if current_parameters.has_key("image"): new_drugbox += "\| image = " + current_parameters['image'] + "\n" if current_parameters.has_key("width"): new_drugbox += "\| width = " + current_parameters['width'] + "\n" Line 426 ⟶ 584: # replace original drugbox with new drugbox if new_drugbox: new_text = re.sub(regexp_drug_infobox, new_drugbox, text) # print new_text page.put(new_text, comment='populated clinical fields in drugbox per [[Wikipedia:Bots/Requests_for_approval/BogBot_2\|bot approval]]', watchArticle = None, minorEdit = True) print ", page updated" else: print ", page not updated" ~~page.put(new_text, comment='populated clinical fields in drugbox', watchArticle = None, minorEdit = True)~~ ~~print ", page updated"~~ wikipedia.stopme()

User:BogBot/Source code/Task 03: Difference between revisions