User:BogBot/Source code/Task 03: Difference between revisions

Browse history interactively

← Previous edit

Content deleted Content added

VisualWikitext

Revision as of 07:06, 23 July 2011 edit Boghog (talk \| contribs) Autopatrolled, Extended confirmed users, IP block exemptions, New page reviewers, Pending changes reviewers, Rollbackers, Template editors 142,840 edits removed duplicate code ← Previous edit		Latest revision as of 14:47, 8 May 2022 edit undo Qwerfjkl (bot) (talk \| contribs) Bots, Mass message senders 4,093,883 edits m →top: Replaced deprecated <source> tags with <syntaxhighlight> Tag: AWB
(10 intermediate revisions by one other user not shown)
Line 1: <~~source~~syntaxhighlight lang=python> #!/usr/bin/python # -- coding: UTF-8 -- # Bot Script to populate new clinical fields in Drugbox templates in Wikipedia drug articles. Line 7 ⟶ 8: # \| Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" --> # \| MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" --> # In addition, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing with data from http://www.drugbank.ca/system/downloads/current/drugbank.txt.zip # The tradnames were obtained from http://www.merckmanuals.com/home/drugnames-index/trade/a.html # Finally the script sorts the fields in the order they are currently rendered by the drugbox template # (in the order of clinical, pharmacokinetic, identifiers, and chemical data) """{{Drugbox \| Watchedfields \| verifiedrevid = 408577806 \| IUPAC_name = \| OtherNames = \| image = \| width = Line 124 ⟶ 131: }}""" from collections import defaultdict import codecs import csv import re import string import sys import urllib import urlparse import wikipedia ~~from collections import defaultdict~~ ~~import urllib~~ ~~import csv~~ ~~import string~~ ~~# Included for bot exclusion compliance (see http://en.wikipedia.org/wiki/Template:Bots)~~ # compiled regular expression user = "BogBot" regexp_ab = re.compile(r'\{\{(nobots\|bots\\|(allow=none\|deny=.?' + user + r'.?\|optout=all\|deny=all))\}\}') # adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/ ~~# compiled regular expression~~ # Build a regex to locate the drugbox exp = r'\{\{' # the opening brackets for the infobox exp = exp + r'\s' # any amount of whitespace exp = exp + r'[Dd]rugbox' # the word "drugbox", capitalized or not exp = exp + r'.\}\}' # any amount of anything, followed by the end of the drugbox regexp_drug_infobox = re.compile(exp, re.DOTALL) ~~regexp_ab = re.compile(r'\{\{(nobots\|bots\\|(allow=none\|deny=.?' + user + r'.?\|optout=all\|deny=all))\}\}')~~ ~~regexp_drug_infobox~~regexp_param = re.compile(r"^\{s?\{\|\s?(~~Drugbox\|drugbox~~?P<PARAM>\S+)\s?=\s?(?P<~~PARAMS~~VALUE>.+)\s?($\|\}\}\s\|)"~~, re.DOTALL~~) ~~regexp_param~~regexp_nested_template = re.compile(r"^\s?{\~~\|\s?~~{(?P<~~PARAM~~PARAMS>\S.+)\~~s?=~~}\~~s?(?P<VALUE>.+)$~~}") regexp_open_square_bracket = re.compile(r"\[", re.DOTALL) regexp_close_square_bracket = re.compile(r"\]", re.DOTALL) regexp_open_curly_bracket = re.compile(r"}", re.DOTALL) regexp_close_curly_bracket = re.compile(r"{", re.DOTALL) regexp_double_open_square_bracket = re.compile(r"\[\[", re.DOTALL) regexp_double_close_square_bracket = re.compile(r"\[\[", re.DOTALL) regexp_double_open_curly_bracket = re.compile(r"}}", re.DOTALL) regexp_double_close_curly_bracket = re.compile(r"{{", re.DOTALL) # ATC_supplemental = {{ATC\|B01\|AC06}}, {{ATC\|N02\|BA01}} regexp_ATC_supplemental = re.compile(r"\\|\s?ATC_supplemental\s?=\s?(?P<TEMPLATE>.\{\{s(ATC).+?\}\})\s?($\|\\|)") # CASNo_Ref = {{cascite\|correct\|CAS}} regexp_CASNo_Ref = re.compile(r"\\|\s?CASNo_Ref\s?=\s?(?P<TEMPLATE>\{\{(Cascite\|cascite).?\}\})") # CAS_supplemental = {{CAS\|405-41-4}} regexp_CAS_supplemental = re.compile(r"\\|\s?CAS_supplemental\s?=\s?(?P<TEMPLATE>\{\{CAS.?\}\})") # ChEMBL_Ref = {{ebicite\|correct\|EBI}} regexp_ChEMBL_Ref = re.compile(r"\\|\s?ChEMBL_Ref\s?=\s?(?P<TEMPLATE>\{\{(Ebicite\|ebicite).?\}\})") # ChemSpiderID_Ref = {{chemspidercite\|correct\|chemspider}} regexp_ChemSpiderID_Ref = re.compile(r"\\|\s?ChemSpiderID_Ref\s?=\s?(?P<TEMPLATE>\{\{(Chemspidercite\|chemspidercite).?\}\})") # Drugs.com = {{drugs.com\|monograph\|lisinopril}} regexp_Drugs_com = re.compile(r"\\|\s?Drugs\.com\s?=\s?(?P<TEMPLATE>\{\{(Drugs\.com\|drugs\.com).?\}\})") # KEGG_Ref = {{keggcite\|correct\|kegg}} regexp_KEGG_Ref = re.compile(r"\\|\s?KEGG_Ref\s?=\s?(?P<TEMPLATE>\{\{(Keggcite\|keggcite).?\}\})") # StdInChI_Ref = {{stdinchicite\|correct\|chemspider}} regexp_StdInChI_Ref = re.compile(r"\\|\s?StdInChI_Ref\s?=\s?(?P<TEMPLATE>\{\{(Stdinchicite\|stdinchicite).?\}\})") # StdInChIKey_Ref = {{stdinchicite\|correct\|chemspider}} regexp_StdInChIKey_Ref = re.compile(r"\\|\s?StdInChIKey_Ref\s?=\s?(?P<TEMPLATE>\{\{(Stdinchicite\|stdinchicite).\}\})") # UNII_Ref = {{fdacite\|changed\|FDA}} regexp_UNII_Ref = re.compile(r"\\|\s?UNII_Ref\s?=\s?(?P<TEMPLATE>\{\{(Fdacite\|fdacite).?\}\})") # named ref tag = <ref name="fattinger2000"> but not <ref name="fattinger2000" /> regexp_ref_tag_begin = re.compile(r"<ref>\|<ref name.?[^/]>") regexp_ref_tag_end = re.compile(r"</ref>") regexp_citation_template = re.compile(r"\{\{[C\|c]ite\s?(?P<TEMPLATE>.?)\}\}") # href='/monograph/maprotiline-hydrochloride.html' regexp_monograph_url = re.compile("href='/monograph/(?P<STEM>.?)\.html'", re.DOTALL) # http://www.nlm.nih.gov/medlineplus/druginfo/meds/a604021.html regexp_medlineplus_url = re.compile("www.nlm.nih.gov/medlineplus/druginfo/meds/(?P<ACNO>.?)\.html", re.DOTALL) def Allowbots(text): if (regexp_ab.search(text)): Line 151 ⟶ 204: return True def urlEncodeNonAscii(b): ~~# articles = open('/Users/BogBot/progs/pywikipedia/drugbox/drugbox_titles.txt', 'r')~~ return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b) def iriToUri(iri): ~~drugbank_data = {}~~ parts= urlparse.urlparse(iri) return urlparse.urlunparse( part.encode('idna') if parti==1 else urlEncodeNonAscii(part.encode('utf-8')) for parti, part in enumerate(parts) ) def find_drugbox_from_text(article_text): ~~# 0 1 2 3 4 5 6 7~~ # adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/ ~~# Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID~~ search_result = regexp_drug_infobox.search(article_text) if search_result: result_text = search_result.group(0) # returns the entire matching sequence begin, end = search_result.span() else: return None # the regex isn't perfect, so look for the closing brackets of the infobox count = 0 last_ind = None for ind, c in enumerate(result_text): if c == '}': count = count - 1 elif c == '{': count = count + 1 if count == 0 and not ind == 0: last_ind = ind break offset = result_text.find('\|') ___location = (begin+offset, begin+last_ind-1, begin, begin+last_ind+1) return ___location def drugbank(): ~~drug_data = csv.reader(open('/Users/BogBot/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'), dialect='excel')~~ # drugbank data obtained from http://www.drugbank.ca/system/downloads/current/drugbank.txt.zip drugbank_data = {} ~~for row in drug_data:~~ ~~drugbank_data[row[0]] = row[1:]~~ # 0 1 2 3 4 5 6 7 8 # Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID DrugBank_ID drug_data = csv.reader(open('/Users/BogBot/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'), dialect='excel') ~~# articles = []~~ for row in drug_data: ~~articles = ["User:Boghog/Sandbox2"]~~ drugbank_data[row[0]] = row[1:] ~~# articles = ["Template:Drugbox/Lisinopril"]~~ return drugbank_data def assign_nested_templates(parameters, current_parameters): ~~# main loop~~ # extract and assign nested templates commonly used in drugbox templates result_ATC_supplemental = regexp_ATC_supplemental.search(parameters) ~~for article in articles:~~ if result_ATC_supplemental: template = result_ATC_supplemental.group('TEMPLATE') current_parameters['ATC_supplemental'] = template # print "found result_ATC_supplemental! ", template parameters = re.sub(regexp_ATC_supplemental, "\|", parameters) result_ChEMBL_Ref = regexp_ChEMBL_Ref.search(parameters) ~~article = article.rstrip('\n')~~ if result_ChEMBL_Ref: ~~# INN = article~~ template = result_ChEMBL_Ref.group('TEMPLATE') ~~INN = "Lisinopril"~~ current_parameters['ChEMBL_Ref'] = template ~~# print article~~ # print "found result_ChEMBL_Ref! ", template parameters = re.sub(regexp_ChEMBL_Ref, "", parameters) result_CASNo_Ref = regexp_CASNo_Ref.search(parameters) ~~log_string = " [[" + article + "]]"~~ if result_CASNo_Ref: ~~print log_string,~~ template = result_CASNo_Ref.group('TEMPLATE') current_parameters['CASNo_Ref'] = template # print "found result_CASNo_Ref! ", template parameters = re.sub(regexp_CASNo_Ref, "", parameters) result_CAS_supplemental = regexp_CAS_supplemental.search(parameters) ~~site = wikipedia.getSite()~~ if result_CAS_supplemental: ~~page = wikipedia.Page(site, article)~~ template = result_CAS_supplemental.group('TEMPLATE') ~~text = page.get(get_redirect = True)~~ current_parameters['CAS_supplemental'] = template # print "found result_CAS_supplemental! ", template parameters = re.sub(regexp_CAS_supplemental, "", parameters) result_ChemSpiderID_Ref = regexp_ChemSpiderID_Ref.search(parameters) ~~if not Allowbots(text):~~ if result_ChemSpiderID_Ref: ~~break~~ template = result_ChemSpiderID_Ref.group('TEMPLATE') current_parameters['ChemSpiderID_Ref'] = template # print "found ChemSpiderID_Ref! ", template parameters = re.sub(regexp_ChemSpiderID_Ref, "", parameters) result_Drugs_com = regexp_Drugs_com.search(parameters) ~~log_string = ", article: " + article~~ if result_Drugs_com: ~~print log_string~~ template = result_Drugs_com.group('TEMPLATE') current_parameters['Drugs.com'] = template # print "found result_Drugs_com! ", template parameters = re.sub(regexp_Drugs_com, "", parameters) result_KEGG_Ref = regexp_KEGG_Ref.search(parameters) ~~# print text~~ if result_KEGG_Ref: template = result_KEGG_Ref.group('TEMPLATE') current_parameters['KEGG_Ref'] = template # print "found KEGG_Ref! ", template parameters = re.sub(regexp_KEGG_Ref, "", parameters) result_StdInChI_Ref = regexp_StdInChI_Ref.search(parameters) if result_StdInChI_Ref: template = result_StdInChI_Ref.group('TEMPLATE') current_parameters['StdInChI_Ref'] = template # print "found StdInChI_Ref! ", template parameters = re.sub(regexp_StdInChI_Ref, "", parameters) result_UNII_Ref = regexp_UNII_Ref.search(parameters) if result_UNII_Ref: template = result_UNII_Ref.group('TEMPLATE') current_parameters['UNII_Ref'] = template # print "found UNII_Ref! ", template parameters = re.sub(regexp_UNII_Ref, "", parameters) return parameters def rejoin(begin, end, sub_strings, type): # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc. # adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/ new_list = [sub_strings[0]] for sub_string in sub_strings[1:]: if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)): if type == "line": new_list[-1] = new_list[-1] + sub_string if type == "parameter": new_list[-1] = new_list[-1] + '\|' + sub_string else: new_list.append(sub_string) sub_strings = new_list return sub_strings def test_disjoint(begin,end,sub_strings): disjoint = False for sub_string in sub_strings: if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)): disjoint = True break return disjoint def regex_rejoin(regexp_begin, regexp_end, sub_strings, type): # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc. # adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/ new_list = [sub_strings[0]] for sub_string in sub_strings[1:]: begin = False; end = False; n_begin = 0; n_end = 0 if regexp_begin.search(sub_string): begin = True match = regexp_begin.findall(sub_string) n_begin = len(match) if regexp_end.search(sub_string): end = True match = regexp_end.findall(sub_string) n_end = len(match) if ((end and not begin) or n_end < n_begin): if type == "line": new_list[-1] = new_list[-1] + sub_string if type == "parameter": new_list[-1] = new_list[-1] + '\|' + sub_string else: new_list.append(sub_string) sub_strings = new_list return sub_strings def regex_test_disjoint(regexp_begin, regexp_end, sub_strings): disjoint = False begin = False; end = False; n_begin = 0; n_end = 0 for sub_string in sub_strings: if regexp_begin.search(sub_string): begin = True match = regexp_begin.findall(sub_string) n_begin = len(match) if regexp_end.search(sub_string): end = True match = regexp_end.findall(sub_string) n_end = len(match) if ((end and not begin) or n_end < n_begin): disjoint = True break return disjoint def pad_parameters(text): matches = regexp_citation_template.findall(text) for match in matches: sub_strings = match.split("\|") new_strings = " " + sub_strings[0].strip() for item in sub_strings[1:]: item = " \| " + item.strip() new_strings += item sub_strings = new_strings.split("=") new_strings = " " + sub_strings[0].strip() for item in sub_strings[1:]: item = " = " + item.strip() new_strings += item text = text.replace(match,new_strings) return text def parse_line(line, current_parameters): # print "index: ", line.count('\|') # if (line.count('\|') > 1 and line.count('[[') < 1 ): if (line.count('\|') > 1): # print "line1: ", line sub_strings = line.split("\|") # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links sub_strings = rejoin('[[',']]',sub_strings, 'parameter') # do the same for nested templates forever = True while forever: if test_disjoint('{{','}}',sub_strings): forever = True sub_strings = rejoin('{{','}}',sub_strings, 'parameter') else: forever = False # do the same for citations: forever = True while forever: if regex_test_disjoint(regexp_ref_tag_begin, regexp_ref_tag_end, sub_strings): forever = True sub_strings = regex_rejoin(regexp_ref_tag_begin, regexp_ref_tag_end, sub_strings, 'parameter') else: forever = False # now assign the parameters for sub_string in sub_strings: # print "sub_string: ", sub_string if (sub_string.count("=") > 0): parts = sub_string.split("=", 1) # print "parts: ", parts parameter = str(parts[0].encode("utf-8")).strip() value = str(parts[1].encode("utf-8")).strip() value = pad_parameters(value) # print "parameter, value: ", parameter, " ", value if not value: value = " " current_parameters[parameter] = value else: result_drug_param = regexp_param.search(line) # print line if result_drug_param: # print "made it!" # print "line2: ", line parameter = (result_drug_param.group('PARAM').encode("utf-8")).strip() value = (result_drug_param.group('VALUE').encode("utf-8")).strip() value = pad_parameters(value) if not value: value = " " current_parameters[parameter] = value return def remove_embedded_carriage_returns(parameters): # remove embedded carriage returns from templates: ~~result_drug_infobox = regexp_drug_infobox.search(text)~~ ~~if result_drug_infobox:~~ ~~print "found it!"~~ ~~parameters = result_drug_infobox.group('PARAMS')~~ ~~current_parameters = {}~~ ~~# print parameters~~ lines = parameters.splitlines() ~~for line in lines:~~ ~~# print line~~ ~~result_drug_param = regexp_param.search(line)~~ ~~if result_drug_param:~~ ~~parameter = result_drug_param.group('PARAM')~~ ~~value = result_drug_param.group('VALUE')~~ ~~current_parameters[parameter] = value~~ ~~print~~forever ~~"INN:~~= ~~", INN~~True while forever: ~~if INN in drugbank_data:~~ if test_disjoint('{{', '}}', lines): ~~data = drugbank_data[INN]~~ forever = True lines = rejoin('{{', '}}', lines, 'line') ~~# merge tradenames~~ else: forever = False ~~merck_tradenames = sorted(set(data[0].split(";")))[1:]~~ ~~for index, object in enumerate(merck_tradenames):~~ ~~merck_tradenames[index] = string.capitalize(string.strip(object.encode()))~~ ~~if current_parameters.has_key("tradename"):~~ ~~current_tradenames = sorted(set(current_parameters['tradename'].split(", ")))~~ ~~else:~~ ~~current_tradenames = []~~ ~~for index, object in enumerate(current_tradenames):~~ ~~current_tradenames[index] = string.capitalize(string.strip(object.encode()))~~ ~~merged_tradenames = sorted(merck_tradenames + current_tradenames)~~ ~~new_tradenames = sorted(set(merged_tradenames))~~ ~~current_parameters['tradename'] = ", ".join(new_tradenames)~~ ~~# print "merck tradenames: ", merck_tradenames~~ ~~# print "current tradenames: ", current_tradenames~~ ~~print "new tradenames: ", current_parameters['tradename']~~ ~~# test web page, returns "200" if OK:~~ ~~# if urllib.urlopen(link).getcode() == 200:~~ ~~# 200: ('OK', 'Request fulfilled, document follows')~~ ~~# 404: (page not found)~~ forever = True ~~# add MedlinePlus parameter~~ while forever: ~~# \| MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" -->~~ if regex_test_disjoint(regexp_ref_tag_begin, regexp_ref_tag_end, lines): forever = True lines = regex_rejoin(regexp_ref_tag_end, regexp_ref_tag_begin, lines, 'line') else: forever = False parameters = string.join(lines, "\n") ~~link = "http://www.nlm.nih.gov/medlineplus/druginfo/meds/" + data[2] + ".html"~~ ~~if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter~~ ~~current_parameters['MedlinePlus'] = data[2]~~ ~~# print "MedlinePlus: ", current_parameters['MedlinePlus']~~ ~~# add Drugs.com parameter~~ ~~# \| Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->~~ return parameters ~~INN_html = string.lower(string.replace(INN, " ", "_"))~~ ~~link = "http://www.drugs.com/monograph/" + INN_html + ".html"~~ ~~if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter~~ ~~current_parameters['Drugs.com'] = INN_html~~ ~~# print "Drugs.com: ", current_parameters['Drugs.com']~~ ~~# for parameter, value in~~def build_new_drugbox(current_parameters~~.iteritems(~~): # build new drugbox template ~~# print parameter, ":", value~~ # make sure that all values in the current_parameters dictionary are properly encoded ~~# while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing~~ encoding = 'utf-8' ~~if data[4] and not current_parameters.has_key("KEGG"):~~ for k, v in current_parameters.iteritems(): ~~current_parameters['KEGG'] = data[4]~~ if isinstance(v, basestring): if not isinstance(v, unicode): v = unicode(v, encoding) current_parameters[k] = v # if type parameter is missing, check subordinate parameters that infer type, and if found, assign type ~~if data[6] and not current_parameters.has_key("ChemSpiderID"):~~ ~~current_parameters['ChemSpiderID'] = data[6]~~ if ~~data[7] and~~ not current_parameters.has_key("~~PubChem~~type"): if ("component1" in current_parameters~~['PubChem']~~ =or ~~data[7]~~ "class1" in current_parameters or "component2" in current_parameters or "class2" in current_parameters or "component3" in current_parameters or "class3" in current_parameters or "component4" in current_parameters or "class4" in current_parameters): current_parameters['type'] = "combo" elif ("mab_type") in current_parameters: current_parameters['type'] = "mab" elif ("vaccine_type") in current_parameters: current_parameters['type'] = "vaccine" # if not previously assigned, add the following "empty" parameters ~~# build new drugbox template~~ if not current_parameters.has_key("tradename"): current_parameters["tradename"] = " " new_drugbox = unicode( "{{Drugbox\n", "utf-8" ) if current_parameters.has_key("Verifiedfields"): new_drugbox += "\| Verifiedfields = " + current_parameters['Verifiedfields'] + "\n" if current_parameters.has_key("Watchedfields"): new_drugbox += "\| Watchedfields = " + current_parameters['Watchedfields'] + "\n" if current_parameters.has_key("verifiedrevid"): new_drugbox += "\| verifiedrevid = " + current_parameters['verifiedrevid'] + "\n" if current_parameters.has_key("IUPAC_name"): new_drugbox += "\| IUPAC_name = " + current_parameters['IUPAC_name'] + "\n" if current_parameters.has_key("OtherNames"): new_drugbox += "\| OtherNames = " + current_parameters['OtherNames'] + "\n" if current_parameters.has_key("image"): new_drugbox += "\| image = " + current_parameters['image'] + "\n" if current_parameters.has_key("width"): new_drugbox += "\| width = " + current_parameters['width'] + "\n" Line 307 ⟶ 565: if current_parameters.has_key("vaccine_type"): new_drugbox += "\| vaccine_type = " + current_parameters['vaccine_type'] + "\n" if ("tradename" in current_parameters or "Drugs.com" in current_parameters or "MedlinePlus" in current_parameters or "licence_EU" in current_parameters or ~~new_drugbox += "\n<!--Clinical data-->\n"~~ "licence_US" in current_parameters or "DailyMedID" in current_parameters or "pregnancy_AU" in current_parameters or "pregnancy_US" in current_parameters or ~~if current_parameters.has_key("tradename"): new_drugbox += "\| tradename = " + current_parameters['tradename'] + "\n"~~ "pregnancy_category" in current_parameters or "legal_AU" in current_parameters or "legal_CA" in current_parameters or "legal_UK" in current_parameters or ~~if current_parameters.has_key("Drugs.com"): new_drugbox += "\| Drugs.com = " + current_parameters['Drugs.com'] + "\n"~~ "legal_US" in current_parameters or "legal_status" in current_parameters or "dependency_liability" or "routes_of_administration" in current_parameters): ~~if current_parameters.has_key("MedlinePlus"): new_drugbox += "\| MedlinePlus = " + current_parameters['MedlinePlus'] + "\n"~~ ~~if current_parameters.has_key("licence_EU"): new_drugbox += "\| licence_EU = " + current_parameters['licence_EU'] + "\n"~~ ~~if current_parameters.has_key("licence_US"): new_drugbox += "\| licence_US = " + current_parameters['licence_US'] + "\n"~~ ~~if current_parameters.has_key("DailyMedID"): new_drugbox += "\| DailyMedID = " + current_parameters['DailyMedID'] + "\n"~~ ~~if current_parameters.has_key("pregnancy_AU"): new_drugbox += "\| pregnancy_AU = " + current_parameters['pregnancy_AU'] + "\n"~~ ~~if current_parameters.has_key("pregnancy_US"): new_drugbox += "\| pregnancy_US = " + current_parameters['pregnancy_US'] + "\n"~~ ~~if current_parameters.has_key("pregnancy_category"): new_drugbox += "\| pregnancy_category = " + current_parameters['pregnancy_category'] + "\n"~~ ~~if current_parameters.has_key("legal_AU"): new_drugbox += "\| legal_AU = " + current_parameters['legal_AU'] + "\n"~~ ~~if current_parameters.has_key("legal_CA"): new_drugbox += "\| legal_CA = " + current_parameters['legal_CA'] + "\n"~~ ~~if current_parameters.has_key("legal_UK"): new_drugbox += "\| legal_UK = " + current_parameters['legal_UK'] + "\n"~~ ~~if current_parameters.has_key("legal_US"): new_drugbox += "\| legal_US = " + current_parameters['legal_US'] + "\n"~~ ~~if current_parameters.has_key("legal_status"): new_drugbox += "\| legal_status = " + current_parameters['legal_status'] + "\n"~~ ~~if current_parameters.has_key("dependency_liability"): new_drugbox += "\| dependency_liability = " + current_parameters['dependency_liability'] + "\n"~~ ~~if current_parameters.has_key("routes_of_administration"): new_drugbox += "\| routes_of_administration = " + current_parameters['routes_of_administration'] + "\n"~~ new_drugbox += "\n<!--~~Pharmacokinetic~~Clinical data-->\n" if current_parameters.has_key("~~bioavailability~~tradename"): new_drugbox += "\| ~~bioavailability~~tradename = " + current_parameters['~~bioavailability~~tradename'] + "\n" if current_parameters.has_key("~~protein_bound~~Drugs.com"): new_drugbox += "\| ~~protein_bound~~Drugs.com = " + current_parameters['~~protein_bound~~Drugs.com'] + "\n" if current_parameters.has_key("~~metabolism~~MedlinePlus"): new_drugbox += "\| ~~metabolism~~MedlinePlus = " + current_parameters['~~metabolism~~MedlinePlus'] + "\n" if current_parameters.has_key("~~elimination_half-life~~licence_EU"): new_drugbox += "\| ~~elimination_half-life~~licence_EU = " + current_parameters['~~elimination_half-life~~licence_EU'] + "\n" if current_parameters.has_key("~~excretion~~licence_US"): new_drugbox += "\| ~~excretion~~licence_US = " + current_parameters['~~excretion~~licence_US'] + "\n" if current_parameters.has_key("DailyMedID"): new_drugbox += "\| DailyMedID = " + current_parameters['DailyMedID'] + "\n" if current_parameters.has_key("pregnancy_AU"): new_drugbox += "\| pregnancy_AU = " + current_parameters['pregnancy_AU'] + "\n" if current_parameters.has_key("pregnancy_US"): new_drugbox += "\| pregnancy_US = " + current_parameters['pregnancy_US'] + "\n" if current_parameters.has_key("pregnancy_category"): new_drugbox += "\| pregnancy_category = " + current_parameters['pregnancy_category'] + "\n" if current_parameters.has_key("legal_AU"): new_drugbox += "\| legal_AU = " + current_parameters['legal_AU'] + "\n" if current_parameters.has_key("legal_CA"): new_drugbox += "\| legal_CA = " + current_parameters['legal_CA'] + "\n" if current_parameters.has_key("legal_UK"): new_drugbox += "\| legal_UK = " + current_parameters['legal_UK'] + "\n" if current_parameters.has_key("legal_US"): new_drugbox += "\| legal_US = " + current_parameters['legal_US'] + "\n" if current_parameters.has_key("legal_status"): new_drugbox += "\| legal_status = " + current_parameters['legal_status'] + "\n" if current_parameters.has_key("dependency_liability"): new_drugbox += "\| dependency_liability = " + current_parameters['dependency_liability'] + "\n" if current_parameters.has_key("routes_of_administration"): new_drugbox += "\| routes_of_administration = " + current_parameters['routes_of_administration'] + "\n" if ("bioavailability" in current_parameters or "protein_bound metabolism" in current_parameters or "elimination_half-life" in current_parameters or "excretion" in current_parameters): new_drugbox += "\n<!--Pharmacokinetic data-->\n" if current_parameters.has_key("bioavailability"): new_drugbox += "\| bioavailability = " + current_parameters['bioavailability'] + "\n" if current_parameters.has_key("protein_bound"): new_drugbox += "\| protein_bound = " + current_parameters['protein_bound'] + "\n" if current_parameters.has_key("metabolism"): new_drugbox += "\| metabolism = " + current_parameters['metabolism'] + "\n" if current_parameters.has_key("elimination_half-life"): new_drugbox += "\| elimination_half-life = " + current_parameters['elimination_half-life'] + "\n" if current_parameters.has_key("excretion"): new_drugbox += "\| excretion = " + current_parameters['excretion'] + "\n" if ("CAS_number" in current_parameters or "CAS_supplemental" in current_parameters or "ATCvet" in current_parameters or "ATC_prefix" in current_parameters or ~~new_drugbox += "\n<!--Identifiers-->\n"~~ "ATC_suffix" in current_parameters or "ATC_supplemental" in current_parameters or "PubChem" in current_parameters or "PubChemSubstance" in current_parameters or ~~if current_parameters.has_key("CAS_number"): new_drugbox += "\| CAS_number = " + current_parameters['CAS_number'] + "\n"~~ "IUPHAR_ligand" in current_parameters or "DrugBank" in current_parameters or "ChemSpiderID" in current_parameters or "UNII" in current_parameters or ~~if current_parameters.has_key("CAS_supplemental"): new_drugbox += "\| CAS_supplemental = " + current_parameters['CAS_supplemental'] + "\n"~~ "KEGG" in current_parameters or "ChEBI" in current_parameters or "ChEMBL" in current_parameters): ~~if current_parameters.has_key("ATCvet"): new_drugbox += "\| ATCvet = " + current_parameters['ATCvet'] + "\n"~~ new_drugbox += "\n<!--Identifiers-->\n" ~~if current_parameters.has_key("ATC_prefix"): new_drugbox += "\| ATC_prefix = " + current_parameters['ATC_prefix'] + "\n"~~ if current_parameters.has_key("~~ATC_suffix~~CAS_number_Ref"): new_drugbox += "\| ~~ATC_suffix~~CAS_number_Ref = " + current_parameters['~~ATC_suffix~~CAS_number_Ref'] + "\n" if current_parameters.has_key("~~ATC_supplemental~~CASNo_Ref"): new_drugbox += "\| ~~ATC_supplemental~~CASNo_Ref = " + current_parameters['~~ATC_supplemental~~CASNo_Ref'] + "\n" if current_parameters.has_key("~~PubChem~~CAS_number"): new_drugbox += "\| ~~PubChem~~CAS_number = " + current_parameters['~~PubChem~~CAS_number'] + "\n" if current_parameters.has_key("~~PubChemSubstance~~CAS_supplemental"): new_drugbox += "\| ~~PubChemSubstance~~CAS_supplemental = " + current_parameters['~~PubChemSubstance~~CAS_supplemental'] + "\n" if current_parameters.has_key("~~IUPHAR_ligand~~ATCvet"): new_drugbox += "\| ~~IUPHAR_ligand~~ATCvet = " + current_parameters['~~IUPHAR_ligand~~ATCvet'] + "\n" if current_parameters.has_key("~~DrugBank~~ATC_prefix"): new_drugbox += "\| ~~DrugBank~~ATC_prefix = " + current_parameters['~~DrugBank~~ATC_prefix'] + "\n" if current_parameters.has_key("~~ChemSpiderID_Ref~~ATC_suffix"): new_drugbox += "\| ~~ChemSpiderID_Ref~~ATC_suffix = " + current_parameters['~~ChemSpiderID_Ref~~ATC_suffix'] + "\n" if current_parameters.has_key("~~ChemSpiderID~~ATC_supplemental"): new_drugbox += "\| ~~ChemSpiderID~~ATC_supplemental = " + current_parameters['~~ChemSpiderID~~ATC_supplemental'] + "\n" if current_parameters.has_key("~~UNII_Ref~~PubChem"): new_drugbox += "\| ~~UNII_Ref~~PubChem = " + current_parameters['~~UNII_Ref~~PubChem'] + "\n" if current_parameters.has_key("~~UNII~~PubChemSubstance"): new_drugbox += "\| ~~UNII~~PubChemSubstance = " + current_parameters['~~UNII~~PubChemSubstance'] + "\n" if current_parameters.has_key("~~KEGG_Ref~~IUPHAR_ligand"): new_drugbox += "\| ~~KEGG_Ref~~IUPHAR_ligand = " + current_parameters['~~KEGG_Ref~~IUPHAR_ligand'] + "\n" if current_parameters.has_key("~~KEGG~~DrugBank_Ref"): new_drugbox += "\| ~~KEGG~~DrugBank_Ref = " + current_parameters['~~KEGG~~DrugBank_Ref'] + "\n" if current_parameters.has_key("~~ChEBI_Ref~~DrugBank"): new_drugbox += "\| ~~ChEBI_Ref~~DrugBank = " + current_parameters['~~ChEBI_Ref~~DrugBank'] + "\n" if current_parameters.has_key("~~ChEBI~~ChemSpiderID_Ref"): new_drugbox += "\| ~~ChEBI~~ChemSpiderID_Ref = " + current_parameters['~~ChEBI~~ChemSpiderID_Ref'] + "\n" if current_parameters.has_key("~~ChEMBL_Ref~~ChemSpiderID"): new_drugbox += "\| ~~ChEMBL_Ref~~ChemSpiderID = " + current_parameters['~~ChEMBL_Ref~~ChemSpiderID'] + "\n" if current_parameters.has_key("~~ChEMBL~~UNII_Ref"): new_drugbox += "\| ~~ChEMBL~~UNII_Ref = " + current_parameters['~~ChEMBL~~UNII_Ref'] + "\n" if current_parameters.has_key("UNII"): new_drugbox += "\| UNII = " + current_parameters['UNII'] + "\n" if current_parameters.has_key("KEGG_Ref"): new_drugbox += "\| KEGG_Ref = " + current_parameters['KEGG_Ref'] + "\n" if current_parameters.has_key("KEGG"): new_drugbox += "\| KEGG = " + current_parameters['KEGG'] + "\n" if current_parameters.has_key("ChEBI_Ref"): new_drugbox += "\| ChEBI_Ref = " + current_parameters['ChEBI_Ref'] + "\n" if current_parameters.has_key("ChEBI"): new_drugbox += "\| ChEBI = " + current_parameters['ChEBI'] + "\n" if current_parameters.has_key("ChEMBL_Ref"): new_drugbox += "\| ChEMBL_Ref = " + current_parameters['ChEMBL_Ref'] + "\n" if current_parameters.has_key("ChEMBL"): new_drugbox += "\| ChEMBL = " + current_parameters['ChEMBL'] + "\n" new_drugbox += "\n<!--Chemical data-->\n" if ~~current_parameters.has_key~~("chemical_formula"): in current_parameters or "C" in current_parameters ~~new_drugbox +=~~or "~~\| chemical_formula =~~ H" in current_parameters or "Ag" in current_parameters or "As" +in current_parameters~~['chemical_formula']~~ or ~~+ "\n"~~ "Au" in current_parameters or "B" in current_parameters or "Bi" in current_parameters or "Br" in current_parameters or "Cl" in current_parameters or "Co" in current_parameters or ~~if current_parameters.has_key("C"): new_drugbox += "\| C=" + current_parameters['C'] + " "~~ "F" in current_parameters or "Fe" in current_parameters or "Gd" in current_parameters or "I" in current_parameters or "K" in current_parameters or "Mn" in current_parameters or ~~if current_parameters.has_key("H"): new_drugbox += "\| H=" + current_parameters['H'] + " "~~ "N" in current_parameters or "Na" in current_parameters or "O" in current_parameters or "P" in current_parameters or "Pt" in current_parameters or "S" in current_parameters or ~~if current_parameters.has_key("Ag"): new_drugbox += "\| Ag=" + current_parameters['Ag'] + " "~~ "Sb" in current_parameters or "Se" in current_parameters or "Sr" in current_parameters or "Tc" in current_parameters or "charge" in current_parameters): ~~if current_parameters.has_key("As"): new_drugbox += "\| As=" + current_parameters['As'] + " "~~ if current_parameters.has_key("Auchemical_formula"): new_drugbox += "\| Auchemical_formula = " + current_parameters['Auchemical_formula'] + " \n" # new_drugbox += " " if current_parameters.has_key("BC"): new_drugbox += "\| BC=" + current_parameters['BC'] + " " if current_parameters.has_key("BiH"): new_drugbox += "\| BiH=" + current_parameters['BiH'] + " " if current_parameters.has_key("BrAg"): new_drugbox += "\| BrAg=" + current_parameters['BrAg'] + " " if current_parameters.has_key("ClAs"): new_drugbox += "\| ClAs=" + current_parameters['ClAs'] + " " if current_parameters.has_key("CoAu"): new_drugbox += "\| CoAu=" + current_parameters['CoAu'] + " " if current_parameters.has_key("FB"): new_drugbox += "\| FB=" + current_parameters['FB'] + " " if current_parameters.has_key("FeBi"): new_drugbox += "\| FeBi=" + current_parameters['FeBi'] + " " if current_parameters.has_key("GdBr"): new_drugbox += "\| GdBr=" + current_parameters['GdBr'] + " " if current_parameters.has_key("ICl"): new_drugbox += "\| ICl=" + current_parameters['ICl'] + " " if current_parameters.has_key("KCo"): new_drugbox += "\| KCo=" + current_parameters['KCo'] + " " if current_parameters.has_key("MnF"): new_drugbox += "\| MnF=" + current_parameters['MnF'] + " " if current_parameters.has_key("NFe"): new_drugbox += "\| NFe=" + current_parameters['NFe'] + " " if current_parameters.has_key("NaGd"): new_drugbox += "\| NaGd=" + current_parameters['NaGd'] + " " if current_parameters.has_key("OI"): new_drugbox += "\| OI=" + current_parameters['OI'] + " " if current_parameters.has_key("PK"): new_drugbox += "\| PK=" + current_parameters['PK'] + " " if current_parameters.has_key("PtMn"): new_drugbox += "\| PtMn=" + current_parameters['PtMn'] + " " if current_parameters.has_key("SN"): new_drugbox += "\| SN=" + current_parameters['SN'] + " " if current_parameters.has_key("SbNa"): new_drugbox += "\| CNa=" + current_parameters['SbNa'] + " " if current_parameters.has_key("SeO"): new_drugbox += "\| SeO=" + current_parameters['SeO'] + " " if current_parameters.has_key("SrP"): new_drugbox += "\| SrP=" + current_parameters['SrP'] + " " if current_parameters.has_key("TcPt"): new_drugbox += "\| TcPt=" + current_parameters['TcPt'] + " " if current_parameters.has_key("~~charge~~S"): new_drugbox += "\| ~~charge~~ S= " + current_parameters['~~charge~~S'] + " " if current_parameters.has_key("Sb"): new_drugbox += "\| C=" + current_parameters['Sb'] + " " ~~new_drugbox += "\n"~~ if current_parameters.has_key("Se"): new_drugbox += "\| Se=" + current_parameters['Se'] + " " if current_parameters.has_key("Sr"): new_drugbox += "\| Sr=" + current_parameters['Sr'] + " " if current_parameters.has_key("Tc"): new_drugbox += "\| Tc=" + current_parameters['Tc'] + " " if current_parameters.has_key("charge"): new_drugbox += "\| charge = " + current_parameters['charge'] + " " new_drugbox += "\n" if current_parameters.has_key("molecular_weight"): new_drugbox += "\| molecular_weight = " + current_parameters['molecular_weight'] + "\n" if current_parameters.has_key("smiles"): new_drugbox += "\| smiles = " + current_parameters['smiles'] + "\n" if current_parameters.has_key("InChI_Ref"): new_drugbox += "\| InChI_Ref = " + current_parameters['InChI_Ref'] + "\n" if current_parameters.has_key("InChI"): new_drugbox += "\| InChI = " + current_parameters['InChI'] + "\n" if current_parameters.has_key("InChIKey"): new_drugbox += "\| InChIKey = " + current_parameters['InChIKey'] + "\n" if current_parameters.has_key("StdInChI_Ref"): new_drugbox += "\| StdInChI_Ref = " + current_parameters['StdInChI_Ref'] + "\n" if current_parameters.has_key("StdInChI"): new_drugbox += "\| StdInChI = " + current_parameters['StdInChI'] + "\n" Line 396 ⟶ 676: if current_parameters.has_key("density"): new_drugbox += "\| density = " + current_parameters['density'] + "\n" if current_parameters.has_key("melting_point"): new_drugbox += "\| melting_point = " + current_parameters['melting_point'] + "\n" if current_parameters.has_key("melting_high"): new_drugbox += "\| melting_high = " + current_parameters['melting_high'] + "\n" if current_parameters.has_key("melting_notes"): new_drugbox += "\| melting_notes = " + current_parameters['melting_notes'] + "\n" if current_parameters.has_key("boiling_point"): new_drugbox += "\| boiling_point = " + current_parameters['boiling_point'] + "\n" if current_parameters.has_key("boiling_notes"): new_drugbox += "\| boiling_notes = " + current_parameters['boiling_notes'] + "\n" Line 402 ⟶ 684: if current_parameters.has_key("sec_combustion"): new_drugbox += "\| sec_combustion = " + current_parameters['sec_combustion'] + "\n" new_drugbox += "}}\n" ~~# print new_drugbox~~ # print new_drugbox ~~# replace original drugbox with new drugbox~~ ~~new_text = re.sub(regexp_drug_infobox, new_drugbox, text)~~ ~~print new_text~~ return new_drugbox ~~page.put(new_text, comment='populated clinical fields in drugbox', watchArticle = None, minorEdit = True)~~ ~~print ", page updated"~~ def merged_tradenames(merck_tradename, current_tradename): ~~wikipedia.stopme()~~ # merge tradenames ~~</source>~~ new_tradenames = [] if merck_tradename: merck_tradenames = sorted(set(merck_tradename.split(";")))[1:] for index, object in enumerate(merck_tradenames): merck_tradenames[index] = string.capitalize(string.strip(object.encode("utf-8"))) else: merck_tradenames = [] if current_tradename: current_tradenames = sorted(set(current_tradename.split(", "))) for index, object in enumerate(current_tradenames): current_tradenames[index] = string.capitalize(string.strip(object.encode("utf-8"))) else: current_tradenames = [] merged_tradenames = [] if merck_tradenames: merged_tradenames = merck_tradenames if current_tradenames: for name in current_tradenames: merged_tradenames.append(name) if merged_tradenames: new_tradenames = sorted(set(merged_tradenames)) new_tradename = ", ".join(new_tradenames) return new_tradename else: new_tradename = "" return new_tradename # print "merck tradenames: ", merck_tradenames # print "current tradenames: ", current_tradenames def test_MedlinePlus(accession_number): # add MedlinePlus parameter # \| MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" --> # test web page, returns "200" if OK: # if urllib.urlopen(link).getcode() == 200: # 200: ('OK', 'Request fulfilled, document follows') # 404: (page not found) if accession_number: link = "http://www.nlm.nih.gov/medlineplus/druginfo/meds/" + accession_number + ".html" if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter return True else: link = "" return False def test_Drugs_com(INN, tradename, drugbank_drugs_com): # add Drugs.com parameter # \| Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" --> # create alternate candidate drugs.com links # alternateive roots include the monograph, CDI, CONS, MTM, and "parent" sectios of drgus.com web site # alternative stems inlclude the INN, trade names, and the name extracted from the drugbank link tradenames = tradename.split(", ") # drugs.com root links: roots = [("monograph","http://www.drugs.com/monograph/"), ("CDI","http://www.drugs.com/cdi/"), ("CONS","http://www.drugs.com/cons/"), ("MTM","http://www.drugs.com/mtm/"), ("pro","http://www.drugs.com/pro/"), ("international","http://www.drugs.com/international/"), ("parent","http://www.drugs.com/")] stems = [] drugnames = [] drugnames.append(INN) link = False if tradenames: for tradename in tradenames: drugnames.append(tradename) for drugname in drugnames: drugname = string.lower(drugname) # print "drugnames: ", drugnames if (drugname != " " and string.find(drugname, " ") > -1): stems.append(string.replace(drugname, " ", "_")) stems.append(string.replace(drugname, " ", "-")) elif (drugname != " "): stems.append(drugname) # also try common salts stems.append(drugname + "-hydrochloride") stems.append(drugname + "-sulfate") # stems.append(drugname + "-chloride") # stems.append(drugname + "-sodium") # stems.append(drugname + "-bromide") # stems.append(drugname + "-maleate") # stems.append(drugname + "-citrate") if drugbank_drugs_com: if (string.find(drugbank_drugs_com, "http://www.drugs.com/") > -1): temp = string.replace(drugbank_drugs_com, "http://www.drugs.com/", "") temp = string.replace(temp, ".html", "") drugnames.append(temp) try: for root in roots: for stem in stems: if stem: link = iriToUri(root[1] + stem + ".html") # print "attempted Drugs.com link: ", link if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter # print "passed link: ", link if root[0] == "monograph": link = "{{drugs.com\|" + root[0] + "\|" + stem + "}}" raise StopIteration() else: opener = urllib.FancyURLopener({}) f = opener.open(link) text = f.read() result = regexp_monograph_url.search(text) if result: stem = result.group('STEM') link = "{{drugs.com\|" + roots[0][0] + "\|" + stem + "}}" raise StopIteration() else: link = "{{drugs.com\|" + root[0] + "\|" + stem + "}}" # print "link: ", link raise StopIteration() else: link = "" except StopIteration: pass return link def unbalanced(text): # test for unmatched square or curly brackets n_open_square_bracket = len(regexp_open_square_bracket.findall(text)) n_close_square_bracket = len(regexp_close_square_bracket.findall(text)) n_open_curly_bracket = len(regexp_open_curly_bracket.findall(text)) n_close_curly_bracket = len(regexp_close_curly_bracket.findall(text)) n_double_open_square_bracket = len(regexp_double_open_square_bracket.findall(text)) n_double_close_square_bracket = len(regexp_double_close_square_bracket.findall(text)) n_double_open_curly_bracket = len(regexp_double_open_curly_bracket.findall(text)) n_double_close_curly_bracket = len(regexp_double_close_curly_bracket.findall(text)) if (n_open_square_bracket != n_close_square_bracket or n_open_curly_bracket != n_close_curly_bracket or n_double_open_square_bracket != n_double_close_square_bracket or n_double_open_curly_bracket != n_double_close_curly_bracket): return True else: return False def savepage(page, text, summary = '', minor = False, log_string = ""): """Save text to a page and log exceptions.""" if summary != '': wikipedia.setAction(summary) try: page.put(text, minorEdit = minor) wikipedia.output('%s \03{green}saving %s' % (log_string, page.title()) ) return '' except wikipedia.LockedPage: wikipedia.output('%s \03{red}cannot save %s because it is locked\03{default}' % (log_string, page.title()) ) return '# %s: page was locked\n' % page.aslink() except wikipedia.EditConflict: wikipedia.output('%s \03{red}cannot save %s because of edit conflict\03{default}' % (log_string, page.title()) ) return '# %s: edit conflict occurred\n' % page.aslink() except wikipedia.SpamfilterError, error: wikipedia.output('%s \03{red}cannot save %s because of spam blacklist entry %s\03{default}' % ((log_string, page.title(), error.url)) ) return '# %s: spam blacklist entry\n' % page.aslink() except: wikipedia.output('%s \03{red}unknown error on saving %s\03{default}' % (log_string, page.title()) ) return '# %s: unknown error occurred\n' % page.aslink() def run(): drugbank_data = drugbank() DrugBank_ID_INN = {} for k, v in drugbank_data.iteritems(): DrugBank_ID_INN[v[8]]= k # list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt" articles = [] articles = codecs.open('/Users/BogBot/progs/pywikipedia/drugbox_titles.txt', mode = 'r', encoding='utf-8') # articles = ['Progesterone'] for article in articles: article = article.rstrip('\n') encoding = 'utf-8' if isinstance(article, basestring): if not isinstance(article, unicode): article = unicode(article, encoding) new_drugbox = "" log_string = "* [[" + article + "]], " site = wikipedia.getSite() page = wikipedia.Page(site, article) text = page.get(get_redirect = True) if not Allowbots(text): continue begin, end, begin2, end2 = find_drugbox_from_text(text) if end: parameters = text[begin:end] else: log_string = log_string + "drugbox not found!" print log_string continue # make sure that there are no unmatched square or curly brackets # if found, abbort, since these may indicate a error in the wiki markup # and may trigger an infinite loop else where in this script if unbalanced(parameters): log_string = log_string + "unmatched brackets found, article skipped!" print log_string continue # print text[begin:end] current_parameters = {} # first extract and assign nested templates commonly used in drugbox templates parameters = assign_nested_templates(parameters, current_parameters) # remove any embedded carriage returns from remaining templates: parameters = remove_embedded_carriage_returns(parameters) # next, parse each line for parameters lines = parameters.splitlines() for line in lines: parse_line(line, current_parameters) INN = article # INN = "Acetylsalicylic acid" if INN in drugbank_data: db_data = drugbank_data[INN] elif "DrugBank" in current_parameters and current_parameters['DrugBank'] in DrugBank_ID_INN: log_string = str(log_string + "INN reset from " + INN,) INN = DrugBank_ID_INN[current_parameters['DrugBank']] log_string = log_string + "to " + INN + ", " db_data = drugbank_data[INN] if not "drug_name" in current_parameters: current_parameters['drug_name'] = INN else: db_data = [] if "DrugBank" in current_parameters and current_parameters['DrugBank'] in DrugBank_ID_INN: if DrugBank_ID_INN[current_parameters['DrugBank']] == INN: log_string = log_string + "DrugBankID/INN OK!, " else: log_string = log_string + "DrugBankID/INN NOT OK!, " else: if db_data: if db_data[8]: if not "DrugBank" in current_parameters: current_parameters['DrugBank'] = db_data[8] # while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing if db_data: if db_data[4] and not "KEGG" in current_parameters: current_parameters['KEGG'] = db_data[4] if db_data[6] and not "ChemSpiderID" in current_parameters: current_parameters['ChemSpiderID'] = db_data[6] if db_data[7] and not "PubChem" in current_parameters: current_parameters['PubChem'] = db_data[7] # augument current tradename list with the ones supplied by the Merck Manual if db_data: if db_data[0]: merck_tradename = db_data[0] else: merck_tradename = "" else: merck_tradename = "" if 'tradename' in current_parameters: current_tradename = current_parameters['tradename'] else: current_tradename = "" new_tradename = merged_tradenames(merck_tradename, current_tradename) if new_tradename: current_parameters['tradename'] = new_tradename # add MedlinePlus parameter if db_data: if db_data[2]: if test_MedlinePlus(db_data[2]): current_parameters['MedlinePlus'] = db_data[2] # add Drugs.com link if 'tradename' in current_parameters: tradename = current_parameters['tradename'] else: tradename = "" if db_data: if db_data[3]: drugbank_drugs_com = db_data[3] else: drugbank_drugs_com = "" else: drugbank_drugs_com = "" result = test_Drugs_com(INN, tradename, drugbank_drugs_com) if result: current_parameters['Drugs.com'] = result # add MedlinePlus parameter if db_data: if db_data[2]: if test_MedlinePlus(db_data[2]): current_parameters['MedlinePlus'] = db_data[2] if not 'MedlinePlus' in current_parameters: opener = urllib.FancyURLopener({}) stem = string.replace(article, " ", "+") link = "http://vsearch.nlm.nih.gov/vivisimo/cgi-bin/query-meta?&v:project=medlineplus&query=" + stem # print "MedlinePlus link:", link f = opener.open(link) text2 = f.read() result = regexp_medlineplus_url.search(text2) if result: current_parameters['MedlinePlus'] = result.group('ACNO') new_text = text[:begin2] + build_new_drugbox(current_parameters) + text[end2:] # print build_new_drugbox(current_parameters) # print new_text # print current_parameters # print new_text if current_parameters: comment='populated new fields in {{drugbox}} and reordered per [[Wikipedia:Bots/Requests_for_approval/BogBot_2\|bot approval]]. Report errors and suggestions to [[User_talk:BogBot]]' status = savepage(page, new_text, comment, False, log_string) else: print ", page not updated" wikipedia.stopme() run() </syntaxhighlight>