User:BogBot/Source code/Task 03: Difference between revisions

Content deleted Content added
removed duplicate code
m top: Replaced deprecated <source> tags with <syntaxhighlight>
 
(10 intermediate revisions by one other user not shown)
Line 1:
<sourcesyntaxhighlight lang=python>
#!/usr/bin/python
# -*- coding: UTF-8 -*-
 
# Bot Script to populate new clinical fields in Drugbox templates in Wikipedia drug articles.
Line 7 ⟶ 8:
# | Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->
# | MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" -->
# In addition, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing with data from http://www.drugbank.ca/system/downloads/current/drugbank.txt.zip
# The tradnames were obtained from http://www.merckmanuals.com/home/drugnames-index/trade/a.html
# Finally the script sorts the fields in the order they are currently rendered by the drugbox template
# (in the order of clinical, pharmacokinetic, identifiers, and chemical data)
 
"""{{Drugbox
| Watchedfields
| verifiedrevid = 408577806
| IUPAC_name =
| OtherNames =
| image =
| width =
Line 124 ⟶ 131:
}}"""
 
from collections import defaultdict
import codecs
import csv
import re
import string
import sys
import urllib
import urlparse
import wikipedia
from collections import defaultdict
import urllib
import csv
import string
 
 
# Included for bot exclusion compliance (see http://en.wikipedia.org/wiki/Template:Bots)
 
# compiled regular expression
 
user = "BogBot"
regexp_ab = re.compile(r'\{\{(nobots|bots\|(allow=none|deny=.*?' + user + r'.*?|optout=all|deny=all))\}\}')
 
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
# compiled regular expression
# Build a regex to locate the drugbox
exp = r'\{\{' # the opening brackets for the infobox
exp = exp + r'\s*' # any amount of whitespace
exp = exp + r'[Dd]rugbox' # the word "drugbox", capitalized or not
exp = exp + r'.*\}\}' # any amount of anything, followed by the end of the drugbox
 
regexp_drug_infobox = re.compile(exp, re.DOTALL)
regexp_ab = re.compile(r'\{\{(nobots|bots\|(allow=none|deny=.*?' + user + r'.*?|optout=all|deny=all))\}\}')
regexp_drug_infoboxregexp_param = re.compile(r"^\{s*?\{|\s*?(Drugbox|drugbox?P<PARAM>\S+)\s*?=\s*?(?P<PARAMSVALUE>.+)\s*?($|\}\}\s*|)", re.DOTALL)
regexp_paramregexp_nested_template = re.compile(r"^\s?{\|\s?{(?P<PARAMPARAMS>\S.+)\s?=}\s?(?P<VALUE>.+)$}")
 
regexp_open_square_bracket = re.compile(r"\[", re.DOTALL)
regexp_close_square_bracket = re.compile(r"\]", re.DOTALL)
regexp_open_curly_bracket = re.compile(r"}", re.DOTALL)
regexp_close_curly_bracket = re.compile(r"{", re.DOTALL)
 
regexp_double_open_square_bracket = re.compile(r"\[\[", re.DOTALL)
regexp_double_close_square_bracket = re.compile(r"\[\[", re.DOTALL)
regexp_double_open_curly_bracket = re.compile(r"}}", re.DOTALL)
regexp_double_close_curly_bracket = re.compile(r"{{", re.DOTALL)
 
# ATC_supplemental = {{ATC|B01|AC06}}, {{ATC|N02|BA01}}
regexp_ATC_supplemental = re.compile(r"\|\s*?ATC_supplemental\s*?=\s*?(?P<TEMPLATE>.*\{\{s*(ATC).+?\}\})\s*?($|\|)")
# CASNo_Ref = {{cascite|correct|CAS}}
regexp_CASNo_Ref = re.compile(r"\|\s*?CASNo_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Cascite|cascite).*?\}\})")
# CAS_supplemental = {{CAS|405-41-4}}
regexp_CAS_supplemental = re.compile(r"\|\s*?CAS_supplemental\s*?=\s*?(?P<TEMPLATE>\{\{CAS.*?\}\})")
# ChEMBL_Ref = {{ebicite|correct|EBI}}
regexp_ChEMBL_Ref = re.compile(r"\|\s*?ChEMBL_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Ebicite|ebicite).*?\}\})")
# ChemSpiderID_Ref = {{chemspidercite|correct|chemspider}}
regexp_ChemSpiderID_Ref = re.compile(r"\|\s*?ChemSpiderID_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Chemspidercite|chemspidercite).*?\}\})")
# Drugs.com = {{drugs.com|monograph|lisinopril}}
regexp_Drugs_com = re.compile(r"\|\s*?Drugs\.com\s*?=\s*?(?P<TEMPLATE>\{\{(Drugs\.com|drugs\.com).*?\}\})")
# KEGG_Ref = {{keggcite|correct|kegg}}
regexp_KEGG_Ref = re.compile(r"\|\s*?KEGG_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Keggcite|keggcite).*?\}\})")
# StdInChI_Ref = {{stdinchicite|correct|chemspider}}
regexp_StdInChI_Ref = re.compile(r"\|\s*?StdInChI_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Stdinchicite|stdinchicite).*?\}\})")
# StdInChIKey_Ref = {{stdinchicite|correct|chemspider}}
regexp_StdInChIKey_Ref = re.compile(r"\|\s*?StdInChIKey_Ref\s*?=\*s?(?P<TEMPLATE>\{\{(Stdinchicite|stdinchicite).*\}\})")
# UNII_Ref = {{fdacite|changed|FDA}}
regexp_UNII_Ref = re.compile(r"\|\s*?UNII_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Fdacite|fdacite).*?\}\})")
 
# named ref tag = <ref name="fattinger2000"> but not <ref name="fattinger2000" />
regexp_ref_tag_begin = re.compile(r"<ref>|<ref name.*?[^/]>")
regexp_ref_tag_end = re.compile(r"</ref>")
regexp_citation_template = re.compile(r"\{\{[C|c]ite\s*?(?P<TEMPLATE>.*?)\}\}")
 
# href='/monograph/maprotiline-hydrochloride.html'
regexp_monograph_url = re.compile("href='/monograph/(?P<STEM>.*?)\.html'", re.DOTALL)
 
# http://www.nlm.nih.gov/medlineplus/druginfo/meds/a604021.html
regexp_medlineplus_url = re.compile("www.nlm.nih.gov/medlineplus/druginfo/meds/(?P<ACNO>.*?)\.html", re.DOTALL)
 
def Allowbots(text):
if (regexp_ab.search(text)):
Line 151 ⟶ 204:
return True
 
def urlEncodeNonAscii(b):
# articles = open('/Users/BogBot/progs/pywikipedia/drugbox/drugbox_titles.txt', 'r')
return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
 
def iriToUri(iri):
drugbank_data = {}
parts= urlparse.urlparse(iri)
return urlparse.urlunparse(
part.encode('idna') if parti==1 else urlEncodeNonAscii(part.encode('utf-8'))
for parti, part in enumerate(parts)
)
 
def find_drugbox_from_text(article_text):
# 0 1 2 3 4 5 6 7
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
# Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID
search_result = regexp_drug_infobox.search(article_text)
if search_result:
result_text = search_result.group(0) # returns the entire matching sequence
begin, end = search_result.span()
else:
return None
# the regex isn't perfect, so look for the closing brackets of the infobox
count = 0
last_ind = None
for ind, c in enumerate(result_text):
if c == '}':
count = count - 1
elif c == '{':
count = count + 1
if count == 0 and not ind == 0:
last_ind = ind
break
offset = result_text.find('|')
___location = (begin+offset, begin+last_ind-1, begin, begin+last_ind+1)
return ___location
 
def drugbank():
drug_data = csv.reader(open('/Users/BogBot/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'), dialect='excel')
# drugbank data obtained from http://www.drugbank.ca/system/downloads/current/drugbank.txt.zip
 
drugbank_data = {}
for row in drug_data:
drugbank_data[row[0]] = row[1:]
 
# 0 1 2 3 4 5 6 7 8
# Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID DrugBank_ID
 
drug_data = csv.reader(open('/Users/BogBot/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'), dialect='excel')
# articles = []
 
for row in drug_data:
articles = ["User:Boghog/Sandbox2"]
drugbank_data[row[0]] = row[1:]
# articles = ["Template:Drugbox/Lisinopril"]
return drugbank_data
 
def assign_nested_templates(parameters, current_parameters):
# main loop
# extract and assign nested templates commonly used in drugbox templates
 
result_ATC_supplemental = regexp_ATC_supplemental.search(parameters)
for article in articles:
if result_ATC_supplemental:
template = result_ATC_supplemental.group('TEMPLATE')
current_parameters['ATC_supplemental'] = template
# print "found result_ATC_supplemental! ", template
parameters = re.sub(regexp_ATC_supplemental, "|", parameters)
 
result_ChEMBL_Ref = regexp_ChEMBL_Ref.search(parameters)
article = article.rstrip('\n')
if result_ChEMBL_Ref:
# INN = article
template = result_ChEMBL_Ref.group('TEMPLATE')
INN = "Lisinopril"
current_parameters['ChEMBL_Ref'] = template
# print article
# print "found result_ChEMBL_Ref! ", template
parameters = re.sub(regexp_ChEMBL_Ref, "", parameters)
 
result_CASNo_Ref = regexp_CASNo_Ref.search(parameters)
log_string = "* [[" + article + "]]"
if result_CASNo_Ref:
print log_string,
template = result_CASNo_Ref.group('TEMPLATE')
current_parameters['CASNo_Ref'] = template
# print "found result_CASNo_Ref! ", template
parameters = re.sub(regexp_CASNo_Ref, "", parameters)
 
result_CAS_supplemental = regexp_CAS_supplemental.search(parameters)
site = wikipedia.getSite()
if result_CAS_supplemental:
page = wikipedia.Page(site, article)
template = result_CAS_supplemental.group('TEMPLATE')
text = page.get(get_redirect = True)
current_parameters['CAS_supplemental'] = template
# print "found result_CAS_supplemental! ", template
parameters = re.sub(regexp_CAS_supplemental, "", parameters)
 
result_ChemSpiderID_Ref = regexp_ChemSpiderID_Ref.search(parameters)
if not Allowbots(text):
if result_ChemSpiderID_Ref:
break
template = result_ChemSpiderID_Ref.group('TEMPLATE')
current_parameters['ChemSpiderID_Ref'] = template
# print "found ChemSpiderID_Ref! ", template
parameters = re.sub(regexp_ChemSpiderID_Ref, "", parameters)
 
result_Drugs_com = regexp_Drugs_com.search(parameters)
log_string = ", article: " + article
if result_Drugs_com:
print log_string
template = result_Drugs_com.group('TEMPLATE')
current_parameters['Drugs.com'] = template
# print "found result_Drugs_com! ", template
parameters = re.sub(regexp_Drugs_com, "", parameters)
 
result_KEGG_Ref = regexp_KEGG_Ref.search(parameters)
# print text
if result_KEGG_Ref:
template = result_KEGG_Ref.group('TEMPLATE')
current_parameters['KEGG_Ref'] = template
# print "found KEGG_Ref! ", template
parameters = re.sub(regexp_KEGG_Ref, "", parameters)
 
result_StdInChI_Ref = regexp_StdInChI_Ref.search(parameters)
if result_StdInChI_Ref:
template = result_StdInChI_Ref.group('TEMPLATE')
current_parameters['StdInChI_Ref'] = template
# print "found StdInChI_Ref! ", template
parameters = re.sub(regexp_StdInChI_Ref, "", parameters)
 
result_UNII_Ref = regexp_UNII_Ref.search(parameters)
if result_UNII_Ref:
template = result_UNII_Ref.group('TEMPLATE')
current_parameters['UNII_Ref'] = template
# print "found UNII_Ref! ", template
parameters = re.sub(regexp_UNII_Ref, "", parameters)
 
return parameters
 
def rejoin(begin, end, sub_strings, type):
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc.
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
new_list = [sub_strings[0]]
for sub_string in sub_strings[1:]:
if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)):
if type == "line":
new_list[-1] = new_list[-1] + sub_string
if type == "parameter":
new_list[-1] = new_list[-1] + '|' + sub_string
else:
new_list.append(sub_string)
sub_strings = new_list
return sub_strings
def test_disjoint(begin,end,sub_strings):
disjoint = False
for sub_string in sub_strings:
if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)):
disjoint = True
break
return disjoint
 
def regex_rejoin(regexp_begin, regexp_end, sub_strings, type):
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc.
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
 
new_list = [sub_strings[0]]
for sub_string in sub_strings[1:]:
begin = False; end = False; n_begin = 0; n_end = 0
if regexp_begin.search(sub_string):
begin = True
match = regexp_begin.findall(sub_string)
n_begin = len(match)
if regexp_end.search(sub_string):
end = True
match = regexp_end.findall(sub_string)
n_end = len(match)
if ((end and not begin) or n_end < n_begin):
if type == "line":
new_list[-1] = new_list[-1] + sub_string
if type == "parameter":
new_list[-1] = new_list[-1] + '|' + sub_string
else:
new_list.append(sub_string)
sub_strings = new_list
return sub_strings
 
def regex_test_disjoint(regexp_begin, regexp_end, sub_strings):
disjoint = False
begin = False; end = False; n_begin = 0; n_end = 0
for sub_string in sub_strings:
if regexp_begin.search(sub_string):
begin = True
match = regexp_begin.findall(sub_string)
n_begin = len(match)
if regexp_end.search(sub_string):
end = True
match = regexp_end.findall(sub_string)
n_end = len(match)
if ((end and not begin) or n_end < n_begin):
disjoint = True
break
return disjoint
 
def pad_parameters(text):
 
matches = regexp_citation_template.findall(text)
 
for match in matches:
sub_strings = match.split("|")
new_strings = " " + sub_strings[0].strip()
for item in sub_strings[1:]:
item = " | " + item.strip()
new_strings += item
 
sub_strings = new_strings.split("=")
new_strings = " " + sub_strings[0].strip()
for item in sub_strings[1:]:
item = " = " + item.strip()
new_strings += item
text = text.replace(match,new_strings)
return text
 
def parse_line(line, current_parameters):
 
# print "index: ", line.count('|')
# if (line.count('|') > 1 and line.count('[[') < 1 ):
if (line.count('|') > 1):
# print "line1: ", line
sub_strings = line.split("|")
 
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links
sub_strings = rejoin('[[',']]',sub_strings, 'parameter')
# do the same for nested templates
forever = True
while forever:
if test_disjoint('{{','}}',sub_strings):
forever = True
sub_strings = rejoin('{{','}}',sub_strings, 'parameter')
else:
forever = False
# do the same for citations:
forever = True
while forever:
if regex_test_disjoint(regexp_ref_tag_begin, regexp_ref_tag_end, sub_strings):
forever = True
sub_strings = regex_rejoin(regexp_ref_tag_begin, regexp_ref_tag_end, sub_strings, 'parameter')
else:
forever = False
# now assign the parameters
for sub_string in sub_strings:
# print "sub_string: ", sub_string
if (sub_string.count("=") > 0):
parts = sub_string.split("=", 1)
# print "parts: ", parts
parameter = str(parts[0].encode("utf-8")).strip()
value = str(parts[1].encode("utf-8")).strip()
value = pad_parameters(value)
# print "parameter, value: ", parameter, " ", value
if not value:
value = " "
current_parameters[parameter] = value
else:
result_drug_param = regexp_param.search(line)
# print line
if result_drug_param:
# print "made it!"
# print "line2: ", line
parameter = (result_drug_param.group('PARAM').encode("utf-8")).strip()
value = (result_drug_param.group('VALUE').encode("utf-8")).strip()
value = pad_parameters(value)
if not value:
value = " "
current_parameters[parameter] = value
return
def remove_embedded_carriage_returns(parameters):
# remove embedded carriage returns from templates:
 
result_drug_infobox = regexp_drug_infobox.search(text)
if result_drug_infobox:
print "found it!"
parameters = result_drug_infobox.group('PARAMS')
current_parameters = {}
# print parameters
lines = parameters.splitlines()
for line in lines:
# print line
result_drug_param = regexp_param.search(line)
if result_drug_param:
parameter = result_drug_param.group('PARAM')
value = result_drug_param.group('VALUE')
current_parameters[parameter] = value
printforever "INN:= ", INNTrue
while forever:
if INN in drugbank_data:
if test_disjoint('{{', '}}', lines):
data = drugbank_data[INN]
forever = True
lines = rejoin('{{', '}}', lines, 'line')
# merge tradenames
else:
forever = False
merck_tradenames = sorted(set(data[0].split(";")))[1:]
for index, object in enumerate(merck_tradenames):
merck_tradenames[index] = string.capitalize(string.strip(object.encode()))
if current_parameters.has_key("tradename"):
current_tradenames = sorted(set(current_parameters['tradename'].split(", ")))
else:
current_tradenames = []
for index, object in enumerate(current_tradenames):
current_tradenames[index] = string.capitalize(string.strip(object.encode()))
merged_tradenames = sorted(merck_tradenames + current_tradenames)
new_tradenames = sorted(set(merged_tradenames))
current_parameters['tradename'] = ", ".join(new_tradenames)
# print "merck tradenames: ", merck_tradenames
# print "current tradenames: ", current_tradenames
print "new tradenames: ", current_parameters['tradename']
# test web page, returns "200" if OK:
# if urllib.urlopen(link).getcode() == 200:
# 200: ('OK', 'Request fulfilled, document follows')
# 404: (page not found)
 
forever = True
# add MedlinePlus parameter
while forever:
# | MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" -->
if regex_test_disjoint(regexp_ref_tag_begin, regexp_ref_tag_end, lines):
forever = True
lines = regex_rejoin(regexp_ref_tag_end, regexp_ref_tag_begin, lines, 'line')
else:
forever = False
 
parameters = string.join(lines, "\n")
link = "http://www.nlm.nih.gov/medlineplus/druginfo/meds/" + data[2] + ".html"
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
current_parameters['MedlinePlus'] = data[2]
# print "MedlinePlus: ", current_parameters['MedlinePlus']
# add Drugs.com parameter
# | Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->
 
return parameters
INN_html = string.lower(string.replace(INN, " ", "_"))
link = "http://www.drugs.com/monograph/" + INN_html + ".html"
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
current_parameters['Drugs.com'] = INN_html
# print "Drugs.com: ", current_parameters['Drugs.com']
 
# for parameter, value indef build_new_drugbox(current_parameters.iteritems():
# build new drugbox template
# print parameter, ":", value
 
# make sure that all values in the current_parameters dictionary are properly encoded
# while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing
 
encoding = 'utf-8'
if data[4] and not current_parameters.has_key("KEGG"):
for k, v in current_parameters.iteritems():
current_parameters['KEGG'] = data[4]
if isinstance(v, basestring):
if not isinstance(v, unicode):
v = unicode(v, encoding)
current_parameters[k] = v
 
# if type parameter is missing, check subordinate parameters that infer type, and if found, assign type
if data[6] and not current_parameters.has_key("ChemSpiderID"):
current_parameters['ChemSpiderID'] = data[6]
 
if data[7] and not current_parameters.has_key("PubChemtype"):
if ("component1" in current_parameters['PubChem'] =or data[7]
"class1" in current_parameters or
"component2" in current_parameters or
"class2" in current_parameters or
"component3" in current_parameters or
"class3" in current_parameters or
"component4" in current_parameters
or "class4" in current_parameters):
current_parameters['type'] = "combo"
elif ("mab_type") in current_parameters:
current_parameters['type'] = "mab"
elif ("vaccine_type") in current_parameters:
current_parameters['type'] = "vaccine"
 
# if not previously assigned, add the following "empty" parameters
# build new drugbox template
if not current_parameters.has_key("tradename"):
current_parameters["tradename"] = " "
 
new_drugbox = unicode( "{{Drugbox\n", "utf-8" )
if current_parameters.has_key("Verifiedfields"): new_drugbox += "| Verifiedfields = " + current_parameters['Verifiedfields'] + "\n"
if current_parameters.has_key("Watchedfields"): new_drugbox += "| Watchedfields = " + current_parameters['Watchedfields'] + "\n"
if current_parameters.has_key("verifiedrevid"): new_drugbox += "| verifiedrevid = " + current_parameters['verifiedrevid'] + "\n"
if current_parameters.has_key("IUPAC_name"): new_drugbox += "| IUPAC_name = " + current_parameters['IUPAC_name'] + "\n"
if current_parameters.has_key("OtherNames"): new_drugbox += "| OtherNames = " + current_parameters['OtherNames'] + "\n"
if current_parameters.has_key("image"): new_drugbox += "| image = " + current_parameters['image'] + "\n"
if current_parameters.has_key("width"): new_drugbox += "| width = " + current_parameters['width'] + "\n"
Line 307 ⟶ 565:
if current_parameters.has_key("vaccine_type"): new_drugbox += "| vaccine_type = " + current_parameters['vaccine_type'] + "\n"
 
if ("tradename" in current_parameters or "Drugs.com" in current_parameters or "MedlinePlus" in current_parameters or "licence_EU" in current_parameters or
new_drugbox += "\n<!--Clinical data-->\n"
"licence_US" in current_parameters or "DailyMedID" in current_parameters or "pregnancy_AU" in current_parameters or "pregnancy_US" in current_parameters or
if current_parameters.has_key("tradename"): new_drugbox += "| tradename = " + current_parameters['tradename'] + "\n"
"pregnancy_category" in current_parameters or "legal_AU" in current_parameters or "legal_CA" in current_parameters or "legal_UK" in current_parameters or
if current_parameters.has_key("Drugs.com"): new_drugbox += "| Drugs.com = " + current_parameters['Drugs.com'] + "\n"
"legal_US" in current_parameters or "legal_status" in current_parameters or "dependency_liability" or "routes_of_administration" in current_parameters):
if current_parameters.has_key("MedlinePlus"): new_drugbox += "| MedlinePlus = " + current_parameters['MedlinePlus'] + "\n"
if current_parameters.has_key("licence_EU"): new_drugbox += "| licence_EU = " + current_parameters['licence_EU'] + "\n"
if current_parameters.has_key("licence_US"): new_drugbox += "| licence_US = " + current_parameters['licence_US'] + "\n"
if current_parameters.has_key("DailyMedID"): new_drugbox += "| DailyMedID = " + current_parameters['DailyMedID'] + "\n"
if current_parameters.has_key("pregnancy_AU"): new_drugbox += "| pregnancy_AU = " + current_parameters['pregnancy_AU'] + "\n"
if current_parameters.has_key("pregnancy_US"): new_drugbox += "| pregnancy_US = " + current_parameters['pregnancy_US'] + "\n"
if current_parameters.has_key("pregnancy_category"): new_drugbox += "| pregnancy_category = " + current_parameters['pregnancy_category'] + "\n"
if current_parameters.has_key("legal_AU"): new_drugbox += "| legal_AU = " + current_parameters['legal_AU'] + "\n"
if current_parameters.has_key("legal_CA"): new_drugbox += "| legal_CA = " + current_parameters['legal_CA'] + "\n"
if current_parameters.has_key("legal_UK"): new_drugbox += "| legal_UK = " + current_parameters['legal_UK'] + "\n"
if current_parameters.has_key("legal_US"): new_drugbox += "| legal_US = " + current_parameters['legal_US'] + "\n"
if current_parameters.has_key("legal_status"): new_drugbox += "| legal_status = " + current_parameters['legal_status'] + "\n"
if current_parameters.has_key("dependency_liability"): new_drugbox += "| dependency_liability = " + current_parameters['dependency_liability'] + "\n"
if current_parameters.has_key("routes_of_administration"): new_drugbox += "| routes_of_administration = " + current_parameters['routes_of_administration'] + "\n"
 
new_drugbox += "\n<!--PharmacokineticClinical data-->\n"
if current_parameters.has_key("bioavailabilitytradename"): new_drugbox += "| bioavailabilitytradename = " + current_parameters['bioavailabilitytradename'] + "\n"
if current_parameters.has_key("protein_boundDrugs.com"): new_drugbox += "| protein_boundDrugs.com = " + current_parameters['protein_boundDrugs.com'] + "\n"
if current_parameters.has_key("metabolismMedlinePlus"): new_drugbox += "| metabolismMedlinePlus = " + current_parameters['metabolismMedlinePlus'] + "\n"
if current_parameters.has_key("elimination_half-lifelicence_EU"): new_drugbox += "| elimination_half-lifelicence_EU = " + current_parameters['elimination_half-lifelicence_EU'] + "\n"
if current_parameters.has_key("excretionlicence_US"): new_drugbox += "| excretionlicence_US = " + current_parameters['excretionlicence_US'] + "\n"
if current_parameters.has_key("DailyMedID"): new_drugbox += "| DailyMedID = " + current_parameters['DailyMedID'] + "\n"
if current_parameters.has_key("pregnancy_AU"): new_drugbox += "| pregnancy_AU = " + current_parameters['pregnancy_AU'] + "\n"
if current_parameters.has_key("pregnancy_US"): new_drugbox += "| pregnancy_US = " + current_parameters['pregnancy_US'] + "\n"
if current_parameters.has_key("pregnancy_category"): new_drugbox += "| pregnancy_category = " + current_parameters['pregnancy_category'] + "\n"
if current_parameters.has_key("legal_AU"): new_drugbox += "| legal_AU = " + current_parameters['legal_AU'] + "\n"
if current_parameters.has_key("legal_CA"): new_drugbox += "| legal_CA = " + current_parameters['legal_CA'] + "\n"
if current_parameters.has_key("legal_UK"): new_drugbox += "| legal_UK = " + current_parameters['legal_UK'] + "\n"
if current_parameters.has_key("legal_US"): new_drugbox += "| legal_US = " + current_parameters['legal_US'] + "\n"
if current_parameters.has_key("legal_status"): new_drugbox += "| legal_status = " + current_parameters['legal_status'] + "\n"
if current_parameters.has_key("dependency_liability"): new_drugbox += "| dependency_liability = " + current_parameters['dependency_liability'] + "\n"
if current_parameters.has_key("routes_of_administration"): new_drugbox += "| routes_of_administration = " + current_parameters['routes_of_administration'] + "\n"
 
if ("bioavailability" in current_parameters or "protein_bound metabolism" in current_parameters or "elimination_half-life" in current_parameters or "excretion" in current_parameters):
 
new_drugbox += "\n<!--Pharmacokinetic data-->\n"
if current_parameters.has_key("bioavailability"): new_drugbox += "| bioavailability = " + current_parameters['bioavailability'] + "\n"
if current_parameters.has_key("protein_bound"): new_drugbox += "| protein_bound = " + current_parameters['protein_bound'] + "\n"
if current_parameters.has_key("metabolism"): new_drugbox += "| metabolism = " + current_parameters['metabolism'] + "\n"
if current_parameters.has_key("elimination_half-life"): new_drugbox += "| elimination_half-life = " + current_parameters['elimination_half-life'] + "\n"
if current_parameters.has_key("excretion"): new_drugbox += "| excretion = " + current_parameters['excretion'] + "\n"
if ("CAS_number" in current_parameters or "CAS_supplemental" in current_parameters or "ATCvet" in current_parameters or "ATC_prefix" in current_parameters or
new_drugbox += "\n<!--Identifiers-->\n"
"ATC_suffix" in current_parameters or "ATC_supplemental" in current_parameters or "PubChem" in current_parameters or "PubChemSubstance" in current_parameters or
if current_parameters.has_key("CAS_number"): new_drugbox += "| CAS_number = " + current_parameters['CAS_number'] + "\n"
"IUPHAR_ligand" in current_parameters or "DrugBank" in current_parameters or "ChemSpiderID" in current_parameters or "UNII" in current_parameters or
if current_parameters.has_key("CAS_supplemental"): new_drugbox += "| CAS_supplemental = " + current_parameters['CAS_supplemental'] + "\n"
"KEGG" in current_parameters or "ChEBI" in current_parameters or "ChEMBL" in current_parameters):
if current_parameters.has_key("ATCvet"): new_drugbox += "| ATCvet = " + current_parameters['ATCvet'] + "\n"
new_drugbox += "\n<!--Identifiers-->\n"
if current_parameters.has_key("ATC_prefix"): new_drugbox += "| ATC_prefix = " + current_parameters['ATC_prefix'] + "\n"
if current_parameters.has_key("ATC_suffixCAS_number_Ref"): new_drugbox += "| ATC_suffixCAS_number_Ref = " + current_parameters['ATC_suffixCAS_number_Ref'] + "\n"
if current_parameters.has_key("ATC_supplementalCASNo_Ref"): new_drugbox += "| ATC_supplementalCASNo_Ref = " + current_parameters['ATC_supplementalCASNo_Ref'] + "\n"
if current_parameters.has_key("PubChemCAS_number"): new_drugbox += "| PubChemCAS_number = " + current_parameters['PubChemCAS_number'] + "\n"
if current_parameters.has_key("PubChemSubstanceCAS_supplemental"): new_drugbox += "| PubChemSubstanceCAS_supplemental = " + current_parameters['PubChemSubstanceCAS_supplemental'] + "\n"
if current_parameters.has_key("IUPHAR_ligandATCvet"): new_drugbox += "| IUPHAR_ligandATCvet = " + current_parameters['IUPHAR_ligandATCvet'] + "\n"
if current_parameters.has_key("DrugBankATC_prefix"): new_drugbox += "| DrugBankATC_prefix = " + current_parameters['DrugBankATC_prefix'] + "\n"
if current_parameters.has_key("ChemSpiderID_RefATC_suffix"): new_drugbox += "| ChemSpiderID_RefATC_suffix = " + current_parameters['ChemSpiderID_RefATC_suffix'] + "\n"
if current_parameters.has_key("ChemSpiderIDATC_supplemental"): new_drugbox += "| ChemSpiderIDATC_supplemental = " + current_parameters['ChemSpiderIDATC_supplemental'] + "\n"
if current_parameters.has_key("UNII_RefPubChem"): new_drugbox += "| UNII_RefPubChem = " + current_parameters['UNII_RefPubChem'] + "\n"
if current_parameters.has_key("UNIIPubChemSubstance"): new_drugbox += "| UNIIPubChemSubstance = " + current_parameters['UNIIPubChemSubstance'] + "\n"
if current_parameters.has_key("KEGG_RefIUPHAR_ligand"): new_drugbox += "| KEGG_RefIUPHAR_ligand = " + current_parameters['KEGG_RefIUPHAR_ligand'] + "\n"
if current_parameters.has_key("KEGGDrugBank_Ref"): new_drugbox += "| KEGGDrugBank_Ref = " + current_parameters['KEGGDrugBank_Ref'] + "\n"
if current_parameters.has_key("ChEBI_RefDrugBank"): new_drugbox += "| ChEBI_RefDrugBank = " + current_parameters['ChEBI_RefDrugBank'] + "\n"
if current_parameters.has_key("ChEBIChemSpiderID_Ref"): new_drugbox += "| ChEBIChemSpiderID_Ref = " + current_parameters['ChEBIChemSpiderID_Ref'] + "\n"
if current_parameters.has_key("ChEMBL_RefChemSpiderID"): new_drugbox += "| ChEMBL_RefChemSpiderID = " + current_parameters['ChEMBL_RefChemSpiderID'] + "\n"
if current_parameters.has_key("ChEMBLUNII_Ref"): new_drugbox += "| ChEMBLUNII_Ref = " + current_parameters['ChEMBLUNII_Ref'] + "\n"
if current_parameters.has_key("UNII"): new_drugbox += "| UNII = " + current_parameters['UNII'] + "\n"
if current_parameters.has_key("KEGG_Ref"): new_drugbox += "| KEGG_Ref = " + current_parameters['KEGG_Ref'] + "\n"
if current_parameters.has_key("KEGG"): new_drugbox += "| KEGG = " + current_parameters['KEGG'] + "\n"
if current_parameters.has_key("ChEBI_Ref"): new_drugbox += "| ChEBI_Ref = " + current_parameters['ChEBI_Ref'] + "\n"
if current_parameters.has_key("ChEBI"): new_drugbox += "| ChEBI = " + current_parameters['ChEBI'] + "\n"
if current_parameters.has_key("ChEMBL_Ref"): new_drugbox += "| ChEMBL_Ref = " + current_parameters['ChEMBL_Ref'] + "\n"
if current_parameters.has_key("ChEMBL"): new_drugbox += "| ChEMBL = " + current_parameters['ChEMBL'] + "\n"
 
new_drugbox += "\n<!--Chemical data-->\n"
if current_parameters.has_key("chemical_formula"): in current_parameters or "C" in current_parameters new_drugbox +=or "| chemical_formula = H" in current_parameters or "Ag" in current_parameters or "As" +in current_parameters['chemical_formula'] or + "\n"
"Au" in current_parameters or "B" in current_parameters or "Bi" in current_parameters or "Br" in current_parameters or "Cl" in current_parameters or "Co" in current_parameters or
if current_parameters.has_key("C"): new_drugbox += "| C=" + current_parameters['C'] + " "
"F" in current_parameters or "Fe" in current_parameters or "Gd" in current_parameters or "I" in current_parameters or "K" in current_parameters or "Mn" in current_parameters or
if current_parameters.has_key("H"): new_drugbox += "| H=" + current_parameters['H'] + " "
"N" in current_parameters or "Na" in current_parameters or "O" in current_parameters or "P" in current_parameters or "Pt" in current_parameters or "S" in current_parameters or
if current_parameters.has_key("Ag"): new_drugbox += "| Ag=" + current_parameters['Ag'] + " "
"Sb" in current_parameters or "Se" in current_parameters or "Sr" in current_parameters or "Tc" in current_parameters or "charge" in current_parameters):
if current_parameters.has_key("As"): new_drugbox += "| As=" + current_parameters['As'] + " "
if current_parameters.has_key("Auchemical_formula"): new_drugbox += "| Auchemical_formula = " + current_parameters['Auchemical_formula'] + " \n"
# new_drugbox += " "
if current_parameters.has_key("BC"): new_drugbox += "| BC=" + current_parameters['BC'] + " "
if current_parameters.has_key("BiH"): new_drugbox += "| BiH=" + current_parameters['BiH'] + " "
if current_parameters.has_key("BrAg"): new_drugbox += "| BrAg=" + current_parameters['BrAg'] + " "
if current_parameters.has_key("ClAs"): new_drugbox += "| ClAs=" + current_parameters['ClAs'] + " "
if current_parameters.has_key("CoAu"): new_drugbox += "| CoAu=" + current_parameters['CoAu'] + " "
if current_parameters.has_key("FB"): new_drugbox += "| FB=" + current_parameters['FB'] + " "
if current_parameters.has_key("FeBi"): new_drugbox += "| FeBi=" + current_parameters['FeBi'] + " "
if current_parameters.has_key("GdBr"): new_drugbox += "| GdBr=" + current_parameters['GdBr'] + " "
if current_parameters.has_key("ICl"): new_drugbox += "| ICl=" + current_parameters['ICl'] + " "
if current_parameters.has_key("KCo"): new_drugbox += "| KCo=" + current_parameters['KCo'] + " "
if current_parameters.has_key("MnF"): new_drugbox += "| MnF=" + current_parameters['MnF'] + " "
if current_parameters.has_key("NFe"): new_drugbox += "| NFe=" + current_parameters['NFe'] + " "
if current_parameters.has_key("NaGd"): new_drugbox += "| NaGd=" + current_parameters['NaGd'] + " "
if current_parameters.has_key("OI"): new_drugbox += "| OI=" + current_parameters['OI'] + " "
if current_parameters.has_key("PK"): new_drugbox += "| PK=" + current_parameters['PK'] + " "
if current_parameters.has_key("PtMn"): new_drugbox += "| PtMn=" + current_parameters['PtMn'] + " "
if current_parameters.has_key("SN"): new_drugbox += "| SN=" + current_parameters['SN'] + " "
if current_parameters.has_key("SbNa"): new_drugbox += "| CNa=" + current_parameters['SbNa'] + " "
if current_parameters.has_key("SeO"): new_drugbox += "| SeO=" + current_parameters['SeO'] + " "
if current_parameters.has_key("SrP"): new_drugbox += "| SrP=" + current_parameters['SrP'] + " "
if current_parameters.has_key("TcPt"): new_drugbox += "| TcPt=" + current_parameters['TcPt'] + " "
if current_parameters.has_key("chargeS"): new_drugbox += "| charge S= " + current_parameters['chargeS'] + " "
if current_parameters.has_key("Sb"): new_drugbox += "| C=" + current_parameters['Sb'] + " "
new_drugbox += "\n"
if current_parameters.has_key("Se"): new_drugbox += "| Se=" + current_parameters['Se'] + " "
if current_parameters.has_key("Sr"): new_drugbox += "| Sr=" + current_parameters['Sr'] + " "
if current_parameters.has_key("Tc"): new_drugbox += "| Tc=" + current_parameters['Tc'] + " "
if current_parameters.has_key("charge"): new_drugbox += "| charge = " + current_parameters['charge'] + " "
new_drugbox += "\n"
if current_parameters.has_key("molecular_weight"): new_drugbox += "| molecular_weight = " + current_parameters['molecular_weight'] + "\n"
if current_parameters.has_key("smiles"): new_drugbox += "| smiles = " + current_parameters['smiles'] + "\n"
if current_parameters.has_key("InChI_Ref"): new_drugbox += "| InChI_Ref = " + current_parameters['InChI_Ref'] + "\n"
if current_parameters.has_key("InChI"): new_drugbox += "| InChI = " + current_parameters['InChI'] + "\n"
if current_parameters.has_key("InChIKey"): new_drugbox += "| InChIKey = " + current_parameters['InChIKey'] + "\n"
if current_parameters.has_key("StdInChI_Ref"): new_drugbox += "| StdInChI_Ref = " + current_parameters['StdInChI_Ref'] + "\n"
if current_parameters.has_key("StdInChI"): new_drugbox += "| StdInChI = " + current_parameters['StdInChI'] + "\n"
Line 396 ⟶ 676:
if current_parameters.has_key("density"): new_drugbox += "| density = " + current_parameters['density'] + "\n"
if current_parameters.has_key("melting_point"): new_drugbox += "| melting_point = " + current_parameters['melting_point'] + "\n"
if current_parameters.has_key("melting_high"): new_drugbox += "| melting_high = " + current_parameters['melting_high'] + "\n"
if current_parameters.has_key("melting_notes"): new_drugbox += "| melting_notes = " + current_parameters['melting_notes'] + "\n"
if current_parameters.has_key("boiling_point"): new_drugbox += "| boiling_point = " + current_parameters['boiling_point'] + "\n"
if current_parameters.has_key("boiling_notes"): new_drugbox += "| boiling_notes = " + current_parameters['boiling_notes'] + "\n"
Line 402 ⟶ 684:
if current_parameters.has_key("sec_combustion"): new_drugbox += "| sec_combustion = " + current_parameters['sec_combustion'] + "\n"
 
new_drugbox += "}}\n"
# print new_drugbox
 
# print new_drugbox
# replace original drugbox with new drugbox
new_text = re.sub(regexp_drug_infobox, new_drugbox, text)
print new_text
 
return new_drugbox
page.put(new_text, comment='populated clinical fields in drugbox', watchArticle = None, minorEdit = True)
print ", page updated"
def merged_tradenames(merck_tradename, current_tradename):
wikipedia.stopme()
# merge tradenames
</source>
new_tradenames = []
 
if merck_tradename:
merck_tradenames = sorted(set(merck_tradename.split(";")))[1:]
for index, object in enumerate(merck_tradenames):
merck_tradenames[index] = string.capitalize(string.strip(object.encode("utf-8")))
else:
merck_tradenames = []
 
if current_tradename:
current_tradenames = sorted(set(current_tradename.split(", ")))
for index, object in enumerate(current_tradenames):
current_tradenames[index] = string.capitalize(string.strip(object.encode("utf-8")))
else:
current_tradenames = []
 
merged_tradenames = []
if merck_tradenames: merged_tradenames = merck_tradenames
if current_tradenames:
for name in current_tradenames:
merged_tradenames.append(name)
if merged_tradenames:
new_tradenames = sorted(set(merged_tradenames))
new_tradename = ", ".join(new_tradenames)
return new_tradename
else:
new_tradename = ""
return new_tradename
# print "merck tradenames: ", merck_tradenames
# print "current tradenames: ", current_tradenames
 
 
def test_MedlinePlus(accession_number):
 
# add MedlinePlus parameter
# | MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" -->
 
# test web page, returns "200" if OK:
# if urllib.urlopen(link).getcode() == 200:
# 200: ('OK', 'Request fulfilled, document follows')
# 404: (page not found)
 
if accession_number:
link = "http://www.nlm.nih.gov/medlineplus/druginfo/meds/" + accession_number + ".html"
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
return True
else:
link = ""
return False
 
def test_Drugs_com(INN, tradename, drugbank_drugs_com):
 
# add Drugs.com parameter
# | Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->
 
# create alternate candidate drugs.com links
# alternateive roots include the monograph, CDI, CONS, MTM, and "parent" sectios of drgus.com web site
# alternative stems inlclude the INN, trade names, and the name extracted from the drugbank link
 
tradenames = tradename.split(", ")
 
# drugs.com root links:
roots = [("monograph","http://www.drugs.com/monograph/"), ("CDI","http://www.drugs.com/cdi/"), ("CONS","http://www.drugs.com/cons/"), ("MTM","http://www.drugs.com/mtm/"), ("pro","http://www.drugs.com/pro/"), ("international","http://www.drugs.com/international/"), ("parent","http://www.drugs.com/")]
 
stems = []
drugnames = []
drugnames.append(INN)
link = False
if tradenames:
for tradename in tradenames:
drugnames.append(tradename)
for drugname in drugnames:
drugname = string.lower(drugname)
# print "drugnames: ", drugnames
if (drugname != " " and string.find(drugname, " ") > -1):
stems.append(string.replace(drugname, " ", "_"))
stems.append(string.replace(drugname, " ", "-"))
elif (drugname != " "):
stems.append(drugname)
# also try common salts
stems.append(drugname + "-hydrochloride")
stems.append(drugname + "-sulfate")
# stems.append(drugname + "-chloride")
# stems.append(drugname + "-sodium")
# stems.append(drugname + "-bromide")
# stems.append(drugname + "-maleate")
# stems.append(drugname + "-citrate")
 
if drugbank_drugs_com:
if (string.find(drugbank_drugs_com, "http://www.drugs.com/") > -1):
temp = string.replace(drugbank_drugs_com, "http://www.drugs.com/", "")
temp = string.replace(temp, ".html", "")
drugnames.append(temp)
 
try:
for root in roots:
for stem in stems:
if stem:
link = iriToUri(root[1] + stem + ".html")
# print "attempted Drugs.com link: ", link
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
# print "passed link: ", link
if root[0] == "monograph":
link = "{{drugs.com|" + root[0] + "|" + stem + "}}"
raise StopIteration()
else:
opener = urllib.FancyURLopener({})
f = opener.open(link)
text = f.read()
result = regexp_monograph_url.search(text)
if result:
stem = result.group('STEM')
link = "{{drugs.com|" + roots[0][0] + "|" + stem + "}}"
raise StopIteration()
else:
link = "{{drugs.com|" + root[0] + "|" + stem + "}}"
# print "link: ", link
raise StopIteration()
else:
link = ""
except StopIteration:
pass
 
return link
 
def unbalanced(text):
 
# test for unmatched square or curly brackets
 
n_open_square_bracket = len(regexp_open_square_bracket.findall(text))
n_close_square_bracket = len(regexp_close_square_bracket.findall(text))
n_open_curly_bracket = len(regexp_open_curly_bracket.findall(text))
n_close_curly_bracket = len(regexp_close_curly_bracket.findall(text))
 
n_double_open_square_bracket = len(regexp_double_open_square_bracket.findall(text))
n_double_close_square_bracket = len(regexp_double_close_square_bracket.findall(text))
n_double_open_curly_bracket = len(regexp_double_open_curly_bracket.findall(text))
n_double_close_curly_bracket = len(regexp_double_close_curly_bracket.findall(text))
 
if (n_open_square_bracket != n_close_square_bracket or
n_open_curly_bracket != n_close_curly_bracket or
n_double_open_square_bracket != n_double_close_square_bracket or
n_double_open_curly_bracket != n_double_close_curly_bracket):
return True
else:
return False
 
def savepage(page, text, summary = '', minor = False, log_string = ""):
"""Save text to a page and log exceptions."""
if summary != '':
wikipedia.setAction(summary)
try:
page.put(text, minorEdit = minor)
wikipedia.output('%s \03{green}saving %s' % (log_string, page.title()) )
return ''
except wikipedia.LockedPage:
wikipedia.output('%s \03{red}cannot save %s because it is locked\03{default}' % (log_string, page.title()) )
return '# %s: page was locked\n' % page.aslink()
except wikipedia.EditConflict:
wikipedia.output('%s \03{red}cannot save %s because of edit conflict\03{default}' % (log_string, page.title()) )
return '# %s: edit conflict occurred\n' % page.aslink()
except wikipedia.SpamfilterError, error:
wikipedia.output('%s \03{red}cannot save %s because of spam blacklist entry %s\03{default}' % ((log_string, page.title(), error.url)) )
return '# %s: spam blacklist entry\n' % page.aslink()
except:
wikipedia.output('%s \03{red}unknown error on saving %s\03{default}' % (log_string, page.title()) )
return '# %s: unknown error occurred\n' % page.aslink()
 
def run():
 
drugbank_data = drugbank()
 
DrugBank_ID_INN = {}
for k, v in drugbank_data.iteritems():
DrugBank_ID_INN[v[8]]= k
 
# list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt"
articles = []
articles = codecs.open('/Users/BogBot/progs/pywikipedia/drugbox_titles.txt', mode = 'r', encoding='utf-8')
 
# articles = ['Progesterone']
 
for article in articles:
 
article = article.rstrip('\n')
 
encoding = 'utf-8'
if isinstance(article, basestring):
if not isinstance(article, unicode):
article = unicode(article, encoding)
 
new_drugbox = ""
 
log_string = "* [[" + article + "]], "
 
site = wikipedia.getSite()
page = wikipedia.Page(site, article)
text = page.get(get_redirect = True)
 
if not Allowbots(text):
continue
begin, end, begin2, end2 = find_drugbox_from_text(text)
if end:
parameters = text[begin:end]
else:
log_string = log_string + "drugbox not found!"
print log_string
continue
 
# make sure that there are no unmatched square or curly brackets
# if found, abbort, since these may indicate a error in the wiki markup
# and may trigger an infinite loop else where in this script
 
if unbalanced(parameters):
log_string = log_string + "unmatched brackets found, article skipped!"
print log_string
continue
 
# print text[begin:end]
 
current_parameters = {}
# first extract and assign nested templates commonly used in drugbox templates
parameters = assign_nested_templates(parameters, current_parameters)
 
# remove any embedded carriage returns from remaining templates:
parameters = remove_embedded_carriage_returns(parameters)
 
# next, parse each line for parameters
lines = parameters.splitlines()
for line in lines:
parse_line(line, current_parameters)
 
INN = article
# INN = "Acetylsalicylic acid"
 
if INN in drugbank_data:
db_data = drugbank_data[INN]
elif "DrugBank" in current_parameters and current_parameters['DrugBank'] in DrugBank_ID_INN:
log_string = str(log_string + "INN reset from " + INN,)
INN = DrugBank_ID_INN[current_parameters['DrugBank']]
log_string = log_string + "to " + INN + ", "
db_data = drugbank_data[INN]
if not "drug_name" in current_parameters:
current_parameters['drug_name'] = INN
else:
db_data = []
 
if "DrugBank" in current_parameters and current_parameters['DrugBank'] in DrugBank_ID_INN:
if DrugBank_ID_INN[current_parameters['DrugBank']] == INN:
log_string = log_string + "DrugBankID/INN OK!, "
else:
log_string = log_string + "DrugBankID/INN NOT OK!, "
else:
if db_data:
if db_data[8]:
if not "DrugBank" in current_parameters:
current_parameters['DrugBank'] = db_data[8]
 
# while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing
 
if db_data:
 
if db_data[4] and not "KEGG" in current_parameters:
current_parameters['KEGG'] = db_data[4]
 
if db_data[6] and not "ChemSpiderID" in current_parameters:
current_parameters['ChemSpiderID'] = db_data[6]
 
if db_data[7] and not "PubChem" in current_parameters:
current_parameters['PubChem'] = db_data[7]
 
# augument current tradename list with the ones supplied by the Merck Manual
 
if db_data:
if db_data[0]:
merck_tradename = db_data[0]
else:
merck_tradename = ""
else:
merck_tradename = ""
if 'tradename' in current_parameters:
current_tradename = current_parameters['tradename']
else:
current_tradename = ""
new_tradename = merged_tradenames(merck_tradename, current_tradename)
if new_tradename: current_parameters['tradename'] = new_tradename
# add MedlinePlus parameter
if db_data:
if db_data[2]:
if test_MedlinePlus(db_data[2]):
current_parameters['MedlinePlus'] = db_data[2]
# add Drugs.com link
if 'tradename' in current_parameters:
tradename = current_parameters['tradename']
else:
tradename = ""
if db_data:
if db_data[3]:
drugbank_drugs_com = db_data[3]
else:
drugbank_drugs_com = ""
else:
drugbank_drugs_com = ""
result = test_Drugs_com(INN, tradename, drugbank_drugs_com)
if result: current_parameters['Drugs.com'] = result
 
# add MedlinePlus parameter
if db_data:
if db_data[2]:
if test_MedlinePlus(db_data[2]):
current_parameters['MedlinePlus'] = db_data[2]
 
 
if not 'MedlinePlus' in current_parameters:
opener = urllib.FancyURLopener({})
stem = string.replace(article, " ", "+")
link = "http://vsearch.nlm.nih.gov/vivisimo/cgi-bin/query-meta?&v:project=medlineplus&query=" + stem
 
# print "MedlinePlus link:", link
f = opener.open(link)
text2 = f.read()
result = regexp_medlineplus_url.search(text2)
if result:
current_parameters['MedlinePlus'] = result.group('ACNO')
 
new_text = text[:begin2] + build_new_drugbox(current_parameters) + text[end2:]
# print build_new_drugbox(current_parameters)
 
# print new_text
 
# print current_parameters
# print new_text
if current_parameters:
comment='populated new fields in {{drugbox}} and reordered per [[Wikipedia:Bots/Requests_for_approval/BogBot_2|bot approval]]. Report errors and suggestions to [[User_talk:BogBot]]'
status = savepage(page, new_text, comment, False, log_string)
else:
print ", page not updated"
 
wikipedia.stopme()
run()
 
</syntaxhighlight>