User:BogBot/Source code/Task 03: Difference between revisions

Content deleted Content added
updated script to fix bugs for special cases
substantial rewrite using function calls
Line 137:
import urllib
import wikipedia
 
# compiled regular expression
 
user = "BogBot"
regexp_ab = re.compile(r'\{\{(nobots|bots\|(allow=none|deny=.*?' + user + r'.*?|optout=all|deny=all))\}\}')
 
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
# compiled regular expression
# Build a regex to locate the drugbox
exp = r'\{\{' # the opening brackets for the infobox
exp = exp + r'\s*' # any amount of whitespace
exp = exp + r'[Dd]rugbox' # the word "infobox", capitalized or not
exp = exp + r'.*\}\}' # any amount of anything, followed by the end of the infobox
 
regexp_drug_infobox = re.compile(exp, re.DOTALL)
regexp_ab = re.compile(r'\{\{(nobots|bots\|(allow=none|deny=.*?' + user + r'.*?|optout=all|deny=all))\}\}')
regexp_drug_infoboxregexp_param = re.compile(r"^\{s*?\{|\s*?(Drugbox|drugbox?P<PARAM>\S+)\s*?=\s*?(?P<PARAMSVALUE>.+)\s*?($|\}\}|)", re.DOTALL)
regexp_param = re.compile(r"^\s*?\|\s*?(?P<PARAM>\S+)\s*?=\s*?(?P<VALUE>.+)\s*?($|\|)")
regexp_nested_template = re.compile(r"\{\{(?P<PARAMS>.+)\}\}")
 
 
# ATC_supplemental = {{ATC|B01|AC06}}, {{ATC|N02|BA01}}
Line 166 ⟶ 172:
# UNII_Ref = {{fdacite|changed|FDA}}
regexp_UNII_Ref = re.compile(r"\|\s??UNII_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Fdacite|fdacite).+?\}\})")
 
 
# Included for bot exclusion compliance (see http://en.wikipedia.org/wiki/Template:Bots)
 
def Allowbots(text):
Line 175 ⟶ 178:
return True
 
def find_drugbox_from_text(article_text):
# list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt"
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
# articles = codecs.open('/Users/BogHog/progs/pywikipedia/drugbox/drugbox_titles.txt', mode = 'r', encoding='utf-8')
search_result = regexp_drug_infobox.search(article_text)
if search_result:
result_text = search_result.group(0) # returns the entire matching sequence
begin, end = search_result.span()
print
else:
return None
# the regex isn't perfect, so look for the closing brackets of the infobox
count = 0
last_ind = None
for ind, c in enumerate(result_text):
if c == '}':
count = count - 1
elif c == '{':
count = count + 1
if count == 0 and not ind == 0:
last_ind = ind
break
offset = result_text.find('|')
___location = (begin+offset, begin+last_ind-1)
return ___location
 
def drugbank():
# drugbank data obtained from http://www.drugbank.ca/system/downloads/current/drugbank.txt.zip
 
drugbank_data = {}
drugbank_data = {}
 
# 0 1 2 3 4 5 6 7
# Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID
 
drug_data = csv.reader(open('/Users/BogHog/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'), dialect='excel')
 
for row in drug_data:
# drugs.com root links:
drugbank_data[row[0]] = row[1:]
return drugbank_data
 
def assign_nested_templates(parameters, current_parameters):
roots = [("monograph","http://www.drugs.com/monograph/"), ("CDI","http://www.drugs.com/cdi/"), ("CONS","http://www.drugs.com/cons/"), ("MTM","http://www.drugs.com/mtm/"), ("parent","http://www.drugs.com/")]
# extract and assign nested templates commonly used in drugbox templates
 
for row in drug_data:
drugbank_data[row[0]] = row[1:]
 
 
# articles = []
 
articles = ["User:Boghog/Sandbox2"]
# articles = ["Template:Drugbox/Lisinopril"]
 
# main loop
 
new_drugbox = ""
 
for article in articles:
 
# article = article.rstrip('\n')
 
INN = article
# INN = "Asprin"
# print article
 
log_string = "* [[" + article + "]]"
print log_string,
 
site = wikipedia.getSite()
page = wikipedia.Page(site, article)
text = page.get(get_redirect = True)
 
if not Allowbots(text):
break
 
log_string = ", article: " + article
print log_string
 
# print text
 
result_drug_infobox = regexp_drug_infobox.search(text)
if result_drug_infobox:
# print "found it!"
parameters = result_drug_infobox.group('PARAMS')
current_parameters = {}
# first extract and assign the nested templates
 
result_ATC_supplemental = regexp_ATC_supplemental.search(parameters)
if result_ATC_supplemental:
template = result_ATC_supplemental.group('TEMPLATE')
current_parameters['ATC_supplemental'] = template
# print "found result_ATC_supplemental! ", template
parameters = re.sub(regexp_ATC_supplemental, "|", parameters)
 
result_ChEMBL_Ref = regexp_ChEMBL_Ref.search(parameters)
if result_ChEMBL_Ref:
template = result_ChEMBL_Ref.group('TEMPLATE')
current_parameters['ChEMBL_Ref'] = template
# print "found result_ChEMBL_Ref! ", template
parameters = re.sub(regexp_ChEMBL_Ref, "", parameters)
 
result_ChemSpiderID_Ref = regexp_ChemSpiderID_Ref.search(parameters)
if result_ChemSpiderID_Ref:
template = result_ChemSpiderID_Ref.group('TEMPLATE')
current_parameters['ChemSpiderID_Ref'] = template
# print "found ChemSpiderID_Ref! ", template
parameters = re.sub(regexp_ChemSpiderID_Ref, "", parameters)
# print parameters
 
result_Drugs_com = regexp_Drugs_com.search(parameters)
if result_Drugs_com:
template = result_Drugs_com.group('TEMPLATE')
current_parameters['Drugs.com'] = template
# print "found result_Drugs_com! ", template
parameters = re.sub(regexp_Drugs_com, "", parameters)
 
result_KEGG_Ref = regexp_KEGG_Ref.search(parameters)
if result_KEGG_Ref:
template = result_KEGG_Ref.group('TEMPLATE')
current_parameters['KEGG_Ref'] = template
# print "found KEGG_Ref! ", template
parameters = re.sub(regexp_KEGG_Ref, "", parameters)
 
result_StdInChI_Ref = regexp_StdInChI_Ref.search(parameters)
if result_StdInChI_Ref:
template = result_StdInChI_Ref.group('TEMPLATE')
current_parameters['StdInChI_Ref'] = template
# print "found StdInChI_Ref! ", template
parameters = re.sub(regexp_StdInChI_Ref, "", parameters)
 
result_StdInChIKey_Ref = regexp_StdInChIKey_Ref.search(parameters)
if result_StdInChIKey_Ref:
template = result_StdInChIKey_Ref.group('TEMPLATE')
current_parameters['StdInChIKey_Ref'] = template
# print "found StdInChIKey_Ref! ", template
parameters = re.sub(regexp_StdInChIKey_Ref, "", parameters)
 
result_UNII_Ref = regexp_UNII_Ref.search(parameters)
if result_UNII_Ref:
template = result_UNII_Ref.group('TEMPLATE')
current_parameters['UNII_Ref'] = template
# print "found UNII_Ref! ", template
parameters = re.sub(regexp_UNII_Ref, "", parameters)
 
# printreturn "parameters:"
 
# print parameters
def parse_line(line, current_parameters):
lines = parameters.splitlines()
 
for line in lines:
# print "index: ", line.count('|')
# if (line.count('|') > 1 and line.count('[[') < 1 ):
if (line.count('|') > 1):
# print "line1: ", line
sub_strings = line.split("|")
 
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links
# takenadapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
new_list = [sub_strings[0]]
for sub_string in sub_strings[1:]:
if (']]' in sub_string) and ((not '[[' in sub_string) or sub_string.find(']]') < sub_string.find('[[')):
new_list[-1] = new_list[-1] + '|' + sub_string
else:
new_list.append(sub_string)
sub_strings = new_list
# print "new_list: ", new_list
# do the same thing for nested templates
# print "new_list: ", new_list
forever = True
 
sub_stringswhile = new_listforever:
# print "new_list: ", new_list
forever = False
for sub_string in sub_strings[1:]:
# print "sub_string: ", sub_string
if ('}}' in sub_string) and ((not '{{' in sub_string) or sub_string.find('}}') < sub_string.find('{{')):
new_list[-1] = new_list[-1] + '|' + sub_string
forever = True
else:
new_list.append(sub_string)
sub_strings = new_list
# print "new_list: ", new_list
 
# now assign the parameters
for sub_string in sub_strings:
# print "sub_string: ", sub_string
if (sub_string.count("=") > 0):
parts = sub_string.split("=")
# print "parts: ", parts
parameter = str(parts[0].encode()).strip()
value = str(parts[1].encode()).strip()
# print "parameter, value: ", parameter, " ", value
current_parameters[parameter] = value
else:
result_drug_param = regexp_param.search(line)
# print line
if result_drug_param:
# print "made it!"
# print "line2: ", line
parameter = result_drug_param.group('PARAM').strip()
value = result_drug_param.group('VALUE').strip()
current_parameters[parameter] = value
return
 
# print current_parameters
def build_new_drugbox(current_parameters):
# print "INN: ", INN
if INN in drugbank_data:
data = drugbank_data[INN]
else:
data = []
# merge tradenames
new_tradenames = []
if data:
merck_tradenames = sorted(set(data[0].split(";")))[1:]
for index, object in enumerate(merck_tradenames):
merck_tradenames[index] = string.capitalize(string.strip(object.encode()))
else:
merck_tradenames = []
if current_parameters.has_key("tradename"):
current_tradenames = sorted(set(current_parameters['tradename'].split(", ")))
else:
current_tradenames = []
for index, object in enumerate(current_tradenames):
current_tradenames[index] = string.capitalize(string.strip(object.encode()))
merged_tradenames = []
if merck_tradenames: merged_tradenames = merck_tradenames
if current_tradenames: merged_tradenames.append(current_tradenames)
if merged_tradenames:
new_tradenames = sorted(set(merged_tradenames))
if new_tradenames: current_parameters['tradename'] = ", ".join(new_tradenames)
# print "merck tradenames: ", merck_tradenames
# print "current tradenames: ", current_tradenames
# print "new tradenames: ", current_parameters['tradename']
# test web page, returns "200" if OK:
# if urllib.urlopen(link).getcode() == 200:
# 200: ('OK', 'Request fulfilled, document follows')
# 404: (page not found)
 
# add MedlinePlus parameter
# | MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" -->
 
if data:
link = "http://www.nlm.nih.gov/medlineplus/druginfo/meds/" + data[2] + ".html"
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
current_parameters['MedlinePlus'] = data[2]
# print "MedlinePlus: ", current_parameters['MedlinePlus']
else:
link = ""
# add Drugs.com parameter
# | Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->
 
 
# create alternate candidate drugs.com links
# alternateive roots include the monograph, CDI, CONS, MTM, and "parent" sectios of drgus.com web site
# alternative stems inlclude the INN, trade names, and the name extracted from the drugbank link
 
stems = []
drugnames = []
drugnames.append(INN)
if new_tradenames:
for tradename in new_tradenames:
drugnames.append(tradename)
for drugname in drugnames:
drugname = string.lower(drugname)
if (string.find(drugname, " ") > -1):
stems.append(string.replace(drugname, " ", "_"))
stems.append(string.replace(drugname, " ", "-"))
else:
stems.append(drugname)
if data:
if data[3]:
if (string.find(data[3], "http://www.drugs.com/") > -1):
temp = string.replace(data[3], "http://www.drugs.com/", "")
temp = string.replace(temp, ".html", "")
drugnames.append(temp)
# print "drugnames: ", drugnames
 
try:
for root in roots:
for stem in stems:
link = root[1] + stem + ".html"
# print "attempted Drugs.com link: ", link
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
current_parameters['Drugs.com'] = "{{drugs.com|" + root[0] + "|" + stem + "}}"
raise StopIteration()
# print "Drugs.com: ", current_parameters['Drugs.com']
except StopIteration:
pass
 
 
# for parameter, value in current_parameters.iteritems():
# print parameter, ":", value
 
# while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing
 
if data:
 
if data[4] and not current_parameters.has_key("KEGG"):
current_parameters['KEGG'] = data[4]
 
if data[6] and not current_parameters.has_key("ChemSpiderID"):
current_parameters['ChemSpiderID'] = data[6]
 
if data[7] and not current_parameters.has_key("PubChem"):
current_parameters['PubChem'] = data[7]
 
# build new drugbox template
 
new_drugbox = "{{Drugbox\n"
if current_parameters.has_key("Watchedfields"): new_drugbox += "| Watchedfields = " + current_parameters['Watchedfields'] + "\n"
if current_parameters.has_key("Verifiedfields"): new_drugbox += "| Verifiedfields = " + current_parameters['Verifiedfields'] + "\n"
Line 580 ⟶ 475:
if current_parameters.has_key("sec_combustion"): new_drugbox += "| sec_combustion = " + current_parameters['sec_combustion'] + "\n"
 
return new_drugbox += "}}\n"
# print new_drugbox
def merged_tradenames(merck_tradename, current_tradename):
# merge tradenames
new_tradenames = []
 
if merck_tradename:
# replace original drugbox with new drugbox
merck_tradenames = sorted(set(merck_tradename.split(";")))[1:]
if new_drugbox:
for index, object in enumerate(merck_tradenames):
new_text = re.sub(regexp_drug_infobox, new_drugbox, text)
merck_tradenames[index] = string.capitalize(string.strip(object.encode()))
print new_text
else:
merck_tradenames = []
 
if current_tradename:
page.put(new_text, comment='populated clinical fields in drugbox per [[Wikipedia:Bots/Requests_for_approval/BogBot_2|bot approval]]', watchArticle = None, minorEdit = True)
current_tradenames = sorted(set(current_tradename.split(", ")))
print ", page updated"
for index, object in enumerate(current_tradenames):
else:
current_tradenames[index] = string.capitalize(string.strip(object.encode()))
print ", page not updated"
else:
current_tradenames = []
 
merged_tradenames = []
if merck_tradenames: merged_tradenames = merck_tradenames
if current_tradenames:
for name in current_tradenames:
merged_tradenames.append(name)
if merged_tradenames:
new_tradenames = sorted(set(merged_tradenames))
new_tradename = ", ".join(new_tradenames)
return new_tradename
else:
new_tradename = ""
return new_tradename
# print "merck tradenames: ", merck_tradenames
# print "current tradenames: ", current_tradenames
 
 
def test_MedlinePlus(accession_number):
 
# add MedlinePlus parameter
# | MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" -->
 
# test web page, returns "200" if OK:
# if urllib.urlopen(link).getcode() == 200:
# 200: ('OK', 'Request fulfilled, document follows')
# 404: (page not found)
 
if accession_number:
link = "http://www.nlm.nih.gov/medlineplus/druginfo/meds/" + accession_number + ".html"
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
return True
else:
link = ""
return False
 
def test_Drugs_com(INN, tradename, drugbank_drugs_com):
 
# add Drugs.com parameter
# | Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->
 
# create alternate candidate drugs.com links
# alternateive roots include the monograph, CDI, CONS, MTM, and "parent" sectios of drgus.com web site
# alternative stems inlclude the INN, trade names, and the name extracted from the drugbank link
 
tradenames = tradename.split(", ")
 
# drugs.com root links:
roots = [("monograph","http://www.drugs.com/monograph/"), ("CDI","http://www.drugs.com/cdi/"), ("CONS","http://www.drugs.com/cons/"), ("MTM","http://www.drugs.com/mtm/"), ("parent","http://www.drugs.com/")]
 
stems = []
drugnames = []
drugnames.append(INN)
link = False
if tradenames:
wikipedia.stopme()
for tradename in tradenames:
drugnames.append(tradename)
for drugname in drugnames:
drugname = string.lower(drugname)
# print "drugnames: ", drugnames
if (string.find(drugname, " ") > -1):
stems.append(string.replace(drugname, " ", "_"))
stems.append(string.replace(drugname, " ", "-"))
else:
stems.append(drugname)
if drugbank_drugs_com:
if (string.find(drugbank_drugs_com, "http://www.drugs.com/") > -1):
temp = string.replace(drugbank_drugs_com, "http://www.drugs.com/", "")
temp = string.replace(temp, ".html", "")
drugnames.append(temp)
 
try:
for root in roots:
for stem in stems:
link = root[1] + stem + ".html"
# print "attempted Drugs.com link: ", link
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
link = "{{drugs.com|" + root[0] + "|" + stem + "}}"
raise StopIteration()
except StopIteration:
pass
 
return link
 
def run():
 
drugbank_data = drugbank()
 
# list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt"
# articles = []
# articles = codecs.open('/Users/BogHog/progs/pywikipedia/drugbox/drugbox_titles.txt', mode = 'r', encoding='utf-8')
 
articles = ["Template:Drugbox/Lisinopril"]
 
for article in articles:
 
# article = article.rstrip('\n')
 
new_drugbox = ""
 
log_string = "* [[" + article + "]]"
print log_string,
 
site = wikipedia.getSite()
page = wikipedia.Page(site, article)
text = page.get(get_redirect = True)
 
if not Allowbots(text):
break
 
begin, end = find_drugbox_from_text(text)
if begin:
parameters = text[begin:end]
log_string = ", article: " + article
print log_string
else:
log_string = ", article: " + article + "drugbox not found!"
print log_string
break
 
# print text[begin:end]
 
current_parameters = {}
# first extract and assign nested templates commonly used in drugbox templates
parameters = assign_nested_templates(parameters, current_parameters)
 
# next, parse each line for parameters
lines = parameters.splitlines()
for line in lines:
parse_line(line, current_parameters)
 
INN = article
if INN in drugbank_data:
db_data = drugbank_data[INN]
else:
db_data = []
 
# while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing
 
if db_data:
 
if db_data[4] and not current_parameters.has_key("KEGG"):
current_parameters['KEGG'] = db_data[4]
 
if db_data[6] and not current_parameters.has_key("ChemSpiderID"):
current_parameters['ChemSpiderID'] = db_data[6]
 
if db_data[7] and not current_parameters.has_key("PubChem"):
current_parameters['PubChem'] = db_data[7]
 
# augument current tradename list with the ones supplied by the Merck Manual
 
if db_data:
if db_data[0]:
merck_tradename = db_data[0]
else:
merck_tradename = ""
if current_parameters.has_key('tradename'):
current_tradename = current_parameters['tradename']
else:
current_tradename = ""
new_tradename = merged_tradenames(merck_tradename, current_tradename)
if new_tradename: current_parameters['tradename'] = new_tradename
# add MedlinePlus parameter
if db_data:
if db_data[2]:
if test_MedlinePlus(db_data[2]):
current_parameters['MedlinePlus'] = db_data[2]
# add Drugs.com link
result = test_Drugs_com(INN, current_parameters['tradename'], db_data[3])
if result: current_parameters['Drugs.com'] = result
 
new_text = text[:begin-1] + build_new_drugbox(current_parameters) + text[end:]
print new_text
if current_parameters:
# page.put(new_text, comment='populated clinical fields in drugbox per [[Wikipedia:Bots/Requests_for_approval/BogBot_2|bot approval]]', watchArticle = None, minorEdit = True)
print ", page updated"
else:
print ", page not updated"
 
wikipedia.stopme()
run()
</source>