Content deleted Content added
updated to include code to generate the {{drugs.com}} template |
updated script to fix bugs for special cases |
||
Line 7:
# | Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->
# | MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" -->
# In addition, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing with data from http://www.drugbank.ca/system/downloads/current/drugbank.txt.zip
# The tradnames were obtained from http://www.merckmanuals.com/home/drugnames-index/trade/a.html
# Finally the script sorts the fields in the order they are currently rendered by the drugbox template
# (in the order of clinical, pharmacokinetic, identifiers, and chemical data)
"""{{Drugbox
| Watchedfields
| verifiedrevid = 408577806
| IUPAC_name =
| OtherNames =
| image =
| width =
Line 124 ⟶ 130:
}}"""
from collections import defaultdict
import
import csv
import re
import string
import urllib
import wikipedia
user = "BogBot"
Line 137 ⟶ 143:
regexp_ab = re.compile(r'\{\{(nobots|bots\|(allow=none|deny=.*?' + user + r'.*?|optout=all|deny=all))\}\}')
regexp_drug_infobox = re.compile(r"\{\{\s*(Drugbox|drugbox)\s*(?P<PARAMS>.+)\s*\}\}
regexp_param = re.compile(r"^\s*?\|\s*?(?P<PARAM>\S+)\s*?=\s*?(?P<VALUE>.+)\s*?($|\|)")
regexp_nested_template = re.compile(r"\{\{(?P<PARAMS>.+)\}\}")
# ATC_supplemental = {{ATC|B01|AC06}}, {{ATC|N02|BA01}}
regexp_ATC_supplemental = re.compile(r"\|\s*?ATC_supplemental\s*?=\s*?(?P<TEMPLATE>.*?)\s*?($|\|)")
# CASNo_Ref = {{cascite|correct|CAS}}
regexp_CASNo_Ref = re.compile(r"\|\s??CASNo_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Cascite|cascite).+?\}\})")
# ChEMBL_Ref = {{ebicite|correct|EBI}}
regexp_ChEMBL_Ref = re.compile(r"\|\s??ChEMBL_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Ebicite|ebicite).+?\}\})")
# ChemSpiderID_Ref = {{chemspidercite|correct|chemspider}}
regexp_ChemSpiderID_Ref = re.compile(r"\|\s??ChemSpiderID_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Chemspidercite|chemspidercite).+?\}\})")
# Drugs.com = {{drugs.com|monograph|lisinopril}}
regexp_Drugs_com = re.compile(r"\|\s??Drugs\.com\s?=\s?(?P<TEMPLATE>\{\{s*(Drugs\.com|drugs\.com).+?\}\})")
# KEGG_Ref = {{keggcite|correct|kegg}}
regexp_KEGG_Ref = re.compile(r"\|\s??KEGG_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Keggcite|keggcite).+?\}\})")
# StdInChI_Ref = {{stdinchicite|correct|chemspider}}
regexp_StdInChI_Ref = re.compile(r"\|\s??StdInChI_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Stdinchicite|stdinchicite).+?\}\})")
# StdInChIKey_Ref = {{stdinchicite|correct|chemspider}}
regexp_StdInChIKey_Ref = re.compile(r"\|\s??StdInChIKey_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Stdinchicite|stdinchicite).+?\}\})")
# UNII_Ref = {{fdacite|changed|FDA}}
regexp_UNII_Ref = re.compile(r"\|\s??UNII_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Fdacite|fdacite).+?\}\})")
# Included for bot exclusion compliance (see http://en.wikipedia.org/wiki/Template:Bots)
Line 147 ⟶ 175:
return True
# list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt"
# articles = codecs.open('/Users/BogHog/progs/pywikipedia/drugbox/drugbox_titles.txt', mode = 'r', encoding='utf-8')
# drugbank data obtained from http://www.drugbank.ca/system/downloads/current/drugbank.txt.zip
drugbank_data = {}
Line 154 ⟶ 184:
# Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID
drug_data = csv.reader(open('/Users/
# drugs.com root links:
Line 170 ⟶ 200:
# main loop
new_drugbox = ""
for article in articles:
# article = article.rstrip('\n')
INN =
# INN = "Asprin"
# print article
Line 195 ⟶ 228:
result_drug_infobox = regexp_drug_infobox.search(text)
if result_drug_infobox:
# print "found it!"
parameters = result_drug_infobox.group('PARAMS')
current_parameters = {}
# first extract and assign the nested templates
result_ATC_supplemental = regexp_ATC_supplemental.search(parameters)
if result_ATC_supplemental:
template = result_ATC_supplemental.group('TEMPLATE')
current_parameters['ATC_supplemental'] = template
# print "found result_ATC_supplemental! ", template
parameters = re.sub(regexp_ATC_supplemental, "|", parameters)
result_ChEMBL_Ref = regexp_ChEMBL_Ref.search(parameters)
if result_ChEMBL_Ref:
template = result_ChEMBL_Ref.group('TEMPLATE')
current_parameters['ChEMBL_Ref'] = template
# print "found result_ChEMBL_Ref! ", template
parameters = re.sub(regexp_ChEMBL_Ref, "", parameters)
result_ChemSpiderID_Ref = regexp_ChemSpiderID_Ref.search(parameters)
if result_ChemSpiderID_Ref:
template = result_ChemSpiderID_Ref.group('TEMPLATE')
current_parameters['ChemSpiderID_Ref'] = template
# print "found ChemSpiderID_Ref! ", template
parameters = re.sub(regexp_ChemSpiderID_Ref, "", parameters)
# print parameters
result_Drugs_com = regexp_Drugs_com.search(parameters)
if result_Drugs_com:
template = result_Drugs_com.group('TEMPLATE')
current_parameters['Drugs.com'] = template
# print "found result_Drugs_com! ", template
parameters = re.sub(regexp_Drugs_com, "", parameters)
result_KEGG_Ref = regexp_KEGG_Ref.search(parameters)
if result_KEGG_Ref:
template = result_KEGG_Ref.group('TEMPLATE')
current_parameters['KEGG_Ref'] = template
# print "found KEGG_Ref! ", template
parameters = re.sub(regexp_KEGG_Ref, "", parameters)
result_StdInChI_Ref = regexp_StdInChI_Ref.search(parameters)
if result_StdInChI_Ref:
template = result_StdInChI_Ref.group('TEMPLATE')
current_parameters['StdInChI_Ref'] = template
# print "found StdInChI_Ref! ", template
parameters = re.sub(regexp_StdInChI_Ref, "", parameters)
result_StdInChIKey_Ref = regexp_StdInChIKey_Ref.search(parameters)
if result_StdInChIKey_Ref:
template = result_StdInChIKey_Ref.group('TEMPLATE')
current_parameters['StdInChIKey_Ref'] = template
# print "found StdInChIKey_Ref! ", template
parameters = re.sub(regexp_StdInChIKey_Ref, "", parameters)
result_UNII_Ref = regexp_UNII_Ref.search(parameters)
if result_UNII_Ref:
template = result_UNII_Ref.group('TEMPLATE')
current_parameters['UNII_Ref'] = template
# print "found UNII_Ref! ", template
parameters = re.sub(regexp_UNII_Ref, "", parameters)
# print "parameters:"
# print parameters
lines = parameters.splitlines()
for line in lines:
#
# if (line.count('|') > 1 and line.count('[[') < 1 ):
if
# print "line1: ", line
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links
# taken from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
new_list = [sub_strings[0]]
for sub_string in sub_strings[1:]:
if (']]' in sub_string) and ((not '[[' in sub_string) or sub_string.find(']]') < sub_string.find('[[')):
new_list[-1] = new_list[-1] + '|' + sub_string
else:
new_list.append(sub_string)
# print "new_list: ", new_list
sub_strings = new_list
for sub_string in sub_strings:
# print "sub_string: ", sub_string
if (sub_string.count("=") > 0):
parts = sub_string.split("=")
# print "parts: ", parts
parameter = str(parts[0].encode()).strip()
value = str(parts[1].encode()).strip()
# print "parameter, value: ", parameter, " ", value
current_parameters[parameter] = value
else:
result_drug_param = regexp_param.search(line)
# print line
if result_drug_param:
# print "made it!"
# print "line2: ", line
parameter = result_drug_param.group('PARAM').strip()
value = result_drug_param.group('VALUE').strip()
current_parameters[parameter] = value
# print current_parameters
# print "INN: ", INN
if INN in drugbank_data:
data = drugbank_data[INN]
else:
data = []
#
new_tradenames = []
if data:
merck_tradenames = sorted(set(data[0].split(";")))[1:]
for index, object in enumerate(merck_tradenames):
merck_tradenames[index] = string.capitalize(string.strip(object.encode()))
else:
merck_tradenames = []
if current_parameters.has_key("tradename"):
current_tradenames = sorted(set(current_parameters['tradename'].split(", ")))
Line 223 ⟶ 356:
for index, object in enumerate(current_tradenames):
current_tradenames[index] = string.capitalize(string.strip(object.encode()))
merged_tradenames =
if merck_tradenames: merged_tradenames = merck_tradenames
if current_tradenames: merged_tradenames.append(current_tradenames)
if merged_tradenames:
new_tradenames = sorted(set(merged_tradenames))
# print "merck tradenames: ", merck_tradenames
# print "current tradenames: ", current_tradenames
# print "new tradenames: ", current_parameters['tradename']
# test web page, returns "200" if OK:
Line 238 ⟶ 374:
# | MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" -->
if data:
link = "http://www.nlm.nih.gov/medlineplus/druginfo/meds/" + data[2] + ".html"
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
current_parameters['MedlinePlus'] = data[2]
# print "MedlinePlus: ", current_parameters['MedlinePlus']
else:
link = ""
# add Drugs.com parameter
# | Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->
# create alternate candidate drugs.com links
# alternateive roots include the monograph, CDI, CONS, MTM, and "parent" sectios of drgus.com web site
# alternative stems inlclude the INN, trade names, and the name extracted from the drugbank link
stems = []
drugnames = []
drugnames.append(INN)
if new_tradenames:
for tradename in new_tradenames:
drugnames.append(tradename)
if
if
if (string.find(data[3], "http://www.drugs.com/") > -1):
drugnames.append(temp)
# print "drugnames: ", drugnames
try:
link = root[1] + stem + ".html"
# print "attempted Drugs.com link: ", link
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
current_parameters['Drugs.com'] = "{{drugs.com|" + root[0] + "|" + stem + "}}"
raise StopIteration()
# print "Drugs.com: ", current_parameters['Drugs.com']
except StopIteration:
pass
Line 277 ⟶ 430:
# while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing
if data:
if data[4] and not current_parameters.has_key("KEGG"): current_parameters['KEGG'] = data[4]
if data[6] and not current_parameters.has_key("ChemSpiderID"):
current_parameters['ChemSpiderID'] = data[6]
if data[7] and not current_parameters.has_key("PubChem"):
current_parameters['PubChem'] = data[7]
# build new drugbox template
new_drugbox = "{{Drugbox\n"
if current_parameters.has_key("Watchedfields"): new_drugbox += "| Watchedfields = " + current_parameters['Watchedfields'] + "\n"
if current_parameters.has_key("Verifiedfields"): new_drugbox += "| Verifiedfields = " + current_parameters['Verifiedfields'] + "\n"
if current_parameters.has_key("verifiedrevid"): new_drugbox += "| verifiedrevid = " + current_parameters['verifiedrevid'] + "\n"
if current_parameters.has_key("IUPAC_name"): new_drugbox += "| IUPAC_name = " + current_parameters['IUPAC_name'] + "\n"
if current_parameters.has_key("OtherNames"): new_drugbox += "| OtherNames = " + current_parameters['OtherNames'] + "\n"
if current_parameters.has_key("image"): new_drugbox += "| image = " + current_parameters['image'] + "\n"
if current_parameters.has_key("width"): new_drugbox += "| width = " + current_parameters['width'] + "\n"
Line 426 ⟶ 584:
# replace original drugbox with new drugbox
if new_drugbox:
new_text = re.sub(regexp_drug_infobox, new_drugbox, text) page.put(new_text, comment='populated clinical fields in drugbox per [[Wikipedia:Bots/Requests_for_approval/BogBot_2|bot approval]]', watchArticle = None, minorEdit = True)
print ", page updated"
else:
print ", page not updated"
wikipedia.stopme()
|