#!/usr/bin/python
# Bot Script to populate new clinical fields in Drugbox templates in Wikipedia drug articles.
# The new fields are:
# | tradename = <!-- comma separated list of tradenames -->
# | Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->
# | MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" -->
"""{{Drugbox
| verifiedrevid = 408577806
| IUPAC_name =
| image =
| width =
| alt =
| image2 =
| width2 =
| alt2 =
| imagename = <!-- else may use drug_name -->
| drug_name = <!-- else may use imagename -->
| caption =
<!--Clinical data-->
| tradename =
| Drugs.com =
| MedlinePlus =
| licence_EU = <!-- EMA requires brand name -->
| licence_US = <!-- FDA may use generic name -->
| DailyMedID = <!-- preference to licence_US -->
| pregnancy_AU = <!-- A / B1 / B2 / B3 / C / D / X -->
| pregnancy_US = <!-- A / B / C / D / X -->
| pregnancy_category=
| legal_AU = <!-- S2, S3, S4, S5, S6, S7, S8, S9 or Unscheduled-->
| legal_CA = <!-- OTC, Rx-only, Schedule I, II, III, IV, V, VI, VII, VIII -->
| legal_UK = <!-- GSL, P, POM, CD, CD Lic, CD POM, CD No Reg POM, CD (Benz) POM, CD (Anab) POM or CD Inv POM -->
| legal_US = <!-- OTC / Rx-only / Schedule I, II, III, IV, V -->
| legal_status =
| dependency_liability =
| routes_of_administration =
<!--Pharmacokinetic data-->
| bioavailability =
| protein_bound =
| metabolism =
| elimination_half-life =
| excretion =
<!--Identifiers-->
| CAS_number =
| CAS_supplemental =
| ATCvet =
| ATC_prefix = <!-- 'none' if uncategorised -->
| ATC_suffix =
| ATC_supplemental =
| PubChem =
| PubChemSubstance =
| IUPHAR_ligand =
| DrugBank =
| ChemSpiderID =
| UNII =
| KEGG =
| ChEBI =
| ChEMBL =
<!--Chemical data-->
| chemical_formula =
| C= | H= | Ag= | As= | Au= | B= | Bi= | Br= | Cl= | Co= | F= | Fe= | Gd= | I=
| K= | Mn= | N= | Na= | O= | P= | Pt= | S= | Sb= | Se= | Sr= | Tc= | Zn= | charge=
| molecular_weight =
| smiles =
| StdInChI =
| StdInChI_comment =
| StdInChIKey =
| synonyms =
| density =
| melting_point =
| melting_high =
| melting_notes =
| boiling_point =
| boiling_notes =
| solubility =
| specific_rotation =
| sec_combustion =
<!--Combo data-->
| type = combo
| drug_name =
| component1 = <!-- Drugname, automatically linked -->
| class1 = <!-- Group, manual link using [[..|..]] -->
| component2 = <!-- Drugname, automatically linked -->
| class2 = <!-- Group, manual link using [[..|..]] -->
| component3 = <!-- Drugname, automatically linked -->
| class3 = <!-- Group, manual link using [[..|..]] -->
| component4 = <!-- Drugname, automatically linked -->
| class4 = <!-- Group, manual link using [[..|..]] -->
<!--Monoclonal antibody data-->
| type = mab
| image =
| width =
| alt =
| image2 =
| width2 =
| alt2 =
| imagename = <!-- else may use drug_name -->
| drug_name = <!-- else may use imagename -->
| mab_type = <!-- mab, Fab, F(ab')2, Fab', scFv, di-scFv, 3funct, clFab, BiTE -->
| source = <!-- a, e, i, o, u, xi/a, zu/a, xizu/a, axo, ... -->
| target = <!-- antigen -->
<!--Vacine data-->
| type = vaccine
| image =
| alt =
| width =
| image2 =
| alt2 =
| width2 =
| imagename = <!-- else may use drug_name -->
| drug_name = <!-- else may use imagename -->
| target = <!-- the antigen/bacteria/toxin/virus to protect against -->
| vaccine_type = <!-- killed/attenuated/live/toxoid/protein subunit/subunit/conjugate/recombinant/DNA -->
}}"""
import re
import string
import wikipedia
from collections import defaultdict
import urllib
import csv
import string
# Included for bot exclusion compliance (see http://en.wikipedia.org/wiki/Template:Bots)
# compiled regular expression
user = "BogBot"
# compiled regular expression
regexp_ab = re.compile(r'\{\{(nobots|bots\|(allow=none|deny=.*?' + user + r'.*?|optout=all|deny=all))\}\}')
regexp_drug_infobox = re.compile(r"\{\{\s*(Drugbox|drugbox)\s*(?P<PARAMS>.+)\s*\}\}\s*", re.DOTALL)
regexp_param = re.compile(r"^\s?\|\s?(?P<PARAM>\S+)\s?=\s?(?P<VALUE>.+)$")
def Allowbots(text):
if (regexp_ab.search(text)):
return False
return True
# articles = open('/Users/BogBot/progs/pywikipedia/drugbox/drugbox_titles.txt', 'r')
drugbank_data = {}
# 0 1 2 3 4 5 6 7
# Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID
drug_data = csv.reader(open('/Users/BogBot/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'), dialect='excel')
for row in drug_data:
drugbank_data[row[0]] = row[1:]
# articles = []
articles = ["User:Boghog/Sandbox2"]
# articles = ["Template:Drugbox/Lisinopril"]
# main loop
for article in articles:
article = article.rstrip('\n')
# INN = article
INN = "Lisinopril"
# print article
log_string = "* [[" + article + "]]"
print log_string,
site = wikipedia.getSite()
page = wikipedia.Page(site, article)
text = page.get(get_redirect = True)
if not Allowbots(text):
break
log_string = ", article: " + article
print log_string
# print text
result_drug_infobox = regexp_drug_infobox.search(text)
if result_drug_infobox:
print "found it!"
parameters = result_drug_infobox.group('PARAMS')
current_parameters = {}
# print parameters
lines = parameters.splitlines()
for line in lines:
# print line
result_drug_param = regexp_param.search(line)
if result_drug_param:
parameter = result_drug_param.group('PARAM')
value = result_drug_param.group('VALUE')
current_parameters[parameter] = value
print "INN: ", INN
if INN in drugbank_data:
data = drugbank_data[INN]
# merge tradenames
merck_tradenames = sorted(set(data[0].split(";")))[1:]
for index, object in enumerate(merck_tradenames):
merck_tradenames[index] = string.capitalize(string.strip(object.encode()))
if current_parameters.has_key("tradename"):
current_tradenames = sorted(set(current_parameters['tradename'].split(", ")))
else:
current_tradenames = []
for index, object in enumerate(current_tradenames):
current_tradenames[index] = string.capitalize(string.strip(object.encode()))
merged_tradenames = sorted(merck_tradenames + current_tradenames)
new_tradenames = sorted(set(merged_tradenames))
current_parameters['tradename'] = ", ".join(new_tradenames)
# print "merck tradenames: ", merck_tradenames
# print "current tradenames: ", current_tradenames
print "new tradenames: ", current_parameters['tradename']
# test web page, returns "200" if OK:
# if urllib.urlopen(link).getcode() == 200:
# 200: ('OK', 'Request fulfilled, document follows')
# 404: (page not found)
# add MedlinePlus parameter
# | MedlinePlus = <!-- MedlinePlus drug accession number, e.g., "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" -->
link = "http://www.nlm.nih.gov/medlineplus/druginfo/meds/" + data[2] + ".html"
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
current_parameters['MedlinePlus'] = data[2]
# print "MedlinePlus: ", current_parameters['MedlinePlus']
# add Drugs.com parameter
# | Drugs.com = <!-- link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->
INN_html = string.lower(string.replace(INN, " ", "_"))
link = "http://www.drugs.com/monograph/" + INN_html + ".html"
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
current_parameters['Drugs.com'] = INN_html
# print "Drugs.com: ", current_parameters['Drugs.com']
# for parameter, value in current_parameters.iteritems():
# print parameter, ":", value
# while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing
if data[4] and not current_parameters.has_key("KEGG"):
current_parameters['KEGG'] = data[4]
if data[6] and not current_parameters.has_key("ChemSpiderID"):
current_parameters['ChemSpiderID'] = data[6]
if data[7] and not current_parameters.has_key("PubChem"):
current_parameters['PubChem'] = data[7]
# build new drugbox template
new_drugbox = "{{Drugbox\n"
if current_parameters.has_key("verifiedrevid"): new_drugbox += "| verifiedrevid = " + current_parameters['verifiedrevid'] + "\n"
if current_parameters.has_key("IUPAC_name"): new_drugbox += "| IUPAC_name = " + current_parameters['IUPAC_name'] + "\n"
if current_parameters.has_key("image"): new_drugbox += "| image = " + current_parameters['image'] + "\n"
if current_parameters.has_key("width"): new_drugbox += "| width = " + current_parameters['width'] + "\n"
if current_parameters.has_key("alt"): new_drugbox += "| alt = " + current_parameters['alt'] + "\n"
if current_parameters.has_key("image2"): new_drugbox += "| image2 = " + current_parameters['image2'] + "\n"
if current_parameters.has_key("width2"): new_drugbox += "| width2 = " + current_parameters['width2'] + "\n"
if current_parameters.has_key("imagename"): new_drugbox += "| imagename = " + current_parameters['imagename'] + "\n"
if current_parameters.has_key("drug_name"): new_drugbox += "| drug_name = " + current_parameters['drug_name'] + "\n"
if current_parameters.has_key("caption"): new_drugbox += "| caption = " + current_parameters['caption'] + "\n"
if current_parameters.has_key("type"):
if current_parameters['type'] == "combo":
new_drugbox += "\n<!--Combo data-->\n"
if current_parameters.has_key("type"): new_drugbox += "| type = " + current_parameters['type'] + "\n"
if current_parameters.has_key("component1"): new_drugbox += "| component1 = " + current_parameters['component1'] + "\n"
if current_parameters.has_key("class1"): new_drugbox += "| class1 = " + current_parameters['class1'] + "\n"
if current_parameters.has_key("component2"): new_drugbox += "| component2 = " + current_parameters['component2'] + "\n"
if current_parameters.has_key("class2"): new_drugbox += "| class2 = " + current_parameters['class2'] + "\n"
if current_parameters.has_key("component3"): new_drugbox += "| component3 = " + current_parameters['component3'] + "\n"
if current_parameters.has_key("class3"): new_drugbox += "| class3 = " + current_parameters['class3'] + "\n"
if current_parameters.has_key("component4"): new_drugbox += "| component4 = " + current_parameters['component4'] + "\n"
if current_parameters.has_key("class4"): new_drugbox += "| class4 = " + current_parameters['class4'] + "\n"
if current_parameters['type'] == "mab":
new_drugbox += "\n<!--Monoclonal antibody data-->\n"
if current_parameters.has_key("type"): new_drugbox += "| type = " + current_parameters['type'] + "\n"
if current_parameters.has_key("mab_type"): new_drugbox += "| mab_type = " + current_parameters['mab_type'] + "\n"
if current_parameters.has_key("source"): new_drugbox += "| source = " + current_parameters['source'] + "\n"
if current_parameters.has_key("target"): new_drugbox += "| target = " + current_parameters['target'] + "\n"
if current_parameters['type'] == "mab":
new_drugbox += "\n<!--Monoclonal antibody data-->\n"
if current_parameters.has_key("type"): new_drugbox += "| type = " + current_parameters['type'] + "\n"
if current_parameters['type'] == "vaccine":
new_drugbox += "\n<!--Vacine data-->\n"
if current_parameters.has_key("type"): new_drugbox += "| type = " + current_parameters['type'] + "\n"
if current_parameters.has_key("target"): new_drugbox += "| target = " + current_parameters['target'] + "\n"
if current_parameters.has_key("vaccine_type"): new_drugbox += "| vaccine_type = " + current_parameters['vaccine_type'] + "\n"
new_drugbox += "\n<!--Clinical data-->\n"
if current_parameters.has_key("tradename"): new_drugbox += "| tradename = " + current_parameters['tradename'] + "\n"
if current_parameters.has_key("Drugs.com"): new_drugbox += "| Drugs.com = " + current_parameters['Drugs.com'] + "\n"
if current_parameters.has_key("MedlinePlus"): new_drugbox += "| MedlinePlus = " + current_parameters['MedlinePlus'] + "\n"
if current_parameters.has_key("licence_EU"): new_drugbox += "| licence_EU = " + current_parameters['licence_EU'] + "\n"
if current_parameters.has_key("licence_US"): new_drugbox += "| licence_US = " + current_parameters['licence_US'] + "\n"
if current_parameters.has_key("DailyMedID"): new_drugbox += "| DailyMedID = " + current_parameters['DailyMedID'] + "\n"
if current_parameters.has_key("pregnancy_AU"): new_drugbox += "| pregnancy_AU = " + current_parameters['pregnancy_AU'] + "\n"
if current_parameters.has_key("pregnancy_US"): new_drugbox += "| pregnancy_US = " + current_parameters['pregnancy_US'] + "\n"
if current_parameters.has_key("pregnancy_category"): new_drugbox += "| pregnancy_category = " + current_parameters['pregnancy_category'] + "\n"
if current_parameters.has_key("legal_AU"): new_drugbox += "| legal_AU = " + current_parameters['legal_AU'] + "\n"
if current_parameters.has_key("legal_CA"): new_drugbox += "| legal_CA = " + current_parameters['legal_CA'] + "\n"
if current_parameters.has_key("legal_UK"): new_drugbox += "| legal_UK = " + current_parameters['legal_UK'] + "\n"
if current_parameters.has_key("legal_US"): new_drugbox += "| legal_US = " + current_parameters['legal_US'] + "\n"
if current_parameters.has_key("legal_status"): new_drugbox += "| legal_status = " + current_parameters['legal_status'] + "\n"
if current_parameters.has_key("dependency_liability"): new_drugbox += "| dependency_liability = " + current_parameters['dependency_liability'] + "\n"
if current_parameters.has_key("routes_of_administration"): new_drugbox += "| routes_of_administration = " + current_parameters['routes_of_administration'] + "\n"
new_drugbox += "\n<!--Pharmacokinetic data-->\n"
if current_parameters.has_key("bioavailability"): new_drugbox += "| bioavailability = " + current_parameters['bioavailability'] + "\n"
if current_parameters.has_key("protein_bound"): new_drugbox += "| protein_bound = " + current_parameters['protein_bound'] + "\n"
if current_parameters.has_key("metabolism"): new_drugbox += "| metabolism = " + current_parameters['metabolism'] + "\n"
if current_parameters.has_key("elimination_half-life"): new_drugbox += "| elimination_half-life = " + current_parameters['elimination_half-life'] + "\n"
if current_parameters.has_key("excretion"): new_drugbox += "| excretion = " + current_parameters['excretion'] + "\n"
new_drugbox += "\n<!--Identifiers-->\n"
if current_parameters.has_key("CAS_number"): new_drugbox += "| CAS_number = " + current_parameters['CAS_number'] + "\n"
if current_parameters.has_key("CAS_supplemental"): new_drugbox += "| CAS_supplemental = " + current_parameters['CAS_supplemental'] + "\n"
if current_parameters.has_key("ATCvet"): new_drugbox += "| ATCvet = " + current_parameters['ATCvet'] + "\n"
if current_parameters.has_key("ATC_prefix"): new_drugbox += "| ATC_prefix = " + current_parameters['ATC_prefix'] + "\n"
if current_parameters.has_key("ATC_suffix"): new_drugbox += "| ATC_suffix = " + current_parameters['ATC_suffix'] + "\n"
if current_parameters.has_key("ATC_supplemental"): new_drugbox += "| ATC_supplemental = " + current_parameters['ATC_supplemental'] + "\n"
if current_parameters.has_key("PubChem"): new_drugbox += "| PubChem = " + current_parameters['PubChem'] + "\n"
if current_parameters.has_key("PubChemSubstance"): new_drugbox += "| PubChemSubstance = " + current_parameters['PubChemSubstance'] + "\n"
if current_parameters.has_key("IUPHAR_ligand"): new_drugbox += "| IUPHAR_ligand = " + current_parameters['IUPHAR_ligand'] + "\n"
if current_parameters.has_key("DrugBank"): new_drugbox += "| DrugBank = " + current_parameters['DrugBank'] + "\n"
if current_parameters.has_key("ChemSpiderID_Ref"): new_drugbox += "| ChemSpiderID_Ref = " + current_parameters['ChemSpiderID_Ref'] + "\n"
if current_parameters.has_key("ChemSpiderID"): new_drugbox += "| ChemSpiderID = " + current_parameters['ChemSpiderID'] + "\n"
if current_parameters.has_key("UNII_Ref"): new_drugbox += "| UNII_Ref = " + current_parameters['UNII_Ref'] + "\n"
if current_parameters.has_key("UNII"): new_drugbox += "| UNII = " + current_parameters['UNII'] + "\n"
if current_parameters.has_key("KEGG_Ref"): new_drugbox += "| KEGG_Ref = " + current_parameters['KEGG_Ref'] + "\n"
if current_parameters.has_key("KEGG"): new_drugbox += "| KEGG = " + current_parameters['KEGG'] + "\n"
if current_parameters.has_key("ChEBI_Ref"): new_drugbox += "| ChEBI_Ref = " + current_parameters['ChEBI_Ref'] + "\n"
if current_parameters.has_key("ChEBI"): new_drugbox += "| ChEBI = " + current_parameters['ChEBI'] + "\n"
if current_parameters.has_key("ChEMBL_Ref"): new_drugbox += "| ChEMBL_Ref = " + current_parameters['ChEMBL_Ref'] + "\n"
if current_parameters.has_key("ChEMBL"): new_drugbox += "| ChEMBL = " + current_parameters['ChEMBL'] + "\n"
new_drugbox += "\n<!--Chemical data-->\n"
if current_parameters.has_key("chemical_formula"): new_drugbox += "| chemical_formula = " + current_parameters['chemical_formula'] + "\n"
if current_parameters.has_key("C"): new_drugbox += "| C=" + current_parameters['C'] + " "
if current_parameters.has_key("H"): new_drugbox += "| H=" + current_parameters['H'] + " "
if current_parameters.has_key("Ag"): new_drugbox += "| Ag=" + current_parameters['Ag'] + " "
if current_parameters.has_key("As"): new_drugbox += "| As=" + current_parameters['As'] + " "
if current_parameters.has_key("Au"): new_drugbox += "| Au=" + current_parameters['Au'] + " "
if current_parameters.has_key("B"): new_drugbox += "| B=" + current_parameters['B'] + " "
if current_parameters.has_key("Bi"): new_drugbox += "| Bi=" + current_parameters['Bi'] + " "
if current_parameters.has_key("Br"): new_drugbox += "| Br=" + current_parameters['Br'] + " "
if current_parameters.has_key("Cl"): new_drugbox += "| Cl=" + current_parameters['Cl'] + " "
if current_parameters.has_key("Co"): new_drugbox += "| Co=" + current_parameters['Co'] + " "
if current_parameters.has_key("F"): new_drugbox += "| F=" + current_parameters['F'] + " "
if current_parameters.has_key("Fe"): new_drugbox += "| Fe=" + current_parameters['Fe'] + " "
if current_parameters.has_key("Gd"): new_drugbox += "| Gd=" + current_parameters['Gd'] + " "
if current_parameters.has_key("I"): new_drugbox += "| I=" + current_parameters['I'] + " "
if current_parameters.has_key("K"): new_drugbox += "| K=" + current_parameters['K'] + " "
if current_parameters.has_key("Mn"): new_drugbox += "| Mn=" + current_parameters['Mn'] + " "
if current_parameters.has_key("N"): new_drugbox += "| N=" + current_parameters['N'] + " "
if current_parameters.has_key("Na"): new_drugbox += "| Na=" + current_parameters['Na'] + " "
if current_parameters.has_key("O"): new_drugbox += "| O=" + current_parameters['O'] + " "
if current_parameters.has_key("P"): new_drugbox += "| P=" + current_parameters['P'] + " "
if current_parameters.has_key("Pt"): new_drugbox += "| Pt=" + current_parameters['Pt'] + " "
if current_parameters.has_key("S"): new_drugbox += "| S=" + current_parameters['S'] + " "
if current_parameters.has_key("Sb"): new_drugbox += "| C=" + current_parameters['Sb'] + " "
if current_parameters.has_key("Se"): new_drugbox += "| Se=" + current_parameters['Se'] + " "
if current_parameters.has_key("Sr"): new_drugbox += "| Sr=" + current_parameters['Sr'] + " "
if current_parameters.has_key("Tc"): new_drugbox += "| Tc=" + current_parameters['Tc'] + " "
if current_parameters.has_key("charge"): new_drugbox += "| charge = " + current_parameters['charge'] + " "
new_drugbox += "\n"
if current_parameters.has_key("molecular_weight"): new_drugbox += "| molecular_weight = " + current_parameters['molecular_weight'] + "\n"
if current_parameters.has_key("smiles"): new_drugbox += "| smiles = " + current_parameters['smiles'] + "\n"
if current_parameters.has_key("InChI_Ref"): new_drugbox += "| InChI_Ref = " + current_parameters['InChI_Ref'] + "\n"
if current_parameters.has_key("InChI"): new_drugbox += "| InChI = " + current_parameters['InChI'] + "\n"
if current_parameters.has_key("StdInChI_Ref"): new_drugbox += "| StdInChI_Ref = " + current_parameters['StdInChI_Ref'] + "\n"
if current_parameters.has_key("StdInChI"): new_drugbox += "| StdInChI = " + current_parameters['StdInChI'] + "\n"
if current_parameters.has_key("StdInChI_comment"): new_drugbox += "| StdInChI_comment = " + current_parameters['StdInChI_comment'] + "\n"
if current_parameters.has_key("StdInChIKey_Ref"): new_drugbox += "| StdInChIKey_Ref = " + current_parameters['StdInChIKey_Ref'] + "\n"
if current_parameters.has_key("StdInChIKey"): new_drugbox += "| StdInChIKey = " + current_parameters['StdInChIKey'] + "\n"
if current_parameters.has_key("synonyms"): new_drugbox += "| synonyms = " + current_parameters['synonyms'] + "\n"
if current_parameters.has_key("density"): new_drugbox += "| density = " + current_parameters['density'] + "\n"
if current_parameters.has_key("melting_point"): new_drugbox += "| melting_point = " + current_parameters['melting_point'] + "\n"
if current_parameters.has_key("boiling_point"): new_drugbox += "| boiling_point = " + current_parameters['boiling_point'] + "\n"
if current_parameters.has_key("boiling_notes"): new_drugbox += "| boiling_notes = " + current_parameters['boiling_notes'] + "\n"
if current_parameters.has_key("solubility"): new_drugbox += "| solubility = " + current_parameters['solubility'] + "\n"
if current_parameters.has_key("specific_rotation"): new_drugbox += "| specific_rotation = " + current_parameters['specific_rotation'] + "\n"
if current_parameters.has_key("sec_combustion"): new_drugbox += "| sec_combustion = " + current_parameters['sec_combustion'] + "\n"
new_drugbox += "}}\n"
# print new_drugbox
# replace original drugbox with new drugbox
new_text = re.sub(regexp_drug_infobox, new_drugbox, text)
print new_text
page.put(new_text, comment='populated clinical fields in drugbox', watchArticle = None, minorEdit = True)
print ", page updated"
wikipedia.stopme()