User:BogBot/Source code/Task 03

#!/usr/bin/python

# Bot Script to populate new clinical fields in Drugbox templates in Wikipedia drug articles.
# The new fields are:
# | tradename   =  <!-- comma separated list of tradenames --> 
# | Drugs.com   =  <!--  link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->
# | MedlinePlus =  <!-- MedlinePlus drug accession number, e.g.,  "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" --> 
# In addition, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing with data from http://www.drugbank.ca/system/downloads/current/drugbank.txt.zip
# The tradnames were obtained from http://www.merckmanuals.com/home/drugnames-index/trade/a.html
# Finally the script sorts the fields in the order they are currently rendered by the drugbox template
# (in the order of clinical, pharmacokinetic, identifiers, and chemical data)

"""{{Drugbox
| Watchedfields
| verifiedrevid = 408577806
| IUPAC_name        = 
| OtherNames        = 
| image             = 
| width             = 
| alt               = 
| image2            = 
| width2            = 
| alt2              = 
| imagename         = <!-- else may use drug_name -->
| drug_name         = <!-- else may use imagename -->
| caption           = 

<!--Clinical data-->
| tradename         =  
| Drugs.com         =
| MedlinePlus       =
| licence_EU        = <!-- EMA requires brand name -->
| licence_US        = <!-- FDA may use generic name -->
| DailyMedID        = <!-- preference to licence_US -->
| pregnancy_AU      = <!-- A / B1 / B2 / B3 / C / D / X -->
| pregnancy_US      = <!-- A / B            / C / D / X -->
| pregnancy_category= 
| legal_AU = <!-- S2, S3, S4, S5, S6, S7, S8, S9 or Unscheduled-->
| legal_CA = <!-- OTC, Rx-only, Schedule I, II, III, IV, V, VI, VII, VIII -->
| legal_UK = <!-- GSL, P, POM, CD, CD Lic, CD POM, CD No Reg POM, CD (Benz) POM, CD (Anab) POM or CD Inv POM -->
| legal_US = <!-- OTC / Rx-only / Schedule I, II, III, IV, V -->
| legal_status      = 
| dependency_liability = 
| routes_of_administration = 

<!--Pharmacokinetic data-->
| bioavailability   = 
| protein_bound     = 
| metabolism        = 
| elimination_half-life = 
| excretion         = 

<!--Identifiers-->
| CAS_number        = 
| CAS_supplemental  = 
| ATCvet            = 
| ATC_prefix        = <!-- 'none' if uncategorised -->
| ATC_suffix        = 
| ATC_supplemental  = 
| PubChem           = 
| PubChemSubstance  = 
| IUPHAR_ligand     = 
| DrugBank          = 
| ChemSpiderID      = 
| UNII              =
| KEGG              =
| ChEBI             =
| ChEMBL            =

<!--Chemical data-->
| chemical_formula  = 
| C= | H= | Ag= | As= | Au= | B= | Bi= | Br= | Cl= | Co= | F= | Fe= | Gd= | I=
| K= | Mn= | N= | Na= | O= | P= | Pt= | S= | Sb= | Se= | Sr= | Tc= | Zn= | charge=
| molecular_weight  = 
| smiles            = 
| StdInChI          =
| StdInChI_comment  =
| StdInChIKey       =
| synonyms          = 
| density           = 
| melting_point     = 
| melting_high      = 
| melting_notes     = 
| boiling_point     = 
| boiling_notes     = 
| solubility        = 
| specific_rotation = 
| sec_combustion    = 

<!--Combo data-->
| type              = combo
| drug_name         = 
| component1        = <!-- Drugname, automatically linked -->
| class1            = <!-- Group, manual link using [[..|..]] -->
| component2        = <!-- Drugname, automatically linked -->
| class2            = <!-- Group, manual link using [[..|..]] -->
| component3        = <!-- Drugname, automatically linked -->
| class3            = <!-- Group, manual link using [[..|..]] -->
| component4        = <!-- Drugname, automatically linked -->
| class4            = <!-- Group, manual link using [[..|..]] -->

<!--Monoclonal antibody data-->
| type              = mab
| image             = 
| width             = 
| alt               = 
| image2            = 
| width2            = 
| alt2              = 
| imagename         = <!-- else may use drug_name -->
| drug_name         = <!-- else may use imagename -->
| mab_type          = <!-- mab, Fab, F(ab')2, Fab', scFv, di-scFv, 3funct, clFab, BiTE -->
| source            = <!-- a, e, i, o, u, xi/a, zu/a, xizu/a, axo, ... -->
| target            = <!-- antigen -->

<!--Vacine data-->
| type              = vaccine
| image             = 
| alt               = 
| width             = 
| image2            = 
| alt2              = 
| width2            = 
| imagename         = <!-- else may use drug_name -->
| drug_name         = <!-- else may use imagename -->
| target            = <!-- the antigen/bacteria/toxin/virus to protect against -->
| vaccine_type      = <!-- killed/attenuated/live/toxoid/protein subunit/subunit/conjugate/recombinant/DNA -->

}}"""

from collections import defaultdict
import codecs
import csv
import re
import string
import urllib
import wikipedia

# compiled regular expression

user =  "BogBot"
regexp_ab              = re.compile(r'\{\{(nobots|bots\|(allow=none|deny=.*?' + user + r'.*?|optout=all|deny=all))\}\}')

# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
# Build a regex to locate the drugbox
exp =       r'\{\{'       # the opening brackets for the infobox 
exp = exp + r'\s*'        # any amount of whitespace
exp = exp + r'[Dd]rugbox' # the word "infobox", capitalized or not
exp = exp + r'.*\}\}'     # any amount of anything, followed by the end of the infobox

regexp_drug_infobox    = re.compile(exp, re.DOTALL)
regexp_param           = re.compile(r"^\s*?\|\s*?(?P<PARAM>\S+)\s*?=\s*?(?P<VALUE>.+)\s*?($|\|)")
regexp_nested_template = re.compile(r"\{\{(?P<PARAMS>.+)\}\}")

# ATC_supplemental =  {{ATC|B01|AC06}}, {{ATC|N02|BA01}}
regexp_ATC_supplemental = re.compile(r"\|\s*?ATC_supplemental\s*?=\s*?(?P<TEMPLATE>.*?)\s*?($|\|)")
# CASNo_Ref = {{cascite|correct|CAS}}
regexp_CASNo_Ref = re.compile(r"\|\s??CASNo_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Cascite|cascite).+?\}\})")
# ChEMBL_Ref = {{ebicite|correct|EBI}}
regexp_ChEMBL_Ref = re.compile(r"\|\s??ChEMBL_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Ebicite|ebicite).+?\}\})")
# ChemSpiderID_Ref = {{chemspidercite|correct|chemspider}}
regexp_ChemSpiderID_Ref = re.compile(r"\|\s??ChemSpiderID_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Chemspidercite|chemspidercite).+?\}\})")
# Drugs.com = {{drugs.com|monograph|lisinopril}}
regexp_Drugs_com = re.compile(r"\|\s??Drugs\.com\s?=\s?(?P<TEMPLATE>\{\{s*(Drugs\.com|drugs\.com).+?\}\})")
# KEGG_Ref = {{keggcite|correct|kegg}}
regexp_KEGG_Ref = re.compile(r"\|\s??KEGG_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Keggcite|keggcite).+?\}\})")
# StdInChI_Ref = {{stdinchicite|correct|chemspider}}
regexp_StdInChI_Ref = re.compile(r"\|\s??StdInChI_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Stdinchicite|stdinchicite).+?\}\})")
# StdInChIKey_Ref = {{stdinchicite|correct|chemspider}}
regexp_StdInChIKey_Ref = re.compile(r"\|\s??StdInChIKey_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Stdinchicite|stdinchicite).+?\}\})")
# UNII_Ref = {{fdacite|changed|FDA}}
regexp_UNII_Ref = re.compile(r"\|\s??UNII_Ref\s?=\s?(?P<TEMPLATE>\{\{s*(Fdacite|fdacite).+?\}\})")

def Allowbots(text):
    if (regexp_ab.search(text)):
        return False
    return True

def find_drugbox_from_text(article_text):
#   adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
    search_result = regexp_drug_infobox.search(article_text)
    if search_result:
        result_text = search_result.group(0) # returns the entire matching sequence
        begin, end    = search_result.span()
        print 
    else:
		return None
	# the regex isn't perfect, so look for the closing brackets of the infobox
    count = 0
    last_ind = None
    for ind, c in enumerate(result_text):
		if c == '}':
			count = count - 1
		elif c == '{':
			count = count + 1
		if count == 0 and not ind == 0:
			last_ind = ind
			break
    offset = result_text.find('|')
    ___location = (begin+offset, begin+last_ind-1)
    return ___location

def drugbank():
#   drugbank data obtained from http://www.drugbank.ca/system/downloads/current/drugbank.txt.zip

    drugbank_data = {}

#			0			1			2			3				4				5					6				7
#	Name	Trade_Names	Drug_Type	MedlinePlus	Drugs.com_link	KEGG_Drug_ID	KEGG_Compound_ID	ChemSpider_ID	PubChem_Compound_ID

    drug_data = csv.reader(open('/Users/BogHog/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'),  dialect='excel')

    for row in drug_data:
        drugbank_data[row[0]] = row[1:]
    
    return drugbank_data

def assign_nested_templates(parameters, current_parameters):
#   extract and assign nested templates commonly used in drugbox templates

    result_ATC_supplemental = regexp_ATC_supplemental.search(parameters)
    if result_ATC_supplemental:
        template = result_ATC_supplemental.group('TEMPLATE')
        current_parameters['ATC_supplemental'] = template
#       print "found result_ATC_supplemental! ", template
        parameters = re.sub(regexp_ATC_supplemental, "|", parameters)

    result_ChEMBL_Ref = regexp_ChEMBL_Ref.search(parameters)
    if result_ChEMBL_Ref:
        template = result_ChEMBL_Ref.group('TEMPLATE')
        current_parameters['ChEMBL_Ref'] = template
#       print "found result_ChEMBL_Ref! ", template
        parameters = re.sub(regexp_ChEMBL_Ref, "", parameters)

    result_ChemSpiderID_Ref = regexp_ChemSpiderID_Ref.search(parameters)
    if result_ChemSpiderID_Ref:
        template = result_ChemSpiderID_Ref.group('TEMPLATE')
        current_parameters['ChemSpiderID_Ref'] = template
#       print "found ChemSpiderID_Ref! ", template
        parameters = re.sub(regexp_ChemSpiderID_Ref, "", parameters)  

    result_Drugs_com = regexp_Drugs_com.search(parameters)
    if result_Drugs_com:
        template = result_Drugs_com.group('TEMPLATE')
        current_parameters['Drugs.com'] = template
#       print "found result_Drugs_com! ", template
        parameters = re.sub(regexp_Drugs_com, "", parameters)  

    result_KEGG_Ref = regexp_KEGG_Ref.search(parameters)
    if result_KEGG_Ref:
        template = result_KEGG_Ref.group('TEMPLATE')
        current_parameters['KEGG_Ref'] = template
#       print "found KEGG_Ref! ", template
        parameters = re.sub(regexp_KEGG_Ref, "", parameters)  

    result_StdInChI_Ref = regexp_StdInChI_Ref.search(parameters)
    if result_StdInChI_Ref:
        template = result_StdInChI_Ref.group('TEMPLATE')
        current_parameters['StdInChI_Ref'] = template
#       print "found StdInChI_Ref! ", template
        parameters = re.sub(regexp_StdInChI_Ref, "", parameters)  

    result_StdInChIKey_Ref = regexp_StdInChIKey_Ref.search(parameters)
    if result_StdInChIKey_Ref:
        template = result_StdInChIKey_Ref.group('TEMPLATE')
        current_parameters['StdInChIKey_Ref'] = template
#       print "found StdInChIKey_Ref! ", template
        parameters = re.sub(regexp_StdInChIKey_Ref, "", parameters)  

    result_UNII_Ref = regexp_UNII_Ref.search(parameters)
    if result_UNII_Ref:
        template = result_UNII_Ref.group('TEMPLATE')
        current_parameters['UNII_Ref'] = template
#       print "found UNII_Ref! ", template
        parameters = re.sub(regexp_UNII_Ref, "", parameters)  

    return parameters

def parse_line(line, current_parameters):

#   print "index: ", line.count('|')
#   if (line.count('|') > 1 and line.count('[[') < 1 ):
    if (line.count('|') > 1):
#       print "line1: ", line
        sub_strings = line.split("|")

# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
        new_list = [sub_strings[0]]
        for sub_string in sub_strings[1:]:
            if (']]' in sub_string) and ((not '[[' in sub_string) or sub_string.find(']]') < sub_string.find('[[')):
                new_list[-1] = new_list[-1] + '|' + sub_string
            else:
		        new_list.append(sub_string)
        sub_strings = new_list
#       print "new_list: ", new_list
		  
# do the same thing for nested templates
        forever = True
        while forever:
#           print "new_list: ", new_list
            forever = False
            for sub_string in sub_strings[1:]:
#                print "sub_string: ", sub_string
                if ('}}' in sub_string) and ((not '{{' in sub_string) or sub_string.find('}}') < sub_string.find('{{')):
                    new_list[-1] = new_list[-1] + '|' + sub_string
                    forever = True
                else:
		            new_list.append(sub_string)
            sub_strings = new_list
		  
#           print "new_list: ", new_list

# now assign the parameters
        for sub_string in sub_strings:
#           print "sub_string: ", sub_string
            if (sub_string.count("=") > 0):
                parts     = sub_string.split("=")
#               print "parts: ", parts
                parameter = str(parts[0].encode()).strip()
                value     = str(parts[1].encode()).strip()
#               print "parameter, value: ", parameter, " ", value
                current_parameters[parameter] = value
    else:
        result_drug_param = regexp_param.search(line)
#       print line
        if result_drug_param:
#           print "made it!"
#           print "line2: ", line
            parameter = result_drug_param.group('PARAM').strip()
            value     = result_drug_param.group('VALUE').strip()
            current_parameters[parameter] = value
            
        return
        
def build_new_drugbox(current_parameters):
# build new drugbox template

    new_drugbox = ""
    if current_parameters.has_key("Watchedfields"):            new_drugbox += "| Watchedfields = "            + current_parameters['Watchedfields']            + "\n"
    if current_parameters.has_key("Verifiedfields"):           new_drugbox += "| Verifiedfields = "           + current_parameters['Verifiedfields']           + "\n"
    if current_parameters.has_key("verifiedrevid"):            new_drugbox += "| verifiedrevid = "            + current_parameters['verifiedrevid']            + "\n"
    if current_parameters.has_key("IUPAC_name"):               new_drugbox += "| IUPAC_name = "               + current_parameters['IUPAC_name']               + "\n"
    if current_parameters.has_key("OtherNames"):               new_drugbox += "| OtherNames = "               + current_parameters['OtherNames']               + "\n"
    if current_parameters.has_key("image"):                    new_drugbox += "| image = "                    + current_parameters['image']                    + "\n"
    if current_parameters.has_key("width"):                    new_drugbox += "| width = "                    + current_parameters['width']                    + "\n"
    if current_parameters.has_key("alt"):                      new_drugbox += "| alt = "                      + current_parameters['alt']                      + "\n"
    if current_parameters.has_key("image2"):                   new_drugbox += "| image2 = "                   + current_parameters['image2']                   + "\n"
    if current_parameters.has_key("width2"):                   new_drugbox += "| width2 = "                   + current_parameters['width2']                   + "\n"
    if current_parameters.has_key("imagename"):                new_drugbox += "| imagename = "                + current_parameters['imagename']                + "\n"
    if current_parameters.has_key("drug_name"):                new_drugbox += "| drug_name = "                + current_parameters['drug_name']                + "\n"
    if current_parameters.has_key("caption"):                  new_drugbox += "| caption = "                  + current_parameters['caption']                  + "\n"
    
    if current_parameters.has_key("type"):

      if current_parameters['type'] == "combo":
        new_drugbox += "\n<!--Combo data-->\n"
        if current_parameters.has_key("type"):                 new_drugbox += "| type = "                     + current_parameters['type']                     + "\n"
        if current_parameters.has_key("component1"):           new_drugbox += "| component1 = "               + current_parameters['component1']               + "\n"
        if current_parameters.has_key("class1"):               new_drugbox += "| class1 = "                   + current_parameters['class1']                   + "\n"
        if current_parameters.has_key("component2"):           new_drugbox += "| component2 = "               + current_parameters['component2']               + "\n"
        if current_parameters.has_key("class2"):               new_drugbox += "| class2 = "                   + current_parameters['class2']                   + "\n"
        if current_parameters.has_key("component3"):           new_drugbox += "| component3 = "               + current_parameters['component3']               + "\n"
        if current_parameters.has_key("class3"):               new_drugbox += "| class3 = "                   + current_parameters['class3']                   + "\n"
        if current_parameters.has_key("component4"):           new_drugbox += "| component4 = "               + current_parameters['component4']               + "\n"
        if current_parameters.has_key("class4"):               new_drugbox += "| class4 = "                   + current_parameters['class4']                   + "\n"

      if current_parameters['type'] == "mab":
        new_drugbox += "\n<!--Monoclonal antibody data-->\n"
        if current_parameters.has_key("type"):                 new_drugbox += "| type = "                     + current_parameters['type']                     + "\n"
        if current_parameters.has_key("mab_type"):             new_drugbox += "| mab_type = "                 + current_parameters['mab_type']                 + "\n"
        if current_parameters.has_key("source"):               new_drugbox += "| source = "                   + current_parameters['source']                   + "\n"
        if current_parameters.has_key("target"):               new_drugbox += "| target = "                   + current_parameters['target']                   + "\n"

      if current_parameters['type'] == "vaccine":
        new_drugbox += "\n<!--Vacine data-->\n"
        if current_parameters.has_key("type"):                 new_drugbox += "| type = "                     + current_parameters['type']                     + "\n"
        if current_parameters.has_key("target"):               new_drugbox += "| target = "                   + current_parameters['target']                   + "\n"
        if current_parameters.has_key("vaccine_type"):         new_drugbox += "| vaccine_type = "             + current_parameters['vaccine_type']             + "\n"

    new_drugbox += "\n<!--Clinical data-->\n"
    if current_parameters.has_key("tradename"):                new_drugbox += "| tradename = "                + current_parameters['tradename']                + "\n"
    if current_parameters.has_key("Drugs.com"):                new_drugbox += "| Drugs.com = "                + current_parameters['Drugs.com']                + "\n"
    if current_parameters.has_key("MedlinePlus"):              new_drugbox += "| MedlinePlus = "              + current_parameters['MedlinePlus']              + "\n"
    if current_parameters.has_key("licence_EU"):               new_drugbox += "| licence_EU = "               + current_parameters['licence_EU']               + "\n"
    if current_parameters.has_key("licence_US"):               new_drugbox += "| licence_US = "               + current_parameters['licence_US']               + "\n"
    if current_parameters.has_key("DailyMedID"):               new_drugbox += "| DailyMedID = "               + current_parameters['DailyMedID']               + "\n"
    if current_parameters.has_key("pregnancy_AU"):             new_drugbox += "| pregnancy_AU = "             + current_parameters['pregnancy_AU']             + "\n"
    if current_parameters.has_key("pregnancy_US"):             new_drugbox += "| pregnancy_US = "             + current_parameters['pregnancy_US']             + "\n"
    if current_parameters.has_key("pregnancy_category"):       new_drugbox += "| pregnancy_category = "       + current_parameters['pregnancy_category']       + "\n"
    if current_parameters.has_key("legal_AU"):                 new_drugbox += "| legal_AU = "                 + current_parameters['legal_AU']                 + "\n"
    if current_parameters.has_key("legal_CA"):                 new_drugbox += "| legal_CA = "                 + current_parameters['legal_CA']                 + "\n"
    if current_parameters.has_key("legal_UK"):                 new_drugbox += "| legal_UK = "                 + current_parameters['legal_UK']                 + "\n"
    if current_parameters.has_key("legal_US"):                 new_drugbox += "| legal_US = "                 + current_parameters['legal_US']                 + "\n"
    if current_parameters.has_key("legal_status"):             new_drugbox += "| legal_status = "             + current_parameters['legal_status']             + "\n"
    if current_parameters.has_key("dependency_liability"):     new_drugbox += "| dependency_liability = "     + current_parameters['dependency_liability']     + "\n"
    if current_parameters.has_key("routes_of_administration"): new_drugbox += "| routes_of_administration = " + current_parameters['routes_of_administration'] + "\n"

    new_drugbox += "\n<!--Pharmacokinetic data-->\n"
    if current_parameters.has_key("bioavailability"):          new_drugbox += "| bioavailability = "           + current_parameters['bioavailability']         + "\n"
    if current_parameters.has_key("protein_bound"):            new_drugbox += "| protein_bound = "             + current_parameters['protein_bound']           + "\n"
    if current_parameters.has_key("metabolism"):               new_drugbox += "| metabolism = "                + current_parameters['metabolism']              + "\n"
    if current_parameters.has_key("elimination_half-life"):    new_drugbox += "| elimination_half-life = "     + current_parameters['elimination_half-life']   + "\n"
    if current_parameters.has_key("excretion"):                new_drugbox += "| excretion = "                 + current_parameters['excretion']               + "\n"
    
    new_drugbox += "\n<!--Identifiers-->\n"
    if current_parameters.has_key("CAS_number"):               new_drugbox += "| CAS_number = "                + current_parameters['CAS_number']              + "\n"
    if current_parameters.has_key("CAS_supplemental"):         new_drugbox += "| CAS_supplemental = "          + current_parameters['CAS_supplemental']        + "\n"
    if current_parameters.has_key("ATCvet"):                   new_drugbox += "| ATCvet = "                    + current_parameters['ATCvet']                  + "\n"
    if current_parameters.has_key("ATC_prefix"):               new_drugbox += "| ATC_prefix = "                + current_parameters['ATC_prefix']              + "\n"
    if current_parameters.has_key("ATC_suffix"):               new_drugbox += "| ATC_suffix = "                + current_parameters['ATC_suffix']              + "\n"
    if current_parameters.has_key("ATC_supplemental"):         new_drugbox += "| ATC_supplemental = "          + current_parameters['ATC_supplemental']        + "\n"
    if current_parameters.has_key("PubChem"):                  new_drugbox += "| PubChem = "                   + current_parameters['PubChem']                 + "\n"
    if current_parameters.has_key("PubChemSubstance"):         new_drugbox += "| PubChemSubstance = "          + current_parameters['PubChemSubstance']        + "\n"
    if current_parameters.has_key("IUPHAR_ligand"):            new_drugbox += "| IUPHAR_ligand = "             + current_parameters['IUPHAR_ligand']           + "\n"
    if current_parameters.has_key("DrugBank"):                 new_drugbox += "| DrugBank = "                  + current_parameters['DrugBank']                + "\n"
    if current_parameters.has_key("ChemSpiderID_Ref"):         new_drugbox += "| ChemSpiderID_Ref = "          + current_parameters['ChemSpiderID_Ref']        + "\n"
    if current_parameters.has_key("ChemSpiderID"):             new_drugbox += "| ChemSpiderID = "              + current_parameters['ChemSpiderID']            + "\n"
    if current_parameters.has_key("UNII_Ref"):                 new_drugbox += "| UNII_Ref = "                  + current_parameters['UNII_Ref']                + "\n"
    if current_parameters.has_key("UNII"):                     new_drugbox += "| UNII = "                      + current_parameters['UNII']                    + "\n"
    if current_parameters.has_key("KEGG_Ref"):                 new_drugbox += "| KEGG_Ref = "                  + current_parameters['KEGG_Ref']                + "\n"
    if current_parameters.has_key("KEGG"):                     new_drugbox += "| KEGG = "                      + current_parameters['KEGG']                    + "\n"
    if current_parameters.has_key("ChEBI_Ref"):                new_drugbox += "| ChEBI_Ref = "                 + current_parameters['ChEBI_Ref']               + "\n"
    if current_parameters.has_key("ChEBI"):                    new_drugbox += "| ChEBI = "                     + current_parameters['ChEBI']                   + "\n"
    if current_parameters.has_key("ChEMBL_Ref"):               new_drugbox += "| ChEMBL_Ref = "                + current_parameters['ChEMBL_Ref']              + "\n"
    if current_parameters.has_key("ChEMBL"):                   new_drugbox += "| ChEMBL = "                    + current_parameters['ChEMBL']                  + "\n"

    new_drugbox += "\n<!--Chemical data-->\n"
    if current_parameters.has_key("chemical_formula"):         new_drugbox += "| chemical_formula = "          + current_parameters['chemical_formula']        + "\n"
    if current_parameters.has_key("C"):                        new_drugbox += "| C="                           + current_parameters['C']                       + " "
    if current_parameters.has_key("H"):                        new_drugbox += "| H="                           + current_parameters['H']                       + " "
    if current_parameters.has_key("Ag"):                       new_drugbox += "| Ag="                          + current_parameters['Ag']                      + " "
    if current_parameters.has_key("As"):                       new_drugbox += "| As="                          + current_parameters['As']                      + " "
    if current_parameters.has_key("Au"):                       new_drugbox += "| Au="                          + current_parameters['Au']                      + " "
    if current_parameters.has_key("B"):                        new_drugbox += "| B="                           + current_parameters['B']                       + " "
    if current_parameters.has_key("Bi"):                       new_drugbox += "| Bi="                          + current_parameters['Bi']                      + " "
    if current_parameters.has_key("Br"):                       new_drugbox += "| Br="                          + current_parameters['Br']                      + " "
    if current_parameters.has_key("Cl"):                       new_drugbox += "| Cl="                          + current_parameters['Cl']                      + " "
    if current_parameters.has_key("Co"):                       new_drugbox += "| Co="                          + current_parameters['Co']                      + " "
    if current_parameters.has_key("F"):                        new_drugbox += "| F="                           + current_parameters['F']                       + " "
    if current_parameters.has_key("Fe"):                       new_drugbox += "| Fe="                          + current_parameters['Fe']                      + " "
    if current_parameters.has_key("Gd"):                       new_drugbox += "| Gd="                          + current_parameters['Gd']                      + " "
    if current_parameters.has_key("I"):                        new_drugbox += "| I="                           + current_parameters['I']                       + " "
    if current_parameters.has_key("K"):                        new_drugbox += "| K="                           + current_parameters['K']                       + " "
    if current_parameters.has_key("Mn"):                       new_drugbox += "| Mn="                          + current_parameters['Mn']                      + " "
    if current_parameters.has_key("N"):                        new_drugbox += "| N="                           + current_parameters['N']                       + " "
    if current_parameters.has_key("Na"):                       new_drugbox += "| Na="                          + current_parameters['Na']                      + " "
    if current_parameters.has_key("O"):                        new_drugbox += "| O="                           + current_parameters['O']                       + " "
    if current_parameters.has_key("P"):                        new_drugbox += "| P="                           + current_parameters['P']                       + " "
    if current_parameters.has_key("Pt"):                       new_drugbox += "| Pt="                          + current_parameters['Pt']                      + " "
    if current_parameters.has_key("S"):                        new_drugbox += "| S="                           + current_parameters['S']                       + " "
    if current_parameters.has_key("Sb"):                       new_drugbox += "| C="                           + current_parameters['Sb']                      + " "
    if current_parameters.has_key("Se"):                       new_drugbox += "| Se="                          + current_parameters['Se']                      + " "
    if current_parameters.has_key("Sr"):                       new_drugbox += "| Sr="                          + current_parameters['Sr']                      + " "
    if current_parameters.has_key("Tc"):                       new_drugbox += "| Tc="                          + current_parameters['Tc']                      + " "
    if current_parameters.has_key("charge"):                   new_drugbox += "| charge = "                    + current_parameters['charge']                  + " "
    new_drugbox += "\n"
    if current_parameters.has_key("molecular_weight"):         new_drugbox += "| molecular_weight = "          + current_parameters['molecular_weight']        + "\n"
    if current_parameters.has_key("smiles"):                   new_drugbox += "| smiles = "                    + current_parameters['smiles']                  + "\n"
    if current_parameters.has_key("InChI_Ref"):                new_drugbox += "| InChI_Ref = "                 + current_parameters['InChI_Ref']               + "\n"
    if current_parameters.has_key("InChI"):                    new_drugbox += "| InChI = "                     + current_parameters['InChI']                   + "\n"
    if current_parameters.has_key("StdInChI_Ref"):             new_drugbox += "| StdInChI_Ref = "              + current_parameters['StdInChI_Ref']            + "\n"
    if current_parameters.has_key("StdInChI"):                 new_drugbox += "| StdInChI = "                  + current_parameters['StdInChI']                + "\n"
    if current_parameters.has_key("StdInChI_comment"):         new_drugbox += "| StdInChI_comment = "          + current_parameters['StdInChI_comment']        + "\n"
    if current_parameters.has_key("StdInChIKey_Ref"):          new_drugbox += "| StdInChIKey_Ref = "           + current_parameters['StdInChIKey_Ref']         + "\n"
    if current_parameters.has_key("StdInChIKey"):              new_drugbox += "| StdInChIKey = "               + current_parameters['StdInChIKey']             + "\n"
    if current_parameters.has_key("synonyms"):                 new_drugbox += "| synonyms = "                  + current_parameters['synonyms']                + "\n"
    if current_parameters.has_key("density"):                  new_drugbox += "| density = "                   + current_parameters['density']                 + "\n"
    if current_parameters.has_key("melting_point"):            new_drugbox += "| melting_point = "             + current_parameters['melting_point']           + "\n"
    if current_parameters.has_key("boiling_point"):            new_drugbox += "| boiling_point = "             + current_parameters['boiling_point']           + "\n"
    if current_parameters.has_key("boiling_notes"):            new_drugbox += "| boiling_notes = "             + current_parameters['boiling_notes']           + "\n"
    if current_parameters.has_key("solubility"):               new_drugbox += "| solubility = "                + current_parameters['solubility']              + "\n"
    if current_parameters.has_key("specific_rotation"):        new_drugbox += "| specific_rotation = "         + current_parameters['specific_rotation']       + "\n"
    if current_parameters.has_key("sec_combustion"):           new_drugbox += "| sec_combustion = "            + current_parameters['sec_combustion']          + "\n"

    return new_drugbox
    
def merged_tradenames(merck_tradename, current_tradename):
# merge tradenames 
    new_tradenames = []

    if merck_tradename:
        merck_tradenames = sorted(set(merck_tradename.split(";")))[1:]
        for index, object in enumerate(merck_tradenames):
            merck_tradenames[index]   = string.capitalize(string.strip(object.encode()))
    else:
        merck_tradenames = []

    if current_tradename: 
        current_tradenames = sorted(set(current_tradename.split(", ")))
        for index, object in enumerate(current_tradenames):
            current_tradenames[index] = string.capitalize(string.strip(object.encode()))
    else:
        current_tradenames = []

    merged_tradenames = []
    if merck_tradenames:   merged_tradenames = merck_tradenames
    if current_tradenames:
        for name in current_tradenames:
            merged_tradenames.append(name)
    if merged_tradenames:
        new_tradenames = sorted(set(merged_tradenames))
        new_tradename  = ", ".join(new_tradenames)
        return new_tradename
    else:
        new_tradename = ""
        return new_tradename
        
#       print "merck tradenames: ", merck_tradenames
#       print "current tradenames: ", current_tradenames


def test_MedlinePlus(accession_number):

#   add MedlinePlus parameter
# | MedlinePlus =  <!-- MedlinePlus drug accession number, e.g.,  "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" --> 

# test web page, returns "200" if OK:
# if urllib.urlopen(link).getcode() == 200:
# 200: ('OK', 'Request fulfilled, document follows')
# 404: (page not found)

    if accession_number:
        link = "http://www.nlm.nih.gov/medlineplus/druginfo/meds/" + accession_number + ".html"
        if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
            return True
        else:
            link = ""
            return False

def test_Drugs_com(INN, tradename, drugbank_drugs_com):

#   add Drugs.com parameter
# | Drugs.com   =  <!--  link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->

#     create alternate candidate drugs.com links
#     alternateive roots include the monograph, CDI, CONS, MTM, and "parent" sectios of drgus.com web site
#     alternative stems inlclude the INN, trade names, and the name extracted from the drugbank link

    tradenames = tradename.split(", ")

# drugs.com root links:
    roots = [("monograph","http://www.drugs.com/monograph/"), ("CDI","http://www.drugs.com/cdi/"), ("CONS","http://www.drugs.com/cons/"), ("MTM","http://www.drugs.com/mtm/"), ("parent","http://www.drugs.com/")]

    stems = []
    drugnames = []
    drugnames.append(INN)
    link = False
    
    if tradenames:
        for tradename in tradenames:
            drugnames.append(tradename)
    for drugname in drugnames:
        drugname = string.lower(drugname)
#        print "drugnames: ", drugnames      
        if (string.find(drugname, " ") > -1):
            stems.append(string.replace(drugname, " ", "_"))
            stems.append(string.replace(drugname, " ", "-"))
        else:
            stems.append(drugname)
          
    if drugbank_drugs_com:
        if (string.find(drugbank_drugs_com, "http://www.drugs.com/") > -1):
            temp = string.replace(drugbank_drugs_com, "http://www.drugs.com/", "")
            temp = string.replace(temp,    ".html", "")
            drugnames.append(temp)

    try:
        for root in roots:
            for stem in stems:
                link = root[1] + stem + ".html"
#               print "attempted Drugs.com link: ", link
                if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
                    link = "{{drugs.com|" + root[0] + "|" + stem + "}}"
                    raise StopIteration()
    except StopIteration:
        pass

    return link

def run():

    drugbank_data = drugbank()

#   list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt"
#   articles = []
#   articles = codecs.open('/Users/BogHog/progs/pywikipedia/drugbox/drugbox_titles.txt', mode = 'r', encoding='utf-8')

    articles = ["Template:Drugbox/Lisinopril"]

    for article in articles:

#       article = article.rstrip('\n')

        new_drugbox = ""

        log_string = "* [[" + article + "]]" 
        print log_string,

        site = wikipedia.getSite()
        page = wikipedia.Page(site, article)
        text = page.get(get_redirect = True)

        if not Allowbots(text):
            break

        
        begin, end = find_drugbox_from_text(text)
        if begin:
            parameters = text[begin:end]
            log_string = ", article: " + article
            print log_string
        else:
            log_string = ", article: " + article + "drugbox not found!"
            print log_string
            break

#        print text[begin:end]

        current_parameters = {}
    
# first extract and assign nested templates commonly used in drugbox templates
        parameters = assign_nested_templates(parameters, current_parameters)

# next, parse each line for parameters
        lines = parameters.splitlines()
        for line in lines:
             parse_line(line, current_parameters)

        INN = article
        if INN in drugbank_data:
            db_data = drugbank_data[INN]
        else:
            db_data = []

# while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing

        if db_data:

            if db_data[4] and not current_parameters.has_key("KEGG"):
                current_parameters['KEGG'] = db_data[4]

            if db_data[6] and not current_parameters.has_key("ChemSpiderID"):
                current_parameters['ChemSpiderID'] = db_data[6]

            if db_data[7] and not current_parameters.has_key("PubChem"):
                current_parameters['PubChem'] = db_data[7]

# augument current tradename list with the ones supplied by the Merck Manual

        if db_data:
            if db_data[0]:
                merck_tradename = db_data[0]
        else:
            merck_tradename = ""
        if current_parameters.has_key('tradename'):
            current_tradename = current_parameters['tradename']
        else:
            current_tradename = ""
        
        new_tradename = merged_tradenames(merck_tradename, current_tradename)
        if new_tradename: current_parameters['tradename'] = new_tradename
        
# add MedlinePlus parameter
        if db_data:
            if db_data[2]:
                if test_MedlinePlus(db_data[2]):
                    current_parameters['MedlinePlus'] = db_data[2]
                    
# add Drugs.com link
        result = test_Drugs_com(INN, current_parameters['tradename'], db_data[3])
        if result: current_parameters['Drugs.com'] = result

        new_text = text[:begin-1] + build_new_drugbox(current_parameters) + text[end:]
        
        print new_text
        
        if current_parameters:
#           page.put(new_text, comment='populated clinical fields in drugbox per [[Wikipedia:Bots/Requests_for_approval/BogBot_2|bot approval]]', watchArticle = None, minorEdit = True)
            print ", page updated"
        else:
            print ", page not updated"

        wikipedia.stopme()
        
run()