Content deleted Content added
additional bug corrections |
m →top: Replaced deprecated <source> tags with <syntaxhighlight> |
||
(5 intermediate revisions by one other user not shown) | |||
Line 1:
<
#!/usr/bin/python
# -*- coding: UTF-8 -*-
Line 150:
exp = r'\{\{' # the opening brackets for the infobox
exp = exp + r'\s*' # any amount of whitespace
exp = exp + r'[Dd]rugbox' # the word "
exp = exp + r'.*\}\}' # any amount of anything, followed by the end of the
regexp_drug_infobox = re.compile(exp, re.DOTALL)
regexp_param = re.compile(r"^\s*?\|\s*?(?P<PARAM>\S+)\s*?=\s*?(?P<VALUE>.+)\s*?($|\|)")
regexp_nested_template = re.compile(r"\{\{(?P<PARAMS>.+)\}\}")
regexp_open_square_bracket = re.compile(r"\[", re.DOTALL)
regexp_close_square_bracket = re.compile(r"\]", re.DOTALL)
regexp_open_curly_bracket = re.compile(r"}", re.DOTALL)
regexp_close_curly_bracket = re.compile(r"{", re.DOTALL)
regexp_double_open_square_bracket = re.compile(r"\[\[", re.DOTALL)
regexp_double_close_square_bracket = re.compile(r"\[\[", re.DOTALL)
regexp_double_open_curly_bracket = re.compile(r"}}", re.DOTALL)
regexp_double_close_curly_bracket = re.compile(r"{{", re.DOTALL)
# ATC_supplemental = {{ATC|B01|AC06}}, {{ATC|N02|BA01}}
Line 177 ⟶ 187:
# UNII_Ref = {{fdacite|changed|FDA}}
regexp_UNII_Ref = re.compile(r"\|\s*?UNII_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Fdacite|fdacite).*?\}\})")
# named ref tag = <ref name="fattinger2000"> but not <ref name="fattinger2000" />
regexp_ref_tag_begin = re.compile(r"<ref>|<ref name.*?[^/]>")
regexp_ref_tag_end = re.compile(r"</ref>")
regexp_citation_template = re.compile(r"\{\{[C|c]ite\s*?(?P<TEMPLATE>.*?)\}\}")
# href='/monograph/maprotiline-hydrochloride.html'
regexp_monograph_url = re.compile("href='/monograph/(?P<STEM>.*?)\.html'", re.DOTALL)
# http://www.nlm.nih.gov/medlineplus/druginfo/meds/a604021.html
regexp_medlineplus_url = re.compile("www.nlm.nih.gov/medlineplus/druginfo/meds/(?P<ACNO>.*?)\.html", re.DOTALL)
def Allowbots(text):
Line 198 ⟶ 219:
if search_result:
result_text = search_result.group(0) # returns the entire matching sequence
begin, end
else:
return None
Line 213 ⟶ 234:
break
offset = result_text.find('|')
___location = (begin+offset, begin+last_ind-1, begin, begin+last_ind+1)
return ___location
Line 221 ⟶ 242:
drugbank_data = {}
# 0 1 2 3 4 5 6 7 8
# Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID DrugBank_ID
drug_data = csv.reader(open('/Users/BogBot/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'), dialect='excel')
Line 299 ⟶ 320:
return parameters
def rejoin(begin, end, sub_strings, type):
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc.
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
new_list = [sub_strings[0]]
for sub_string in sub_strings[1:]:
if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)):
new_list[-1] = new_list[-1] + sub_string
if type == "parameter":
new_list[-1] = new_list[-1] + '|' + sub_string
else:
new_list.append(sub_string)
sub_strings = new_list
return sub_strings
def test_disjoint(begin,end,sub_strings):
disjoint = False
Line 319 ⟶ 344:
break
return disjoint
def regex_rejoin(regexp_begin, regexp_end, sub_strings, type):
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc.
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
new_list = [sub_strings[0]]
for sub_string in sub_strings[1:]:
begin = False; end = False; n_begin = 0; n_end = 0
if regexp_begin.search(sub_string):
begin = True
match = regexp_begin.findall(sub_string)
n_begin = len(match)
if regexp_end.search(sub_string):
end = True
match = regexp_end.findall(sub_string)
n_end = len(match)
if ((end and not begin) or n_end < n_begin):
if type == "line":
new_list[-1] = new_list[-1] + sub_string
if type == "parameter":
new_list[-1] = new_list[-1] + '|' + sub_string
else:
new_list.append(sub_string)
sub_strings = new_list
return sub_strings
def regex_test_disjoint(regexp_begin, regexp_end, sub_strings):
disjoint = False
begin = False; end = False; n_begin = 0; n_end = 0
for sub_string in sub_strings:
if regexp_begin.search(sub_string):
begin = True
match = regexp_begin.findall(sub_string)
n_begin = len(match)
if regexp_end.search(sub_string):
end = True
match = regexp_end.findall(sub_string)
n_end = len(match)
if ((end and not begin) or n_end < n_begin):
disjoint = True
break
return disjoint
def pad_parameters(text):
matches = regexp_citation_template.findall(text)
for match in matches:
sub_strings = match.split("|")
new_strings = " " + sub_strings[0].strip()
for item in sub_strings[1:]:
item = " | " + item.strip()
new_strings += item
sub_strings = new_strings.split("=")
new_strings = " " + sub_strings[0].strip()
for item in sub_strings[1:]:
item = " = " + item.strip()
new_strings += item
text = text.replace(match,new_strings)
return text
def parse_line(line, current_parameters):
Line 329 ⟶ 417:
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links
sub_strings = rejoin('[[',']]',sub_strings, 'parameter')
# do the same for nested templates
Line 336 ⟶ 424:
if test_disjoint('{{','}}',sub_strings):
forever = True
sub_strings = rejoin('{{','}}',sub_strings, 'parameter')
else:
forever = False
# do the same for citations:
forever = True
while forever:
if
forever = True
sub_strings =
else:
forever = False
Line 357 ⟶ 445:
parameter = str(parts[0].encode("utf-8")).strip()
value = str(parts[1].encode("utf-8")).strip()
value = pad_parameters(value)
# print "parameter, value: ", parameter, " ", value
if not value:
value = " "
current_parameters[parameter] = value
else:
Line 367 ⟶ 458:
parameter = (result_drug_param.group('PARAM').encode("utf-8")).strip()
value = (result_drug_param.group('VALUE').encode("utf-8")).strip()
value = pad_parameters(value)
if not value:
value = " "
current_parameters[parameter] = value
return
def remove_embedded_carriage_returns(parameters):
# remove embedded carriage returns from templates:
lines = parameters.splitlines()
forever = True
while forever:
if test_disjoint('{{', '}}', lines):
forever = True
lines = rejoin('{{', '}}', lines, 'line')
else:
forever = False
forever = True
while forever:
if regex_test_disjoint(regexp_ref_tag_begin, regexp_ref_tag_end, lines):
forever = True
lines = regex_rejoin(regexp_ref_tag_end, regexp_ref_tag_begin, lines, 'line')
else:
forever = False
parameters = string.join(lines, "\n")
return parameters
def build_new_drugbox(current_parameters):
# build new drugbox template
Line 400 ⟶ 519:
current_parameters['type'] = "vaccine"
# if not previously assigned, add the following "empty" parameters
if not current_parameters.has_key("
current_parameters["tradename"] = " "
new_drugbox = unicode( "{{Drugbox\n", "utf-8" )
if current_parameters.has_key("Verifiedfields"): new_drugbox += "| Verifiedfields = " + current_parameters['Verifiedfields'] + "\n"
if current_parameters.has_key("Watchedfields"): new_drugbox += "| Watchedfields = " + current_parameters['Watchedfields'] + "\n"
if current_parameters.has_key("verifiedrevid"): new_drugbox += "| verifiedrevid = " + current_parameters['verifiedrevid'] + "\n"
if current_parameters.has_key("IUPAC_name"): new_drugbox += "| IUPAC_name = " + current_parameters['IUPAC_name'] + "\n"
Line 479 ⟶ 602:
"KEGG" in current_parameters or "ChEBI" in current_parameters or "ChEMBL" in current_parameters):
new_drugbox += "\n<!--Identifiers-->\n"
if current_parameters.has_key("
if current_parameters.has_key("CASNo_Ref"): new_drugbox += "| CASNo_Ref = " + current_parameters['CASNo_Ref'] + "\n"
if current_parameters.has_key("CAS_number"): new_drugbox += "| CAS_number = " + current_parameters['CAS_number'] + "\n"
if current_parameters.has_key("CAS_supplemental"): new_drugbox += "| CAS_supplemental = " + current_parameters['CAS_supplemental'] + "\n"
Line 489 ⟶ 613:
if current_parameters.has_key("PubChemSubstance"): new_drugbox += "| PubChemSubstance = " + current_parameters['PubChemSubstance'] + "\n"
if current_parameters.has_key("IUPHAR_ligand"): new_drugbox += "| IUPHAR_ligand = " + current_parameters['IUPHAR_ligand'] + "\n"
if current_parameters.has_key("DrugBank_Ref"): new_drugbox += "| DrugBank_Ref = " + current_parameters['DrugBank_Ref'] + "\n"
if current_parameters.has_key("DrugBank"): new_drugbox += "| DrugBank = " + current_parameters['DrugBank'] + "\n"
if current_parameters.has_key("ChemSpiderID_Ref"): new_drugbox += "| ChemSpiderID_Ref = " + current_parameters['ChemSpiderID_Ref'] + "\n"
Line 508 ⟶ 633:
"Sb" in current_parameters or "Se" in current_parameters or "Sr" in current_parameters or "Tc" in current_parameters or "charge" in current_parameters):
if current_parameters.has_key("chemical_formula"): new_drugbox += "| chemical_formula = " + current_parameters['chemical_formula'] + "\n"
# new_drugbox += " "
if current_parameters.has_key("C"): new_drugbox += "| C=" + current_parameters['C'] + " "
if current_parameters.has_key("H"): new_drugbox += "| H=" + current_parameters['H'] + " "
Line 541 ⟶ 667:
if current_parameters.has_key("InChI_Ref"): new_drugbox += "| InChI_Ref = " + current_parameters['InChI_Ref'] + "\n"
if current_parameters.has_key("InChI"): new_drugbox += "| InChI = " + current_parameters['InChI'] + "\n"
if current_parameters.has_key("InChIKey"): new_drugbox += "| InChIKey = " + current_parameters['InChIKey'] + "\n"
if current_parameters.has_key("StdInChI_Ref"): new_drugbox += "| StdInChI_Ref = " + current_parameters['StdInChI_Ref'] + "\n"
if current_parameters.has_key("StdInChI"): new_drugbox += "| StdInChI = " + current_parameters['StdInChI'] + "\n"
Line 549 ⟶ 676:
if current_parameters.has_key("density"): new_drugbox += "| density = " + current_parameters['density'] + "\n"
if current_parameters.has_key("melting_point"): new_drugbox += "| melting_point = " + current_parameters['melting_point'] + "\n"
if current_parameters.has_key("melting_high"): new_drugbox += "| melting_high = " + current_parameters['melting_high'] + "\n"
if current_parameters.has_key("melting_notes"): new_drugbox += "| melting_notes = " + current_parameters['melting_notes'] + "\n"
if current_parameters.has_key("boiling_point"): new_drugbox += "| boiling_point = " + current_parameters['boiling_point'] + "\n"
if current_parameters.has_key("boiling_notes"): new_drugbox += "| boiling_notes = " + current_parameters['boiling_notes'] + "\n"
Line 555 ⟶ 684:
if current_parameters.has_key("sec_combustion"): new_drugbox += "| sec_combustion = " + current_parameters['sec_combustion'] + "\n"
# print new_drugbox
return new_drugbox
Line 624 ⟶ 755:
# drugs.com root links:
roots = [("monograph","http://www.drugs.com/monograph/"), ("CDI","http://www.drugs.com/cdi/"), ("CONS","http://www.drugs.com/cons/"), ("MTM","http://www.drugs.com/mtm/"), ("pro","http://www.drugs.com/pro/"), ("international","http://www.drugs.com/international/"), ("parent","http://www.drugs.com/")]
stems = []
Line 637 ⟶ 768:
drugname = string.lower(drugname)
# print "drugnames: ", drugnames
if (drugname != " " and string.find(drugname, " ") > -1):
stems.append(string.replace(drugname, " ", "_"))
stems.append(string.replace(drugname, " ", "-"))
stems.append(drugname)
# also try common salts
stems.append(drugname + "-hydrochloride")
stems.append(drugname + "-sulfate")
# stems.append(drugname + "-chloride")
# stems.append(drugname + "-sodium")
# stems.append(drugname + "-bromide")
# stems.append(drugname + "-maleate")
# stems.append(drugname + "-citrate")
if drugbank_drugs_com:
if (string.find(drugbank_drugs_com, "http://www.drugs.com/") > -1):
Line 657 ⟶ 796:
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
# print "passed link: ", link
raise StopIteration()
else:
opener = urllib.FancyURLopener({})
f = opener.open(link)
text = f.read()
result = regexp_monograph_url.search(text)
if result:
stem = result.group('STEM')
link = "{{drugs.com|" + roots[0][0] + "|" + stem + "}}"
raise StopIteration()
else:
link = "{{drugs.com|" + root[0] + "|" + stem + "}}"
# print "link: ", link
raise StopIteration()
else:
link = ""
Line 665 ⟶ 818:
return link
def unbalanced(text):
# test for unmatched square or curly brackets
n_open_square_bracket = len(regexp_open_square_bracket.findall(text))
n_close_square_bracket = len(regexp_close_square_bracket.findall(text))
n_open_curly_bracket = len(regexp_open_curly_bracket.findall(text))
n_close_curly_bracket = len(regexp_close_curly_bracket.findall(text))
n_double_open_square_bracket = len(regexp_double_open_square_bracket.findall(text))
n_double_close_square_bracket = len(regexp_double_close_square_bracket.findall(text))
n_double_open_curly_bracket = len(regexp_double_open_curly_bracket.findall(text))
n_double_close_curly_bracket = len(regexp_double_close_curly_bracket.findall(text))
if (n_open_square_bracket != n_close_square_bracket or
n_open_curly_bracket != n_close_curly_bracket or
n_double_open_square_bracket != n_double_close_square_bracket or
n_double_open_curly_bracket != n_double_close_curly_bracket):
return True
else:
return False
def savepage(page, text, summary = '', minor = False, log_string = ""):
"""Save text to a page and log exceptions."""
if summary != '':
wikipedia.setAction(summary)
try:
page.put(text, minorEdit = minor)
wikipedia.output('%s \03{green}saving %s' % (log_string, page.title()) )
return ''
except wikipedia.LockedPage:
wikipedia.output('%s \03{red}cannot save %s because it is locked\03{default}' % (log_string, page.title()) )
return '# %s: page was locked\n' % page.aslink()
except wikipedia.EditConflict:
wikipedia.output('%s \03{red}cannot save %s because of edit conflict\03{default}' % (log_string, page.title()) )
return '# %s: edit conflict occurred\n' % page.aslink()
except wikipedia.SpamfilterError, error:
wikipedia.output('%s \03{red}cannot save %s because of spam blacklist entry %s\03{default}' % ((log_string, page.title(), error.url)) )
return '# %s: spam blacklist entry\n' % page.aslink()
except:
wikipedia.output('%s \03{red}unknown error on saving %s\03{default}' % (log_string, page.title()) )
return '# %s: unknown error occurred\n' % page.aslink()
def run():
drugbank_data = drugbank()
DrugBank_ID_INN = {}
for k, v in drugbank_data.iteritems():
DrugBank_ID_INN[v[8]]= k
# list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt"
# articles = ['
for article in articles:
encoding = 'utf-8'
Line 687 ⟶ 888:
new_drugbox = ""
log_string = "* [[" + article + "]], "
site = wikipedia.getSite()
Line 695 ⟶ 895:
if not Allowbots(text):
begin, end, begin2, end2 = find_drugbox_from_text(text)
if
parameters = text[begin:end]
else:
log_string =
print log_string
# make sure that there are no unmatched square or curly brackets
# if found, abbort, since these may indicate a error in the wiki markup
# and may trigger an infinite loop else where in this script
if unbalanced(parameters):
log_string = log_string + "unmatched brackets found, article skipped!"
print log_string
continue
# print text[begin:end]
Line 714 ⟶ 920:
# first extract and assign nested templates commonly used in drugbox templates
parameters = assign_nested_templates(parameters, current_parameters)
# remove any embedded carriage returns from remaining templates:
parameters = remove_embedded_carriage_returns(parameters)
# next, parse each line for parameters
Line 720 ⟶ 929:
parse_line(line, current_parameters)
# INN = "Acetylsalicylic acid"
if INN in drugbank_data:
db_data = drugbank_data[INN]
elif "DrugBank" in current_parameters and current_parameters['DrugBank'] in DrugBank_ID_INN:
log_string = str(log_string + "INN reset from " + INN,)
INN = DrugBank_ID_INN[current_parameters['DrugBank']]
log_string = log_string + "to " + INN + ", "
db_data = drugbank_data[INN]
if not "drug_name" in current_parameters:
current_parameters['drug_name'] = INN
else:
db_data = []
if "DrugBank" in current_parameters and current_parameters['DrugBank'] in DrugBank_ID_INN:
if DrugBank_ID_INN[current_parameters['DrugBank']] == INN:
log_string = log_string + "DrugBankID/INN OK!, "
else:
log_string = log_string + "DrugBankID/INN NOT OK!, "
else:
if db_data:
if db_data[8]:
if not "DrugBank" in current_parameters:
current_parameters['DrugBank'] = db_data[8]
# while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing
Line 735 ⟶ 960:
if db_data:
if db_data[4] and not
current_parameters['KEGG'] = db_data[4]
if db_data[6] and not
current_parameters['ChemSpiderID'] = db_data[6]
if db_data[7] and not
current_parameters['PubChem'] = db_data[7]
Line 753 ⟶ 978:
else:
merck_tradename = ""
if
current_tradename = current_parameters['tradename']
else:
Line 768 ⟶ 993:
# add Drugs.com link
if
tradename = current_parameters['tradename']
else:
Line 783 ⟶ 1,008:
if result: current_parameters['Drugs.com'] = result
# add MedlinePlus parameter
if db_data:
if db_data[2]:
if test_MedlinePlus(db_data[2]):
current_parameters['MedlinePlus'] = db_data[2]
if not 'MedlinePlus' in current_parameters:
opener = urllib.FancyURLopener({})
stem = string.replace(article, " ", "+")
link = "http://vsearch.nlm.nih.gov/vivisimo/cgi-bin/query-meta?&v:project=medlineplus&query=" + stem
# print "MedlinePlus link:", link
f = opener.open(link)
text2 = f.read()
result = regexp_medlineplus_url.search(text2)
if result:
current_parameters['MedlinePlus'] = result.group('ACNO')
new_text = text[:begin2] + build_new_drugbox(current_parameters) + text[end2:]
# print build_new_drugbox(current_parameters)
# print new_text
# print current_parameters
# print new_text
if current_parameters:
status
else:
print ", page not updated"
Line 797 ⟶ 1,045:
run()
</syntaxhighlight>
|