Content deleted Content added
+ remove_embedded_carriage_returns inside nested templates and citation markers |
updated code |
||
Line 156:
regexp_param = re.compile(r"^\s*?\|\s*?(?P<PARAM>\S+)\s*?=\s*?(?P<VALUE>.+)\s*?($|\|)")
regexp_nested_template = re.compile(r"\{\{(?P<PARAMS>.+)\}\}")
regexp_open_square_bracket = re.compile(r"\[", re.DOTALL)
regexp_close_square_bracket = re.compile(r"\]", re.DOTALL)
regexp_open_curly_bracket = re.compile(r"}", re.DOTALL)
regexp_close_curly_bracket = re.compile(r"{", re.DOTALL)
regexp_double_open_square_bracket = re.compile(r"\[\[", re.DOTALL)
regexp_double_close_square_bracket = re.compile(r"\[\[", re.DOTALL)
regexp_double_open_curly_bracket = re.compile(r"}}", re.DOTALL)
regexp_double_close_curly_bracket = re.compile(r"{{", re.DOTALL)
# ATC_supplemental = {{ATC|B01|AC06}}, {{ATC|N02|BA01}}
Line 179 ⟶ 189:
# named ref tag = <ref name="fattinger2000"> but not <ref name="fattinger2000" />
regexp_ref_tag_begin = re.compile(r"
regexp_ref_tag_end = re.compile(r"</ref>")
regexp_citation_template = re.compile(r"\{\{[C|c]ite\s*?(?P<TEMPLATE>.*?)\}\}")
def Allowbots(text):
Line 225 ⟶ 236:
drugbank_data = {}
# 0 1 2 3 4 5 6 7 8
# Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID DrugBank_ID
drug_data = csv.reader(open('/Users/
for row in drug_data:
Line 369 ⟶ 380:
break
return disjoint
def pad_parameters(text):
matches = regexp_citation_template.findall(text)
for match in matches:
sub_strings = match.split("|")
new_strings = " " + sub_strings[0].strip()
for item in sub_strings[1:]:
item = " | " + item.strip()
new_strings += item
sub_strings = new_strings.split("=")
new_strings = " " + sub_strings[0].strip()
for item in sub_strings[1:]:
item = " = " + item.strip()
new_strings += item
text = text.replace(match,new_strings)
return text
def parse_line(line, current_parameters):
Line 407 ⟶ 439:
parameter = str(parts[0].encode("utf-8")).strip()
value = str(parts[1].encode("utf-8")).strip()
value = pad_parameters(value)
# print "parameter, value: ", parameter, " ", value
current_parameters[parameter] = value
Line 417 ⟶ 450:
parameter = (result_drug_param.group('PARAM').encode("utf-8")).strip()
value = (result_drug_param.group('VALUE').encode("utf-8")).strip()
value = pad_parameters(value)
current_parameters[parameter] = value
Line 435 ⟶ 469:
forever = True
while forever:
if regex_test_disjoint(regexp_ref_tag_begin, regexp_ref_tag_end, lines):
Line 625 ⟶ 658:
if current_parameters.has_key("density"): new_drugbox += "| density = " + current_parameters['density'] + "\n"
if current_parameters.has_key("melting_point"): new_drugbox += "| melting_point = " + current_parameters['melting_point'] + "\n"
if current_parameters.has_key("melting_high"): new_drugbox += "| melting_high = " + current_parameters['melting_high'] + "\n"
if current_parameters.has_key("melting_notes"): new_drugbox += "| melting_notes = " + current_parameters['melting_notes'] + "\n"
if current_parameters.has_key("boiling_point"): new_drugbox += "| boiling_point = " + current_parameters['boiling_point'] + "\n"
if current_parameters.has_key("boiling_notes"): new_drugbox += "| boiling_notes = " + current_parameters['boiling_notes'] + "\n"
Line 631 ⟶ 666:
if current_parameters.has_key("sec_combustion"): new_drugbox += "| sec_combustion = " + current_parameters['sec_combustion'] + "\n"
# print new_drugbox
return new_drugbox
Line 741 ⟶ 776:
return link
def unbalanced(text):
# test for unmatched square or curly brackets
n_open_square_bracket = len(regexp_open_square_bracket.findall(text))
n_close_square_bracket = len(regexp_close_square_bracket.findall(text))
n_open_curly_bracket = len(regexp_open_curly_bracket.findall(text))
n_close_curly_bracket = len(regexp_close_curly_bracket.findall(text))
n_double_open_square_bracket = len(regexp_double_open_square_bracket.findall(text))
n_double_close_square_bracket = len(regexp_double_close_square_bracket.findall(text))
n_double_open_curly_bracket = len(regexp_double_open_curly_bracket.findall(text))
n_double_close_curly_bracket = len(regexp_double_close_curly_bracket.findall(text))
if (n_open_square_bracket != n_close_square_bracket or
n_open_curly_bracket != n_close_curly_bracket or
n_double_open_square_bracket != n_double_close_square_bracket or
n_double_open_curly_bracket != n_double_close_curly_bracket):
return True
else:
return False
def savepage(page, text, summary = '', minor = False):
"""Save text to a page and log exceptions."""
if summary != '':
wikipedia.setAction(summary)
try:
page.put(text, minorEdit = minor)
wikipedia.output(' \03{green}saving %s' % (page.title()))
return ''
except wikipedia.LockedPage:
wikipedia.output(' \03{red}cannot save %s because it is locked\03{default}' % page.title())
return '# %s: page was locked\n' % page.aslink()
except wikipedia.EditConflict:
wikipedia.output(' \03{red}cannot save %s because of edit conflict\03{default}' % page.title())
return '# %s: edit conflict occurred\n' % page.aslink()
except wikipedia.SpamfilterError, error:
wikipedia.output(' \03{red}cannot save %s because of spam blacklist entry %s\03{default}' % (page.title(), error.url))
return '# %s: spam blacklist entry\n' % page.aslink()
except:
wikipedia.output(' \03{red}unknown error on saving %s\03{default}' % page.title())
return '# %s: unknown error occurred\n' % page.aslink()
def run():
drugbank_data = drugbank()
DrugBank_ID_INN = {}
for k, v in drugbank_data.iteritems():
DrugBank_ID_INN[v[8]]= k
# list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt"
# articles = ['
for article in articles:
encoding = 'utf-8'
Line 771 ⟶ 854:
if not Allowbots(text):
begin, end = find_drugbox_from_text(text)
Line 781 ⟶ 863:
else:
log_string = ", article: " + article + "drugbox not found!"
print log_string
# make sure that there are no unmatched square or curly brackets
# if found, abbort, since these may indicate a error in the wiki markup
# and may trigger an infinite loop else where in this script
if unbalanced(parameters):
log_string = ", article: " + article + ", unmatched brackets found, article skipped!"
print log_string
continue
# print text[begin:end]
Line 799 ⟶ 890:
parse_line(line, current_parameters)
# INN = "Acetylsalicylic acid"
if INN in drugbank_data:
db_data = drugbank_data[INN]
elif "DrugBank" in current_parameters and current_parameters['DrugBank'] in DrugBank_ID_INN:
print "INN reset from ", INN,
INN = DrugBank_ID_INN[current_parameters['DrugBank']]
print "to ", INN, ", ",
db_data = drugbank_data[INN]
if not "drug_name" in current_parameters:
current_parameters['drug_name'] = INN
else:
db_data = []
if "DrugBank" in current_parameters and current_parameters['DrugBank'] in DrugBank_ID_INN:
if DrugBank_ID_INN[current_parameters['DrugBank']] == INN:
print "DrugBankID/INN OK!, ",
else:
print "DrugBankID/INN NOT OK!, ",
else:
if db_data:
if db_data[8]:
current_parameters['DrugBank'] = db_data[8]
# while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing
Line 868 ⟶ 974:
if current_parameters:
status
print ", ", status
else:
print ", page not updated"
Line 876 ⟶ 983:
run()
</source>
|