User:BogBot/Source code/Task 03: Difference between revisions

Browse history interactively

← Previous edit

Content deleted Content added

VisualWikitext

Revision as of 21:42, 30 July 2011 edit Boghog (talk \| contribs) Autopatrolled, Extended confirmed users, IP block exemptions, New page reviewers, Pending changes reviewers, Rollbackers, Template editors 142,840 edits additional bug corrections ← Previous edit		Latest revision as of 14:47, 8 May 2022 edit undo Qwerfjkl (bot) (talk \| contribs) Bots, Mass message senders 4,093,883 edits m →top: Replaced deprecated <source> tags with <syntaxhighlight> Tag: AWB
(5 intermediate revisions by one other user not shown)
Line 1: <~~source~~syntaxhighlight lang=python> #!/usr/bin/python # -- coding: UTF-8 -- Line 150: exp = r'\{\{' # the opening brackets for the infobox exp = exp + r'\s' # any amount of whitespace exp = exp + r'[Dd]rugbox' # the word "~~infobox~~drugbox", capitalized or not exp = exp + r'.\}\}' # any amount of anything, followed by the end of the ~~infobox~~drugbox regexp_drug_infobox = re.compile(exp, re.DOTALL) regexp_param = re.compile(r"^\s?\\|\s?(?P<PARAM>\S+)\s?=\s?(?P<VALUE>.+)\s?($\|\\|)") regexp_nested_template = re.compile(r"\{\{(?P<PARAMS>.+)\}\}") regexp_open_square_bracket = re.compile(r"\[", re.DOTALL) regexp_close_square_bracket = re.compile(r"\]", re.DOTALL) regexp_open_curly_bracket = re.compile(r"}", re.DOTALL) regexp_close_curly_bracket = re.compile(r"{", re.DOTALL) regexp_double_open_square_bracket = re.compile(r"\[\[", re.DOTALL) regexp_double_close_square_bracket = re.compile(r"\[\[", re.DOTALL) regexp_double_open_curly_bracket = re.compile(r"}}", re.DOTALL) regexp_double_close_curly_bracket = re.compile(r"{{", re.DOTALL) # ATC_supplemental = {{ATC\|B01\|AC06}}, {{ATC\|N02\|BA01}} Line 177 ⟶ 187: # UNII_Ref = {{fdacite\|changed\|FDA}} regexp_UNII_Ref = re.compile(r"\\|\s?UNII_Ref\s?=\s?(?P<TEMPLATE>\{\{(Fdacite\|fdacite).?\}\})") # named ref tag = <ref name="fattinger2000"> but not <ref name="fattinger2000" /> regexp_ref_tag_begin = re.compile(r"<ref>\|<ref name.?[^/]>") regexp_ref_tag_end = re.compile(r"</ref>") regexp_citation_template = re.compile(r"\{\{[C\|c]ite\s?(?P<TEMPLATE>.?)\}\}") # href='/monograph/maprotiline-hydrochloride.html' regexp_monograph_url = re.compile("href='/monograph/(?P<STEM>.?)\.html'", re.DOTALL) # http://www.nlm.nih.gov/medlineplus/druginfo/meds/a604021.html regexp_medlineplus_url = re.compile("www.nlm.nih.gov/medlineplus/druginfo/meds/(?P<ACNO>.?)\.html", re.DOTALL) def Allowbots(text): Line 198 ⟶ 219: if search_result: result_text = search_result.group(0) # returns the entire matching sequence begin, end = search_result.span() else: return None Line 213 ⟶ 234: break offset = result_text.find('\|') ___location = (begin+offset, begin+last_ind-1, begin, begin+last_ind+1) return ___location Line 221 ⟶ 242: drugbank_data = {} # 0 1 2 3 4 5 6 7 8 # Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID DrugBank_ID drug_data = csv.reader(open('/Users/BogBot/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'), dialect='excel') Line 299 ⟶ 320: return parameters ~~def rejoin(begin,end,sub_strings):~~ def rejoin(begin, end, sub_strings, type): # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc. # adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/ new_list = [sub_strings[0]] for sub_string in sub_strings[1:]: if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)): ~~new_list[-1]~~if type == ~~new_list[-1] + '\|' + sub_string~~"line": new_list[-1] = new_list[-1] + sub_string if type == "parameter": new_list[-1] = new_list[-1] + '\|' + sub_string else: new_list.append(sub_string) sub_strings = new_list return sub_strings def test_disjoint(begin,end,sub_strings): disjoint = False Line 319 ⟶ 344: break return disjoint def regex_rejoin(regexp_begin, regexp_end, sub_strings, type): # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc. # adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/ new_list = [sub_strings[0]] for sub_string in sub_strings[1:]: begin = False; end = False; n_begin = 0; n_end = 0 if regexp_begin.search(sub_string): begin = True match = regexp_begin.findall(sub_string) n_begin = len(match) if regexp_end.search(sub_string): end = True match = regexp_end.findall(sub_string) n_end = len(match) if ((end and not begin) or n_end < n_begin): if type == "line": new_list[-1] = new_list[-1] + sub_string if type == "parameter": new_list[-1] = new_list[-1] + '\|' + sub_string else: new_list.append(sub_string) sub_strings = new_list return sub_strings def regex_test_disjoint(regexp_begin, regexp_end, sub_strings): disjoint = False begin = False; end = False; n_begin = 0; n_end = 0 for sub_string in sub_strings: if regexp_begin.search(sub_string): begin = True match = regexp_begin.findall(sub_string) n_begin = len(match) if regexp_end.search(sub_string): end = True match = regexp_end.findall(sub_string) n_end = len(match) if ((end and not begin) or n_end < n_begin): disjoint = True break return disjoint def pad_parameters(text): matches = regexp_citation_template.findall(text) for match in matches: sub_strings = match.split("\|") new_strings = " " + sub_strings[0].strip() for item in sub_strings[1:]: item = " \| " + item.strip() new_strings += item sub_strings = new_strings.split("=") new_strings = " " + sub_strings[0].strip() for item in sub_strings[1:]: item = " = " + item.strip() new_strings += item text = text.replace(match,new_strings) return text def parse_line(line, current_parameters): Line 329 ⟶ 417: # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links sub_strings = rejoin('[[',']]',sub_strings, 'parameter') # do the same for nested templates Line 336 ⟶ 424: if test_disjoint('{{','}}',sub_strings): forever = True sub_strings = rejoin('{{','}}',sub_strings, 'parameter') else: forever = False # do the same for citations: forever = True while forever: if ~~test_disjoint~~regex_test_disjoint(~~'<ref>'~~regexp_ref_tag_begin,~~'</ref>'~~ regexp_ref_tag_end, sub_strings): forever = True sub_strings = ~~rejoin~~regex_rejoin(~~'<ref>'~~regexp_ref_tag_begin, regexp_ref_tag_end, sub_strings, '~~</ref>~~parameter'~~,sub_strings~~) else: forever = False Line 357 ⟶ 445: parameter = str(parts[0].encode("utf-8")).strip() value = str(parts[1].encode("utf-8")).strip() value = pad_parameters(value) # print "parameter, value: ", parameter, " ", value if not value: value = " " current_parameters[parameter] = value else: Line 367 ⟶ 458: parameter = (result_drug_param.group('PARAM').encode("utf-8")).strip() value = (result_drug_param.group('VALUE').encode("utf-8")).strip() value = pad_parameters(value) if not value: value = " " current_parameters[parameter] = value return def remove_embedded_carriage_returns(parameters): # remove embedded carriage returns from templates: lines = parameters.splitlines() forever = True while forever: if test_disjoint('{{', '}}', lines): forever = True lines = rejoin('{{', '}}', lines, 'line') else: forever = False forever = True while forever: if regex_test_disjoint(regexp_ref_tag_begin, regexp_ref_tag_end, lines): forever = True lines = regex_rejoin(regexp_ref_tag_end, regexp_ref_tag_begin, lines, 'line') else: forever = False parameters = string.join(lines, "\n") return parameters def build_new_drugbox(current_parameters): # build new drugbox template Line 400 ⟶ 519: current_parameters['type'] = "vaccine" # if not previously assigned, add the following "empty" parameters ~~new_drugbox = unicode( "", "utf-8" )~~ if not current_parameters.has_key("~~Watchedfields~~tradename"): ~~new_drugbox += "\| Watchedfields = " + current_parameters['Watchedfields'] + "\n"~~ current_parameters["tradename"] = " " new_drugbox = unicode( "{{Drugbox\n", "utf-8" ) if current_parameters.has_key("Verifiedfields"): new_drugbox += "\| Verifiedfields = " + current_parameters['Verifiedfields'] + "\n" if current_parameters.has_key("Watchedfields"): new_drugbox += "\| Watchedfields = " + current_parameters['Watchedfields'] + "\n" if current_parameters.has_key("verifiedrevid"): new_drugbox += "\| verifiedrevid = " + current_parameters['verifiedrevid'] + "\n" if current_parameters.has_key("IUPAC_name"): new_drugbox += "\| IUPAC_name = " + current_parameters['IUPAC_name'] + "\n" Line 479 ⟶ 602: "KEGG" in current_parameters or "ChEBI" in current_parameters or "ChEMBL" in current_parameters): new_drugbox += "\n<!--Identifiers-->\n" if current_parameters.has_key("~~CASNo_Ref~~CAS_number_Ref"): new_drugbox += "\| ~~CASNo_Ref~~CAS_number_Ref = " + current_parameters['~~CASNo_Ref~~CAS_number_Ref'] + "\n" if current_parameters.has_key("CASNo_Ref"): new_drugbox += "\| CASNo_Ref = " + current_parameters['CASNo_Ref'] + "\n" if current_parameters.has_key("CAS_number"): new_drugbox += "\| CAS_number = " + current_parameters['CAS_number'] + "\n" if current_parameters.has_key("CAS_supplemental"): new_drugbox += "\| CAS_supplemental = " + current_parameters['CAS_supplemental'] + "\n" Line 489 ⟶ 613: if current_parameters.has_key("PubChemSubstance"): new_drugbox += "\| PubChemSubstance = " + current_parameters['PubChemSubstance'] + "\n" if current_parameters.has_key("IUPHAR_ligand"): new_drugbox += "\| IUPHAR_ligand = " + current_parameters['IUPHAR_ligand'] + "\n" if current_parameters.has_key("DrugBank_Ref"): new_drugbox += "\| DrugBank_Ref = " + current_parameters['DrugBank_Ref'] + "\n" if current_parameters.has_key("DrugBank"): new_drugbox += "\| DrugBank = " + current_parameters['DrugBank'] + "\n" if current_parameters.has_key("ChemSpiderID_Ref"): new_drugbox += "\| ChemSpiderID_Ref = " + current_parameters['ChemSpiderID_Ref'] + "\n" Line 508 ⟶ 633: "Sb" in current_parameters or "Se" in current_parameters or "Sr" in current_parameters or "Tc" in current_parameters or "charge" in current_parameters): if current_parameters.has_key("chemical_formula"): new_drugbox += "\| chemical_formula = " + current_parameters['chemical_formula'] + "\n" # new_drugbox += " " if current_parameters.has_key("C"): new_drugbox += "\| C=" + current_parameters['C'] + " " if current_parameters.has_key("H"): new_drugbox += "\| H=" + current_parameters['H'] + " " Line 541 ⟶ 667: if current_parameters.has_key("InChI_Ref"): new_drugbox += "\| InChI_Ref = " + current_parameters['InChI_Ref'] + "\n" if current_parameters.has_key("InChI"): new_drugbox += "\| InChI = " + current_parameters['InChI'] + "\n" if current_parameters.has_key("InChIKey"): new_drugbox += "\| InChIKey = " + current_parameters['InChIKey'] + "\n" if current_parameters.has_key("StdInChI_Ref"): new_drugbox += "\| StdInChI_Ref = " + current_parameters['StdInChI_Ref'] + "\n" if current_parameters.has_key("StdInChI"): new_drugbox += "\| StdInChI = " + current_parameters['StdInChI'] + "\n" Line 549 ⟶ 676: if current_parameters.has_key("density"): new_drugbox += "\| density = " + current_parameters['density'] + "\n" if current_parameters.has_key("melting_point"): new_drugbox += "\| melting_point = " + current_parameters['melting_point'] + "\n" if current_parameters.has_key("melting_high"): new_drugbox += "\| melting_high = " + current_parameters['melting_high'] + "\n" if current_parameters.has_key("melting_notes"): new_drugbox += "\| melting_notes = " + current_parameters['melting_notes'] + "\n" if current_parameters.has_key("boiling_point"): new_drugbox += "\| boiling_point = " + current_parameters['boiling_point'] + "\n" if current_parameters.has_key("boiling_notes"): new_drugbox += "\| boiling_notes = " + current_parameters['boiling_notes'] + "\n" Line 555 ⟶ 684: if current_parameters.has_key("sec_combustion"): new_drugbox += "\| sec_combustion = " + current_parameters['sec_combustion'] + "\n" ~~print~~ new_drugbox += "}}" # print new_drugbox return new_drugbox Line 624 ⟶ 755: # drugs.com root links: roots = [("monograph","http://www.drugs.com/monograph/"), ("CDI","http://www.drugs.com/cdi/"), ("CONS","http://www.drugs.com/cons/"), ("MTM","http://www.drugs.com/mtm/"), ("pro","http://www.drugs.com/pro/"), ("international","http://www.drugs.com/international/"), ("parent","http://www.drugs.com/")] stems = [] Line 637 ⟶ 768: drugname = string.lower(drugname) # print "drugnames: ", drugnames if (drugname != " " and string.find(drugname, " ") > -1): stems.append(string.replace(drugname, " ", "_")) stems.append(string.replace(drugname, " ", "-")) ~~else~~elif (drugname != " "): stems.append(drugname) # also try common salts stems.append(drugname + "-hydrochloride") stems.append(drugname + "-sulfate") # stems.append(drugname + "-chloride") # stems.append(drugname + "-sodium") # stems.append(drugname + "-bromide") # stems.append(drugname + "-maleate") # stems.append(drugname + "-citrate") if drugbank_drugs_com: if (string.find(drugbank_drugs_com, "http://www.drugs.com/") > -1): Line 657 ⟶ 796: if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter # print "passed link: ", link ~~link = "{{drugs.com\|" +~~if root[0] ~~+ "\|" + stem +~~== "}}monograph": ~~raise~~ ~~StopIteration()~~ link = "{{drugs.com\|" + root[0] + "\|" + stem + "}}" raise StopIteration() else: opener = urllib.FancyURLopener({}) f = opener.open(link) text = f.read() result = regexp_monograph_url.search(text) if result: stem = result.group('STEM') link = "{{drugs.com\|" + roots[0][0] + "\|" + stem + "}}" raise StopIteration() else: link = "{{drugs.com\|" + root[0] + "\|" + stem + "}}" # print "link: ", link raise StopIteration() else: link = "" Line 665 ⟶ 818: return link def unbalanced(text): # test for unmatched square or curly brackets n_open_square_bracket = len(regexp_open_square_bracket.findall(text)) n_close_square_bracket = len(regexp_close_square_bracket.findall(text)) n_open_curly_bracket = len(regexp_open_curly_bracket.findall(text)) n_close_curly_bracket = len(regexp_close_curly_bracket.findall(text)) n_double_open_square_bracket = len(regexp_double_open_square_bracket.findall(text)) n_double_close_square_bracket = len(regexp_double_close_square_bracket.findall(text)) n_double_open_curly_bracket = len(regexp_double_open_curly_bracket.findall(text)) n_double_close_curly_bracket = len(regexp_double_close_curly_bracket.findall(text)) if (n_open_square_bracket != n_close_square_bracket or n_open_curly_bracket != n_close_curly_bracket or n_double_open_square_bracket != n_double_close_square_bracket or n_double_open_curly_bracket != n_double_close_curly_bracket): return True else: return False def savepage(page, text, summary = '', minor = False, log_string = ""): """Save text to a page and log exceptions.""" if summary != '': wikipedia.setAction(summary) try: page.put(text, minorEdit = minor) wikipedia.output('%s \03{green}saving %s' % (log_string, page.title()) ) return '' except wikipedia.LockedPage: wikipedia.output('%s \03{red}cannot save %s because it is locked\03{default}' % (log_string, page.title()) ) return '# %s: page was locked\n' % page.aslink() except wikipedia.EditConflict: wikipedia.output('%s \03{red}cannot save %s because of edit conflict\03{default}' % (log_string, page.title()) ) return '# %s: edit conflict occurred\n' % page.aslink() except wikipedia.SpamfilterError, error: wikipedia.output('%s \03{red}cannot save %s because of spam blacklist entry %s\03{default}' % ((log_string, page.title(), error.url)) ) return '# %s: spam blacklist entry\n' % page.aslink() except: wikipedia.output('%s \03{red}unknown error on saving %s\03{default}' % (log_string, page.title()) ) return '# %s: unknown error occurred\n' % page.aslink() def run(): drugbank_data = drugbank() DrugBank_ID_INN = {} for k, v in drugbank_data.iteritems(): DrugBank_ID_INN[v[8]]= k # list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt" # articles = [] # articles = codecs.open('/Users/BogBot/progs/pywikipedia~~/drugbox~~/drugbox_titles.txt', mode = 'r', encoding='utf-8') # articles = ['~~Chloramphenicol~~Progesterone'] for article in articles: # article = article.rstrip('\n') encoding = 'utf-8' Line 687 ⟶ 888: new_drugbox = "" log_string = "* [[" + article + "]], " ~~print log_string,~~ site = wikipedia.getSite() Line 695 ⟶ 895: if not Allowbots(text): ~~break~~continue begin, end, begin2, end2 = find_drugbox_from_text(text) if ~~begin~~end: parameters = text[begin:end] ~~log_string = ", article: " + article~~ ~~print log_string,~~ else: log_string = ~~", article: " + article~~log_string + "drugbox not found!" print log_string, ~~break~~continue # make sure that there are no unmatched square or curly brackets # if found, abbort, since these may indicate a error in the wiki markup # and may trigger an infinite loop else where in this script if unbalanced(parameters): log_string = log_string + "unmatched brackets found, article skipped!" print log_string continue # print text[begin:end] Line 714 ⟶ 920: # first extract and assign nested templates commonly used in drugbox templates parameters = assign_nested_templates(parameters, current_parameters) # remove any embedded carriage returns from remaining templates: parameters = remove_embedded_carriage_returns(parameters) # next, parse each line for parameters Line 720 ⟶ 929: parse_line(line, current_parameters) ifINN ~~"drug_name"~~= ~~in current_parameters:~~article ~~INN = current_parameters['drug_name']~~ ~~else:~~ ~~INN = article~~ # INN = "Acetylsalicylic acid" if INN in drugbank_data: db_data = drugbank_data[INN] elif "DrugBank" in current_parameters and current_parameters['DrugBank'] in DrugBank_ID_INN: log_string = str(log_string + "INN reset from " + INN,) INN = DrugBank_ID_INN[current_parameters['DrugBank']] log_string = log_string + "to " + INN + ", " db_data = drugbank_data[INN] if not "drug_name" in current_parameters: current_parameters['drug_name'] = INN else: db_data = [] if "DrugBank" in current_parameters and current_parameters['DrugBank'] in DrugBank_ID_INN: if DrugBank_ID_INN[current_parameters['DrugBank']] == INN: log_string = log_string + "DrugBankID/INN OK!, " else: log_string = log_string + "DrugBankID/INN NOT OK!, " else: if db_data: if db_data[8]: if not "DrugBank" in current_parameters: current_parameters['DrugBank'] = db_data[8] # while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing Line 735 ⟶ 960: if db_data: if db_data[4] and not ~~current_parameters.has_key(~~"KEGG") in current_parameters: current_parameters['KEGG'] = db_data[4] if db_data[6] and not ~~current_parameters.has_key(~~"ChemSpiderID") in current_parameters: current_parameters['ChemSpiderID'] = db_data[6] if db_data[7] and not ~~current_parameters.has_key(~~"PubChem") in current_parameters: current_parameters['PubChem'] = db_data[7] Line 753 ⟶ 978: else: merck_tradename = "" if ~~current_parameters.has_key(~~'tradename') in current_parameters: current_tradename = current_parameters['tradename'] else: Line 768 ⟶ 993: # add Drugs.com link if ~~current_parameters.has_key(~~'tradename') in current_parameters: tradename = current_parameters['tradename'] else: Line 783 ⟶ 1,008: if result: current_parameters['Drugs.com'] = result # add MedlinePlus parameter ~~new_text = text[:begin] + build_new_drugbox(current_parameters) + text[end:]~~ if db_data: if db_data[2]: if test_MedlinePlus(db_data[2]): current_parameters['MedlinePlus'] = db_data[2] if not 'MedlinePlus' in current_parameters: opener = urllib.FancyURLopener({}) stem = string.replace(article, " ", "+") link = "http://vsearch.nlm.nih.gov/vivisimo/cgi-bin/query-meta?&v:project=medlineplus&query=" + stem # print "MedlinePlus link:", link f = opener.open(link) text2 = f.read() result = regexp_medlineplus_url.search(text2) if result: current_parameters['MedlinePlus'] = result.group('ACNO') new_text = text[:begin2] + build_new_drugbox(current_parameters) + text[end2:] # print build_new_drugbox(current_parameters) # print new_text # print current_parameters # print new_text if current_parameters: ~~page.put(new_text,~~ comment='populated new fields in {{drugbox}} and reordered per [[Wikipedia:Bots/Requests_for_approval/BogBot_2\|bot approval]]',. ~~watchArticle~~Report =errors ~~None,~~and ~~minorEdit~~suggestions =to ~~True)~~[[User_talk:BogBot]]' status ~~print~~= "savepage(page, ~~page~~new_text, comment, False, ~~updated"~~log_string) else: print ", page not updated" Line 797 ⟶ 1,045: run() ~~</source>~~ </syntaxhighlight>