Revision as of 21:04, 29 July 2011 edit Boghog (talk \| contribs) Autopatrolled, Extended confirmed users, IP block exemptions, New page reviewers, Pending changes reviewers, Rollbackers, Template editors 142,840 edits more tweaks ← Previous edit		Revision as of 21:42, 30 July 2011 edit undo Boghog (talk \| contribs) Autopatrolled, Extended confirmed users, IP block exemptions, New page reviewers, Pending changes reviewers, Rollbackers, Template editors 142,840 edits additional bug corrections Next edit →
Line 1: <source lang=python> #!/usr/bin/python # -- coding: UTF-8 -- # Bot Script to populate new clinical fields in Drugbox templates in Wikipedia drug articles. Line 135 ⟶ 136: import re import string import sys import urllib import urlparse import wikipedia Line 157 ⟶ 160: regexp_ATC_supplemental = re.compile(r"\\|\s?ATC_supplemental\s?=\s?(?P<TEMPLATE>.\{\{s(ATC).+?\}\})\s?($\|\\|)") # CASNo_Ref = {{cascite\|correct\|CAS}} regexp_CASNo_Ref = re.compile(r"\\|\s??CASNo_Ref\s?=\s?(?P<TEMPLATE>\{\{(Cascite\|cascite).+?\}\})") # CAS_supplemental = {{CAS\|405-41-4}} regexp_CAS_supplemental = re.compile(r"\\|\s?CAS_supplemental\s?=\s?(?P<TEMPLATE>\{\{CAS.?\}\})") # ChEMBL_Ref = {{ebicite\|correct\|EBI}} regexp_ChEMBL_Ref = re.compile(r"\\|\s??ChEMBL_Ref\s?=\s?(?P<TEMPLATE>\{\{(Ebicite\|ebicite).+?\}\})") # ChemSpiderID_Ref = {{chemspidercite\|correct\|chemspider}} regexp_ChemSpiderID_Ref = re.compile(r"\\|\s??ChemSpiderID_Ref\s?=\s?(?P<TEMPLATE>\{\{(Chemspidercite\|chemspidercite).+?\}\})") # Drugs.com = {{drugs.com\|monograph\|lisinopril}} regexp_Drugs_com = re.compile(r"\\|\s??Drugs\.com\s?=\s?(?P<TEMPLATE>\{\{(Drugs\.com\|drugs\.com).+?\}\})") # KEGG_Ref = {{keggcite\|correct\|kegg}} regexp_KEGG_Ref = re.compile(r"\\|\s??KEGG_Ref\s?=\s?(?P<TEMPLATE>\{\{(Keggcite\|keggcite).+?\}\})") # StdInChI_Ref = {{stdinchicite\|correct\|chemspider}} regexp_StdInChI_Ref = re.compile(r"\\|\s??StdInChI_Ref\s?=\s?(?P<TEMPLATE>\{\{(Stdinchicite\|stdinchicite).+?\}\})") # StdInChIKey_Ref = {{stdinchicite\|correct\|chemspider}} regexp_StdInChIKey_Ref = re.compile(r"\\|\s??StdInChIKey_Ref\s?=\s?(?P<TEMPLATE>\{\{(Stdinchicite\|stdinchicite).+?\}\})") # UNII_Ref = {{fdacite\|changed\|FDA}} regexp_UNII_Ref = re.compile(r"\\|\s??UNII_Ref\s?=\s?(?P<TEMPLATE>\{\{(Fdacite\|fdacite).+?\}\})") def Allowbots(text): Line 177 ⟶ 182: return False return True def urlEncodeNonAscii(b): return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b) def iriToUri(iri): parts= urlparse.urlparse(iri) return urlparse.urlunparse( part.encode('idna') if parti==1 else urlEncodeNonAscii(part.encode('utf-8')) for parti, part in enumerate(parts) ) def find_drugbox_from_text(article_text): Line 185 ⟶ 200: begin, end = search_result.span() else: return None # the regex isn't perfect, so look for the closing brackets of the infobox count = 0 last_ind = None for ind, c in enumerate(result_text): if c == '}': count = count - 1 elif c == '{': count = count + 1 if count == 0 and not ind == 0: last_ind = ind break offset = result_text.find('\|') ___location = (begin+offset, begin+last_ind-1) Line 206 ⟶ 221: drugbank_data = {} # 0 1 2 3 4 5 6 7 ~~# 0 1 2 3 4 5 6 7~~ # Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID drug_data = csv.reader(open('/Users/~~bogbot~~BogBot/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'), dialect='excel') for row in drug_data: Line 232 ⟶ 247: # print "found result_ChEMBL_Ref! ", template parameters = re.sub(regexp_ChEMBL_Ref, "", parameters) result_CASNo_Ref = regexp_CASNo_Ref.search(parameters) if result_CASNo_Ref: template = result_CASNo_Ref.group('TEMPLATE') current_parameters['CASNo_Ref'] = template # print "found result_CASNo_Ref! ", template parameters = re.sub(regexp_CASNo_Ref, "", parameters) result_CAS_supplemental = regexp_CAS_supplemental.search(parameters) if result_CAS_supplemental: template = result_CAS_supplemental.group('TEMPLATE') current_parameters['CAS_supplemental'] = template # print "found result_CAS_supplemental! ", template parameters = re.sub(regexp_CAS_supplemental, "", parameters) result_ChemSpiderID_Ref = regexp_ChemSpiderID_Ref.search(parameters) Line 260 ⟶ 289: # print "found StdInChI_Ref! ", template parameters = re.sub(regexp_StdInChI_Ref, "", parameters) ~~result_StdInChIKey_Ref = regexp_StdInChIKey_Ref.search(parameters)~~ ~~if result_StdInChIKey_Ref:~~ ~~template = result_StdInChIKey_Ref.group('TEMPLATE')~~ ~~current_parameters['StdInChIKey_Ref'] = template~~ ~~# print "found StdInChIKey_Ref! ", template~~ ~~parameters = re.sub(regexp_StdInChIKey_Ref, "", parameters)~~ result_UNII_Ref = regexp_UNII_Ref.search(parameters) Line 276 ⟶ 298: return parameters def rejoin(begin,end,sub_strings): # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc. # adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/ new_list = [sub_strings[0]] for sub_string in sub_strings[1:]: if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)): new_list[-1] = new_list[-1] + '\|' + sub_string else: new_list.append(sub_string) sub_strings = new_list return sub_strings def test_disjoint(begin,end,sub_strings): disjoint = False for sub_string in sub_strings: if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)): disjoint = True break return disjoint def parse_line(line, current_parameters): Line 286 ⟶ 329: # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links sub_strings = rejoin('[[',']]',sub_strings) ~~# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/~~ ~~new_list~~ = ~~[sub_strings[0]]~~ # do the same for nested templates ~~for sub_string in sub_strings[1:]:~~ forever = True ~~if (']]' in sub_string) and ((not '[[' in sub_string) or sub_string.find(']]') < sub_string.find('[[')):~~ while forever: ~~new_list[-1] = new_list[-1] + '\|' + sub_string~~ if test_disjoint('{{','}}',sub_strings): forever = True sub_strings = rejoin('{{','}}',sub_strings) else: ~~new_list.append(sub_string)~~ forever = False ~~sub_strings = new_list~~ ~~# print "new_list: ", new_list~~ ~~# do the same thing for nested templates~~ ~~# forever = True~~ ~~# while forever:~~ ~~# print "new_list: ", new_list~~ ~~# forever = False~~ ~~# for sub_string in sub_strings[1:]:~~ ~~# print "sub_string: ", sub_string~~ ~~# if ('}}' in sub_string) and ((not '{{' in sub_string) or sub_string.find('}}') < sub_string.find('{{')):~~ ~~# new_list[-1] = new_list[-1] + '\|' + sub_string~~ ~~# forever = True~~ ~~# else:~~ ~~# new_list.append(sub_string)~~ ~~# sub_strings = new_list~~ ~~# print "new_list: ", new_list~~ # do the same for citations: forever = True while forever: if test_disjoint('<ref>','</ref>',sub_strings): forever = True sub_strings = rejoin('<ref>','</ref>',sub_strings) else: forever = False # now assign the parameters for sub_string in sub_strings: # print "sub_string: ", sub_string if (sub_string.count("=") > 0): parts = sub_string.split("=", 1) # print "parts: ", parts parameter = str(parts[0].encode("utf-8")).strip() value = str(parts[1].encode("utf-8")).strip() # print "parameter, value: ", parameter, " ", value current_parameters[parameter] = value Line 328 ⟶ 365: # print "made it!" # print "line2: ", line parameter = (result_drug_param.group('PARAM').encode("utf-8")).strip() value = (result_drug_param.group('VALUE').encode("utf-8")).strip() current_parameters[parameter] = value Line 336 ⟶ 373: def build_new_drugbox(current_parameters): # build new drugbox template # make sure that all values in the current_parameters dictionary are properly encoded encoding = 'utf-8' for k, v in current_parameters.iteritems(): if isinstance(v, basestring): if not isinstance(v, unicode): v = unicode(v, encoding) current_parameters[k] = v # if type parameter is missing, check subordinate parameters that infer type, and if found, assign type Line 354 ⟶ 400: current_parameters['type'] = "vaccine" new_drugbox = unicode( "", "utf-8" ) if current_parameters.has_key("Watchedfields"): new_drugbox += "\| Watchedfields = " + current_parameters['Watchedfields'] + "\n" if current_parameters.has_key("Verifiedfields"): new_drugbox += "\| Verifiedfields = " + current_parameters['Verifiedfields'] + "\n" Line 433 ⟶ 479: "KEGG" in current_parameters or "ChEBI" in current_parameters or "ChEMBL" in current_parameters): new_drugbox += "\n<!--Identifiers-->\n" if current_parameters.has_key("CASNo_Ref"): new_drugbox += "\| CASNo_Ref = " + current_parameters['CASNo_Ref'] + "\n" if current_parameters.has_key("CAS_number"): new_drugbox += "\| CAS_number = " + current_parameters['CAS_number'] + "\n" if current_parameters.has_key("CAS_supplemental"): new_drugbox += "\| CAS_supplemental = " + current_parameters['CAS_supplemental'] + "\n" Line 507 ⟶ 554: if current_parameters.has_key("specific_rotation"): new_drugbox += "\| specific_rotation = " + current_parameters['specific_rotation'] + "\n" if current_parameters.has_key("sec_combustion"): new_drugbox += "\| sec_combustion = " + current_parameters['sec_combustion'] + "\n" print new_drugbox return new_drugbox Line 517 ⟶ 566: merck_tradenames = sorted(set(merck_tradename.split(";")))[1:] for index, object in enumerate(merck_tradenames): merck_tradenames[index] = string.capitalize(string.strip(object.encode("utf-8"))) else: merck_tradenames = [] Line 524 ⟶ 573: current_tradenames = sorted(set(current_tradename.split(", "))) for index, object in enumerate(current_tradenames): current_tradenames[index] = string.capitalize(string.strip(object.encode("utf-8"))) else: current_tradenames = [] Line 604 ⟶ 653: for stem in stems: if stem: link = iriToUri(root[1] + stem + ".html") # print "attempted Drugs.com link: ", link if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter # print "passed link: ", link link = "{{drugs.com\|" + root[0] + "\|" + stem + "}}" raise StopIteration() else: link = "" ~~raise StopIteration()~~ except StopIteration: pass Line 623 ⟶ 672: # list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt" # articles = [] # articles = codecs.open('/Users/~~bogbot~~BogBot/progs/pywikipedia/drugbox/drugbox_titles.txt', mode = 'r', encoding='utf-8') articles = [~~"Aspirin"~~'Chloramphenicol'] for article in articles: # article = article.rstrip('\n') encoding = 'utf-8' if isinstance(article, basestring): if not isinstance(article, unicode): article = unicode(article, encoding) new_drugbox = "" Line 666 ⟶ 720: parse_line(line, current_parameters) ~~INN~~if ="drug_name" ~~article~~in current_parameters: # INN = ~~"Acetylsalicylic acid"~~current_parameters['drug_name'] else: INN = article # INN = "Acetylsalicylic acid" if INN in drugbank_data: db_data = drugbank_data[INN] Line 731 ⟶ 789: if current_parameters: page.put(new_text, comment='populated ~~clinical~~new fields in drugbox and reordered per [[Wikipedia:Bots/Requests_for_approval/BogBot_2\|bot approval]]', watchArticle = None, minorEdit = True) print ", page updated" else: print ", page not updated"

User:BogBot/Source code/Task 03: Difference between revisions