User:BogBot/Source code/Task 03: Difference between revisions

Content deleted Content added
more tweaks
additional bug corrections
Line 1:
<source lang=python>
#!/usr/bin/python
# -*- coding: UTF-8 -*-
 
# Bot Script to populate new clinical fields in Drugbox templates in Wikipedia drug articles.
Line 135 ⟶ 136:
import re
import string
import sys
import urllib
import urlparse
import wikipedia
 
Line 157 ⟶ 160:
regexp_ATC_supplemental = re.compile(r"\|\s*?ATC_supplemental\s*?=\s*?(?P<TEMPLATE>.*\{\{s*(ATC).+?\}\})\s*?($|\|)")
# CASNo_Ref = {{cascite|correct|CAS}}
regexp_CASNo_Ref = re.compile(r"\|\s?*?CASNo_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Cascite|cascite).+*?\}\})")
# CAS_supplemental = {{CAS|405-41-4}}
regexp_CAS_supplemental = re.compile(r"\|\s*?CAS_supplemental\s*?=\s*?(?P<TEMPLATE>\{\{CAS.*?\}\})")
# ChEMBL_Ref = {{ebicite|correct|EBI}}
regexp_ChEMBL_Ref = re.compile(r"\|\s?*?ChEMBL_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Ebicite|ebicite).+*?\}\})")
# ChemSpiderID_Ref = {{chemspidercite|correct|chemspider}}
regexp_ChemSpiderID_Ref = re.compile(r"\|\s?*?ChemSpiderID_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Chemspidercite|chemspidercite).+*?\}\})")
# Drugs.com = {{drugs.com|monograph|lisinopril}}
regexp_Drugs_com = re.compile(r"\|\s?*?Drugs\.com\s*?=\s*?(?P<TEMPLATE>\{\{(Drugs\.com|drugs\.com).+*?\}\})")
# KEGG_Ref = {{keggcite|correct|kegg}}
regexp_KEGG_Ref = re.compile(r"\|\s?*?KEGG_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Keggcite|keggcite).+*?\}\})")
# StdInChI_Ref = {{stdinchicite|correct|chemspider}}
regexp_StdInChI_Ref = re.compile(r"\|\s?*?StdInChI_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Stdinchicite|stdinchicite).+*?\}\})")
# StdInChIKey_Ref = {{stdinchicite|correct|chemspider}}
regexp_StdInChIKey_Ref = re.compile(r"\|\s?*?StdInChIKey_Ref\s*?=\*s?(?P<TEMPLATE>\{\{(Stdinchicite|stdinchicite).+?*\}\})")
# UNII_Ref = {{fdacite|changed|FDA}}
regexp_UNII_Ref = re.compile(r"\|\s?*?UNII_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Fdacite|fdacite).+*?\}\})")
 
def Allowbots(text):
Line 177 ⟶ 182:
return False
return True
 
def urlEncodeNonAscii(b):
return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
 
def iriToUri(iri):
parts= urlparse.urlparse(iri)
return urlparse.urlunparse(
part.encode('idna') if parti==1 else urlEncodeNonAscii(part.encode('utf-8'))
for parti, part in enumerate(parts)
)
 
def find_drugbox_from_text(article_text):
Line 185 ⟶ 200:
begin, end = search_result.span()
else:
return None
# the regex isn't perfect, so look for the closing brackets of the infobox
count = 0
last_ind = None
for ind, c in enumerate(result_text):
if c == '}':
count = count - 1
elif c == '{':
count = count + 1
if count == 0 and not ind == 0:
last_ind = ind
break
offset = result_text.find('|')
___location = (begin+offset, begin+last_ind-1)
Line 206 ⟶ 221:
drugbank_data = {}
 
# 0 1 2 3 4 5 6 7
# 0 1 2 3 4 5 6 7
# Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID
 
drug_data = csv.reader(open('/Users/bogbotBogBot/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'), dialect='excel')
 
for row in drug_data:
Line 232 ⟶ 247:
# print "found result_ChEMBL_Ref! ", template
parameters = re.sub(regexp_ChEMBL_Ref, "", parameters)
 
result_CASNo_Ref = regexp_CASNo_Ref.search(parameters)
if result_CASNo_Ref:
template = result_CASNo_Ref.group('TEMPLATE')
current_parameters['CASNo_Ref'] = template
# print "found result_CASNo_Ref! ", template
parameters = re.sub(regexp_CASNo_Ref, "", parameters)
 
result_CAS_supplemental = regexp_CAS_supplemental.search(parameters)
if result_CAS_supplemental:
template = result_CAS_supplemental.group('TEMPLATE')
current_parameters['CAS_supplemental'] = template
# print "found result_CAS_supplemental! ", template
parameters = re.sub(regexp_CAS_supplemental, "", parameters)
 
result_ChemSpiderID_Ref = regexp_ChemSpiderID_Ref.search(parameters)
Line 260 ⟶ 289:
# print "found StdInChI_Ref! ", template
parameters = re.sub(regexp_StdInChI_Ref, "", parameters)
 
result_StdInChIKey_Ref = regexp_StdInChIKey_Ref.search(parameters)
if result_StdInChIKey_Ref:
template = result_StdInChIKey_Ref.group('TEMPLATE')
current_parameters['StdInChIKey_Ref'] = template
# print "found StdInChIKey_Ref! ", template
parameters = re.sub(regexp_StdInChIKey_Ref, "", parameters)
 
result_UNII_Ref = regexp_UNII_Ref.search(parameters)
Line 276 ⟶ 298:
 
return parameters
 
def rejoin(begin,end,sub_strings):
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc.
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
 
new_list = [sub_strings[0]]
for sub_string in sub_strings[1:]:
if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)):
new_list[-1] = new_list[-1] + '|' + sub_string
else:
new_list.append(sub_string)
sub_strings = new_list
return sub_strings
 
def test_disjoint(begin,end,sub_strings):
disjoint = False
for sub_string in sub_strings:
if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)):
disjoint = True
break
return disjoint
 
def parse_line(line, current_parameters):
Line 286 ⟶ 329:
 
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links
sub_strings = rejoin('[[',']]',sub_strings)
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
new_list = [sub_strings[0]]
# do the same for nested templates
for sub_string in sub_strings[1:]:
forever = True
if (']]' in sub_string) and ((not '[[' in sub_string) or sub_string.find(']]') < sub_string.find('[[')):
while forever:
new_list[-1] = new_list[-1] + '|' + sub_string
if test_disjoint('{{','}}',sub_strings):
forever = True
sub_strings = rejoin('{{','}}',sub_strings)
else:
new_list.append(sub_string) forever = False
sub_strings = new_list
# print "new_list: ", new_list
# do the same thing for nested templates
# forever = True
# while forever:
# print "new_list: ", new_list
# forever = False
# for sub_string in sub_strings[1:]:
# print "sub_string: ", sub_string
# if ('}}' in sub_string) and ((not '{{' in sub_string) or sub_string.find('}}') < sub_string.find('{{')):
# new_list[-1] = new_list[-1] + '|' + sub_string
# forever = True
# else:
# new_list.append(sub_string)
# sub_strings = new_list
# print "new_list: ", new_list
 
# do the same for citations:
forever = True
while forever:
if test_disjoint('<ref>','</ref>',sub_strings):
forever = True
sub_strings = rejoin('<ref>','</ref>',sub_strings)
else:
forever = False
# now assign the parameters
for sub_string in sub_strings:
# print "sub_string: ", sub_string
if (sub_string.count("=") > 0):
parts = sub_string.split("=", 1)
# print "parts: ", parts
parameter = str(parts[0].encode("utf-8")).strip()
value = str(parts[1].encode("utf-8")).strip()
# print "parameter, value: ", parameter, " ", value
current_parameters[parameter] = value
Line 328 ⟶ 365:
# print "made it!"
# print "line2: ", line
parameter = (result_drug_param.group('PARAM').encode("utf-8")).strip()
value = (result_drug_param.group('VALUE').encode("utf-8")).strip()
current_parameters[parameter] = value
Line 336 ⟶ 373:
def build_new_drugbox(current_parameters):
# build new drugbox template
 
# make sure that all values in the current_parameters dictionary are properly encoded
 
encoding = 'utf-8'
for k, v in current_parameters.iteritems():
if isinstance(v, basestring):
if not isinstance(v, unicode):
v = unicode(v, encoding)
current_parameters[k] = v
 
# if type parameter is missing, check subordinate parameters that infer type, and if found, assign type
Line 354 ⟶ 400:
current_parameters['type'] = "vaccine"
 
new_drugbox = unicode( "", "utf-8" )
if current_parameters.has_key("Watchedfields"): new_drugbox += "| Watchedfields = " + current_parameters['Watchedfields'] + "\n"
if current_parameters.has_key("Verifiedfields"): new_drugbox += "| Verifiedfields = " + current_parameters['Verifiedfields'] + "\n"
Line 433 ⟶ 479:
"KEGG" in current_parameters or "ChEBI" in current_parameters or "ChEMBL" in current_parameters):
new_drugbox += "\n<!--Identifiers-->\n"
if current_parameters.has_key("CASNo_Ref"): new_drugbox += "| CASNo_Ref = " + current_parameters['CASNo_Ref'] + "\n"
if current_parameters.has_key("CAS_number"): new_drugbox += "| CAS_number = " + current_parameters['CAS_number'] + "\n"
if current_parameters.has_key("CAS_supplemental"): new_drugbox += "| CAS_supplemental = " + current_parameters['CAS_supplemental'] + "\n"
Line 507 ⟶ 554:
if current_parameters.has_key("specific_rotation"): new_drugbox += "| specific_rotation = " + current_parameters['specific_rotation'] + "\n"
if current_parameters.has_key("sec_combustion"): new_drugbox += "| sec_combustion = " + current_parameters['sec_combustion'] + "\n"
 
print new_drugbox
 
return new_drugbox
Line 517 ⟶ 566:
merck_tradenames = sorted(set(merck_tradename.split(";")))[1:]
for index, object in enumerate(merck_tradenames):
merck_tradenames[index] = string.capitalize(string.strip(object.encode("utf-8")))
else:
merck_tradenames = []
Line 524 ⟶ 573:
current_tradenames = sorted(set(current_tradename.split(", ")))
for index, object in enumerate(current_tradenames):
current_tradenames[index] = string.capitalize(string.strip(object.encode("utf-8")))
else:
current_tradenames = []
Line 604 ⟶ 653:
for stem in stems:
if stem:
link = iriToUri(root[1] + stem + ".html")
# print "attempted Drugs.com link: ", link
if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
# print "passed link: ", link
link = "{{drugs.com|" + root[0] + "|" + stem + "}}"
raise StopIteration()
else:
link = ""
raise StopIteration()
except StopIteration:
pass
Line 623 ⟶ 672:
# list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt"
# articles = []
# articles = codecs.open('/Users/bogbotBogBot/progs/pywikipedia/drugbox/drugbox_titles.txt', mode = 'r', encoding='utf-8')
 
articles = ["Aspirin"'Chloramphenicol']
 
for article in articles:
 
# article = article.rstrip('\n')
 
encoding = 'utf-8'
if isinstance(article, basestring):
if not isinstance(article, unicode):
article = unicode(article, encoding)
 
new_drugbox = ""
Line 666 ⟶ 720:
parse_line(line, current_parameters)
 
INNif ="drug_name" articlein current_parameters:
# INN = "Acetylsalicylic acid"current_parameters['drug_name']
else:
INN = article
# INN = "Acetylsalicylic acid"
 
if INN in drugbank_data:
db_data = drugbank_data[INN]
Line 731 ⟶ 789:
if current_parameters:
page.put(new_text, comment='populated clinicalnew fields in drugbox and reordered per [[Wikipedia:Bots/Requests_for_approval/BogBot_2|bot approval]]', watchArticle = None, minorEdit = True)
print ", page updated"
else:
print ", page not updated"