User:BogBot/Source code/Task 03: Difference between revisions

Content deleted Content added
more robust searches for drugs.com monograph external links; improved log file report; added a couple of missing drugbox parameters
m top: Replaced deprecated <source> tags with <syntaxhighlight>
 
(One intermediate revision by one other user not shown)
Line 1:
<sourcesyntaxhighlight lang=python>
#!/usr/bin/python
# -*- coding: UTF-8 -*-
Line 150:
exp = r'\{\{' # the opening brackets for the infobox
exp = exp + r'\s*' # any amount of whitespace
exp = exp + r'[Dd]rugbox' # the word "infoboxdrugbox", capitalized or not
exp = exp + r'.*\}\}' # any amount of anything, followed by the end of the infoboxdrugbox
 
regexp_drug_infobox = re.compile(exp, re.DOTALL)
Line 195:
# href='/monograph/maprotiline-hydrochloride.html'
regexp_monograph_url = re.compile("href='/monograph/(?P<STEM>.*?)\.html'", re.DOTALL)
 
# http://www.nlm.nih.gov/medlineplus/druginfo/meds/a604021.html
regexp_medlineplus_url = re.compile("www.nlm.nih.gov/medlineplus/druginfo/meds/(?P<ACNO>.*?)\.html", re.DOTALL)
 
def Allowbots(text):
Line 216 ⟶ 219:
if search_result:
result_text = search_result.group(0) # returns the entire matching sequence
begin, end = search_result.span()
else:
return None
Line 231 ⟶ 234:
break
offset = result_text.find('|')
___location = (begin+offset, begin+last_ind-1, begin, begin+last_ind+1)
return ___location
 
Line 242 ⟶ 245:
# Name Trade_Names Drug_Type MedlinePlus Drugs.com_link KEGG_Drug_ID KEGG_Compound_ID ChemSpider_ID PubChem_Compound_ID DrugBank_ID
 
drug_data = csv.reader(open('/Users/BoghogBogBot/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'), dialect='excel')
 
for row in drug_data:
Line 444 ⟶ 447:
value = pad_parameters(value)
# print "parameter, value: ", parameter, " ", value
if not value:
value = " "
current_parameters[parameter] = value
else:
Line 454 ⟶ 459:
value = (result_drug_param.group('VALUE').encode("utf-8")).strip()
value = pad_parameters(value)
if not value:
value = " "
current_parameters[parameter] = value
Line 512 ⟶ 519:
current_parameters['type'] = "vaccine"
 
# if not previously assigned, add the following "empty" parameters
new_drugbox = unicode( "", "utf-8" )
if not current_parameters.has_key("Watchedfieldstradename"): new_drugbox += "| Watchedfields = " + current_parameters['Watchedfields'] + "\n"
current_parameters["tradename"] = " "
 
new_drugbox = unicode( "{{Drugbox\n", "utf-8" )
if current_parameters.has_key("Verifiedfields"): new_drugbox += "| Verifiedfields = " + current_parameters['Verifiedfields'] + "\n"
if current_parameters.has_key("Watchedfields"): new_drugbox += "| Watchedfields = " + current_parameters['Watchedfields'] + "\n"
if current_parameters.has_key("verifiedrevid"): new_drugbox += "| verifiedrevid = " + current_parameters['verifiedrevid'] + "\n"
if current_parameters.has_key("IUPAC_name"): new_drugbox += "| IUPAC_name = " + current_parameters['IUPAC_name'] + "\n"
Line 622 ⟶ 633:
"Sb" in current_parameters or "Se" in current_parameters or "Sr" in current_parameters or "Tc" in current_parameters or "charge" in current_parameters):
if current_parameters.has_key("chemical_formula"): new_drugbox += "| chemical_formula = " + current_parameters['chemical_formula'] + "\n"
# new_drugbox += " "
if current_parameters.has_key("C"): new_drugbox += "| C=" + current_parameters['C'] + " "
if current_parameters.has_key("H"): new_drugbox += "| H=" + current_parameters['H'] + " "
Line 671 ⟶ 683:
if current_parameters.has_key("specific_rotation"): new_drugbox += "| specific_rotation = " + current_parameters['specific_rotation'] + "\n"
if current_parameters.has_key("sec_combustion"): new_drugbox += "| sec_combustion = " + current_parameters['sec_combustion'] + "\n"
 
new_drugbox += "}}"
 
# print new_drugbox
Line 741 ⟶ 755:
 
# drugs.com root links:
roots = [("monograph","http://www.drugs.com/monograph/"), ("CDI","http://www.drugs.com/cdi/"), ("CONS","http://www.drugs.com/cons/"), ("MTM","http://www.drugs.com/mtm/"), ("pro","http://www.drugs.com/pro/"), ("international","http://www.drugs.com/international/"), ("parent","http://www.drugs.com/")]
 
stems = []
Line 754 ⟶ 768:
drugname = string.lower(drugname)
# print "drugnames: ", drugnames
if (drugname != " " and string.find(drugname, " ") > -1):
stems.append(string.replace(drugname, " ", "_"))
stems.append(string.replace(drugname, " ", "-"))
elseelif (drugname != " "):
stems.append(drugname)
# also try common salts
stems.append(drugname + "-hydrochloride")
stems.append(drugname + "-sulfate")
# stems.append(drugname + "-chloride")
# stems.append(drugname + "-sodium")
# stems.append(drugname + "-bromide")
# stems.append(drugname + "-maleate")
# stems.append(drugname + "-citrate")
 
if drugbank_drugs_com:
Line 793 ⟶ 807:
stem = result.group('STEM')
link = "{{drugs.com|" + roots[0][0] + "|" + stem + "}}"
raise StopIteration()
else:
link = "{{drugs.com|" + root[0] + "|" + stem + "}}"
# print "link: ", link
raise StopIteration()
else:
Line 855 ⟶ 873:
# list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt"
articles = []
articles = codecs.open('/Users/BoghogBogBot/progs/pywikipedia/drugbox_titles.txt', mode = 'r', encoding='utf-8')
 
# articles = ['Progesterone']
Line 879 ⟶ 897:
continue
begin, end, begin2, end2 = find_drugbox_from_text(text)
if beginend:
parameters = text[begin:end]
else:
Line 917 ⟶ 935:
db_data = drugbank_data[INN]
elif "DrugBank" in current_parameters and current_parameters['DrugBank'] in DrugBank_ID_INN:
log_string = str(log_string + "INN reset from ", + INN,)
INN = DrugBank_ID_INN[current_parameters['DrugBank']]
log_string = log_string + "to " + INN, + ", "
db_data = drugbank_data[INN]
if not "drug_name" in current_parameters:
Line 942 ⟶ 960:
if db_data:
 
if db_data[4] and not current_parameters.has_key("KEGG") in current_parameters:
current_parameters['KEGG'] = db_data[4]
 
if db_data[6] and not current_parameters.has_key("ChemSpiderID") in current_parameters:
current_parameters['ChemSpiderID'] = db_data[6]
 
if db_data[7] and not current_parameters.has_key("PubChem") in current_parameters:
current_parameters['PubChem'] = db_data[7]
 
Line 960 ⟶ 978:
else:
merck_tradename = ""
if current_parameters.has_key('tradename') in current_parameters:
current_tradename = current_parameters['tradename']
else:
Line 975 ⟶ 993:
# add Drugs.com link
if current_parameters.has_key('tradename') in current_parameters:
tradename = current_parameters['tradename']
else:
Line 990 ⟶ 1,008:
if result: current_parameters['Drugs.com'] = result
 
# add MedlinePlus parameter
new_text = text[:begin] + build_new_drugbox(current_parameters) + text[end:]
if db_data:
if db_data[2]:
if test_MedlinePlus(db_data[2]):
current_parameters['MedlinePlus'] = db_data[2]
 
 
if not 'MedlinePlus' in current_parameters:
opener = urllib.FancyURLopener({})
stem = string.replace(article, " ", "+")
link = "http://vsearch.nlm.nih.gov/vivisimo/cgi-bin/query-meta?&v:project=medlineplus&query=" + stem
 
# print "MedlinePlus link:", link
f = opener.open(link)
text2 = f.read()
result = regexp_medlineplus_url.search(text2)
if result:
current_parameters['MedlinePlus'] = result.group('ACNO')
 
new_text = text[:beginbegin2] + build_new_drugbox(current_parameters) + text[endend2:]
# print build_new_drugbox(current_parameters)
 
# print new_text
 
# print current_parameters
Line 998 ⟶ 1,037:
if current_parameters:
comment='populated new fields in {{drugbox}} and reordered per [[Wikipedia:Bots/Requests_for_approval/BogBot_2|bot approval]]. Report errors and suggestions to [[User_talk:BogBot]]'
status = savepage(page, new_text, comment, False, log_string)
else:
Line 1,006 ⟶ 1,045:
run()
 
</source>
</syntaxhighlight>