Revision as of 21:42, 30 July 2011 edit Boghog (talk \| contribs) Autopatrolled, Extended confirmed users, IP block exemptions, New page reviewers, Pending changes reviewers, Rollbackers, Template editors 142,840 edits additional bug corrections ← Previous edit		Revision as of 07:54, 31 July 2011 edit undo Boghog (talk \| contribs) Autopatrolled, Extended confirmed users, IP block exemptions, New page reviewers, Pending changes reviewers, Rollbackers, Template editors 142,840 edits + more robust parsing of citation templates Next edit →
Line 177: # UNII_Ref = {{fdacite\|changed\|FDA}} regexp_UNII_Ref = re.compile(r"\\|\s?UNII_Ref\s?=\s?(?P<TEMPLATE>\{\{(Fdacite\|fdacite).?\}\})") # named ref tag = <ref name="fattinger2000"> but not <ref name="fattinger2000" /> regexp_ref_tag_begin = re.compile(r"(<ref>)\|(<ref name.?^/>)") regexp_ref_tag_end = re.compile(r"<ref.?/>") def Allowbots(text): Line 299 ⟶ 303: return parameters def rejoin(begin,end,sub_strings): # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc. # adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/ new_list = [sub_strings[0]] for sub_string in sub_strings[1:]: Line 311 ⟶ 316: sub_strings = new_list return sub_strings def test_disjoint(begin,end,sub_strings): disjoint = False for sub_string in sub_strings: if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)): disjoint = True break return disjoint def regex_rejoin(regexp_begin, regexp_end, sub_strings): # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc. # adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/ new_list = [sub_strings[0]] for sub_string in sub_strings[1:]: begin = False; end = False; n_begin = 0; n_end = 0 if regexp_begin.search(sub_string): begin = True match = regexp_begin.findall(sub_string) n_begin = len(match) if regexp_end.search(sub_string): end = True match = regexp_end.findall(sub_string) n_end = len(match) if ((end and not begin) or n_end < n_begin): new_list[-1] = new_list[-1] + '\|' + sub_string else: new_list.append(sub_string) sub_strings = new_list return sub_strings def regex_test_disjoint(regexp_begin, regexp_end, sub_strings): disjoint = False begin = False; end = False; n_begin = 0; n_end = 0 for sub_string in sub_strings: if regexp_begin.search(sub_string): begin = True match = regexp_begin.findall(sub_string) n_begin = len(match) if regexp_end.search(sub_string): end = True match = regexp_end.findall(sub_string) n_end = len(match) if ((end and not begin) or n_end < n_begin): disjoint = True break Line 339 ⟶ 383: else: forever = False # do the same for citations: forever = True while forever: if ~~test_disjoint~~regex_test_disjoint(~~'<ref>'~~regexp_ref_tag_begin,~~'</ref>'~~ regexp_ref_tag_end, sub_strings): forever = True sub_strings = ~~rejoin~~regex_rejoin(~~'<ref>'~~regexp_ref_tag_begin,~~'</ref>'~~ regexp_ref_tag_end, sub_strings) else: forever = False Line 674 ⟶ 718: # articles = codecs.open('/Users/BogBot/progs/pywikipedia/drugbox/drugbox_titles.txt', mode = 'r', encoding='utf-8') articles = ['~~Chloramphenicol~~Cocaine'] for article in articles:

User:BogBot/Source code/Task 03: Difference between revisions