User:BogBot/Source code/Task 03: Difference between revisions

Content deleted Content added
additional bug corrections
+ more robust parsing of citation templates
Line 177:
# UNII_Ref = {{fdacite|changed|FDA}}
regexp_UNII_Ref = re.compile(r"\|\s*?UNII_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Fdacite|fdacite).*?\}\})")
 
# named ref tag = <ref name="fattinger2000"> but not <ref name="fattinger2000" />
regexp_ref_tag_begin = re.compile(r"(<ref>)|(<ref name.*?^/>)")
regexp_ref_tag_end = re.compile(r"<ref.*?/>")
 
def Allowbots(text):
Line 299 ⟶ 303:
return parameters
 
def rejoin(begin,end,sub_strings):
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc.
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
 
new_list = [sub_strings[0]]
for sub_string in sub_strings[1:]:
Line 311 ⟶ 316:
sub_strings = new_list
return sub_strings
 
def test_disjoint(begin,end,sub_strings):
disjoint = False
for sub_string in sub_strings:
if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)):
disjoint = True
break
return disjoint
 
def regex_rejoin(regexp_begin, regexp_end, sub_strings):
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc.
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
 
new_list = [sub_strings[0]]
for sub_string in sub_strings[1:]:
begin = False; end = False; n_begin = 0; n_end = 0
if regexp_begin.search(sub_string):
begin = True
match = regexp_begin.findall(sub_string)
n_begin = len(match)
if regexp_end.search(sub_string):
end = True
match = regexp_end.findall(sub_string)
n_end = len(match)
if ((end and not begin) or n_end < n_begin):
new_list[-1] = new_list[-1] + '|' + sub_string
else:
new_list.append(sub_string)
sub_strings = new_list
return sub_strings
 
def regex_test_disjoint(regexp_begin, regexp_end, sub_strings):
disjoint = False
begin = False; end = False; n_begin = 0; n_end = 0
for sub_string in sub_strings:
if regexp_begin.search(sub_string):
begin = True
match = regexp_begin.findall(sub_string)
n_begin = len(match)
if regexp_end.search(sub_string):
end = True
match = regexp_end.findall(sub_string)
n_end = len(match)
if ((end and not begin) or n_end < n_begin):
disjoint = True
break
Line 339 ⟶ 383:
else:
forever = False
 
# do the same for citations:
forever = True
while forever:
if test_disjointregex_test_disjoint('<ref>'regexp_ref_tag_begin,'</ref>' regexp_ref_tag_end, sub_strings):
forever = True
sub_strings = rejoinregex_rejoin('<ref>'regexp_ref_tag_begin,'</ref>' regexp_ref_tag_end, sub_strings)
else:
forever = False
Line 674 ⟶ 718:
# articles = codecs.open('/Users/BogBot/progs/pywikipedia/drugbox/drugbox_titles.txt', mode = 'r', encoding='utf-8')
 
articles = ['ChloramphenicolCocaine']
 
for article in articles: