Content deleted Content added
+ more robust parsing of citation templates |
+ remove_embedded_carriage_returns inside nested templates and citation markers |
||
Line 180:
# named ref tag = <ref name="fattinger2000"> but not <ref name="fattinger2000" />
regexp_ref_tag_begin = re.compile(r"(<ref>)|(<ref name.*?^/>)")
regexp_ref_tag_end = re.compile(r"<
def Allowbots(text):
Line 304:
def rejoin(begin, end, sub_strings, type):
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc.
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
Line 311:
for sub_string in sub_strings[1:]:
if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)):
new_list[-1] = new_list[-1] + sub_string
if type == "parameter":
new_list[-1] = new_list[-1] + '|' + sub_string
else:
new_list.append(sub_string)
sub_strings = new_list
return sub_strings
def test_disjoint(begin,end,sub_strings):
disjoint = False
Line 325 ⟶ 328:
return disjoint
def regex_rejoin(regexp_begin, regexp_end, sub_strings, type):
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc.
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
Line 341 ⟶ 344:
n_end = len(match)
if ((end and not begin) or n_end < n_begin):
new_list[-1] = new_list[-1] + sub_string
if type == "parameter":
new_list[-1] = new_list[-1] + '|' + sub_string
else:
new_list.append(sub_string)
Line 373 ⟶ 379:
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links
sub_strings = rejoin('[[',']]',sub_strings, 'parameter')
# do the same for nested templates
Line 380 ⟶ 386:
if test_disjoint('{{','}}',sub_strings):
forever = True
sub_strings = rejoin('{{','}}',sub_strings, 'parameter')
else:
forever = False
Line 389 ⟶ 395:
if regex_test_disjoint(regexp_ref_tag_begin, regexp_ref_tag_end, sub_strings):
forever = True
sub_strings = regex_rejoin(regexp_ref_tag_begin, regexp_ref_tag_end, sub_strings, 'parameter')
else:
forever = False
Line 415 ⟶ 421:
return
def remove_embedded_carriage_returns(parameters):
# remove embedded carriage returns from templates:
lines = parameters.splitlines()
forever = True
while forever:
if test_disjoint('{{', '}}', lines):
forever = True
lines = rejoin('{{', '}}', lines, 'line')
else:
forever = False
forever = True
print
while forever:
if regex_test_disjoint(regexp_ref_tag_begin, regexp_ref_tag_end, lines):
forever = True
lines = regex_rejoin(regexp_ref_tag_end, regexp_ref_tag_begin, lines, 'line')
else:
forever = False
parameters = string.join(lines, "\n")
return parameters
def build_new_drugbox(current_parameters):
# build new drugbox template
Line 718 ⟶ 750:
# articles = codecs.open('/Users/BogBot/progs/pywikipedia/drugbox/drugbox_titles.txt', mode = 'r', encoding='utf-8')
articles = ['
for article in articles:
Line 758 ⟶ 790:
# first extract and assign nested templates commonly used in drugbox templates
parameters = assign_nested_templates(parameters, current_parameters)
# remove any embedded carriage returns from remaining templates:
parameters = remove_embedded_carriage_returns(parameters)
# next, parse each line for parameters
Line 833 ⟶ 868:
if current_parameters:
page.put(new_text, comment='populated new fields in drugbox and reordered per [[Wikipedia:Bots/Requests_for_approval/BogBot_2|bot approval]]', watchArticle = None, minorEdit =
print ", page updated"
else:
|