Revision as of 07:54, 31 July 2011 edit Boghog (talk \| contribs) Autopatrolled, Extended confirmed users, IP block exemptions, New page reviewers, Pending changes reviewers, Rollbackers, Template editors 142,840 edits + more robust parsing of citation templates ← Previous edit		Revision as of 12:44, 31 July 2011 edit undo Boghog (talk \| contribs) Autopatrolled, Extended confirmed users, IP block exemptions, New page reviewers, Pending changes reviewers, Rollbackers, Template editors 142,840 edits + remove_embedded_carriage_returns inside nested templates and citation markers Next edit →
Line 180: # named ref tag = <ref name="fattinger2000"> but not <ref name="fattinger2000" /> regexp_ref_tag_begin = re.compile(r"(<ref>)\|(<ref name.?^/>)") regexp_ref_tag_end = re.compile(r"<~~ref.?~~/ref>") def Allowbots(text): Line 304: def rejoin(begin, end, sub_strings, type): # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc. # adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/ Line 311: for sub_string in sub_strings[1:]: if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)): ~~new_list[-1]~~if type == ~~new_list[-1] + '\|' + sub_string~~"line": new_list[-1] = new_list[-1] + sub_string if type == "parameter": new_list[-1] = new_list[-1] + '\|' + sub_string else: new_list.append(sub_string) sub_strings = new_list return sub_strings def test_disjoint(begin,end,sub_strings): disjoint = False Line 325 ⟶ 328: return disjoint def regex_rejoin(regexp_begin, regexp_end, sub_strings, type): # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc. # adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/ Line 341 ⟶ 344: n_end = len(match) if ((end and not begin) or n_end < n_begin): ~~new_list[-1]~~if type == ~~new_list[-1] + '\|' + sub_string~~"line": new_list[-1] = new_list[-1] + sub_string if type == "parameter": new_list[-1] = new_list[-1] + '\|' + sub_string else: new_list.append(sub_string) Line 373 ⟶ 379: # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links sub_strings = rejoin('[[',']]',sub_strings, 'parameter') # do the same for nested templates Line 380 ⟶ 386: if test_disjoint('{{','}}',sub_strings): forever = True sub_strings = rejoin('{{','}}',sub_strings, 'parameter') else: forever = False Line 389 ⟶ 395: if regex_test_disjoint(regexp_ref_tag_begin, regexp_ref_tag_end, sub_strings): forever = True sub_strings = regex_rejoin(regexp_ref_tag_begin, regexp_ref_tag_end, sub_strings, 'parameter') else: forever = False Line 415 ⟶ 421: return def remove_embedded_carriage_returns(parameters): # remove embedded carriage returns from templates: lines = parameters.splitlines() forever = True while forever: if test_disjoint('{{', '}}', lines): forever = True lines = rejoin('{{', '}}', lines, 'line') else: forever = False forever = True print while forever: if regex_test_disjoint(regexp_ref_tag_begin, regexp_ref_tag_end, lines): forever = True lines = regex_rejoin(regexp_ref_tag_end, regexp_ref_tag_begin, lines, 'line') else: forever = False parameters = string.join(lines, "\n") return parameters def build_new_drugbox(current_parameters): # build new drugbox template Line 718 ⟶ 750: # articles = codecs.open('/Users/BogBot/progs/pywikipedia/drugbox/drugbox_titles.txt', mode = 'r', encoding='utf-8') articles = ['~~Cocaine~~Erythromycin'] for article in articles: Line 758 ⟶ 790: # first extract and assign nested templates commonly used in drugbox templates parameters = assign_nested_templates(parameters, current_parameters) # remove any embedded carriage returns from remaining templates: parameters = remove_embedded_carriage_returns(parameters) # next, parse each line for parameters Line 833 ⟶ 868: if current_parameters: page.put(new_text, comment='populated new fields in drugbox and reordered per [[Wikipedia:Bots/Requests_for_approval/BogBot_2\|bot approval]]', watchArticle = None, minorEdit = ~~True~~False) print ", page updated" else:

User:BogBot/Source code/Task 03: Difference between revisions