User:PDFbot/pdfbot.py: Difference between revisions

Browse history interactively

← Previous edit

Content deleted Content added

VisualWikitext

Revision as of 23:42, 20 May 2007 edit Dispenser (talk \| contribs) Extended confirmed users, Pending changes reviewers 33,005 edits Major addition: Interlanguage support ← Previous edit		Latest revision as of 17:34, 8 May 2022 edit undo Qwerfjkl (bot) (talk \| contribs) Bots, Mass message senders 4,087,799 edits m →top: Replaced deprecated <source> tags with <syntaxhighlight> Tag: AWB
(2 intermediate revisions by one other user not shown)
Line 1: <syntaxhighlight lang="python"> ~~<pre><nowiki>~~ #!/usr/bin/python # -- coding: utf-8 -- """ Line 21: # import re, sys~~, httplib~~, time import wikipedia, pagegenerators~~, login, config~~, catlib ~~from~~import ~~urllib2~~httplib, ~~import~~socket, urlparse import codecs try: import commonfixes except ImportError: wikipedia.output('Unable to import commonfixes') commonfixes = None try: import reflinks def my_reflink_put_page(self, page, new): self.page = page self.new_text = new reflinks.ReferencesRobot.put_page=my_reflink_put_page except ImportError: wikipedia.output('Unable to import reflinks') reflinks = None # Download this file : ~~# Define global constants~~ # http://www.twoevils.org/files/wikipedia/404-links.txt.gz ~~readDelay = 20 # seconds~~ # ( maintained by User:Marumari ) ~~writeDelay = 60 # seconds~~ listof404pages = '404-links.txt' ~~prefix = ['bytes', '[[Kibibyte\|KiB]]', '[[Mebibyte\|MiB]]', '[[Gibibyte\|GiB]]']~~ ~~urlpattern = re.compile(r'http[s]?://[^][>< \n\|]', re.IGNORECASE)~~ ~~userAgent = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)'~~ # Define global constants ~~# Edit summary messages~~ readDelay = 10 # seconds ~~msg_added = {~~ writeDelay = 30 # seconds ~~'de': u'BOT: hinzufuegen der Dateigroesse markiert als {{PDFlink}}',~~ mix_prefix = ('bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB') ~~'en': u'Added file size for external links tagged with {{[[Template:PDFlink\|PDFlink]]}}',~~ SI_prefix = ('bytes', '[[Kilobyte\|kB]]', '[[Megabyte\|MB]]', '[[Gigabyte\|GB]]') } IEC_prefix = ('bytes', '[[Kibibyte\|KiB]]', '[[Mebibyte\|MiB]]', '[[Gibibyte\|GiB]]') ~~msg_updated = {~~ # following char sperate url from title: []"<>\ \n ~~'de': u'BOT: Aktualisieren der Dateigroesse mit Vorlageeinbindung',~~ # {\|} is included since we're in a template ~~'en': u'Updated file size of transcluded {{[[Template:PDFlink\|PDFlink]]}}',~~ urlpattern = re.compile(r'http[s]?://[^][<>\s"{\|}]', re.IGNORECASE) } ~~msg_fixed~~httpHeader = { 'User-Agent': 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)', ~~'de': u'BOT: Korrigierte Benutzung der Vorlage {{PDFlink}}',~~ 'Accept': 'application/pdf,application/octet-stream,/;q=0.5', ~~'en': u'Corrected usage of {{[[Template:PDFlink\|PDFlink]]}}',~~ 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,;q=0.7', } 'Keep-Alive': '30', ~~msg_removed_cite = {~~ 'Connection': 'keep-alive', ~~'de': u'BOT: Entfernen {{PDFlink}} entsprechend der zitierten Vorlage',~~ ~~'en': u'Remove {{PDFlink}} from citation template.',~~ } def checkLink(___location, ~~redirectCounter~~useHEAD = True, counter = 5): try: while ~~(redirectCounter~~counter >= 0 and ___location ~~is not None)~~: (scheme, site, path~~, args~~, query, frag) = urlparse.~~urlparse~~urlsplit(___location) query = query and '?' + query or '' path = path or '/' if scheme == "http": conn = httplib.HTTPConnection(site) elif scheme == "https": conn = httplib.HTTPSConnection(site) else: ~~#conn.set_debuglevel(1)~~ return (___location, -1, 'Unsupported Protocol', None, None) ~~conn.putrequest('HEAD', path + args + query)~~ conn.~~putheader~~set_debuglevel(~~'User-Agent', userAgent~~0) socket.setdefaulttimeout(300) ~~conn.endheaders()~~ try: request = path.encode('ascii') + query.encode('ascii') except UnicodeEncodeError: encoding = 'utf-8' noencode = '~!^()_-=&/\|,.?;' request = unicode(urllib.quote(path.encode(encoding) + query.encode(encoding), noencode)) if useHEAD: conn.request('HEAD', request, None, httpHeader) else: conn.request('GET', request, None, httpHeader) response = conn.getresponse() Line 67 ⟶ 93: content_length = response.msg.getheader('content-length') content_type = response.msg.getheader('content-type') ~~response_code = response.status~~ conn.close() ~~redirectCounter~~counter -= 1 if( redirect ~~is not None)~~: wikipedia.output( u'STATUS: HTTP %s Moved: %s to %s' % (response.status, ___location, redirect) ) ~~if(redirect[:4] != "http"):~~ if redirect.startswith("http"): ___location = urlparse.urljoin(___location, redirect) else: ___location = redirect ~~wikipedia.output( u'STATUS: HTTP %s Moved: %s' % (response_code, ___location) )~~ else: ___location = None return [(___location, ~~response_code~~response.status, response.reason, content_length, content_type]) except httplib.error, arg: wikipedia.output(u'~~HTTP Error~~ERROR: HTTP %s %s' % (arg, ___location)) return [(___location, 752, "", None, None]) except socket.timeout: return (___location, 110, 'Connection timeout', None, None) ~~wikipedia.output(u'Error with URL: %s' % ___location)~~ except socket.error, arg: ~~return [___location, 6, None, None]~~ wikipedia.output(u'ERROR: Socket %s %s' % (arg, ___location)) return (___location, arg[0], arg[1], None, None) ~~# Convert the byte count to a human readable value~~ except KeyboardInterrupt: ~~def binary_notation(size):~~ raise except Exception, e: # catches those weird ones print u'Exception raised: %s' % e return (___location, 0, "Exception %s" % e, None, None) def binary_notation(size, base = 1024., prefix = IEC_prefix): """ Convert the byte count to a human readable value """ a = float(size) exponent = 0 while( a >= 1000.): a /= ~~1024.~~base exponent += 3 # Truncate and remove trailing dot byteSigs = str(a)[:4] if (byteSigs.endswith('.')): byteSigs = byteSigs[:3] return byteSigs + ' ' + prefix[exponent / 3] # return '%3.3g %s' % (byteSigs, prefix[exponent / 3]) def fix_broken_links(~~hypertext~~link): """ ~~#This function attempts to fix multipule broken link using its dictionary~~ Returns link replacement for known broken links """ # Moving of resources ~~hypertext~~link = relink.~~sub~~replace(r'virginiadot.org/infoservice~~/resources~~/', r'virginiadot.org/info~~/resources~~/'~~, hypertext~~) link = link.replace('virginiadot.org/comtravel/', 'virginiadot.org/info/') ~~hypertext = re.sub(r'ncdot.org/transit/aviation/ncairports/locations/pdf/', r'ncdot.org/transit/aviation/download/ncairports/', hypertext)~~ link = link.replace('ncdot.org/transit/aviation/ncairports/locations/pdf/', 'ncdot.org/transit/aviation/download/ncairports/') link = link.replace('waitangi-tribunal.govt.nz/doclibrary/researchwhanui/', 'waitangi-tribunal.govt.nz/doclibrary/public/researchwhanui/') # 301 Permanent Redirects ~~hypertext~~link = relink.~~sub~~replace(r'transportation.ky.gov/planning/', r'www.planning.kytc.ky.gov/'~~, hypertext~~) ~~hypertext~~link = relink.~~sub~~replace(r'official-documents.co.uk/', r'official-documents.gov.uk/'~~, hypertext~~) link = link.replace('http://bmj.bmjjournals.com/', 'http://www.bmj.com/') link = link.replace('http://bris.ac.uk/', 'http://www.bristol.ac.uk/') link = link.replace('http://www.shef.ac.uk/socst/', 'http://www.shef.ac.uk/socstudies/') link = link.replace('http://www.sims.berkeley.edu:8000/', 'http://www2.sims.berkeley.edu/') link = link.replace('http://www.cs.wm.edu/hpcs/', 'http://www.cse.ohio-state.edu/hpcs/') link = link.replace('http://www.pchrgaza.org/', 'http://www.pchrgaza.ps/') link = link.replace('http://www.almlondon.org.uk/', 'http://www.mlalondon.org.uk/') link = link.replace('http://www.state.ma.us/eot/', 'http://www.eot.state.ma.us/') link = link.replace('http://www.aapt.org.au/', 'http://www.ausapt.org.au/') link = link.replace('http://berlin.usembassy.gov/', 'http://germany.usembassy.gov/') return ~~hypertext~~link def update_size_paramter(template_text): m = re.search(r'(?s)\{\{(?P<tpl>[^\|])\\|(1=)?(?P<text>[^\|]).?(, (?P<size>[0-9]+) byte.)?\}\}', fix_broken_links(template_text)) ~~# following char sperate url from title: []"<>\ \n~~ link_text = m.group('text') ~~# \| is included since we're in a template~~ ___location = urlpattern.search(link_text).group(0) ~~fixed_text = fix_broken_links(template_text)~~ ~~___location = urlpattern.search(fixed_text).group(0)~~ ~~prefix_text = re.search(r'(\{\{[^\|]\\|[^\|]).\}\}', fixed_text).group(1)~~ old_size = int(m.group('size') or 0) ~~if (re.findall(r'=', template_text)):~~ ~~parameter_prefix = '\|2='~~ ~~else:~~ ~~parameter_prefix = '\|'~~ parameter_prefix = '' ~~# Parse indirect HTML character references~~ if '=' in link_text: ~~___location = re.sub(r'&\#61;', r'=', ___location)~~ parameter_prefix = '2=' ~~___location = re.sub(r'&', r'&', ___location)~~ # Convert indirect HTML character references ~~(redirect, response, content_length, content_type) = checkLink(___location)~~ ___location = wikipedia.html2unicode(___location) ~~if (content_type is not None and content_length is not None and int(content_length) > 16):~~ ~~# I should really put in 404 error handling code, but this has been working just fine.~~ (redirect, response, reason, content_length, media_type) = checkLink(___location) ~~if re.findall(r'pdf\|octet-stream', content_type):~~ try: ~~return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) + ' bytes -->}}'~~ content_length = int(content_length) ~~else:~~ except: ~~wikipedia.output(u'Unusual content_type: %s, code: %s' % (content_type, response))~~ content_length = None ~~return template_text~~ if media_type and content_length and content_length != old_size: # I should really put in 404 error handling code, but this has been working just fine. if 'pdf' in media_type or 'octet-stream' in media_type or 'application/download' in media_type: # This was the old format using the comment # return u'{{%s\|%s\|%s%s<!-- %s, %d bytes -->}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length), content_type, content_length ) # However, comment was filled with generally non-useful information return (not (old_size == 0) or template_text.count('\|')<2, u'{{%s\|%s\|%s%s}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length, prefix = mix_prefix))) else: wikipedia.output(u'FIXME: Bad response: code: %d, type: %s, ___location: %s' % (response, media_type, ___location)) # If anything else return template_text back if old_size: return (False, u'{{%s\|%s\|%s%s}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(old_size, prefix = mix_prefix))) else: return (False, template_text) def process_article(page): try: deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read() except IOError: wikipedia.output('You need to download http://www.twoevils.org/files/wikipedia/404-links.txt.gz and to ungzip it in the same directory') raise wikipedia.output('Getting page %s' % page.aslink()) wikitext = page.get() # Fix Casing (Reduces the number of possible expressions) wikitext = re.~~compile~~sub(r'(?i)\{\{\s(template:\|)pdf', ~~re.IGNORECASE).sub(~~r'{{PDF', wikitext) wikitext = wikitext.replace('{{PDFLink', '{{PDFlink') # State point. Count any changes as needing an update if they're after this line state0 = wikitext # [http {{PDF}}] wikitext = re.sub(r'(\[\w+://[^][<>"\s]+\s[^][\n]+?)\s(\{\{(PDFlink\|PDF)\}\})', r'\2\1', wikitext) # Convert hard coded pdf links (ex: [http link] (pdf) ) wikitext = re.sub(r'(\[~~http~~\w+://[^][]\]) \((\[\[[^\|\]]\|)?\.?(PDF\|pdf) ([Ff]ile)? ([Ff]ormat)?(\]\]\|)?\)', r'{{PDFlink\|\1}}', wikitext) # Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] ) wikitext = re.sub(r'"[(~~\(\|)~~]?\{\{(PDFlink\|PDF)\}\}~~(\)\|~~[)]? ((?P<quote>')\[~~http~~\w+://[^][]\](?P=quote)'?)", r'{{\21\|\42}}', wikitext) wikitext = re.sub(r'(("\|)?\[~~http~~\w+://[^]]\]("\|)?)([^a-zA-ZZ0-9()]) [(~~\(\|)~~]?\{\{(PDFlink\|PDF) \}\}~~(\)\|~~[)]?', r'{{\63\|\1}}\42', wikitext) # ~~Experimental:~~ Convert with with tag at the end of a bullet list (ex: [http link] some text ([[PDF]]) ) if '{{PDF' in wikitext: wikitext = re.compile(r'(\n\[^\n:/])(\[http[^]]\])([^\n:/]) \((\[\[\|\{\{\|)(Portable Document Format\\|PDF\|PDFlink).?(file\|format\|datei\|)(\}\}\|\]\]\|)\)', re.IGNORECASE).sub(r'\1{{PDF\|\2}}\3', wikitext) wikitext = re.~~sub~~compile(r'(\n \+[^\n:/])(\[~~http~~\w+://[^][]\])([^\n:/]) [(](\[\[\|\{\{)?(~~PDFlink~~Portable Document Format[\|]PDF\|pdflink).?(pdf.?)?(file\|format\|datei)?(\}\}\|\]\])?[)]', re.IGNORECASE).sub(r'\1{{\4PDFlink\|\2}}\3', wikitext) wikitext = re.sub(r'(\n \+[^\n:/])(\[\w+://[^][]\])([^\n:/]) \{\{(PDFlink\|PDF)\}\}', r'\1{{\4\|\2}}\3', wikitext) # Experimental: move {{PDF}} back in <ref> tag wikitext = re.sub(r'(<ref[^][{}<>]>[^][<>=]?)("?\[\w+://[^][<>\s"]+[^]\n]\]"?)([^{}<>])\{\{(PDFlink\|PDF)\}\}', r'\1{{\4\|\2}}\3', wikitext) # State point. Correction of {{PDFlink}} template genfixState = wikitext # Remove PDFlink from citation templates # {{cite \|format={{PDF}}}} wikitext = re.sub(r'(?s)(format = )(PDF\|pdf\|)?[(~~\(\|)~~]?\{\{PDF[^{}]?\}\}~~(\)\|~~[)]?', r'\1PDF', wikitext) # {{cite.?}}{{PDF}} wikitext = re.sub(r'(?s)(\{\{~~(Cite\|cite)~~ [Cc]ite[^}])(\}\}[^~~a-zA-Z~~\w() ]) [(~~\(\|)~~]?\{\{(PDF\|PDFlink)\}\}~~(\)\|~~[)]?', r'\1 \|format=PDF\32', wikitext) # {{cite \| lang= EN {{PDF}} }} wikitext = re.sub(r'(?s)(\{\{. [Cc]ite web[^}]) (\(\|)\{\{(PDF\|PDFlink)\}\}(\)\|) ([^}]\}\})', r'\1 \|format=PDF \5', wikitext) # {{PDF\| {{template...}} }} wikitext = re.sub(r'(?s)\{\{(PDFlink\|PDF)\\|\s(\{\{[^{}]+?(\\|[^{\|}]+)?\}\})[\s\|]\}\}', r'\2', wikitext) ~~state1 = wikitext~~ # {{citation\|url={{PDFlink\|...}} }} ~~m = re.findall(r'\{\{PDF[link]{0,4}\\|[^}]\}\}', wikitext)~~ wikitext = re.sub(r'(?i)\{\{(([Cc]itation\|[Cc]ite)[^{}]+?)\{\{(PDFlink\|PDF)\\|([^{}]?)(\\|[^{\|}]+)?\}\}', r'{{\1\4', wikitext) # Sate point. Removal of {{PDFlink}} in certian instances ~~for s in m:~~ state2 = wikitext ~~if re.findall(r'http[s]?://', s):~~ cleantext = wikitext ~~replacetext = update_size_paramter(s)~~ # This is ugly, since we need the comments to check the relative filesize ~~wikitext = re.sub(re.escape(s), replacetext, wikitext)~~ for m in re.finditer(r'<!--.?-->\|<nowiki[^>]>.?</nowiki>', cleantext): if '{{PDF' in m.group(): cleantext = cleantext.replace(m.group(), '') sizechange = 0 for m in re.finditer(r'\{\{(?:PDFlink\|PDF)\\|[^{}]+?\}\}', cleantext): if 'http://' in m.group() or 'https://' in m.group(): (changed, replacetext) = update_size_paramter(m.group()) sizechange += changed and 1 or 0 # print "update page? %s"%(sizechange, ) wikitext = wikitext.replace(m.group(), replacetext) # Uncomment the bellow line to see the replacement text # wikipedia.output(u'OUTPUT: %s' % replacetext) ~~# Fix equal sign problem~~ ~~wikitext = re.sub(r'\{\{(PDF\|PDFlink)\\|(1=\|)(.{2}[^{\|}]+=[^{\|}]+)', r'{{\1\|1=\3', wikitext)~~ for s in re.findall(ur'(?ui)\{\{(?:cite[\w\s]+)\\|[^{}]+?\}\}', cleantext): ~~# Test to see if file sizes parameter was untouched~~ murl = re.search('\\|\surl\s=\s(?P<url>http[s]?://[^][<>"\s\|]+)(\\|\|}})', s) ~~if wikitext == state1:~~ if murl and 'PDF' in murl.group().upper() and (not re.search(ur'\\|\sformat\s=\s[^\s{\|}]+', s) or not re.search(ur'\\|\s(access\w+)\s=\s([^{\|}]+?)\s(?=[{\|}])', s)) and not re.search(ur'\\|\sarchiveurl\s=\s[^\s{\|}]+', s): ~~if len(wikitext) - len(state1) <= 4:~~ repl_url = fix_broken_links(murl.group('url')) ~~# 4 or more bytes removed typically indicate a embed citation removal~~ (redirect, response, reason, content_length, media_type) = checkLink(repl_url) ~~EditMsg = msg_removed_cite~~ # media_type not given if not media_type: continue # Gone/Not Found error code elif (response == 410 or (response == 404 and (u'\t%s\t' % murl.group(1) in deadLinks))) and repl_url == murl.group('url'): wikitext = wikitext.replace(s, s + time.strftime("{{dead link\|bot=PDFbot\|date=%B %Y}}")) # valid PDF # python2.6code: any(item in media_type.lower() for item in ('pdf', 'octet-stream')) elif 'pdf' in media_type.lower() or 'octet-stream' in media_type.lower(): replacetext = s replacetext = replacetext.replace(murl.group(), murl.group().replace(murl.group('url'), repl_url)) if re.search(ur'\\|\sformat\s=\s[^{\|}][\|}]', replacetext): # fill in the format= replacetext = re.sub(r'(\\|\sformat\s= ??)(\n* [{\|}])', r'\1PDF\2', replacetext) else: # add format=PDF (third last parameter) replacetext = re.sub(r'(\{\{[^{}]+?)((\s\\|\s)[^[=\]{\|}]+(\s= )[^{\|}]+)(\s\\|[^{\|}]+)\}\}', r'\1\3format\4PDF\2\5}}', replacetext) accessed = re.search(ur'\\|\s(access\w+)\s=\s[^{\|}\s]+', replacetext) # no access-anything filled in, add/fill accessdate if not accessed: # fill out accessdate if it exists replacetext = re.sub(r'(\\|\saccessdate\s= ??)(?=\n [{\|}])', time.strftime(r'\g<1>%Y-%m-%d'), replacetext) # if template doesn't contain accessdate then add it (last parameter) if not re.search(r'\\|\saccessdate\s=', replacetext): replacetext = re.sub(r'(\{\{[^{}]+?)((\s\\|\s)[^[=\]{\|}]+?(\s= )[^{\|}]+?)(\s)\}\}', time.strftime(r'\1\2\3accessdate\g<4>%Y-%m-%d\5}}'), replacetext) #replacetext = re.sub(r'(\{\{[^{}]+?)((\s\\|\s)[^[=\]{\|}]+(\s= )[^{\|}]+)(\s\\|[^{\|}]+)\}\}', time.strftime(r'\1\2\5\3accessdate\g<4>%Y-%m-%d}}'), replacetext) # put back in wikitext = wikitext.replace(s, replacetext) sizechange += 1 # Uncomment the bellow line to see the replacement text wikipedia.output(u'OUTPUT: %s' % replacetext) # remove duplicate {{dead link}} dead_templates = r'[Dd]ead[ _]link\|[Dd]l\|[Dd]l-s\|404\|[Bb]roken[ _]+link\|[Cc]leanup-link' wikitext = re.sub('(\{\{(?:%s)[^}]?\}\})+((</ref>)?\{\{(?:%s)[^}]?\}\})'%(dead_templates, dead_templates), r'\2', wikitext) # Figure out an edit message of what we did if sizechange: if state2 != state0: EditMsg = "Updating %d PDF%s and fixes" % (sizechange, sizechange>1 and 's' or '') else: EditMsg = "Updating %d PDF%s" % (sizechange, sizechange>1 and 's' or '') ~~EditMsg = msg_fixed~~ else: # state0: renamed templates ~~if len(wikitext) - len(state1) > 34:~~ # genfix: fixPDFlink ~~# Minimum of 34 bytes to add file size information~~ # state2: removePDFlink ~~EditMsg = msg_added~~ ~~else~~#wikitext: - EditMsg = ~~msg_updated~~"General fixes for PDFs" if wikitext == state0: ~~wikipedia.setAction(wikipedia.translate(page.site().language(), EditMsg))~~ pass # text stayed the same elif wikitext == genfixState: EditMsg = "Correct {{PDFlink}} syntax" elif wikitext == state2: if genfixState == state0: # no fixes EditMsg = "Remove incorrect {{PDFlink}}" else: #fixes+removal pass wikipedia.setAction(EditMsg) updateSizes = wikitext # Fix equal sign problem # moved here to avoid changing edit message wikitext = re.sub(r'\{\{(PDF\|PDFlink)\\|(1=\|)(.{2}[^{\|}]+=[^{\|}]+)', r'{{\1\|1=\3', wikitext) # altert me if the page does not contains {{pdflink\|no-link}} if re.search(r'\{\{PDF(link\|)\\|[^:]+\}\}', wikitext): wikipedia.output(u'FIXME: No link in {{PDFlink}} on %s' % page.aslink()) # If the text has changed at all since the state point, upload it if (wikitext != state0 and sizechange) or state2 != state0 or updateSizes != wikitext: wikipedia.output('PDFs updated: % 3d' % sizechange) # [[pdf]] -> [[PDF]] wikitext = re.sub(r'\[\[pdf(?=[\|\]])', '[[PDF', wikitext) # {{cite \| format = pdf }} wikitext = re.sub(r'(?s)(?:([\|]\sformat\s=\s)(?:\[\[\|)[Pp][Dd][Ff](?:\]\]\|))+(\s[{\|}])', r'\1PDF\2', wikitext) # To many to just fix when we come across, so we don't count it with the fixes # Unlink PDF in format parameters wikitext = re.sub(r'(?i)(\\|\sformat\s=\s)\[\[(adobe\|portable\|document\|file\|format\|pdf\|\.\|\s\|\(\|\)\|\\|)+\]\]', r'\1PDF', wikitext) wikitext = re.sub(r'(?i)(\\|\sformat\s=\s)(\s\.?(adobe\|portable\|document\|file\|format\|pdf\|\(\|\)))+?(?=\s[\|}])', r'\1PDF', wikitext) # Apply common fixes if avalible if commonfixes: wikitext = commonfixes.fix(page, text=wikitext) # Apply reflink if avalible if reflinks: # Hackist hook page._contents = wikitext if page.get() != wikitext: wikipedia.output("Injected text wasn't returned with page.get()") elif reflinks.linksInRef.search(wikitext): reflinksbot = reflinks.ReferencesRobot(iter([page])) reflinksbot.run() if hasattr(reflinksbot, 'new_text'): if reflinksbot.page != page:raise 'pages not the same' wikitext = reflinksbot.new_text # Reset edit summary wikipedia.setAction(EditMsg) try: wikipedia.output(u'~~Page~~WRITE: Delta ~~change~~length byof %s 3d bytes~~. Writing new version~~.' % ~~str~~(len(wikitext)-len(state0))) page.put(wikitext) except Exception, e: wikipedia.output(u'~~-------~~ERROR: Except ~~Write~~%s ~~error~~raised ~~------~~while writing.' % e) # Pause to reduce load on the servers time.sleep(writeDelay) else: wikipedia.put_throttle() time.sleep(readDelay) pass def main(): site = wikipedia.getSite() gen = None namespaces = [0] for arg in wikipedia.handleArgs(): Line 226 ⟶ 408: page = wikipedia.Page(wikipedia.getSite(), unicode(arg[6:])) gen = iter([page]) elif arg.startswith('-ns:'): namespaces.append(int(arg[11:])) elif arg.startswith('-delay:'): global readDelay, writeDelay readDelay = int(arg[7:]) writeDelay = int(arg[7:]) if not gen ~~is None~~: wikipedia.showHelp(u'pdfbot') return wikipedia.output(u'Delays are %s s for read and %s for writes' % (readDelay, writeDelay,) ) if namespaces != []: ~~wikipedia.output(u'Read delay is %s seconds.' % readDelay)~~ gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) ~~wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay)~~ ~~# Only process pages from the main namespace~~ ~~gen = pagegenerators.NamespaceFilterPageGenerator(gen, [0])~~ gen = pagegenerators.RedirectFilterPageGenerator(gen) for page in gen: if page.site().messages: wikipedia.output(u'Messages left on talk page, halting.') return process_article(page) wikipedia.output(u'~~\nOperation~~Finished ~~Complete.\n~~updating') if __name__ == "__main__": Line 248 ⟶ 438: finally: wikipedia.stopme() </syntaxhighlight> ~~</nowiki></pre>~~