Revision as of 02:26, 10 May 2007 edit Dispenser (talk \| contribs) Extended confirmed users, Pending changes reviewers 33,005 edits code sync, big feature: Dead external link repair ← Previous edit		Revision as of 23:42, 20 May 2007 edit undo Dispenser (talk \| contribs) Extended confirmed users, Pending changes reviewers 33,005 edits Major addition: Interlanguage support Next edit →
Line 13: -cat: Update pages from the given category. -links: Update pages linked from a given page. -page: Update that page. """ # # (c) Dispenser, 2007 # import re, sys, httplib, time Line 20 ⟶ 25: from urllib2 import urlparse # Define global ~~variables~~constants writeDelay = 60 # seconds▼ readDelay = 20 # seconds ▲writeDelay = 60 # seconds ~~httpDebug = 0~~ prefix = ['bytes', '[[Kibibyte\|KiB]]', '[[Mebibyte\|MiB]]', '[[Gibibyte\|GiB]]']▼ urlpattern = re.compile(r'http[s]?://[^][>< \n\|]', re.IGNORECASE) userAgent = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)' # Edit summary messages def checkLink(___location):▼ msg_added = { ~~redirectCounter = 6~~ 'de': u'BOT: hinzufuegen der Dateigroesse markiert als {{PDFlink}}', ~~EditMsg =~~ 'en': u'Added file size for external links tagged with {{[[Template:PDFlink\|PDFlink]]}}',▼ } msg_updated = { 'de': u'BOT: Aktualisieren der Dateigroesse mit Vorlageeinbindung', ~~EditMsg =~~ 'en': u'Updated file size of transcluded {{[[Template:PDFlink\|PDFlink]]}}',▼ } msg_fixed = { 'de': u'BOT: Korrigierte Benutzung der Vorlage {{PDFlink}}', ~~EditMsg =~~ 'en': u'Corrected ~~use~~usage of {{[[Template:PDFlink\|PDFlink]]}}',▼ } msg_removed_cite = { 'de': u'BOT: Entfernen {{PDFlink}} entsprechend der zitierten Vorlage', 'en': u'Remove {{PDFlink}} from citation template.', } ▲def checkLink(___location, redirectCounter = 5): try: while (redirectCounter > 0 and ___location is not None): (scheme, site, path, args, query, frag) = urlparse.urlparse(___location) if scheme == "http": path = path + args + query▼ conn = httplib.HTTPConnection(site) elif scheme == "https": conn.set_debuglevel(httpDebug)▼ conn = httplib.~~putrequest~~HTTPSConnection(~~'HEAD', path~~site) ▲ #conn.set_debuglevel(~~httpDebug~~1) ▲ ~~path =~~conn.putrequest('HEAD', path + args + query) conn.putheader('User-Agent', userAgent) conn.endheaders() Line 51 ⟶ 76: else: ___location = redirect wikipedia.output( u'~~Redirecting~~STATUS: HTTP (%s) toMoved: %s' % (response_code, ___location) ) else: ___location = None return ( [___location, response_code, content_length, content_type] ) except httplib.error, arg: wikipedia.output(u'HTTP Error: %s %s' % (arg, ___location)) return [___location, 7, None, None] except: wikipedia.output(u'Error with URL: %s' % ___location) return ( [___location, ~~006~~6, None, None] ) # Convert the byte count to a human readable value Line 66 ⟶ 94: a /= 1024. exponent += 3 ▲ prefix = ['bytes', '[[Kibibyte\|KiB]]', '[[Mebibyte\|MiB]]', '[[Gibibyte\|GiB]]'] # Truncate and remove trailing dot Line 88 ⟶ 115: def update_size_paramter(template_text): # following char sperate url from title: []"<>\ \n # \| is included since we're in a template fixed_text = fix_broken_links(template_text) ___location = reurlpattern.search(~~r'(http[^] \|}])',~~ fixed_text).group(10) prefix_text = re.search(r'(\{\{[^\|]\\|[^\|}])~~[^}]~~.\}\}', fixed_text).group(1) if (re.findall(r'=', template_text)): Line 106 ⟶ 135: # I should really put in 404 error handling code, but this has been working just fine. if re.findall(r'pdf\|octet-stream', content_type): return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) +" ' bytes -->}}"' else: wikipedia.output(u'Unusual content_type: %s, code: %s' % (content_type, response)) Line 116 ⟶ 145: # Fix Casing (Reduces the number of possible expressions) wikitext = re.~~sub~~compile(r'\{\{ \s(~~Template:\|~~template:\|)~~(PDF\|Pdf\|~~pdf)', re.IGNORECASE).sub(r'{{PDF', wikitext) # State point. Count any changes as needing an update if they're after this line Line 129 ⟶ 158: # Experimental: Convert with with tag at the end of a bullet list (ex: * [http link] some text ([[PDF]]) ) wikitext = re.~~sub~~compile(r'(\n\[^\n:/])(\[http[^]]\])([^\n:/]) \((\[\[\|\{\{\|)(Portable Document Format\\|PDF\|PDFlink).?(file\|~~PDF~~format\|datei\|~~pdf~~)(\}\}\|\]\]\|)\)', re.IGNORECASE).sub(r'\1{{PDF\|\2}}\3', wikitext) wikitext = re.sub(r'(\n\[^\n:/])(\[http[^]]\])([^\n:/]) \{\{(PDFlink\|PDF)\}\}', r'\1{{\4\|\2}}\3', wikitext) # Remove PDFlink from citation templates wikitext = re.sub(r'(format = )(PDF\|pdf\|)(\(\|)\{\{PDF~~(\|link)~~[^{}]\}\}(\()\|)', r'\1PDF', wikitext) wikitext = re.sub(r'(\{\{(Cite\|cite)[^}])(}}[^a-zA-Z])(\(\|)\{\{(PDF\|PDFlink)\}\}(\)\|)', r'\1 \|format=PDF\3', wikitext) wikitext = re.sub(r'(\{\{.ite web[^}]) (\(\|)\{\{(PDF\|PDFlink)\}\}(\)\|) ([^}]\}\})', r'\1 \|format=PDF \5', wikitext) state1 = wikitext m = re.findall(r'\{\{(PDF~~\|PDFlink)~~[link]{0,4}\\|[^}]\}\}', wikitext) for s in m: if re.findall(r'http[s]?://', s): replacetext = update_size_paramter(s) wikitext = re.sub(re.escape(s), replacetext, wikitext) Line 148 ⟶ 177: # Fix equal sign problem wikitext = re.sub(r'\{\{(PDF\|PDFlink)\\|(1=\|)(.{2}[^{\|}]+=[^{\|}]+)', r'{{\1\|1=\3', wikitext) ▼ # Test to see if file sizes parameter was untouched if wikitext == state1: if len(wikitext) - len(state1) <= 4: ~~# Nothing was done with the embedded file sizes string~~ # 4 or more bytes removed typically indicate a embed citation removal ▲ EditMsg = 'Corrected use of {{[[Template:PDFlink\|PDFlink]]}}' EditMsg = msg_removed_cite else:▼ EditMsg = msg_fixed else: if len(wikitext) - len(state1) > 34: # Minimum of 34 bytes to add file size information EditMsg = msg_added ▲ EditMsg = 'Added file size for external links tagged with {{[[Template:PDFlink\|PDFlink]]}}' else: EditMsg = msg_updated ▲ EditMsg = 'Updated file size of transcluded {{[[Template:PDFlink\|PDFlink]]}}' wikipedia.setAction(wikipedia.translate(page.site().language(), EditMsg)) # If the text has changed at all since the state point, upload it if (wikitext != state0): Line 176 ⟶ 209: def main(): site = wikipedia.getSite() gen = None for arg in wikipedia.handleArgs(): Line 189 ⟶ 223: pagelinks = wikipedia.Page(wikipedia.getSite(), arg[7:]) gen = pagegenerators.LinkedPageGenerator(pagelinks) elif arg.startswith('-page:'): ▲ else: page = wikipedia.~~showHelp~~Page(~~u'pdfbot'~~wikipedia.getSite(), unicode(arg[6:])) gen = iter([page]) return▼ if gen is None: wikipedia.showHelp(u'pdfbot') ▲ return wikipedia.output(u'Read delay is %s seconds.' % readDelay) Line 197 ⟶ 235: # Only process pages from the main namespace gen = pagegenerators.NamespaceFilterPageGenerator(gen, [0]) ▲ gen = pagegenerators.RedirectFilterPageGenerator(gen) for page in gen:

User:PDFbot/pdfbot.py: Difference between revisions