#!/usr/bin/python # -*- coding: utf-8 -*- """ This script can be used to update links transcluded using the {{PDFlink}} template. Syntax: python pdfbot.py [-ref: TemplateName] Command line options: -file: Update article pages listed in a text file. -ref: Update article pages transcluding from a given page. -cat: Update artcile pages from the given category. """ import re, sys, httplib, time import wikipedia, pagegenerators, login, config, catlib from urllib2 import urlparse # Define global variables writeDelay = 60 # seconds readDelay = 15 #seconds httpDebug = 0 userAgent = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)' def whichURL(___location): redirectCounter = 6 try: while (redirectCounter > 0 and ___location is not None): (scheme, site, path, args, query, frag) = urlparse.urlparse(___location) path = path + args + query conn = httplib.HTTPConnection(site) conn.set_debuglevel(httpDebug) conn.putrequest('HEAD', path) conn.putheader('User-Agent', userAgent) conn.endheaders() response = conn.getresponse() ___location = response.msg.getheader('___location') redirectCounter -= 1 if(redirectCounter > 0 and ___location is not None): conn.close() wikipedia.output( u'Redirecting to %s' % ___location ) content_length = response.msg.getheader('content-length') content_type = response.msg.getheader('content-type') response_code = response.status conn.close() return ( [site, path, content_length, content_type] ) except: wikipedia.output(u'Error with URL') return ( [None, None, None, None] ) # Convert the byte count to a human readable value def binary_notation(size): a = float(size) exponent = 0 while a >= 1000. : a /= 1024. exponent += 3 prefix = ['bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]'] # Truncate and remove trailing dot byteSigs = str(a)[:4] if (byteSigs.endswith('.')): byteSigs = byteSigs[:3] return ( byteSigs + ' ' + prefix[exponent / 3] ) def update_size_paramter(template_text): ___location = re.search(r'(http[^] |}]*)', template_text).group(1) prefix_text = re.search(r'(\{\{[^|]*\|[^|}]*)[^}]*\}\}', template_text).group(1) if (re.findall(r'=', template_text)): parameter_prefix = '|2=' else: parameter_prefix = '|' # Fix indirect HTML character refernces ___location = re.sub(r'&\#61;', r'=', ___location) ___location = re.sub(r'&', r'&', ___location) (site, path, content_length, content_type) = whichURL(___location) if (content_length is not None and int(content_length) > 16): # I should really put in 404 error handling code, but this has been working just fine. if (re.findall(r'pdf|octet-stream', content_type)): return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) +" bytes -->}}" else: wikipedia.output(u'Unusual content_type: ' + content_type) return template_text # If anything else return the template_text back def process_article(page): wikitext = page.get() # Fix Casing (Reduces the number of possible expressions) wikitext = re.sub(r'\{\{ *(Template:|template:|)(PDF|Pdf|pdf)', r'{{PDF', wikitext) # State point. Count any changes as needing an update if they're after this line state0 = wikitext # Convert hard coded pdf links (ex: [http link] (pdf) ) wikitext = re.sub(r'(\[http[^]]*\]) *\((\[\[[^|\]]*|)(PDF|pdf)(\]\]|)\)', r'{{PDFlink|\1}}', wikitext) # Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] ) wikitext = re.sub(r'(\(|)\{\{(PDFlink|PDF)\}\}(\)|) *(\[http[^]]*\])', r'{{\2|\4}}', wikitext) wikitext = re.sub(r'("|)(\[http[^]]*\])("|)([^a-zA-Z(]*) *(\(|)\{\{(PDFlink|PDF)\}\}(\)|)', r'{{\6|\2}}\4', wikitext) # Remove PDFlink from citation templates wikitext = re.sub(r'(format *= *)(\(|)\{\{PDF(|link)\}\}(\(|)', r'\1PDF', wikitext) wikitext = re.sub(r'(\{\{(Cite|cite)[^}]*)(}}[^a-zA-Z]*)(\(|)\{\{(PDF|PDFlink)\}\}(\)|)', r'\1 |format=PDF\3', wikitext) wikitext = re.sub(r'(\{\{.ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext) # Fix equal sign problem wikitext = re.sub(r'(PDF|PDFlink)\|(1=|)([^{|}]*=[^{|}]*)', r'\1|1=\3', wikitext) state1 = wikitext m = re.findall(r'\{\{PDF[^|}]*\|[^}]*\}\}', wikitext) for s in m: if (re.findall(r'http', s)): replacetext = update_size_paramter(s) # Uncomment the bellow line to see the replacement text # wikipedia.output(replacetext) wikitext = re.sub(re.escape(s), replacetext, wikitext) if (wikitext == state1): EditMsg = 'Corrected use of {{[[Template:PDFlink|PDFlink]]}}' else: EditMsg = 'Updating filesize for external links tagged with {{[[Template:PDFlink|PDFlink]]}}' wikipedia.setAction(EditMsg) # If the text has changed at all since the state point, upload it if (wikitext != state0): wikipedia.output(u'Page change by %s bytes. Writing new version.' % str(len(wikitext)-len(state0))) page.put(wikitext) # Pause to reduce load on the servers time.sleep(writeDelay) else: time.sleep(readDelay) def main(): site = wikipedia.getSite() for arg in wikipedia.handleArgs(): if (arg.startswith('-ref:')): referredPage = wikipedia.Page(site, arg[5:]) gen = pagegenerators.ReferringPageGenerator(referredPage) elif (arg.startswith('-file:')): gen = pagegenerators.TextfilePageGenerator(arg[6:]) elif (arg.startswith('-cat:')): cat = catlib.Category(site, arg[5:]) gen = pagegenerators.CategorizedPageGenerator(cat) else: wikipedia.showHelp(u'pdfbot') return wikipedia.output(u'Read delay is %s seconds.' % readDelay) wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay) for page in gen: if (not re.findall(r'(User|Wikipedia|Image|MediaWiki|Template|Help|Category|Portal|Talk)(| talk):', page.title())): process_article(page) if __name__ == "__main__": try: main() finally: wikipedia.stopme()