#!/usr/bin/python # -*- coding: utf-8 -*- """ This script can be used to update links transcluded using the {{PDFlink}} template. Syntax: python pdfbot.py [-ref: TemplateName] Command line options: -file: Update all pages listed in a text file. -ref: Update all pages transcluding from a given page. """ import re, sys, httplib, time import wikipedia, pagegenerators, login, config, codecs from urllib2 import urlparse def whichURL(___location): redirectCounter = 8 try: while (redirectCounter > 0 and ___location is not None ): (scheme, site, path, args, query, frag) = urlparse.urlparse(___location) path = path + args + query conn = httplib.HTTPConnection(site) conn.set_debuglevel(0) conn.putrequest('HEAD', path) conn.putheader('User-Agent', 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)') conn.endheaders() response = conn.getresponse() ___location = response.msg.getheader('___location') redirectCounter -= 1 if( ___location is not None ): conn.close() wikipedia.output( u'Redirecting to %s' % ___location ) content_length = response.msg.getheader('content-length') content_type = response.msg.getheader('content-type') conn.close() return ( [site, path, content_length, content_type] ) except: wikipedia.output(u'Error with URL') return ( [None, None, None, None] ) # Convert the byte count to a human readable value def binary_notation(size): a = float(size) exponent = 0 while a > 1000. : a /= 1024. exponent += 3 prefix = ['bytes', 'KiB', 'MiB', 'GiB'] # Truncate and remove trailing dot byteSigs = str(a)[:4] if (byteSigs.endswith('.')): byteSigs = byteSigs[:3] return ( byteSigs + ' ' + prefix[exponent / 3] ) def update_size_paramter(template_text): ___location = re.search(r'\[([^ }]*) ', template_text ).group(1) prefix_text = re.search(r'(\{\{[^|]*\|[^|}]*)[^}]*\}\}', template_text ).group(1) if (re.findall(r'=', template_text)): parameter_prefix = '|2=' else: parameter_prefix = '|' # Fix indirect HTML character refernces ___location = re.sub(r'&\#61;', r'=', ___location) ___location = re.sub(r'&', r'&', ___location) if (___location.lower()[:4] == 'http'): (site, path, content_length, content_type ) = whichURL(___location) if (content_length is not None): return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) +" bytes -->}}" return template_text def process_article(site, pageName): page = wikipedia.Page(site, pageName) wikitext = page.get() # Fix Casing wikitext = re.sub(r'\{\{ *(PDF|Pdf|pdf)', r'{{PDF', wikitext) # Count any changes as needing an update if they're after this line startText = wikitext # Convert from the old style to the new style wikitext = re.sub(r'(\(|)\{\{(PDFlink|PDF)\}\}(\)|) *(\[http[^]]*\])', r'{{\2|\4}}', wikitext) wikitext = re.sub(r'(\[http[^]]*\]) *(\(|)\{\{(PDFlink|PDF)\}\}(\)|)', r'{{\3|\1}}', wikitext) # Remove PDFlink for citation Templates wikitext = re.sub(r'(format *= *)(\(|)\{\{PDF(|link)\}\}(\(|)', r'\1PDF', wikitext) m = re.findall(r'(\{\{PDF[^|}]*\|[^}]*\}\})', wikitext ) if (m is None): wikipedia.output(u"Error: Tempate:PDFlink not found.") return for s in m: if (re.findall(r'\[http', s)): replacetext = update_size_paramter(s) wikitext = re.sub(re.escape(s), replacetext, wikitext) # Uncomment the bellow line to see the replacement text # print replacetext.encode('ascii', 'replace') sizeChange = len(wikitext) - len(startText) if (sizeChange > 0): EditMsg = 'Corrected use of {{[[Template:PDFlink|PDFlink]]}}' else: EditMsg = 'Updating filesize for external links tagged with {{[[Template:PDFlink|PDFlink]]}}' wikipedia.setAction(EditMsg) # If the text has changed at all since, upload the new version if (startText != wikitext): wikipedia.output(u'Uploading updated version. Detla byte count: %s' % str(sizeChange)) # page.put(wikitext) def serverlist(site, pageName): s = wikipedia.Page(site, unicode(pageName)) return [page for page in s.getReferences(onlyTemplateInclusion=True)] def main(): site = wikipedia.getSite() arg = wikipedia.handleArgs()[0] timer = 1 # Minutes if (arg.startswith('-ref:')): worklist = serverlist(site, arg[len('-ref:'):]) elif (arg.startswith('-file:')): worklist = pagegenerators.TextfilePageGenerator(arg[len('-file:'):]) else: wikipedia.showHelp(u'pdfbot') return wikipedia.output(u'Will sleeping for ' + str(timer) + ' minutes between page loads.\n') for page in worklist: if (not re.findall(r'(User|Wikipedia|Image|MediaWiki|Template|Help|Category|Portal)(|Talk| talk):', page.title())): process_article(site, page.title()) # Pause to reduce load on the servers time.sleep(timer*60) if __name__ == "__main__": try: main() finally: wikipedia.stopme()