User:PDFbot/pdfbot.py

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
This script can be used to update links transcluded using the {{PDFlink}} template.

Syntax: python pdfbot.py [-ref: TemplateName]

Command line options:

-file:       Update all pages listed in a text file.
-ref:        Update all pages transcluding from a given page.

"""

import re, sys, httplib, time
import wikipedia, pagegenerators, login, config, codecs
from urllib2 import urlparse

def whichURL(___location):
	redirectCounter = 8
	try:
		while (redirectCounter > 0 and ___location is not None ):
			(scheme, site, path, args, query, frag) = urlparse.urlparse(___location)
			path = path + args + query
			conn = httplib.HTTPConnection(site)
			conn.set_debuglevel(0)
			conn.putrequest('HEAD', path)
			conn.putheader('User-Agent', 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)')
			conn.endheaders()
			
			response = conn.getresponse()
			___location = response.msg.getheader('___location')
			
			redirectCounter -= 1
			if( ___location is not None ):
				conn.close()
				wikipedia.output( u'Redirecting to %s' % ___location )
				
		content_length = response.msg.getheader('content-length')
		content_type   = response.msg.getheader('content-type')
		conn.close()
		return ( [site, path, content_length, content_type] )
	except:
		wikipedia.output(u'Error with URL')
		return ( [None, None, None, None] )

# Convert the byte count to a human readable value
def binary_notation(size):
	a = float(size)
	exponent = 0
	while a > 1000. :
		a /= 1024.
		exponent += 3
	prefix = ['bytes', 'KiB', 'MiB', 'GiB']

	# Truncate and remove trailing dot
	byteSigs = str(a)[:4]
	if (byteSigs.endswith('.')):
		byteSigs = byteSigs[:3]
	return ( byteSigs + ' ' + prefix[exponent / 3] )

def update_size_paramter(template_text):
	___location    = re.search(r'\[([^ }]*) ', template_text ).group(1)
	prefix_text = re.search(r'(\{\{[^|]*\|[^|}]*)[^}]*\}\}', template_text ).group(1) 
	
	if (re.findall(r'=', template_text)):
		parameter_prefix = '|2='
	else:
		parameter_prefix = '|'
	
	# Fix indirect HTML character refernces
	___location = re.sub(r'&\#61;', r'=', ___location)
	___location = re.sub(r'&', r'&', ___location)
	
	if (___location.lower()[:4] == 'http'):
		(site, path, content_length, content_type ) = whichURL(___location)
		if (content_length is not None):
			return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) +" bytes -->}}"
	return template_text
	
def process_article(site, pageName):
		page = wikipedia.Page(site, pageName)
		wikitext = page.get()
		
		# Fix Casing
		wikitext = re.sub(r'\{\{ *(PDF|Pdf|pdf)', r'{{PDF', wikitext)
		
		# Count any changes as needing an update if they're after this line
		startText = wikitext
		
		# Convert from the old style to the new style
		wikitext = re.sub(r'(\(|)\{\{(PDFlink|PDF)\}\}(\)|) *(\[http[^]]*\])', r'{{\2|\4}}', wikitext)
		wikitext = re.sub(r'(\[http[^]]*\]) *(\(|)\{\{(PDFlink|PDF)\}\}(\)|)', r'{{\3|\1}}', wikitext)
		
		# Remove PDFlink for citation Templates
		wikitext = re.sub(r'(format *= *)(\(|)\{\{PDF(|link)\}\}(\(|)', r'\1PDF', wikitext)
		
		m = re.findall(r'(\{\{PDF[^|}]*\|[^}]*\}\})', wikitext )
		
		if (m is None):
			wikipedia.output(u"Error: Tempate:PDFlink not found.")
			return
		
		for s in m: 
			if (re.findall(r'\[http', s)):
				replacetext = update_size_paramter(s)
				wikitext    = re.sub(re.escape(s), replacetext, wikitext)
				# Uncomment the bellow line to see the replacement text
#				print replacetext.encode('ascii', 'replace')
		
		sizeChange = len(wikitext) - len(startText)
		
		if (sizeChange > 0):
			EditMsg = 'Corrected use of {{[[Template:PDFlink|PDFlink]]}}'
		else:
			EditMsg = 'Updating filesize for external links tagged with {{[[Template:PDFlink|PDFlink]]}}'
		wikipedia.setAction(EditMsg)
			
		# If the text has changed at all since, upload the new version
		if (startText != wikitext):
			wikipedia.output(u'Uploading updated version.  Detla byte count: %s' % str(sizeChange))
#			page.put(wikitext)	

def serverlist(site, pageName):
	s = wikipedia.Page(site, unicode(pageName))
	return [page for page in s.getReferences(onlyTemplateInclusion=True)]
	
def main():
	site  = wikipedia.getSite()
	arg = wikipedia.handleArgs()[0]
	timer = 1	# Minutes	
	
	if (arg.startswith('-ref:')):
		worklist = serverlist(site, arg[len('-ref:'):])
	elif (arg.startswith('-file:')):
		worklist = pagegenerators.TextfilePageGenerator(arg[len('-file:'):])
	else:
		wikipedia.showHelp(u'pdfbot')
		return
	
	wikipedia.output(u'Will sleeping for ' + str(timer) + ' minutes between page loads.\n')
	
	for page in worklist:
		if (not re.findall(r'(User|Wikipedia|Image|MediaWiki|Template|Help|Category|Portal)(|Talk| talk):', page.title())):
			process_article(site, page.title())
			
			# Pause to reduce load on the servers
			time.sleep(timer*60)

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()