User:PDFbot/pdfbot.py

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
This script can be used to update links transcluded using the {{PDFlink}} template.

Syntax: python pdfbot.py [-ref: TemplateName]

Command line options:

-file:       Update article pages listed in a text file.
-ref:        Update article pages transcluding from a given page.
-cat:        Update artcile pages from the given category.

"""

import re, sys, httplib, time
import wikipedia, pagegenerators, login, config, catlib
from urllib2 import urlparse

# Define global variables
writeDelay = 60	# seconds
readDelay  = 15	#seconds
httpDebug  = 0
userAgent  = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)'

def whichURL(___location):
	redirectCounter = 6
	try:
		while (redirectCounter > 0 and ___location is not None):
			(scheme, site, path, args, query, frag) = urlparse.urlparse(___location)
			path = path + args + query
			conn = httplib.HTTPConnection(site)
			conn.set_debuglevel(httpDebug)
			conn.putrequest('HEAD', path)
			conn.putheader('User-Agent', userAgent)
			conn.endheaders()
			
			response = conn.getresponse()
			___location = response.msg.getheader('___location')
			
			redirectCounter -= 1
			if(redirectCounter > 0 and ___location is not None):
				conn.close()
				wikipedia.output( u'Redirecting to %s' % ___location )
				
		content_length = response.msg.getheader('content-length')
		content_type   = response.msg.getheader('content-type')
		response_code  = response.status
		conn.close()
		return ( [site, path, content_length, content_type] )
	except:
		wikipedia.output(u'Error with URL')
		return ( [None, None, None, None] )

# Convert the byte count to a human readable value
def binary_notation(size):
	a = float(size)
	exponent = 0
	while a >= 1000. :
		a /= 1024.
		exponent += 3
	prefix = ['bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]']

	# Truncate and remove trailing dot
	byteSigs = str(a)[:4]
	if (byteSigs.endswith('.')):
		byteSigs = byteSigs[:3]
	return ( byteSigs + ' ' + prefix[exponent / 3] )

def update_size_paramter(template_text):
	___location    = re.search(r'(http[^] |}]*)', template_text).group(1)
	prefix_text = re.search(r'(\{\{[^|]*\|[^|}]*)[^}]*\}\}', template_text).group(1) 
	
	if (re.findall(r'=', template_text)):
		parameter_prefix = '|2='
	else:
		parameter_prefix = '|'
	
	# Fix indirect HTML character refernces
	___location = re.sub(r'&\#61;', r'=', ___location)
	___location = re.sub(r'&', r'&', ___location)
	
	(site, path, content_length, content_type) = whichURL(___location)
	if (content_length is not None and int(content_length) > 16):
		# I should really put in 404 error handling code, but this has been working just fine.
		if (re.findall(r'pdf|octet-stream', content_type)):
			return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) +" bytes -->}}"
		else:
			wikipedia.output(u'Unusual content_type: ' + content_type)
	return template_text	
	# If anything else return the template_text back

def process_article(page):
		wikitext = page.get()
		
		# Fix Casing (Reduces the number of possible expressions)
		wikitext = re.sub(r'\{\{ *(Template:|template:|)(PDF|Pdf|pdf)', r'{{PDF', wikitext)
		
		# State point.  Count any changes as needing an update if they're after this line
		state0 = wikitext
		
		# Convert hard coded pdf links  (ex: [http link] (pdf) )
		wikitext = re.sub(r'(\[http[^]]*\]) *\((\[\[[^|\]]*|)(PDF|pdf)(\]\]|)\)', r'{{PDFlink|\1}}', wikitext)
		
		# Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] )
		wikitext = re.sub(r'(\(|)\{\{(PDFlink|PDF)\}\}(\)|) *(\[http[^]]*\])', r'{{\2|\4}}', wikitext)
		wikitext = re.sub(r'("|)(\[http[^]]*\])("|)([^a-zA-Z(]*) *(\(|)\{\{(PDFlink|PDF)\}\}(\)|)', r'{{\6|\2}}\4', wikitext)
				
		# Remove PDFlink from citation templates
		wikitext = re.sub(r'(format *= *)(\(|)\{\{PDF(|link)\}\}(\(|)', r'\1PDF', wikitext)
		wikitext = re.sub(r'(\{\{(Cite|cite)[^}]*)(}}[^a-zA-Z]*)(\(|)\{\{(PDF|PDFlink)\}\}(\)|)', r'\1 |format=PDF\3', wikitext)
		wikitext = re.sub(r'(\{\{.ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext)
		
		# Fix equal sign problem
		wikitext = re.sub(r'(PDF|PDFlink)\|(1=|)([^{|}]*=[^{|}]*)', r'\1|1=\3', wikitext)
		
		state1 = wikitext
		m = re.findall(r'\{\{PDF[^|}]*\|[^}]*\}\}', wikitext)
		
		for s in m:
			if (re.findall(r'http', s)):
				replacetext = update_size_paramter(s)
				# Uncomment the bellow line to see the replacement text
#				wikipedia.output(replacetext)
				wikitext    = re.sub(re.escape(s), replacetext, wikitext)
		
		if (wikitext == state1):
			EditMsg = 'Corrected use of {{[[Template:PDFlink|PDFlink]]}}'
		else:
			EditMsg = 'Updating filesize for external links tagged with {{[[Template:PDFlink|PDFlink]]}}'
		wikipedia.setAction(EditMsg)
			
		# If the text has changed at all since the state point, upload it
		if (wikitext != state0):
			wikipedia.output(u'Page change by %s bytes.  Writing new version.' % str(len(wikitext)-len(state0)))
			page.put(wikitext)	
			
			# Pause to reduce load on the servers
			time.sleep(writeDelay)
		else:
			time.sleep(readDelay)
	
def main():
	site  = wikipedia.getSite()
	
	for arg in wikipedia.handleArgs():
		if (arg.startswith('-ref:')):
			referredPage = wikipedia.Page(site, arg[5:])
			gen = pagegenerators.ReferringPageGenerator(referredPage)
		elif (arg.startswith('-file:')):
			gen = pagegenerators.TextfilePageGenerator(arg[6:])
		elif (arg.startswith('-cat:')):
			cat = catlib.Category(site, arg[5:])
			gen = pagegenerators.CategorizedPageGenerator(cat)
		else:
			wikipedia.showHelp(u'pdfbot')
			return
	
	wikipedia.output(u'Read delay is %s seconds.' % readDelay)
	wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay)
	
	for page in gen:
		if (not re.findall(r'(User|Wikipedia|Image|MediaWiki|Template|Help|Category|Portal|Talk)(| talk):', page.title())):
			process_article(page)

if __name__ == "__main__":
	try:
		main()
	finally:
		wikipedia.stopme()