User:PDFbot/pdfbot.py

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
This script can be used to update links transcluded using the {{PDFlink}} template.

Syntax: python pdfbot.py [-ref: TemplateName]

Command line options:

-file:       Update pages listed in a text file.
-ref:        Update pages transcluding from a given page.
-cat:        Update pages from the given category.
-links:      Update pages linked from a given page.
-page:       Update that page.

"""

#
# (c) Dispenser, 2007
#

import re, sys, httplib, time
import wikipedia, pagegenerators, login, config, catlib
from urllib2 import urlparse

# Define global constants
readDelay  = 20	# seconds
writeDelay = 60 # seconds
prefix = ['bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]']
urlpattern = re.compile(r'http[s]?://[^][>< \n|]*', re.IGNORECASE)
userAgent  = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)'

# Edit summary messages
msg_added = {
	'de': u'BOT: hinzufuegen der Dateigroesse markiert als {{PDFlink}}',
	'en': u'Added file size for external links tagged with {{[[Template:PDFlink|PDFlink]]}}',
}
msg_updated = {
	'de': u'BOT: Aktualisieren der Dateigroesse mit Vorlageeinbindung',
	'en': u'Updated file size of transcluded {{[[Template:PDFlink|PDFlink]]}}',
}
msg_fixed = {
	'de': u'BOT: Korrigierte Benutzung der Vorlage {{PDFlink}}',
	'en': u'Corrected usage of {{[[Template:PDFlink|PDFlink]]}}',
}
msg_removed_cite = {
	'de': u'BOT: Entfernen {{PDFlink}} entsprechend der zitierten Vorlage',
	'en': u'Remove {{PDFlink}} from citation template.',
}

def checkLink(___location, redirectCounter = 5):
	try:
		while (redirectCounter > 0 and ___location is not None):
			(scheme, site, path, args, query, frag) = urlparse.urlparse(___location)
			if scheme == "http":
				conn = httplib.HTTPConnection(site)
			elif scheme == "https":
				conn = httplib.HTTPSConnection(site)
			#conn.set_debuglevel(1)
			conn.putrequest('HEAD', path + args + query)
			conn.putheader('User-Agent', userAgent)
			conn.endheaders()
			
			response = conn.getresponse()
			redirect = response.msg.getheader('___location')
			content_length = response.msg.getheader('content-length')
			content_type   = response.msg.getheader('content-type')
			response_code  = response.status
			conn.close()
			
			redirectCounter -= 1
			if(redirect is not None):
				if(redirect[:4] != "http"):
					___location = urlparse.urljoin(___location, redirect)
				else:
					___location = redirect
				wikipedia.output( u'STATUS: HTTP %s Moved: %s' % (response_code, ___location) )
			else:
				___location = None
		return [___location, response_code, content_length, content_type]
	except httplib.error, arg:
		wikipedia.output(u'HTTP Error: %s %s' % (arg, ___location))
		return [___location, 7, None, None]
	except:
		wikipedia.output(u'Error with URL: %s' % ___location)
		return [___location, 6, None, None]

# Convert the byte count to a human readable value
def binary_notation(size):
	a = float(size)
	exponent = 0
	while(a >= 1000.):
		a /= 1024.
		exponent += 3

	# Truncate and remove trailing dot
	byteSigs = str(a)[:4]
	if (byteSigs.endswith('.')):
		byteSigs = byteSigs[:3]
	return byteSigs + ' ' + prefix[exponent / 3]

def fix_broken_links(hypertext):
	#This function attempts to fix multipule broken link using its dictionary

	# Moving of resources
	hypertext = re.sub(r'virginiadot.org/infoservice/resources/', r'virginiadot.org/info/resources/', hypertext)
	hypertext = re.sub(r'ncdot.org/transit/aviation/ncairports/locations/pdf/', r'ncdot.org/transit/aviation/download/ncairports/', hypertext)
	
	# 301 Permanent Redirects
	hypertext = re.sub(r'transportation.ky.gov/planning/', r'www.planning.kytc.ky.gov/', hypertext)
	hypertext = re.sub(r'official-documents.co.uk/', r'official-documents.gov.uk/', hypertext)
	
	return hypertext

def update_size_paramter(template_text):
	# following char sperate url from title: []"<>\ \n
	# | is included since we're in a template
	fixed_text = fix_broken_links(template_text)

	___location    = urlpattern.search(fixed_text).group(0)
	prefix_text = re.search(r'(\{\{[^|]*\|[^|]*).*\}\}', fixed_text).group(1) 
	
	if (re.findall(r'=', template_text)):
		parameter_prefix = '|2='
	else:
		parameter_prefix = '|'
	
	# Parse indirect HTML character references
	___location = re.sub(r'&\#61;', r'=', ___location)
	___location = re.sub(r'&', r'&', ___location)
	
	(redirect, response, content_length, content_type) = checkLink(___location)
	if (content_type is not None and content_length is not None and int(content_length) > 16):
		# I should really put in 404 error handling code, but this has been working just fine.
		if re.findall(r'pdf|octet-stream', content_type):
			return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) + ' bytes -->}}'
		else:
			wikipedia.output(u'Unusual content_type: %s, code: %s' % (content_type, response))
	return template_text	
	# If anything else return template_text back

def process_article(page):
		wikitext = page.get()
		
		# Fix Casing (Reduces the number of possible expressions)
		wikitext = re.compile(r'\{\{\s*(template:|)pdf', re.IGNORECASE).sub(r'{{PDF', wikitext)
		
		# State point.  Count any changes as needing an update if they're after this line
		state0 = wikitext
		
		# Convert hard coded pdf links  (ex: [http link] (pdf) )
		wikitext = re.sub(r'(\[http[^]]*\]) *\((\[\[[^|\]]*|)(PDF|pdf)(\]\]|)\)', r'{{PDFlink|\1}}', wikitext)
		
		# Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] )
		wikitext = re.sub(r'(\(|)\{\{(PDFlink|PDF)\}\}(\)|) *(\[http[^]]*\])', r'{{\2|\4}}', wikitext)
		wikitext = re.sub(r'(("|)\[http[^]]*\]("|))([^a-zA-Z(]*) *(\(|)\{\{(PDFlink|PDF)\}\}(\)|)', r'{{\6|\1}}\4', wikitext)
	
		# Experimental: Convert with with tag at the end of a bullet list (ex: * [http link] some text ([[PDF]]) )
		wikitext = re.compile(r'(\n\*[^\n:/]*)(\[http[^]]*\])([^\n:/]*) *\((\[\[|\{\{|)(Portable Document Format\|PDF|PDFlink).?(file|format|datei|)(\}\}|\]\]|)\)', re.IGNORECASE).sub(r'\1{{PDF|\2}}\3', wikitext)
		wikitext = re.sub(r'(\n\*[^\n:/]*)(\[http[^]]*\])([^\n:/]*) *\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext)
	
		# Remove PDFlink from citation templates
		wikitext = re.sub(r'(format *= *)(PDF|pdf|)(\(|)\{\{PDF[^{}]*\}\}(\)|)', r'\1PDF', wikitext)
		wikitext = re.sub(r'(\{\{(Cite|cite)[^}]*)(}}[^a-zA-Z]*)(\(|)\{\{(PDF|PDFlink)\}\}(\)|)', r'\1 |format=PDF\3', wikitext)
		wikitext = re.sub(r'(\{\{.ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext)
	
		state1 = wikitext
		m = re.findall(r'\{\{PDF[link]{0,4}\|[^}]*\}\}', wikitext)
		
		for s in m:
			if re.findall(r'http[s]?://', s):
				replacetext = update_size_paramter(s)
				wikitext    = re.sub(re.escape(s), replacetext, wikitext)
				# Uncomment the bellow line to see the replacement text
#				wikipedia.output(replacetext)
	
		# Fix equal sign problem
		wikitext = re.sub(r'\{\{(PDF|PDFlink)\|(1=|)(.{2}[^{|}]+=[^{|}]+)', r'{{\1|1=\3', wikitext)
		
		# Test to see if file sizes parameter was untouched
		if wikitext == state1:
			if len(wikitext) - len(state1) <= 4:
				# 4 or more bytes removed typically indicate a embed citation removal
				EditMsg = msg_removed_cite
			else:
				EditMsg = msg_fixed
		else:
			if len(wikitext) - len(state1) > 34:
				# Minimum of 34 bytes to add file size information
				EditMsg = msg_added
			else:
				EditMsg = msg_updated
		wikipedia.setAction(wikipedia.translate(page.site().language(), EditMsg))
		
		# If the text has changed at all since the state point, upload it
		if (wikitext != state0):
			try:
				wikipedia.output(u'Page change by %s bytes.  Writing new version.' % str(len(wikitext)-len(state0)))
				page.put(wikitext)	
			except:
				wikipedia.output(u'------- Write error ------')
		
			# Pause to reduce load on the servers
			time.sleep(writeDelay)
		else:
			time.sleep(readDelay)
	
def main():
	site  = wikipedia.getSite()
	gen = None
	
	for arg in wikipedia.handleArgs():
		if arg.startswith('-ref:'):
			referredPage = wikipedia.Page(site, arg[5:])
			gen = pagegenerators.ReferringPageGenerator(referredPage)
		elif arg.startswith('-file:'):
			gen = pagegenerators.TextfilePageGenerator(arg[6:])
		elif arg.startswith('-cat:'):
			cat = catlib.Category(site, arg[5:])
			gen = pagegenerators.CategorizedPageGenerator(cat)
		elif arg.startswith('-links:'):
			pagelinks = wikipedia.Page(wikipedia.getSite(), arg[7:])
			gen = pagegenerators.LinkedPageGenerator(pagelinks)
		elif arg.startswith('-page:'):
			page = wikipedia.Page(wikipedia.getSite(), unicode(arg[6:]))
			gen = iter([page])

	if gen is None:
		wikipedia.showHelp(u'pdfbot')
		return
	
	wikipedia.output(u'Read delay is %s seconds.' % readDelay)
	wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay)
	
	# Only process pages from the main namespace
	gen = pagegenerators.NamespaceFilterPageGenerator(gen, [0])
	
	gen = pagegenerators.RedirectFilterPageGenerator(gen)
	
	for page in gen:
		process_article(page)
	wikipedia.output(u'\nOperation Complete.\n')

if __name__ == "__main__":
	try:
		main()
	finally:
		wikipedia.stopme()