User:PDFbot/pdfbot.py

This is an old revision of this page, as edited by Dispenser (talk | contribs) at 02:26, 10 May 2007 (code sync, big feature: Dead external link repair). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
This script can be used to update links transcluded using the {{PDFlink}} template.

Syntax: python pdfbot.py [-ref: TemplateName]

Command line options:

-file:       Update pages listed in a text file.
-ref:        Update pages transcluding from a given page.
-cat:        Update pages from the given category.
-links:      Update pages linked from a given page.

"""

import re, sys, httplib, time
import wikipedia, pagegenerators, login, config, catlib
from urllib2 import urlparse

# Define global variables
writeDelay = 60 # seconds
readDelay  = 20	# seconds
httpDebug  = 0
userAgent  = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)'

def checkLink(___location):
	redirectCounter = 6
	try:
		while (redirectCounter > 0 and ___location is not None):
			(scheme, site, path, args, query, frag) = urlparse.urlparse(___location)
			path = path + args + query
			conn = httplib.HTTPConnection(site)
			conn.set_debuglevel(httpDebug)
			conn.putrequest('HEAD', path)
			conn.putheader('User-Agent', userAgent)
			conn.endheaders()
			
			response = conn.getresponse()
			redirect = response.msg.getheader('___location')
			content_length = response.msg.getheader('content-length')
			content_type   = response.msg.getheader('content-type')
			response_code  = response.status
			conn.close()
			
			redirectCounter -= 1
			if(redirect is not None):
				if(redirect[:4] != "http"):
					___location = urlparse.urljoin(___location, redirect)
				else:
					___location = redirect
				wikipedia.output( u'Redirecting (%s) to %s' % (response_code, ___location) )
			else:
				___location = None
		return ( [___location, response_code, content_length, content_type] )
	except:
		wikipedia.output(u'Error with URL: %s' % ___location)
		return ( [___location, 006, None, None] )

# Convert the byte count to a human readable value
def binary_notation(size):
	a = float(size)
	exponent = 0
	while(a >= 1000.):
		a /= 1024.
		exponent += 3
	prefix = ['bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]']

	# Truncate and remove trailing dot
	byteSigs = str(a)[:4]
	if (byteSigs.endswith('.')):
		byteSigs = byteSigs[:3]
	return byteSigs + ' ' + prefix[exponent / 3]

def fix_broken_links(hypertext):
	#This function attempts to fix multipule broken link using its dictionary

	# Moving of resources
	hypertext = re.sub(r'virginiadot.org/infoservice/resources/', r'virginiadot.org/info/resources/', hypertext)
	hypertext = re.sub(r'ncdot.org/transit/aviation/ncairports/locations/pdf/', r'ncdot.org/transit/aviation/download/ncairports/', hypertext)
	
	# 301 Permanent Redirects
	hypertext = re.sub(r'transportation.ky.gov/planning/', r'www.planning.kytc.ky.gov/', hypertext)
	hypertext = re.sub(r'official-documents.co.uk/', r'official-documents.gov.uk/', hypertext)
	
	return hypertext

def update_size_paramter(template_text):
	fixed_text = fix_broken_links(template_text)

	___location    = re.search(r'(http[^] |}]*)', fixed_text).group(1)
	prefix_text = re.search(r'(\{\{[^|]*\|[^|}]*)[^}]*\}\}', fixed_text).group(1) 
	
	if (re.findall(r'=', template_text)):
		parameter_prefix = '|2='
	else:
		parameter_prefix = '|'
	
	# Parse indirect HTML character references
	___location = re.sub(r'&\#61;', r'=', ___location)
	___location = re.sub(r'&', r'&', ___location)
	
	(redirect, response, content_length, content_type) = checkLink(___location)
	if (content_type is not None and content_length is not None and int(content_length) > 16):
		# I should really put in 404 error handling code, but this has been working just fine.
		if re.findall(r'pdf|octet-stream', content_type):
			return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) +" bytes -->}}"
		else:
			wikipedia.output(u'Unusual content_type: %s, code: %s' % (content_type, response))
	return template_text	
	# If anything else return template_text back

def process_article(page):
		wikitext = page.get()
		
		# Fix Casing (Reduces the number of possible expressions)
		wikitext = re.sub(r'\{\{ *(Template:|template:|)(PDF|Pdf|pdf)', r'{{PDF', wikitext)
		
		# State point.  Count any changes as needing an update if they're after this line
		state0 = wikitext
		
		# Convert hard coded pdf links  (ex: [http link] (pdf) )
		wikitext = re.sub(r'(\[http[^]]*\]) *\((\[\[[^|\]]*|)(PDF|pdf)(\]\]|)\)', r'{{PDFlink|\1}}', wikitext)
		
		# Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] )
		wikitext = re.sub(r'(\(|)\{\{(PDFlink|PDF)\}\}(\)|) *(\[http[^]]*\])', r'{{\2|\4}}', wikitext)
		wikitext = re.sub(r'(("|)\[http[^]]*\]("|))([^a-zA-Z(]*) *(\(|)\{\{(PDFlink|PDF)\}\}(\)|)', r'{{\6|\1}}\4', wikitext)
	
		# Experimental: Convert with with tag at the end of a bullet list (ex: * [http link] some text ([[PDF]]) )
		wikitext = re.sub(r'(\n\*[^\n:/]*)(\[http[^]]*\])([^\n:/]*) *\((\[\[|\{\{|)(Portable Document Format\|PDF|PDFlink|PDF|pdf)(\}\}|\]\]|)\)', r'\1{{PDF|\2}}\3', wikitext)
		wikitext = re.sub(r'(\n\*[^\n:/]*)(\[http[^]]*\])([^\n:/]*) *\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext)
	
		# Remove PDFlink from citation templates
		wikitext = re.sub(r'(format *= *)(PDF|pdf|)(\(|)\{\{PDF(|link)\}\}(\(|)', r'\1PDF', wikitext)
		wikitext = re.sub(r'(\{\{(Cite|cite)[^}]*)(}}[^a-zA-Z]*)(\(|)\{\{(PDF|PDFlink)\}\}(\)|)', r'\1 |format=PDF\3', wikitext)
		wikitext = re.sub(r'(\{\{.ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext)
	
		state1 = wikitext
		m = re.findall(r'\{\{(PDF|PDFlink)\|[^}]*\}\}', wikitext)
		
		for s in m:
			if re.findall(r'http:', s):
				replacetext = update_size_paramter(s)
				wikitext    = re.sub(re.escape(s), replacetext, wikitext)
				# Uncomment the bellow line to see the replacement text
#				wikipedia.output(replacetext)
	
		# Fix equal sign problem
		wikitext = re.sub(r'\{\{(PDF|PDFlink)\|(1=|)([^{|}]+=[^{|}]+)', r'{{\1|1=\3', wikitext)
	
		if wikitext == state1:
			# Nothing was done with the embedded file sizes string
			EditMsg = 'Corrected use of {{[[Template:PDFlink|PDFlink]]}}'
		else:
			if len(wikitext) - len(state1) > 34:
				# Minimum of 34 bytes to add file size information
				EditMsg = 'Added file size for external links tagged with {{[[Template:PDFlink|PDFlink]]}}'
			else:
				EditMsg = 'Updated file size of transcluded {{[[Template:PDFlink|PDFlink]]}}'
		wikipedia.setAction(EditMsg)
			
		# If the text has changed at all since the state point, upload it
		if (wikitext != state0):
			try:
				wikipedia.output(u'Page change by %s bytes.  Writing new version.' % str(len(wikitext)-len(state0)))
				page.put(wikitext)	
			except:
				wikipedia.output(u'------- Write error ------')
		
			# Pause to reduce load on the servers
			time.sleep(writeDelay)
		else:
			time.sleep(readDelay)
	
def main():
	site  = wikipedia.getSite()
	
	for arg in wikipedia.handleArgs():
		if arg.startswith('-ref:'):
			referredPage = wikipedia.Page(site, arg[5:])
			gen = pagegenerators.ReferringPageGenerator(referredPage)
		elif arg.startswith('-file:'):
			gen = pagegenerators.TextfilePageGenerator(arg[6:])
		elif arg.startswith('-cat:'):
			cat = catlib.Category(site, arg[5:])
			gen = pagegenerators.CategorizedPageGenerator(cat)
		elif arg.startswith('-links:'):
			pagelinks = wikipedia.Page(wikipedia.getSite(), arg[7:])
			gen = pagegenerators.LinkedPageGenerator(pagelinks)
		else:
			wikipedia.showHelp(u'pdfbot')
			return
	
	wikipedia.output(u'Read delay is %s seconds.' % readDelay)
	wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay)
	
	# Only process pages from the main namespace
	gen =  pagegenerators.NamespaceFilterPageGenerator(gen, [0])
	
	for page in gen:
		process_article(page)
	wikipedia.output(u'\nOperation Complete.\n')

if __name__ == "__main__":
	try:
		main()
	finally:
		wikipedia.stopme()