User:PDFbot/pdfbot.py

This is an old revision of this page, as edited by Dispenser (talk | contribs) at 03:30, 13 July 2007 (Lots of small fixes, standardization of output messages, Fixed bug "?" in with query, regex optimizations, support for SI bases). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
This script can be used to update links transcluded using the {{PDFlink}} template.

Syntax: python pdfbot.py [-ref: TemplateName]

Command line options:

-file:       Update pages listed in a text file.
-ref:        Update pages transcluding from a given page.
-cat:        Update pages from the given category.
-links:      Update pages linked from a given page.
-page:       Update that page.

"""

#
# (c) Dispenser, 2007
#

import re, sys, time
import wikipedia, pagegenerators, login, config, catlib
import httplib, socket, urlparse

# Define global constants
readDelay  = 20	# seconds
writeDelay = 60 # seconds
SI_prefix  = ['bytes', '[[Kilobyte|kB]]', '[[Megabyte|MB]]', '[[Gigabyte|GB]]']
IEC_prefix = ['bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]']
# following char sperate url from title: []"<>\ \n
# {|} is included since we're in a template
urlpattern = re.compile(r'http[s]?://[^][<>\s"{|}]*', re.IGNORECASE)
httpHeader = {
	'User-Agent': 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)',
	'Accept': 'application/pdf,application/octet-stream,*/*;q=0.5',
	'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
	'Keep-Alive': '30',
	'Connection': 'keep-alive',
}

# Edit summary messages
msg_added = {
	'de': u'BOT: hinzufuegen der Dateigroesse markiert als {{PDFlink}}',
	'en': u'Added file size for external links tagged with {{[[Template:PDFlink|PDFlink]]}}',
}
msg_updated = {
	'de': u'BOT: Aktualisieren der Dateigroesse mit Vorlageeinbindung',
	'en': u'Updated file size of transcluded {{[[Template:PDFlink|PDFlink]]}}',
}
msg_fixed = {
	'de': u'BOT: Korrigierte Benutzung der Vorlage {{PDFlink}}',
	'en': u'Corrected usage of {{[[Template:PDFlink|PDFlink]]}}',
}
msg_removed_cite = {
	'de': u'BOT: Entfernen {{PDFlink}} entsprechend der zitierten Vorlage',
	'en': u'Remove {{PDFlink}} from citation template.',
}

def checkLink(___location, useHEAD = True, counter = 5):
	try:
		while (counter >= 0 and ___location is not None):
			(scheme, site, path, args, query, frag) = urlparse.urlparse(___location)
			if query != '':
				query = '?' + query
			if path == '':
				path = '/'
			if scheme == "http":
				conn = httplib.HTTPConnection(site)
			elif scheme == "https":
				conn = httplib.HTTPSConnection(site)
			#conn.set_debuglevel(1)
			conn.request('HEAD', path + args + query, None, httpHeader)
			#conn.putheader('User-Agent', userAgent)
			#conn.endheaders()
			
			response = conn.getresponse()
			redirect = response.msg.getheader('___location')
			content_length = response.msg.getheader('content-length')
			content_type   = response.msg.getheader('content-type')
			response_code  = response.status
			response_reason= response.reason
			conn.close()
			
			counter -= 1
			if(redirect is not None):
				wikipedia.output( u'STATUS:	HTTP %s Moved: %s to %s' % (response_code, ___location, redirect) )
				if(redirect[:4] != "http"):
					___location = urlparse.urljoin(___location, redirect)
				else:
					___location = redirect
			else:
				___location = None
		return [___location, response_code, response_reason, content_length, content_type]
	except httplib.error, arg:
		wikipedia.output(u'ERROR:	HTTP %s %s' % (arg, ___location))
		return [___location, 52, "", None, None]
	except socket.error, arg:
		wikipedia.output(u'ERROR:	Socket %s %s' % (arg, ___location))
		return [___location, arg[0], arg[1], None, None]

# Convert the byte count to a human readable value
def binary_notation(size, base = 1024., prefix = IEC_prefix):
	a = float(size)
	exponent = 0
	while(a >= 1000.):
		a /= base
		exponent += 3

	# Truncate and remove trailing dot
	byteSigs = str(a)[:4]
	if (byteSigs.endswith('.')):
		byteSigs = byteSigs[:3]
	return byteSigs + ' ' + prefix[exponent / 3]

def fix_broken_links(link):
	#This function attempts to fix multipule broken link using its dictionary

	# Moving of resources
	link = link.replace('virginiadot.org/infoservice/resources/', 'virginiadot.org/info/resources/')
	link = link.replace('ncdot.org/transit/aviation/ncairports/locations/pdf/', 'ncdot.org/transit/aviation/download/ncairports/')
	link = link.replace('waitangi-tribunal.govt.nz/doclibrary/researchwhanui/', 'waitangi-tribunal.govt.nz/doclibrary/public/researchwhanui/')
	
	# 301 Permanent Redirects
	link = link.replace('transportation.ky.gov/planning/', 'www.planning.kytc.ky.gov/')
	link = link.replace('official-documents.co.uk/', 'official-documents.gov.uk/')
	link = link.replace('http://bmj.bmjjournals.com/', 'http://www.bmj.com/')
	link = link.replace('http://bris.ac.uk/', 'http://www.bristol.ac.uk/')
	link = link.replace('http://www.shef.ac.uk/socst/', 'http://www.shef.ac.uk/socstudies/')
	link = link.replace('http://www.sims.berkeley.edu:8000/', 'http://www2.sims.berkeley.edu/')
	link = link.replace('http://www.cs.wm.edu/hpcs/', 'http://www.cse.ohio-state.edu/hpcs/')
	link = link.replace('http://www.pchrgaza.org/', 'http://www.pchrgaza.ps/')
	link = link.replace('http://www.almlondon.org.uk/', 'http://www.mlalondon.org.uk/')
	link = link.replace('http://www.state.ma.us/eot/', 'http://www.eot.state.ma.us/')
	link = link.replace('http://www.aapt.org.au/', 'http://www.ausapt.org.au/')
	link = link.replace('http://berlin.usembassy.gov/', 'http://germany.usembassy.gov/')
	
	return link

def update_size_paramter(template_text):
	m = re.search(r'\{\{(?P<tpl>[^|]*)\|(1=)?(?P<text>[^|]*).*(, (?P<size>\d+) bytes .*)??\}\}', fix_broken_links(template_text))
	link_text = m.group('text')
	___location  = urlpattern.search(link_text).group(0)
	
	if(m.group('size') is not None and m.group('size') !=''):
		old_size = int(m.group('size'))
	else:
		old_size = '0'
	
	if (link_text.find('=') != -1):
		parameter_prefix = '2='
	else:
		parameter_prefix = ''
	
	# Parse indirect HTML character references
	___location = re.sub(r'&\#(\d\d);', r'%\1', ___location)
	___location = re.sub(r'&', r'&', ___location)
	
	(redirect, response, reason, content_length, content_type) = checkLink(___location)
	
	if (content_type is not None and content_length is not None and int(content_length) > 8 and int(content_length) != old_size):
		# I should really put in 404 error handling code, but this has been working just fine.
		if re.findall(r'pdf|octet-stream', content_type):
			return  u'{{%s|%s|%s%s<!-- %s, %s bytes -->}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length), content_type, content_length )
		else:
			wikipedia.output(u'FIXME:	Bad response: content_type=%s, code=%s, ___location=%s' % (content_type, response, ___location))
	return template_text	
	# If anything else return template_text back

def process_article(page):
		wikitext = page.get()
		
		# Fix Casing (Reduces the number of possible expressions)
		wikitext = re.compile(r'\{\{\s*(template:|)pdf', re.IGNORECASE).sub(r'{{PDF', wikitext)
		
		# State point.  Count any changes as needing an update if they're after this line
		state0 = wikitext
		
		# Convert hard coded pdf links  (ex: [http link] (pdf) )
		wikitext = re.sub(r'(\[\w*://[^][]*\]) *\((\[\[[^|\]]*)?\.?(PDF|pdf) *([Ff]ile)? *([Ff]ormat)?(\]\])?\)', r'{{PDFlink|\1}}', wikitext)
		
		# Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] )
		wikitext = re.sub(r'[(]?\{\{(PDFlink|PDF)\}\}[)]? *(\[http://[^][]*\])', r'{{\1|\2}}', wikitext)
		wikitext = re.sub(r'("?\[http[^]]*\]"?)([^a-zA-Z0-9()]*) *[(]?\{\{ *(PDFlink|PDF) *\}\}[)]?', r'{{\3|\1}}\2', wikitext)
		
		# Experimental: Convert with with tag at the end of a bullet list (ex: * [http link] some text ([[PDF]]) )
		wikitext = re.compile(r'(\n *\*+[^\n:/]*)(\[http://[^][]*\])([^\n:/]*) *[(](\[\[|\{\{)?(Portable Document Format[|]PDF|pdflink).?(pdf.?)?(file|format|datei)?(\}\}|\]\])?[)]', re.IGNORECASE).sub(r'\1{{PDFlink|\2}}\3', wikitext)
		wikitext = re.sub(r'(\n *\*+[^\n:/]*)(\[http://[^][]*\])([^\n:/]*) *\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext)
		
		# Remove PDFlink from citation templates
		# {{cite |format={{PDF}}}}
		wikitext = re.sub(r'(format *= *)(PDF|pdf|)(\(|)\{\{PDF[^{}]*\}\}(\)|)', r'\1PDF', wikitext)
		#  {{cite.*?}}{{PDF}}
		wikitext = re.sub(r'(\{\{(Cite|cite)[^}]*)(}}[^a-zA-Z]*)(\(|)\{\{(PDF|PDFlink)\}\}(\)|)', r'\1 |format=PDF\3', wikitext)
		# {{cite | lang= EN {{PDF}} }}
		wikitext = re.sub(r'(\{\{.ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext)
		
		state1 = wikitext
		m = re.findall(r'\{\{PDF[link]{0,4}\|[^{}]*?\}\}', wikitext)
		
		for s in m:
			if re.findall(r'http[s]?://', s):
				replacetext = update_size_paramter(s)
				wikitext    = re.sub(re.escape(s), replacetext, wikitext)
				# Uncomment the bellow line to see the replacement text
#			wikipedia.output(u'OUTPUT:	%s' % replacetext)
		
		# Fix equal sign problem
		wikitext = re.sub(r'\{\{(PDF|PDFlink)\|(1=|)(.{2}[^{|}]+=[^{|}]+)', r'{{\1|1=\3', wikitext)
		
		# Test to see if file sizes parameter was untouched
		if wikitext == state1:
			if len(wikitext) - len(state0) <= 4:
				# 4 or more bytes removed typically indicate a embed citation removal
				EditMsg = msg_removed_cite
			else:
				EditMsg = msg_fixed
		else:
			if len(wikitext) - len(state1) > 34:
				# Minimum of 34 bytes to add file size information
				EditMsg = msg_added
			else:
				EditMsg = msg_updated
		wikipedia.setAction(wikipedia.translate(page.site().language(), EditMsg))
		
		# altert me if the page contains {{pdflink|no-link}}
		if re.findall(r'\{\{PDF(link|)\|[^:]*\}\}', wikitext):
			wikipedia.output(u'FIXME:	No link in {{PDFlink}}')
		
		# If the text has changed at all since the state point, upload it
		if (wikitext != state0):
			try:
				wikipedia.output(u'WRITE:	Delta length of %s bytes.' % str(len(wikitext)-len(state0)))
				page.put(wikitext)	
			except:
				wikipedia.output(u'ERROR:	Except raised while writing.')
			
			# Pause to reduce load on the servers
			time.sleep(writeDelay)
		else:
			time.sleep(readDelay)
	
def main():
	site  = wikipedia.getSite()
	gen = None
	namespaces = [0]
	
	for arg in wikipedia.handleArgs():
		if arg.startswith('-ref:'):
			referredPage = wikipedia.Page(site, arg[5:])
			gen = pagegenerators.ReferringPageGenerator(referredPage)
		elif arg.startswith('-file:'):
			gen = pagegenerators.TextfilePageGenerator(arg[6:])
		elif arg.startswith('-cat:'):
			cat = catlib.Category(site, arg[5:])
			gen = pagegenerators.CategorizedPageGenerator(cat)
		elif arg.startswith('-links:'):
			pagelinks = wikipedia.Page(wikipedia.getSite(), arg[7:])
			gen = pagegenerators.LinkedPageGenerator(pagelinks)
		elif arg.startswith('-page:'):
			page = wikipedia.Page(wikipedia.getSite(), unicode(arg[6:]))
			gen = iter([page])
		elif arg.startswith('-namespace:'):
			namespaces.append(int(arg[11:]))

	if gen is None:
		wikipedia.showHelp(u'pdfbot')
		return
	
	wikipedia.output(u'Read delay is %s seconds.' % readDelay)
	wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay)
		
	if namespaces != []:
			gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
		
	gen = pagegenerators.RedirectFilterPageGenerator(gen)
		
	for page in gen:
			process_article(page)
	wikipedia.output(u'\nOperation Complete.\n')

if __name__ == "__main__":
	try:
		main()
	finally:
		wikipedia.stopme()