#!/usr/bin/python # -*- coding: utf-8 -*- """ This script can be used to update links transcluded using the {{PDFlink}} template. Syntax: python pdfbot.py [-ref: TemplateName] Command line options: -file: Update pages listed in a text file. -ref: Update pages transcluding from a given page. -cat: Update pages from the given category. -links: Update pages linked from a given page. -page: Update that page. """ # # (c) Dispenser, 2007 # import re, sys, time import wikipedia, pagegenerators, login, config, catlib import httplib, socket, urlparse # Define global constants readDelay = 20 # seconds writeDelay = 60 # seconds SI_prefix = ['bytes', '[[Kilobyte|kB]]', '[[Megabyte|MB]]', '[[Gigabyte|GB]]'] IEC_prefix = ['bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]'] # following char sperate url from title: []"<>\ \n # {|} is included since we're in a template urlpattern = re.compile(r'http[s]?://[^][<>\s"{|}]*', re.IGNORECASE) httpHeader = { 'User-Agent': 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)', 'Accept': 'application/pdf,application/octet-stream,*/*;q=0.5', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Keep-Alive': '30', 'Connection': 'keep-alive', } # Edit summary messages msg_added = { 'de': u'BOT: hinzufuegen der Dateigroesse markiert als {{PDFlink}}', 'en': u'Added file size for external links tagged with {{[[Template:PDFlink|PDFlink]]}}', } msg_updated = { 'de': u'BOT: Aktualisieren der Dateigroesse mit Vorlageeinbindung', 'en': u'Updated file size of transcluded {{[[Template:PDFlink|PDFlink]]}}', } msg_fixed = { 'de': u'BOT: Korrigierte Benutzung der Vorlage {{PDFlink}}', 'en': u'Corrected usage of {{[[Template:PDFlink|PDFlink]]}}', } msg_removed_cite = { 'de': u'BOT: Entfernen {{PDFlink}} entsprechend der zitierten Vorlage', 'en': u'Remove {{PDFlink}} from citation template.', } def checkLink(___location, useHEAD = True, counter = 5): try: while (counter >= 0 and ___location is not None): (scheme, site, path, args, query, frag) = urlparse.urlparse(___location) if query != '': query = '?' + query if path == '': path = '/' if scheme == "http": conn = httplib.HTTPConnection(site) elif scheme == "https": conn = httplib.HTTPSConnection(site) #conn.set_debuglevel(1) conn.request('HEAD', path + args + query, None, httpHeader) #conn.putheader('User-Agent', userAgent) #conn.endheaders() response = conn.getresponse() redirect = response.msg.getheader('___location') content_length = response.msg.getheader('content-length') content_type = response.msg.getheader('content-type') response_code = response.status response_reason= response.reason conn.close() counter -= 1 if(redirect is not None): wikipedia.output( u'STATUS: HTTP %s Moved: %s to %s' % (response_code, ___location, redirect) ) if(redirect[:4] != "http"): ___location = urlparse.urljoin(___location, redirect) else: ___location = redirect else: ___location = None return [___location, response_code, response_reason, content_length, content_type] except httplib.error, arg: wikipedia.output(u'ERROR: HTTP %s %s' % (arg, ___location)) return [___location, 52, "", None, None] except socket.error, arg: wikipedia.output(u'ERROR: Socket %s %s' % (arg, ___location)) return [___location, arg[0], arg[1], None, None] # Convert the byte count to a human readable value def binary_notation(size, base = 1024., prefix = IEC_prefix): a = float(size) exponent = 0 while(a >= 1000.): a /= base exponent += 3 # Truncate and remove trailing dot byteSigs = str(a)[:4] if (byteSigs.endswith('.')): byteSigs = byteSigs[:3] return byteSigs + ' ' + prefix[exponent / 3] def fix_broken_links(link): #This function attempts to fix multipule broken link using its dictionary # Moving of resources link = link.replace('virginiadot.org/infoservice/resources/', 'virginiadot.org/info/resources/') link = link.replace('ncdot.org/transit/aviation/ncairports/locations/pdf/', 'ncdot.org/transit/aviation/download/ncairports/') link = link.replace('waitangi-tribunal.govt.nz/doclibrary/researchwhanui/', 'waitangi-tribunal.govt.nz/doclibrary/public/researchwhanui/') # 301 Permanent Redirects link = link.replace('transportation.ky.gov/planning/', 'www.planning.kytc.ky.gov/') link = link.replace('official-documents.co.uk/', 'official-documents.gov.uk/') link = link.replace('http://bmj.bmjjournals.com/', 'http://www.bmj.com/') link = link.replace('http://bris.ac.uk/', 'http://www.bristol.ac.uk/') link = link.replace('http://www.shef.ac.uk/socst/', 'http://www.shef.ac.uk/socstudies/') link = link.replace('http://www.sims.berkeley.edu:8000/', 'http://www2.sims.berkeley.edu/') link = link.replace('http://www.cs.wm.edu/hpcs/', 'http://www.cse.ohio-state.edu/hpcs/') link = link.replace('http://www.pchrgaza.org/', 'http://www.pchrgaza.ps/') link = link.replace('http://www.almlondon.org.uk/', 'http://www.mlalondon.org.uk/') link = link.replace('http://www.state.ma.us/eot/', 'http://www.eot.state.ma.us/') link = link.replace('http://www.aapt.org.au/', 'http://www.ausapt.org.au/') link = link.replace('http://berlin.usembassy.gov/', 'http://germany.usembassy.gov/') return link def update_size_paramter(template_text): m = re.search(r'\{\{(?P<tpl>[^|]*)\|(1=)?(?P<text>[^|]*).*(, (?P<size>\d+) bytes .*)??\}\}', fix_broken_links(template_text)) link_text = m.group('text') ___location = urlpattern.search(link_text).group(0) if(m.group('size') is not None and m.group('size') !=''): old_size = int(m.group('size')) else: old_size = '0' if (link_text.find('=') != -1): parameter_prefix = '2=' else: parameter_prefix = '' # Parse indirect HTML character references ___location = re.sub(r'&\#(\d\d);', r'%\1', ___location) ___location = re.sub(r'&', r'&', ___location) (redirect, response, reason, content_length, content_type) = checkLink(___location) if (content_type is not None and content_length is not None and int(content_length) > 8 and int(content_length) != old_size): # I should really put in 404 error handling code, but this has been working just fine. if re.findall(r'pdf|octet-stream', content_type): return u'{{%s|%s|%s%s<!-- %s, %s bytes -->}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length), content_type, content_length ) else: wikipedia.output(u'FIXME: Bad response: content_type=%s, code=%s, ___location=%s' % (content_type, response, ___location)) return template_text # If anything else return template_text back def process_article(page): wikitext = page.get() # Fix Casing (Reduces the number of possible expressions) wikitext = re.compile(r'\{\{\s*(template:|)pdf', re.IGNORECASE).sub(r'{{PDF', wikitext) # State point. Count any changes as needing an update if they're after this line state0 = wikitext # Convert hard coded pdf links (ex: [http link] (pdf) ) wikitext = re.sub(r'(\[\w*://[^][]*\]) *\((\[\[[^|\]]*)?\.?(PDF|pdf) *([Ff]ile)? *([Ff]ormat)?(\]\])?\)', r'{{PDFlink|\1}}', wikitext) # Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] ) wikitext = re.sub(r'[(]?\{\{(PDFlink|PDF)\}\}[)]? *(\[http://[^][]*\])', r'{{\1|\2}}', wikitext) wikitext = re.sub(r'("?\[http[^]]*\]"?)([^a-zA-Z0-9()]*) *[(]?\{\{ *(PDFlink|PDF) *\}\}[)]?', r'{{\3|\1}}\2', wikitext) # Experimental: Convert with with tag at the end of a bullet list (ex: * [http link] some text ([[PDF]]) ) wikitext = re.compile(r'(\n *\*+[^\n:/]*)(\[http://[^][]*\])([^\n:/]*) *[(](\[\[|\{\{)?(Portable Document Format[|]PDF|pdflink).?(pdf.?)?(file|format|datei)?(\}\}|\]\])?[)]', re.IGNORECASE).sub(r'\1{{PDFlink|\2}}\3', wikitext) wikitext = re.sub(r'(\n *\*+[^\n:/]*)(\[http://[^][]*\])([^\n:/]*) *\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext) # Remove PDFlink from citation templates # {{cite |format={{PDF}}}} wikitext = re.sub(r'(format *= *)(PDF|pdf|)(\(|)\{\{PDF[^{}]*\}\}(\)|)', r'\1PDF', wikitext) # {{cite.*?}}{{PDF}} wikitext = re.sub(r'(\{\{(Cite|cite)[^}]*)(}}[^a-zA-Z]*)(\(|)\{\{(PDF|PDFlink)\}\}(\)|)', r'\1 |format=PDF\3', wikitext) # {{cite | lang= EN {{PDF}} }} wikitext = re.sub(r'(\{\{.ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext) state1 = wikitext m = re.findall(r'\{\{PDF[link]{0,4}\|[^{}]*?\}\}', wikitext) for s in m: if re.findall(r'http[s]?://', s): replacetext = update_size_paramter(s) wikitext = re.sub(re.escape(s), replacetext, wikitext) # Uncomment the bellow line to see the replacement text # wikipedia.output(u'OUTPUT: %s' % replacetext) # Fix equal sign problem wikitext = re.sub(r'\{\{(PDF|PDFlink)\|(1=|)(.{2}[^{|}]+=[^{|}]+)', r'{{\1|1=\3', wikitext) # Test to see if file sizes parameter was untouched if wikitext == state1: if len(wikitext) - len(state0) <= 4: # 4 or more bytes removed typically indicate a embed citation removal EditMsg = msg_removed_cite else: EditMsg = msg_fixed else: if len(wikitext) - len(state1) > 34: # Minimum of 34 bytes to add file size information EditMsg = msg_added else: EditMsg = msg_updated wikipedia.setAction(wikipedia.translate(page.site().language(), EditMsg)) # altert me if the page contains {{pdflink|no-link}} if re.findall(r'\{\{PDF(link|)\|[^:]*\}\}', wikitext): wikipedia.output(u'FIXME: No link in {{PDFlink}}') # If the text has changed at all since the state point, upload it if (wikitext != state0): try: wikipedia.output(u'WRITE: Delta length of %s bytes.' % str(len(wikitext)-len(state0))) page.put(wikitext) except: wikipedia.output(u'ERROR: Except raised while writing.') # Pause to reduce load on the servers time.sleep(writeDelay) else: time.sleep(readDelay) def main(): site = wikipedia.getSite() gen = None namespaces = [0] for arg in wikipedia.handleArgs(): if arg.startswith('-ref:'): referredPage = wikipedia.Page(site, arg[5:]) gen = pagegenerators.ReferringPageGenerator(referredPage) elif arg.startswith('-file:'): gen = pagegenerators.TextfilePageGenerator(arg[6:]) elif arg.startswith('-cat:'): cat = catlib.Category(site, arg[5:]) gen = pagegenerators.CategorizedPageGenerator(cat) elif arg.startswith('-links:'): pagelinks = wikipedia.Page(wikipedia.getSite(), arg[7:]) gen = pagegenerators.LinkedPageGenerator(pagelinks) elif arg.startswith('-page:'): page = wikipedia.Page(wikipedia.getSite(), unicode(arg[6:])) gen = iter([page]) elif arg.startswith('-namespace:'): namespaces.append(int(arg[11:])) if gen is None: wikipedia.showHelp(u'pdfbot') return wikipedia.output(u'Read delay is %s seconds.' % readDelay) wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay) if namespaces != []: gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) gen = pagegenerators.RedirectFilterPageGenerator(gen) for page in gen: process_article(page) wikipedia.output(u'\nOperation Complete.\n') if __name__ == "__main__": try: main() finally: wikipedia.stopme()