#!/usr/bin/python # -*- coding: utf-8 -*- """ This script can be used to update links transcluded using the {{PDFlink}} template. Syntax: python pdfbot.py [-ref: TemplateName] Command line options: -file: Update pages listed in a text file. -ref: Update pages transcluding from a given page. -cat: Update pages from the given category. -links: Update pages linked from a given page. -page: Update that page. """ # # (c) Dispenser, 2007 # import re, sys, httplib, time import wikipedia, pagegenerators, login, config, catlib from urllib2 import urlparse # Define global constants readDelay = 20 # seconds writeDelay = 60 # seconds prefix = ['bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]'] urlpattern = re.compile(r'http[s]?://[^][>< \n|]*', re.IGNORECASE) userAgent = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)' # Edit summary messages msg_added = { 'de': u'BOT: hinzufuegen der Dateigroesse markiert als {{PDFlink}}', 'en': u'Added file size for external links tagged with {{[[Template:PDFlink|PDFlink]]}}', } msg_updated = { 'de': u'BOT: Aktualisieren der Dateigroesse mit Vorlageeinbindung', 'en': u'Updated file size of transcluded {{[[Template:PDFlink|PDFlink]]}}', } msg_fixed = { 'de': u'BOT: Korrigierte Benutzung der Vorlage {{PDFlink}}', 'en': u'Corrected usage of {{[[Template:PDFlink|PDFlink]]}}', } msg_removed_cite = { 'de': u'BOT: Entfernen {{PDFlink}} entsprechend der zitierten Vorlage', 'en': u'Remove {{PDFlink}} from citation template.', } def checkLink(___location, redirectCounter = 5): try: while (redirectCounter > 0 and ___location is not None): (scheme, site, path, args, query, frag) = urlparse.urlparse(___location) if scheme == "http": conn = httplib.HTTPConnection(site) elif scheme == "https": conn = httplib.HTTPSConnection(site) #conn.set_debuglevel(1) conn.putrequest('HEAD', path + args + query) conn.putheader('User-Agent', userAgent) conn.endheaders() response = conn.getresponse() redirect = response.msg.getheader('___location') content_length = response.msg.getheader('content-length') content_type = response.msg.getheader('content-type') response_code = response.status conn.close() redirectCounter -= 1 if(redirect is not None): if(redirect[:4] != "http"): ___location = urlparse.urljoin(___location, redirect) else: ___location = redirect wikipedia.output( u'STATUS: HTTP %s Moved: %s' % (response_code, ___location) ) else: ___location = None return [___location, response_code, content_length, content_type] except httplib.error, arg: wikipedia.output(u'HTTP Error: %s %s' % (arg, ___location)) return [___location, 7, None, None] except: wikipedia.output(u'Error with URL: %s' % ___location) return [___location, 6, None, None] # Convert the byte count to a human readable value def binary_notation(size): a = float(size) exponent = 0 while(a >= 1000.): a /= 1024. exponent += 3 # Truncate and remove trailing dot byteSigs = str(a)[:4] if (byteSigs.endswith('.')): byteSigs = byteSigs[:3] return byteSigs + ' ' + prefix[exponent / 3] def fix_broken_links(hypertext): #This function attempts to fix multipule broken link using its dictionary # Moving of resources hypertext = re.sub(r'virginiadot.org/infoservice/resources/', r'virginiadot.org/info/resources/', hypertext) hypertext = re.sub(r'ncdot.org/transit/aviation/ncairports/locations/pdf/', r'ncdot.org/transit/aviation/download/ncairports/', hypertext) # 301 Permanent Redirects hypertext = re.sub(r'transportation.ky.gov/planning/', r'www.planning.kytc.ky.gov/', hypertext) hypertext = re.sub(r'official-documents.co.uk/', r'official-documents.gov.uk/', hypertext) return hypertext def update_size_paramter(template_text): # following char sperate url from title: []"<>\ \n # | is included since we're in a template fixed_text = fix_broken_links(template_text) ___location = urlpattern.search(fixed_text).group(0) prefix_text = re.search(r'(\{\{[^|]*\|[^|]*).*\}\}', fixed_text).group(1) if (re.findall(r'=', template_text)): parameter_prefix = '|2=' else: parameter_prefix = '|' # Parse indirect HTML character references ___location = re.sub(r'&\#61;', r'=', ___location) ___location = re.sub(r'&', r'&', ___location) (redirect, response, content_length, content_type) = checkLink(___location) if (content_type is not None and content_length is not None and int(content_length) > 16): # I should really put in 404 error handling code, but this has been working just fine. if re.findall(r'pdf|octet-stream', content_type): return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) + ' bytes -->}}' else: wikipedia.output(u'Unusual content_type: %s, code: %s' % (content_type, response)) return template_text # If anything else return template_text back def process_article(page): wikitext = page.get() # Fix Casing (Reduces the number of possible expressions) wikitext = re.compile(r'\{\{\s*(template:|)pdf', re.IGNORECASE).sub(r'{{PDF', wikitext) # State point. Count any changes as needing an update if they're after this line state0 = wikitext # Convert hard coded pdf links (ex: [http link] (pdf) ) wikitext = re.sub(r'(\[http[^]]*\]) *\((\[\[[^|\]]*|)(PDF|pdf)(\]\]|)\)', r'{{PDFlink|\1}}', wikitext) # Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] ) wikitext = re.sub(r'(\(|)\{\{(PDFlink|PDF)\}\}(\)|) *(\[http[^]]*\])', r'{{\2|\4}}', wikitext) wikitext = re.sub(r'(("|)\[http[^]]*\]("|))([^a-zA-Z(]*) *(\(|)\{\{(PDFlink|PDF)\}\}(\)|)', r'{{\6|\1}}\4', wikitext) # Experimental: Convert with with tag at the end of a bullet list (ex: * [http link] some text ([[PDF]]) ) wikitext = re.compile(r'(\n\*[^\n:/]*)(\[http[^]]*\])([^\n:/]*) *\((\[\[|\{\{|)(Portable Document Format\|PDF|PDFlink).?(file|format|datei|)(\}\}|\]\]|)\)', re.IGNORECASE).sub(r'\1{{PDF|\2}}\3', wikitext) wikitext = re.sub(r'(\n\*[^\n:/]*)(\[http[^]]*\])([^\n:/]*) *\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext) # Remove PDFlink from citation templates wikitext = re.sub(r'(format *= *)(PDF|pdf|)(\(|)\{\{PDF[^{}]*\}\}(\)|)', r'\1PDF', wikitext) wikitext = re.sub(r'(\{\{(Cite|cite)[^}]*)(}}[^a-zA-Z]*)(\(|)\{\{(PDF|PDFlink)\}\}(\)|)', r'\1 |format=PDF\3', wikitext) wikitext = re.sub(r'(\{\{.ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext) state1 = wikitext m = re.findall(r'\{\{PDF[link]{0,4}\|[^}]*\}\}', wikitext) for s in m: if re.findall(r'http[s]?://', s): replacetext = update_size_paramter(s) wikitext = re.sub(re.escape(s), replacetext, wikitext) # Uncomment the bellow line to see the replacement text # wikipedia.output(replacetext) # Fix equal sign problem wikitext = re.sub(r'\{\{(PDF|PDFlink)\|(1=|)(.{2}[^{|}]+=[^{|}]+)', r'{{\1|1=\3', wikitext) # Test to see if file sizes parameter was untouched if wikitext == state1: if len(wikitext) - len(state1) <= 4: # 4 or more bytes removed typically indicate a embed citation removal EditMsg = msg_removed_cite else: EditMsg = msg_fixed else: if len(wikitext) - len(state1) > 34: # Minimum of 34 bytes to add file size information EditMsg = msg_added else: EditMsg = msg_updated wikipedia.setAction(wikipedia.translate(page.site().language(), EditMsg)) # If the text has changed at all since the state point, upload it if (wikitext != state0): try: wikipedia.output(u'Page change by %s bytes. Writing new version.' % str(len(wikitext)-len(state0))) page.put(wikitext) except: wikipedia.output(u'------- Write error ------') # Pause to reduce load on the servers time.sleep(writeDelay) else: time.sleep(readDelay) def main(): site = wikipedia.getSite() gen = None for arg in wikipedia.handleArgs(): if arg.startswith('-ref:'): referredPage = wikipedia.Page(site, arg[5:]) gen = pagegenerators.ReferringPageGenerator(referredPage) elif arg.startswith('-file:'): gen = pagegenerators.TextfilePageGenerator(arg[6:]) elif arg.startswith('-cat:'): cat = catlib.Category(site, arg[5:]) gen = pagegenerators.CategorizedPageGenerator(cat) elif arg.startswith('-links:'): pagelinks = wikipedia.Page(wikipedia.getSite(), arg[7:]) gen = pagegenerators.LinkedPageGenerator(pagelinks) elif arg.startswith('-page:'): page = wikipedia.Page(wikipedia.getSite(), unicode(arg[6:])) gen = iter([page]) if gen is None: wikipedia.showHelp(u'pdfbot') return wikipedia.output(u'Read delay is %s seconds.' % readDelay) wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay) # Only process pages from the main namespace gen = pagegenerators.NamespaceFilterPageGenerator(gen, [0]) gen = pagegenerators.RedirectFilterPageGenerator(gen) for page in gen: process_article(page) wikipedia.output(u'\nOperation Complete.\n') if __name__ == "__main__": try: main() finally: wikipedia.stopme()