User:PDFbot/pdfbot.py: Difference between revisions

Content deleted Content added
Lots of small fixes, standardization of output messages, Fixed bug "?" in with query, regex optimizations, support for SI bases
Adds information to the {{cite *}} series of templates; improved parse for {{PDFlink}} which supports and converts to mixed notation; regexp improvements; more general fixes with external programs
Line 1:
<source lang="python">
<pre><nowiki>
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Line 22:
 
import re, sys, time
import wikipedia, pagegenerators, login, config, catlib
import httplib, socket, urlparse
import codecs
try:
import commonfixes
except ImportError:
wikipedia.output('Unable to import commonfixes')
commonfixes = None
try:
import reflinks
def my_reflink_put_page(self, page, new):
self.page = page
self.new_text = new
reflinks.ReferencesRobot.put_page=my_reflink_put_page
except ImportError:
wikipedia.output('Unable to import reflinks')
reflinks = None
 
# Download this file :
# http://www.twoevils.org/files/wikipedia/404-links.txt.gz
# ( maintained by User:Marumari )
listof404pages = '404-links.txt'
 
# Define global constants
readDelay = 2010 # seconds
writeDelay = 6030 # seconds
SI_prefix mix_prefix = [('bytes', '[[Kilobyte|kB]]KB', '[[Megabyte|MB]]', '[[Gigabyte|GB]]'], 'TB', 'PB', 'EB', 'ZB', 'YB')
IEC_prefixSI_prefix = [('bytes', '[[KibibyteKilobyte|KiBkB]]', '[[MebibyteMegabyte|MiBMB]]', '[[GibibyteGigabyte|GiBGB]]'])
IEC_prefix = ('bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]')
# following char sperate url from title: []"<>\ \n
# {|} is included since we're in a template
Line 39 ⟶ 60:
'Keep-Alive': '30',
'Connection': 'keep-alive',
}
 
# Edit summary messages
msg_added = {
'de': u'BOT: hinzufuegen der Dateigroesse markiert als {{PDFlink}}',
'en': u'Added file size for external links tagged with {{[[Template:PDFlink|PDFlink]]}}',
}
msg_updated = {
'de': u'BOT: Aktualisieren der Dateigroesse mit Vorlageeinbindung',
'en': u'Updated file size of transcluded {{[[Template:PDFlink|PDFlink]]}}',
}
msg_fixed = {
'de': u'BOT: Korrigierte Benutzung der Vorlage {{PDFlink}}',
'en': u'Corrected usage of {{[[Template:PDFlink|PDFlink]]}}',
}
msg_removed_cite = {
'de': u'BOT: Entfernen {{PDFlink}} entsprechend der zitierten Vorlage',
'en': u'Remove {{PDFlink}} from citation template.',
}
 
def checkLink(___location, useHEAD = True, counter = 5):
try:
while (counter >= 0 and ___location is not None):
(scheme, site, path, args, query, frag) = urlparse.urlparseurlsplit(___location)
if query != query and '?' + query or '':
querypath = '?'path +or query'/'
if path == '':
path = '/'
if scheme == "http":
conn = httplib.HTTPConnection(site)
elif scheme == "https":
conn = httplib.HTTPSConnection(site)
else:
#conn.set_debuglevel(1)
conn.request return ('HEAD'___location, path-1, +'Unsupported args + queryProtocol', None, httpHeaderNone)
#conn.putheaderset_debuglevel('User-Agent', userAgent0)
socket.setdefaulttimeout(300)
#conn.endheaders()
try:
request = path.encode('ascii') + query.encode('ascii')
except UnicodeEncodeError:
encoding = 'utf-8'
noencode = '~!^*()_-=&/|,.?;'
request = unicode(urllib.quote(path.encode(encoding) + query.encode(encoding), noencode))
if useHEAD:
conn.request('HEAD', request, None, httpHeader)
else:
conn.request('GET', request, None, httpHeader)
response = conn.getresponse()
Line 80 ⟶ 93:
content_length = response.msg.getheader('content-length')
content_type = response.msg.getheader('content-type')
response_code = response.status
response_reason= response.reason
conn.close()
counter -= 1
if( redirect is not None):
wikipedia.output( u'STATUS: HTTP %s Moved: %s to %s' % (response_coderesponse.status, ___location, redirect) )
if redirect.startswith(redirect[:4] != "http"):
___location = urlparse.urljoin(___location, redirect)
else:
Line 93 ⟶ 104:
else:
___location = None
return [(___location, response_coderesponse.status, response_reasonresponse.reason, content_length, content_type])
except httplib.error, arg:
wikipedia.output(u'ERROR: HTTP %s %s' % (arg, ___location))
return [(___location, 52, "", None, None])
except socket.timeout:
return (___location, 110, 'Connection timeout', None, None)
except socket.error, arg:
wikipedia.output(u'ERROR: Socket %s %s' % (arg, ___location))
return [(___location, arg[0], arg[1], None, None])
except KeyboardInterrupt:
 
raise
# Convert the byte count to a human readable value
except Exception, e: # catches those weird ones
print u'Exception raised: %s' % e
return (___location, 0, "Exception %s" % e, None, None)
def binary_notation(size, base = 1024., prefix = IEC_prefix):
"""
Convert the byte count to a human readable value
"""
a = float(size)
exponent = 0
while( a >= 1000.):
a /= base
exponent += 3
Line 111 ⟶ 131:
# Truncate and remove trailing dot
byteSigs = str(a)[:4]
if (byteSigs.endswith('.')):
byteSigs = byteSigs[:3]
return byteSigs + '&nbsp;' + prefix[exponent / 3]
# return '%3.3g&nbsp;%s' % (byteSigs, prefix[exponent / 3])
 
def fix_broken_links(link):
"""
#This function attempts to fix multipule broken link using its dictionary
Returns link replacement for known broken links
"""
 
# Moving of resources
link = link.replace('virginiadot.org/infoservice/resources/', 'virginiadot.org/info/resources/')
link = link.replace('virginiadot.org/comtravel/', 'virginiadot.org/info/')
link = link.replace('ncdot.org/transit/aviation/ncairports/locations/pdf/', 'ncdot.org/transit/aviation/download/ncairports/')
link = link.replace('waitangi-tribunal.govt.nz/doclibrary/researchwhanui/', 'waitangi-tribunal.govt.nz/doclibrary/public/researchwhanui/')
Line 140 ⟶ 164:
 
def update_size_paramter(template_text):
m = re.search(r'(?s)\{\{(?P<tpl>[^|]*)\|(1=)?(?P<text>[^|]*).*?(, (?P<size>\d[0-9]+) bytes byte.*)??\}\}', fix_broken_links(template_text))
link_text = m.group('text')
___location = urlpattern.search(link_text).group(0)
ifold_size = int(m.group('size') isor not None and m.group('size') !=''0):
old_size = int(m.group('size'))
else:
old_size = '0'
parameter_prefix = ''
if (link_text.find('=') != -1):
if '=' in link_text:
parameter_prefix = '2='
else:
parameter_prefix = ''
# ParseConvert indirect HTML character references
___location = rewikipedia.subhtml2unicode(r'&\#(\d\d);', r'%\1', ___location)
___location = re.sub(r'&amp;', r'&', ___location)
(redirect, response, reason, content_length, content_typemedia_type) = checkLink(___location)
try:
if (content_type is not None and content_length is not None and= int(content_length) > 8 and int(content_length) != old_size):
except:
# I should really put in 404 error handling code, but this has been working just fine.
content_length = None
if re.findall(r'pdf|octet-stream', content_type):
if media_type and content_length and content_length != old_size:
return u'{{%s|%s|%s%s<!-- %s, %s bytes -->}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length), content_type, content_length )
# I should really put in 404 error handling code, but this has been working just fine.
else:
if 'pdf' in media_type or 'octet-stream' in media_type or 'application/download' in media_type:
wikipedia.output(u'FIXME: Bad response: content_type=%s, code=%s, ___location=%s' % (content_type, response, ___location))
# This was the old format using the comment
return template_text
# return u'{{%s|%s|%s%s<!-- %s, %d bytes -->}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length), content_type, content_length )
# However, comment was filled with generally non-useful information
return (not (old_size == 0) or template_text.count('|')<2, u'{{%s|%s|%s%s}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length, prefix = mix_prefix)))
else:
wikipedia.output(u'FIXME: Bad response: code: %d, type: %s, ___location: %s' % (response, media_type, ___location))
# If anything else return template_text back
if old_size:
return (False, u'{{%s|%s|%s%s}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(old_size, prefix = mix_prefix)))
else:
return (False, template_text)
 
def process_article(page):
try:
deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read()
except IOError:
wikipedia.output('You need to download http://www.twoevils.org/files/wikipedia/404-links.txt.gz and to ungzip it in the same directory')
raise
wikipedia.output('Getting page %s' % page.aslink())
wikitext = page.get()
# Fix Casing (Reduces the number of possible expressions)
wikitext = re.compilesub(r'(?i)\{\{\s*(template:|)pdf', re.IGNORECASE).sub(r'{{PDF', wikitext)
wikitext = wikitext.replace('{{PDFLink', '{{PDFlink')
# State point. Count any changes as needing an update if they're after this line
state0 = wikitext
# [http {{PDF}}]
wikitext = re.sub(r'(\[\w+://[^][<>"\s]+\s[^][\n]+?)\s*(\{\{(PDFlink|PDF)\}\})', r'\2\1', wikitext)
# Convert hard coded pdf links (ex: [http link] (pdf) )
wikitext = re.sub(r'(\[\w*+://[^][]*\]) *\((\[\[[^|\]]*)?\.?(PDF|pdf) *([Ff]ile)? *([Ff]ormat)?(\]\])?\)', r'{{PDFlink|\1}}', wikitext)
# Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] )
wikitext = re.sub(r'"[(]?\{\{(PDFlink|PDF)\}\}[)]? *((?P<quote>'*)\[http\w+://[^][]*\](?P=quote)'?)", r'{{\1|\2}}', wikitext)
wikitext = re.sub(r'("?\[http\w+://[^]]*\]"?)([^a-zA-Z0-9()]*) *[(]?\{\{ *(PDFlink|PDF) *\}\}[)]?', r'{{\3|\1}}\2', wikitext)
# Experimental: Convert with with tag at the end of a bullet list (ex: * [http link] some text ([[PDF]]) )
if '{{PDF' in wikitext:
wikitext = re.compile(r'(\n *\*+[^\n:/]*)(\[http://[^][]*\])([^\n:/]*) *[(](\[\[|\{\{)?(Portable Document Format[|]PDF|pdflink).?(pdf.?)?(file|format|datei)?(\}\}|\]\])?[)]', re.IGNORECASE).sub(r'\1{{PDFlink|\2}}\3', wikitext)
wikitext = re.subcompile(r'(\n *\*+[^\n:/]*)(\[http\w+://[^][]*\])([^\n:/]*) *[(](\[\[|\{\{)?(PDFlinkPortable Document Format[|]PDF|pdflink).?(pdf.?)?(file|format|datei)?(\}\}|\]\])?[)]', re.IGNORECASE).sub(r'\1{{\4PDFlink|\2}}\3', wikitext)
wikitext = re.sub(r'(\n *\*+[^\n:/]*)(\[\w+://[^][]*\])([^\n:/]*) *\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext)
 
# Experimental: move {{PDF}} back in <ref> tag
wikitext = re.sub(r'(<ref[^][{}<>]*>[^][<>=]*?)("?\[\w+://[^][<>\s"]+[^]\n]*\]"?)([^{}<>]*)\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext)
# State point. Correction of {{PDFlink}} template
genfixState = wikitext
# Remove PDFlink from citation templates
# {{cite |format={{PDF}}}}
wikitext = re.sub(r'(?s)(format *= *)(PDF|pdf|)?[(\(|)]?\{\{PDF[^{}]*?\}\}(\)|[)]?', r'\1PDF', wikitext)
# {{cite.*?}}{{PDF}}
wikitext = re.sub(r'(?s)(\{\{(Cite|cite) *[Cc]ite[^}]*)(\}\}[^a-zA-Z\w() ]*) *[(\(|)]?\{\{(PDF|PDFlink)\}\}(\)|[)]?', r'\1 |format=PDF\32', wikitext)
# {{cite | lang= EN {{PDF}} }}
wikitext = re.sub(r'(?s)(\{\{. *[Cc]ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext)
# {{PDF| {{template...}} }}
wikitext = re.sub(r'(?s)\{\{(PDFlink|PDF)\|\s*(\{\{[^{}]+?(\|[^{|}]+)?\}\})[\s|]*\}\}', r'\2', wikitext)
# {{citation|url={{PDFlink|...}} }}
wikitext = re.sub(r'(?i)\{\{(([Cc]itation|[Cc]ite)[^{}]+?)\{\{(PDFlink|PDF)\|([^{}]*?)(\|[^{|}]+)?\}\}', r'{{\1\4', wikitext)
# Sate point. Removal of {{PDFlink}} in certian instances
state1 = wikitext
state2 = wikitext
m = re.findall(r'\{\{PDF[link]{0,4}\|[^{}]*?\}\}', wikitext)
cleantext = wikitext
# This is ugly, since we need the comments to check the relative filesize
for m in re.finditer(r'<!--.*?-->|<nowiki[^>]*>.*?</nowiki>', cleantext):
if '{{PDF' in m.group():
cleantext = cleantext.replace(m.group(), '')
sizechange = 0
for s in m:
iffor m in re.findallfinditer(r'http\{\{(?:PDFlink|PDF)\|[s^{}]+?://\}\}', scleantext):
if 'http://' in m.group() or 'https://' in m.group():
replacetext = update_size_paramter(s)
(changed, replacetext) = update_size_paramter(m.group())
wikitext = re.sub(re.escape(s), replacetext, wikitext)
sizechange += changed and 1 or 0
# print "update page? %s"%(sizechange, )
wikitext = wikitext.replace(m.group(), replacetext)
# Uncomment the bellow line to see the replacement text
# wikipedia.output(u'OUTPUT: %s' % replacetext)
for s in re.findall(ur'(?ui)\{\{(?:cite[\w\s]+)\|[^{}]+?\}\}', cleantext):
# Fix equal sign problem
wikitext murl = re.subsearch(r'\{|\{(PDF|PDFlink)s*url\|(1s*=|)\s*(.{2}?P<url>http[s]?://[^{|}]+=[^{<>"\s|}]+)', r'{{(\1|1=\3|}})', wikitexts)
if murl and 'PDF' in murl.group().upper() and (not re.search(ur'\|\s*format\s*=\s*[^\s{|}]+', s) or not re.search(ur'\|\s*(access\w+)\s*=\s*([^{|}]+?)\s*(?=[{|}])', s)) and not re.search(ur'\|\s*archiveurl\s*=\s*[^\s{|}]+', s):
repl_url = fix_broken_links(murl.group('url'))
# Test to see if file sizes parameter was untouched
(redirect, response, reason, content_length, media_type) = checkLink(repl_url)
if wikitext == state1:
# media_type not given
if len(wikitext) - len(state0) <= 4:
if not media_type:
# 4 or more bytes removed typically indicate a embed citation removal
continue
EditMsg = msg_removed_cite
# Gone/Not Found error code
elif (response == 410 or (response == 404 and (u'\t%s\t' % murl.group(1) in deadLinks))) and repl_url == murl.group('url'):
wikitext = wikitext.replace(s, s + time.strftime("{{dead link|bot=PDFbot|date=%B %Y}}"))
# valid PDF
# python2.6code: any(item in media_type.lower() for item in ('pdf', 'octet-stream'))
elif 'pdf' in media_type.lower() or 'octet-stream' in media_type.lower():
replacetext = s
replacetext = replacetext.replace(murl.group(), murl.group().replace(murl.group('url'), repl_url))
if re.search(ur'\|\s*format\s*=\s*[^{|}]*[|}]', replacetext):
# fill in the format=
replacetext = re.sub(r'(\|\s*format\s*= ??)(\n* *[{|}])', r'\1PDF\2', replacetext)
else:
# add format=PDF (third last parameter)
replacetext = re.sub(r'(\{\{[^{}]+?)((\s*\|\s*)[^[=\]{|}]+(\s*= *)[^{|}]+)(\s*\|[^{|}]+)\}\}', r'\1\3format\4PDF\2\5}}', replacetext)
 
accessed = re.search(ur'\|\s*(access\w+)\s*=\s*[^{|}\s]+', replacetext)
# no access-anything filled in, add/fill accessdate
if not accessed:
# fill out accessdate if it exists
replacetext = re.sub(r'(\|\s*accessdate\s*= ??)(?=\n* *[{|}])', time.strftime(r'\g<1>%Y-%m-%d'), replacetext)
# if template doesn't contain accessdate then add it (last parameter)
if not re.search(r'\|\s*accessdate\s*=', replacetext):
replacetext = re.sub(r'(\{\{[^{}]+?)((\s*\|\s*)[^[=\]{|}]+?(\s*= *)[^{|}]+?)(\s*)\}\}', time.strftime(r'\1\2\3accessdate\g<4>%Y-%m-%d\5}}'), replacetext)
#replacetext = re.sub(r'(\{\{[^{}]+?)((\s*\|\s*)[^[=\]{|}]+(\s*= *)[^{|}]+)(\s*\|[^{|}]+)\}\}', time.strftime(r'\1\2\5\3accessdate\g<4>%Y-%m-%d}}'), replacetext)
 
# put back in
wikitext = wikitext.replace(s, replacetext)
sizechange += 1
# Uncomment the bellow line to see the replacement text
wikipedia.output(u'OUTPUT: %s' % replacetext)
 
# remove duplicate {{dead link}}
dead_templates = r'[Dd]ead[ _]*link|[Dd]l|[Dd]l-s|404|[Bb]roken[ _]+link|[Cc]leanup-link'
wikitext = re.sub('(\{\{(?:%s)[^}]*?\}\})+((</ref>)?\{\{(?:%s)[^}]*?\}\})'%(dead_templates, dead_templates), r'\2', wikitext)
 
# Figure out an edit message of what we did
if sizechange:
if state2 != state0:
EditMsg = "Updating %d PDF%s and fixes" % (sizechange, sizechange>1 and 's' or '')
else:
EditMsg = "Updating %d PDF%s" % (sizechange, sizechange>1 and 's' or '')
EditMsg = msg_fixed
else:
# state0: renamed templates
if len(wikitext) - len(state1) > 34:
# genfix: fixPDFlink
# Minimum of 34 bytes to add file size information
# state2: removePDFlink
EditMsg = msg_added
else#wikitext: -
EditMsg = msg_updated"General fixes for PDFs"
if wikitext == state0:
wikipedia.setAction(wikipedia.translate(page.site().language(), EditMsg))
pass # text stayed the same
elif wikitext == genfixState:
EditMsg = "Correct {{PDFlink}} syntax"
elif wikitext == state2:
if genfixState == state0: # no fixes
EditMsg = "Remove incorrect {{PDFlink}}"
else: #fixes+removal
pass
wikipedia.setAction(EditMsg)
updateSizes = wikitext
# altert me if the page contains {{pdflink|no-link}}
 
if re.findall(r'\{\{PDF(link|)\|[^:]*\}\}', wikitext):
# Fix equal sign problem
wikipedia.output(u'FIXME: No link in {{PDFlink}}')
# moved here to avoid changing edit message
wikitext = re.sub(r'\{\{(PDF|PDFlink)\|(1=|)(.{2}[^{|}]+=[^{|}]+)', r'{{\1|1=\3', wikitext)
# altert me if the page does not contains {{pdflink|no-link}}
if re.search(r'\{\{PDF(link|)\|[^:]+\}\}', wikitext):
wikipedia.output(u'FIXME: No link in {{PDFlink}} on %s' % page.aslink())
# If the text has changed at all since the state point, upload it
if (wikitext != state0 and sizechange) or state2 != state0 or updateSizes != wikitext:
wikipedia.output('PDFs updated: % 3d' % sizechange)
 
# [[pdf]] -> [[PDF]]
wikitext = re.sub(r'\[\[pdf(?=[|\]])', '[[PDF', wikitext)
 
# {{cite | format = pdf }}
wikitext = re.sub(r'(?s)(?:([|]\s*format\s*=\s*)(?:\[\[|)[Pp][Dd][Ff](?:\]\]|))+(\s*[{|}])', r'\1PDF\2', wikitext)
# To many to just fix when we come across, so we don't count it with the fixes
# Unlink PDF in format parameters
wikitext = re.sub(r'(?i)(\|\s*format\s*=\s*)\[\[(adobe|portable|document|file|format|pdf|\.|\s|\(|\)|\|)+\]\]', r'\1PDF', wikitext)
wikitext = re.sub(r'(?i)(\|\s*format\s*=\s*)(\s*\.?(adobe|portable|document|file|format|pdf|\(|\)))+?(?=\s*[|}])', r'\1PDF', wikitext)
# Apply common fixes if avalible
if commonfixes:
wikitext = commonfixes.fix(page, text=wikitext)
 
# Apply reflink if avalible
if reflinks:
# Hackist hook
page._contents = wikitext
if page.get() != wikitext:
wikipedia.output("Injected text wasn't returned with page.get()")
elif reflinks.linksInRef.search(wikitext):
reflinksbot = reflinks.ReferencesRobot(iter([page]))
reflinksbot.run()
if hasattr(reflinksbot, 'new_text'):
if reflinksbot.page != page:raise 'pages not the same'
wikitext = reflinksbot.new_text
# Reset edit summary
wikipedia.setAction(EditMsg)
 
try:
wikipedia.output(u'WRITE: Delta length of %s 3d bytes.' % str(len(wikitext)-len(state0)))
page.put(wikitext)
except Exception, e:
wikipedia.output(u'ERROR: Except %s raised while writing.' % e)
# Pause to reduce load on the servers
time.sleep(writeDelay)
else:
wikipedia.put_throttle()
time.sleep(readDelay)
pass
def main():
site = wikipedia.getSite()
Line 262 ⟶ 408:
page = wikipedia.Page(wikipedia.getSite(), unicode(arg[6:]))
gen = iter([page])
elif arg.startswith('-namespacens:'):
namespaces.append(int(arg[11:]))
elif arg.startswith('-delay:'):
global readDelay, writeDelay
readDelay = int(arg[7:])
writeDelay = int(arg[7:])
 
if not gen is None:
wikipedia.showHelp(u'pdfbot')
return
 
wikipedia.output(u'Delays are %s s for read and %s for writes' % (readDelay, writeDelay,) )
wikipedia.output(u'Read delay is %s seconds.' % readDelay)
wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay)
if namespaces != []:
gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
gen = pagegenerators.RedirectFilterPageGenerator(gen)
for page in gen:
if page.site().messages:
process_article(page)
wikipedia.output(u'\nOperationMessages Completeleft on talk page, halting.\n')
return
process_article(page)
wikipedia.output(u'Finished updating')
 
if __name__ == "__main__":
Line 286 ⟶ 438:
finally:
wikipedia.stopme()
</source>
</nowiki></pre>