User:PDFbot/pdfbot.py: Difference between revisions

Content deleted Content added
code sync, big feature: Dead external link repair
Major addition: Interlanguage support
Line 13:
-cat: Update pages from the given category.
-links: Update pages linked from a given page.
-page: Update that page.
 
"""
 
#
# (c) Dispenser, 2007
#
 
import re, sys, httplib, time
Line 20 ⟶ 25:
from urllib2 import urlparse
 
# Define global variablesconstants
writeDelay = 60 # seconds
readDelay = 20 # seconds
writeDelay = 60 # seconds
httpDebug = 0
prefix = ['bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]']
urlpattern = re.compile(r'http[s]?://[^][>< \n|]*', re.IGNORECASE)
userAgent = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)'
 
# Edit summary messages
def checkLink(___location):
msg_added = {
redirectCounter = 6
'de': u'BOT: hinzufuegen der Dateigroesse markiert als {{PDFlink}}',
EditMsg = 'en': u'Added file size for external links tagged with {{[[Template:PDFlink|PDFlink]]}}',
}
msg_updated = {
'de': u'BOT: Aktualisieren der Dateigroesse mit Vorlageeinbindung',
EditMsg = 'en': u'Updated file size of transcluded {{[[Template:PDFlink|PDFlink]]}}',
}
msg_fixed = {
'de': u'BOT: Korrigierte Benutzung der Vorlage {{PDFlink}}',
EditMsg = 'en': u'Corrected useusage of {{[[Template:PDFlink|PDFlink]]}}',
}
msg_removed_cite = {
'de': u'BOT: Entfernen {{PDFlink}} entsprechend der zitierten Vorlage',
'en': u'Remove {{PDFlink}} from citation template.',
}
 
def checkLink(___location, redirectCounter = 5):
try:
while (redirectCounter > 0 and ___location is not None):
(scheme, site, path, args, query, frag) = urlparse.urlparse(___location)
if scheme == "http":
path = path + args + query
conn = httplib.HTTPConnection(site)
elif scheme == "https":
conn.set_debuglevel(httpDebug)
conn = httplib.putrequestHTTPSConnection('HEAD', pathsite)
#conn.set_debuglevel(httpDebug1)
path =conn.putrequest('HEAD', path + args + query)
conn.putheader('User-Agent', userAgent)
conn.endheaders()
Line 51 ⟶ 76:
else:
___location = redirect
wikipedia.output( u'RedirectingSTATUS: HTTP (%s) toMoved: %s' % (response_code, ___location) )
else:
___location = None
return ( [___location, response_code, content_length, content_type] )
except httplib.error, arg:
wikipedia.output(u'HTTP Error: %s %s' % (arg, ___location))
return [___location, 7, None, None]
except:
wikipedia.output(u'Error with URL: %s' % ___location)
return ( [___location, 0066, None, None] )
 
# Convert the byte count to a human readable value
Line 66 ⟶ 94:
a /= 1024.
exponent += 3
prefix = ['bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]']
 
# Truncate and remove trailing dot
Line 88 ⟶ 115:
 
def update_size_paramter(template_text):
# following char sperate url from title: []"<>\ \n
# | is included since we're in a template
fixed_text = fix_broken_links(template_text)
 
___location = reurlpattern.search(r'(http[^] |}]*)', fixed_text).group(10)
prefix_text = re.search(r'(\{\{[^|]*\|[^|}]*)[^}].*\}\}', fixed_text).group(1)
if (re.findall(r'=', template_text)):
Line 106 ⟶ 135:
# I should really put in 404 error handling code, but this has been working just fine.
if re.findall(r'pdf|octet-stream', content_type):
return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) +" ' bytes -->}}"'
else:
wikipedia.output(u'Unusual content_type: %s, code: %s' % (content_type, response))
Line 116 ⟶ 145:
# Fix Casing (Reduces the number of possible expressions)
wikitext = re.subcompile(r'\{\{ \s*(Template:|template:|)(PDF|Pdf|pdf)', re.IGNORECASE).sub(r'{{PDF', wikitext)
# State point. Count any changes as needing an update if they're after this line
Line 129 ⟶ 158:
# Experimental: Convert with with tag at the end of a bullet list (ex: * [http link] some text ([[PDF]]) )
wikitext = re.subcompile(r'(\n\*[^\n:/]*)(\[http[^]]*\])([^\n:/]*) *\((\[\[|\{\{|)(Portable Document Format\|PDF|PDFlink).?(file|PDFformat|datei|pdf)(\}\}|\]\]|)\)', re.IGNORECASE).sub(r'\1{{PDF|\2}}\3', wikitext)
wikitext = re.sub(r'(\n\*[^\n:/]*)(\[http[^]]*\])([^\n:/]*) *\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext)
# Remove PDFlink from citation templates
wikitext = re.sub(r'(format *= *)(PDF|pdf|)(\(|)\{\{PDF(|link)[^{}]*\}\}(\()|)', r'\1PDF', wikitext)
wikitext = re.sub(r'(\{\{(Cite|cite)[^}]*)(}}[^a-zA-Z]*)(\(|)\{\{(PDF|PDFlink)\}\}(\)|)', r'\1 |format=PDF\3', wikitext)
wikitext = re.sub(r'(\{\{.ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext)
state1 = wikitext
m = re.findall(r'\{\{(PDF|PDFlink)[link]{0,4}\|[^}]*\}\}', wikitext)
for s in m:
if re.findall(r'http[s]?://', s):
replacetext = update_size_paramter(s)
wikitext = re.sub(re.escape(s), replacetext, wikitext)
Line 148 ⟶ 177:
# Fix equal sign problem
wikitext = re.sub(r'\{\{(PDF|PDFlink)\|(1=|)(.{2}[^{|}]+=[^{|}]+)', r'{{\1|1=\3', wikitext)
# Test to see if file sizes parameter was untouched
if wikitext == state1:
if len(wikitext) - len(state1) <= 4:
# Nothing was done with the embedded file sizes string
# 4 or more bytes removed typically indicate a embed citation removal
EditMsg = 'Corrected use of {{[[Template:PDFlink|PDFlink]]}}'
EditMsg = msg_removed_cite
else:
EditMsg = msg_fixed
else:
if len(wikitext) - len(state1) > 34:
# Minimum of 34 bytes to add file size information
EditMsg = msg_added
EditMsg = 'Added file size for external links tagged with {{[[Template:PDFlink|PDFlink]]}}'
else:
EditMsg = msg_updated
EditMsg = 'Updated file size of transcluded {{[[Template:PDFlink|PDFlink]]}}'
wikipedia.setAction(wikipedia.translate(page.site().language(), EditMsg))
# If the text has changed at all since the state point, upload it
if (wikitext != state0):
Line 176 ⟶ 209:
def main():
site = wikipedia.getSite()
gen = None
for arg in wikipedia.handleArgs():
Line 189 ⟶ 223:
pagelinks = wikipedia.Page(wikipedia.getSite(), arg[7:])
gen = pagegenerators.LinkedPageGenerator(pagelinks)
elif arg.startswith('-page:'):
else:
page = wikipedia.showHelpPage(u'pdfbot'wikipedia.getSite(), unicode(arg[6:]))
gen = iter([page])
return
 
if gen is None:
wikipedia.showHelp(u'pdfbot')
return
wikipedia.output(u'Read delay is %s seconds.' % readDelay)
Line 197 ⟶ 235:
# Only process pages from the main namespace
gen = pagegenerators.NamespaceFilterPageGenerator(gen, [0])
gen = pagegenerators.RedirectFilterPageGenerator(gen)
for page in gen: