User:PDFbot/pdfbot.py: Difference between revisions

Content deleted Content added
Major addition: Interlanguage support
m top: Replaced deprecated <source> tags with <syntaxhighlight>
 
(2 intermediate revisions by one other user not shown)
Line 1:
<syntaxhighlight lang="python">
<pre><nowiki>
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Line 21:
#
 
import re, sys, httplib, time
import wikipedia, pagegenerators, login, config, catlib
fromimport urllib2httplib, importsocket, urlparse
import codecs
try:
import commonfixes
except ImportError:
wikipedia.output('Unable to import commonfixes')
commonfixes = None
try:
import reflinks
def my_reflink_put_page(self, page, new):
self.page = page
self.new_text = new
reflinks.ReferencesRobot.put_page=my_reflink_put_page
except ImportError:
wikipedia.output('Unable to import reflinks')
reflinks = None
 
# Download this file :
# Define global constants
# http://www.twoevils.org/files/wikipedia/404-links.txt.gz
readDelay = 20 # seconds
# ( maintained by User:Marumari )
writeDelay = 60 # seconds
listof404pages = '404-links.txt'
prefix = ['bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]']
urlpattern = re.compile(r'http[s]?://[^][>< \n|]*', re.IGNORECASE)
userAgent = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)'
 
# Define global constants
# Edit summary messages
readDelay = 10 # seconds
msg_added = {
writeDelay = 30 # seconds
'de': u'BOT: hinzufuegen der Dateigroesse markiert als {{PDFlink}}',
mix_prefix = ('bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB')
'en': u'Added file size for external links tagged with {{[[Template:PDFlink|PDFlink]]}}',
SI_prefix = ('bytes', '[[Kilobyte|kB]]', '[[Megabyte|MB]]', '[[Gigabyte|GB]]')
}
IEC_prefix = ('bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]')
msg_updated = {
# following char sperate url from title: []"<>\ \n
'de': u'BOT: Aktualisieren der Dateigroesse mit Vorlageeinbindung',
# {|} is included since we're in a template
'en': u'Updated file size of transcluded {{[[Template:PDFlink|PDFlink]]}}',
urlpattern = re.compile(r'http[s]?://[^][<>\s"{|}]*', re.IGNORECASE)
}
msg_fixedhttpHeader = {
'User-Agent': 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)',
'de': u'BOT: Korrigierte Benutzung der Vorlage {{PDFlink}}',
'Accept': 'application/pdf,application/octet-stream,*/*;q=0.5',
'en': u'Corrected usage of {{[[Template:PDFlink|PDFlink]]}}',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
}
'Keep-Alive': '30',
msg_removed_cite = {
'Connection': 'keep-alive',
'de': u'BOT: Entfernen {{PDFlink}} entsprechend der zitierten Vorlage',
'en': u'Remove {{PDFlink}} from citation template.',
}
 
def checkLink(___location, redirectCounteruseHEAD = True, counter = 5):
try:
while (redirectCountercounter >= 0 and ___location is not None):
(scheme, site, path, args, query, frag) = urlparse.urlparseurlsplit(___location)
query = query and '?' + query or ''
path = path or '/'
if scheme == "http":
conn = httplib.HTTPConnection(site)
elif scheme == "https":
conn = httplib.HTTPSConnection(site)
else:
#conn.set_debuglevel(1)
return (___location, -1, 'Unsupported Protocol', None, None)
conn.putrequest('HEAD', path + args + query)
conn.putheaderset_debuglevel('User-Agent', userAgent0)
socket.setdefaulttimeout(300)
conn.endheaders()
try:
request = path.encode('ascii') + query.encode('ascii')
except UnicodeEncodeError:
encoding = 'utf-8'
noencode = '~!^*()_-=&/|,.?;'
request = unicode(urllib.quote(path.encode(encoding) + query.encode(encoding), noencode))
if useHEAD:
conn.request('HEAD', request, None, httpHeader)
else:
conn.request('GET', request, None, httpHeader)
response = conn.getresponse()
Line 67 ⟶ 93:
content_length = response.msg.getheader('content-length')
content_type = response.msg.getheader('content-type')
response_code = response.status
conn.close()
redirectCountercounter -= 1
if( redirect is not None):
wikipedia.output( u'STATUS: HTTP %s Moved: %s to %s' % (response.status, ___location, redirect) )
if(redirect[:4] != "http"):
if redirect.startswith("http"):
___location = urlparse.urljoin(___location, redirect)
else:
___location = redirect
wikipedia.output( u'STATUS: HTTP %s Moved: %s' % (response_code, ___location) )
else:
___location = None
return [(___location, response_coderesponse.status, response.reason, content_length, content_type])
except httplib.error, arg:
wikipedia.output(u'HTTP ErrorERROR: HTTP %s %s' % (arg, ___location))
return [(___location, 752, "", None, None])
except socket.timeout:
return (___location, 110, 'Connection timeout', None, None)
wikipedia.output(u'Error with URL: %s' % ___location)
except socket.error, arg:
return [___location, 6, None, None]
wikipedia.output(u'ERROR: Socket %s %s' % (arg, ___location))
 
return (___location, arg[0], arg[1], None, None)
# Convert the byte count to a human readable value
except KeyboardInterrupt:
def binary_notation(size):
raise
except Exception, e: # catches those weird ones
print u'Exception raised: %s' % e
return (___location, 0, "Exception %s" % e, None, None)
def binary_notation(size, base = 1024., prefix = IEC_prefix):
"""
Convert the byte count to a human readable value
"""
a = float(size)
exponent = 0
while( a >= 1000.):
a /= 1024.base
exponent += 3
 
# Truncate and remove trailing dot
byteSigs = str(a)[:4]
if (byteSigs.endswith('.')):
byteSigs = byteSigs[:3]
return byteSigs + '&nbsp;' + prefix[exponent / 3]
# return '%3.3g&nbsp;%s' % (byteSigs, prefix[exponent / 3])
 
def fix_broken_links(hypertextlink):
"""
#This function attempts to fix multipule broken link using its dictionary
Returns link replacement for known broken links
"""
 
# Moving of resources
hypertextlink = relink.subreplace(r'virginiadot.org/infoservice/resources/', r'virginiadot.org/info/resources/', hypertext)
link = link.replace('virginiadot.org/comtravel/', 'virginiadot.org/info/')
hypertext = re.sub(r'ncdot.org/transit/aviation/ncairports/locations/pdf/', r'ncdot.org/transit/aviation/download/ncairports/', hypertext)
link = link.replace('ncdot.org/transit/aviation/ncairports/locations/pdf/', 'ncdot.org/transit/aviation/download/ncairports/')
link = link.replace('waitangi-tribunal.govt.nz/doclibrary/researchwhanui/', 'waitangi-tribunal.govt.nz/doclibrary/public/researchwhanui/')
# 301 Permanent Redirects
hypertextlink = relink.subreplace(r'transportation.ky.gov/planning/', r'www.planning.kytc.ky.gov/', hypertext)
hypertextlink = relink.subreplace(r'official-documents.co.uk/', r'official-documents.gov.uk/', hypertext)
link = link.replace('http://bmj.bmjjournals.com/', 'http://www.bmj.com/')
link = link.replace('http://bris.ac.uk/', 'http://www.bristol.ac.uk/')
link = link.replace('http://www.shef.ac.uk/socst/', 'http://www.shef.ac.uk/socstudies/')
link = link.replace('http://www.sims.berkeley.edu:8000/', 'http://www2.sims.berkeley.edu/')
link = link.replace('http://www.cs.wm.edu/hpcs/', 'http://www.cse.ohio-state.edu/hpcs/')
link = link.replace('http://www.pchrgaza.org/', 'http://www.pchrgaza.ps/')
link = link.replace('http://www.almlondon.org.uk/', 'http://www.mlalondon.org.uk/')
link = link.replace('http://www.state.ma.us/eot/', 'http://www.eot.state.ma.us/')
link = link.replace('http://www.aapt.org.au/', 'http://www.ausapt.org.au/')
link = link.replace('http://berlin.usembassy.gov/', 'http://germany.usembassy.gov/')
return hypertextlink
 
def update_size_paramter(template_text):
m = re.search(r'(?s)\{\{(?P<tpl>[^|]*)\|(1=)?(?P<text>[^|]*).*?(, (?P<size>[0-9]+) byte.*)?\}\}', fix_broken_links(template_text))
# following char sperate url from title: []"<>\ \n
link_text = m.group('text')
# | is included since we're in a template
___location = urlpattern.search(link_text).group(0)
fixed_text = fix_broken_links(template_text)
 
___location = urlpattern.search(fixed_text).group(0)
prefix_text = re.search(r'(\{\{[^|]*\|[^|]*).*\}\}', fixed_text).group(1)
old_size = int(m.group('size') or 0)
if (re.findall(r'=', template_text)):
parameter_prefix = '|2='
else:
parameter_prefix = '|'
parameter_prefix = ''
# Parse indirect HTML character references
if '=' in link_text:
___location = re.sub(r'&\#61;', r'=', ___location)
parameter_prefix = '2='
___location = re.sub(r'&amp;', r'&', ___location)
# Convert indirect HTML character references
(redirect, response, content_length, content_type) = checkLink(___location)
___location = wikipedia.html2unicode(___location)
if (content_type is not None and content_length is not None and int(content_length) > 16):
# I should really put in 404 error handling code, but this has been working just fine.
(redirect, response, reason, content_length, media_type) = checkLink(___location)
if re.findall(r'pdf|octet-stream', content_type):
try:
return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) + ' bytes -->}}'
content_length = int(content_length)
else:
except:
wikipedia.output(u'Unusual content_type: %s, code: %s' % (content_type, response))
content_length = None
return template_text
if media_type and content_length and content_length != old_size:
# I should really put in 404 error handling code, but this has been working just fine.
if 'pdf' in media_type or 'octet-stream' in media_type or 'application/download' in media_type:
# This was the old format using the comment
# return u'{{%s|%s|%s%s<!-- %s, %d bytes -->}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length), content_type, content_length )
# However, comment was filled with generally non-useful information
return (not (old_size == 0) or template_text.count('|')<2, u'{{%s|%s|%s%s}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length, prefix = mix_prefix)))
else:
wikipedia.output(u'FIXME: Bad response: code: %d, type: %s, ___location: %s' % (response, media_type, ___location))
# If anything else return template_text back
if old_size:
return (False, u'{{%s|%s|%s%s}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(old_size, prefix = mix_prefix)))
else:
return (False, template_text)
 
def process_article(page):
try:
deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read()
except IOError:
wikipedia.output('You need to download http://www.twoevils.org/files/wikipedia/404-links.txt.gz and to ungzip it in the same directory')
raise
wikipedia.output('Getting page %s' % page.aslink())
wikitext = page.get()
# Fix Casing (Reduces the number of possible expressions)
wikitext = re.compilesub(r'(?i)\{\{\s*(template:|)pdf', re.IGNORECASE).sub(r'{{PDF', wikitext)
wikitext = wikitext.replace('{{PDFLink', '{{PDFlink')
# State point. Count any changes as needing an update if they're after this line
state0 = wikitext
# [http {{PDF}}]
wikitext = re.sub(r'(\[\w+://[^][<>"\s]+\s[^][\n]+?)\s*(\{\{(PDFlink|PDF)\}\})', r'\2\1', wikitext)
# Convert hard coded pdf links (ex: [http link] (pdf) )
wikitext = re.sub(r'(\[http\w+://[^][]*\]) *\((\[\[[^|\]]*|)?\.?(PDF|pdf) *([Ff]ile)? *([Ff]ormat)?(\]\]|)?\)', r'{{PDFlink|\1}}', wikitext)
# Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] )
wikitext = re.sub(r'"[(\(|)]?\{\{(PDFlink|PDF)\}\}(\)|[)]? *((?P<quote>'*)\[http\w+://[^][]*\](?P=quote)'?)", r'{{\21|\42}}', wikitext)
wikitext = re.sub(r'(("|)?\[http\w+://[^]]*\]("|)?)([^a-zA-ZZ0-9()]*) *[(\(|)]?\{\{(PDFlink|PDF) *\}\}(\)|[)]?', r'{{\63|\1}}\42', wikitext)
# Experimental: Convert with with tag at the end of a bullet list (ex: * [http link] some text ([[PDF]]) )
if '{{PDF' in wikitext:
wikitext = re.compile(r'(\n\*[^\n:/]*)(\[http[^]]*\])([^\n:/]*) *\((\[\[|\{\{|)(Portable Document Format\|PDF|PDFlink).?(file|format|datei|)(\}\}|\]\]|)\)', re.IGNORECASE).sub(r'\1{{PDF|\2}}\3', wikitext)
wikitext = re.subcompile(r'(\n *\*+[^\n:/]*)(\[http\w+://[^][]*\])([^\n:/]*) *[(](\[\[|\{\{)?(PDFlinkPortable Document Format[|]PDF|pdflink).?(pdf.?)?(file|format|datei)?(\}\}|\]\])?[)]', re.IGNORECASE).sub(r'\1{{\4PDFlink|\2}}\3', wikitext)
wikitext = re.sub(r'(\n *\*+[^\n:/]*)(\[\w+://[^][]*\])([^\n:/]*) *\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext)
 
# Experimental: move {{PDF}} back in <ref> tag
wikitext = re.sub(r'(<ref[^][{}<>]*>[^][<>=]*?)("?\[\w+://[^][<>\s"]+[^]\n]*\]"?)([^{}<>]*)\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext)
# State point. Correction of {{PDFlink}} template
genfixState = wikitext
# Remove PDFlink from citation templates
# {{cite |format={{PDF}}}}
wikitext = re.sub(r'(?s)(format *= *)(PDF|pdf|)?[(\(|)]?\{\{PDF[^{}]*?\}\}(\)|[)]?', r'\1PDF', wikitext)
# {{cite.*?}}{{PDF}}
wikitext = re.sub(r'(?s)(\{\{(Cite|cite) *[Cc]ite[^}]*)(\}\}[^a-zA-Z\w() ]*) *[(\(|)]?\{\{(PDF|PDFlink)\}\}(\)|[)]?', r'\1 |format=PDF\32', wikitext)
# {{cite | lang= EN {{PDF}} }}
wikitext = re.sub(r'(?s)(\{\{. *[Cc]ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext)
# {{PDF| {{template...}} }}
wikitext = re.sub(r'(?s)\{\{(PDFlink|PDF)\|\s*(\{\{[^{}]+?(\|[^{|}]+)?\}\})[\s|]*\}\}', r'\2', wikitext)
state1 = wikitext
# {{citation|url={{PDFlink|...}} }}
m = re.findall(r'\{\{PDF[link]{0,4}\|[^}]*\}\}', wikitext)
wikitext = re.sub(r'(?i)\{\{(([Cc]itation|[Cc]ite)[^{}]+?)\{\{(PDFlink|PDF)\|([^{}]*?)(\|[^{|}]+)?\}\}', r'{{\1\4', wikitext)
# Sate point. Removal of {{PDFlink}} in certian instances
for s in m:
state2 = wikitext
if re.findall(r'http[s]?://', s):
cleantext = wikitext
replacetext = update_size_paramter(s)
# This is ugly, since we need the comments to check the relative filesize
wikitext = re.sub(re.escape(s), replacetext, wikitext)
for m in re.finditer(r'<!--.*?-->|<nowiki[^>]*>.*?</nowiki>', cleantext):
if '{{PDF' in m.group():
cleantext = cleantext.replace(m.group(), '')
sizechange = 0
for m in re.finditer(r'\{\{(?:PDFlink|PDF)\|[^{}]+?\}\}', cleantext):
if 'http://' in m.group() or 'https://' in m.group():
(changed, replacetext) = update_size_paramter(m.group())
sizechange += changed and 1 or 0
# print "update page? %s"%(sizechange, )
wikitext = wikitext.replace(m.group(), replacetext)
# Uncomment the bellow line to see the replacement text
# wikipedia.output(u'OUTPUT: %s' % replacetext)
# Fix equal sign problem
wikitext = re.sub(r'\{\{(PDF|PDFlink)\|(1=|)(.{2}[^{|}]+=[^{|}]+)', r'{{\1|1=\3', wikitext)
for s in re.findall(ur'(?ui)\{\{(?:cite[\w\s]+)\|[^{}]+?\}\}', cleantext):
# Test to see if file sizes parameter was untouched
murl = re.search('\|\s*url\s*=\s*(?P<url>http[s]?://[^][<>"\s|]+)(\||}})', s)
if wikitext == state1:
if murl and 'PDF' in murl.group().upper() and (not re.search(ur'\|\s*format\s*=\s*[^\s{|}]+', s) or not re.search(ur'\|\s*(access\w+)\s*=\s*([^{|}]+?)\s*(?=[{|}])', s)) and not re.search(ur'\|\s*archiveurl\s*=\s*[^\s{|}]+', s):
if len(wikitext) - len(state1) <= 4:
repl_url = fix_broken_links(murl.group('url'))
# 4 or more bytes removed typically indicate a embed citation removal
(redirect, response, reason, content_length, media_type) = checkLink(repl_url)
EditMsg = msg_removed_cite
# media_type not given
if not media_type:
continue
# Gone/Not Found error code
elif (response == 410 or (response == 404 and (u'\t%s\t' % murl.group(1) in deadLinks))) and repl_url == murl.group('url'):
wikitext = wikitext.replace(s, s + time.strftime("{{dead link|bot=PDFbot|date=%B %Y}}"))
# valid PDF
# python2.6code: any(item in media_type.lower() for item in ('pdf', 'octet-stream'))
elif 'pdf' in media_type.lower() or 'octet-stream' in media_type.lower():
replacetext = s
replacetext = replacetext.replace(murl.group(), murl.group().replace(murl.group('url'), repl_url))
if re.search(ur'\|\s*format\s*=\s*[^{|}]*[|}]', replacetext):
# fill in the format=
replacetext = re.sub(r'(\|\s*format\s*= ??)(\n* *[{|}])', r'\1PDF\2', replacetext)
else:
# add format=PDF (third last parameter)
replacetext = re.sub(r'(\{\{[^{}]+?)((\s*\|\s*)[^[=\]{|}]+(\s*= *)[^{|}]+)(\s*\|[^{|}]+)\}\}', r'\1\3format\4PDF\2\5}}', replacetext)
 
accessed = re.search(ur'\|\s*(access\w+)\s*=\s*[^{|}\s]+', replacetext)
# no access-anything filled in, add/fill accessdate
if not accessed:
# fill out accessdate if it exists
replacetext = re.sub(r'(\|\s*accessdate\s*= ??)(?=\n* *[{|}])', time.strftime(r'\g<1>%Y-%m-%d'), replacetext)
# if template doesn't contain accessdate then add it (last parameter)
if not re.search(r'\|\s*accessdate\s*=', replacetext):
replacetext = re.sub(r'(\{\{[^{}]+?)((\s*\|\s*)[^[=\]{|}]+?(\s*= *)[^{|}]+?)(\s*)\}\}', time.strftime(r'\1\2\3accessdate\g<4>%Y-%m-%d\5}}'), replacetext)
#replacetext = re.sub(r'(\{\{[^{}]+?)((\s*\|\s*)[^[=\]{|}]+(\s*= *)[^{|}]+)(\s*\|[^{|}]+)\}\}', time.strftime(r'\1\2\5\3accessdate\g<4>%Y-%m-%d}}'), replacetext)
 
# put back in
wikitext = wikitext.replace(s, replacetext)
sizechange += 1
# Uncomment the bellow line to see the replacement text
wikipedia.output(u'OUTPUT: %s' % replacetext)
 
# remove duplicate {{dead link}}
dead_templates = r'[Dd]ead[ _]*link|[Dd]l|[Dd]l-s|404|[Bb]roken[ _]+link|[Cc]leanup-link'
wikitext = re.sub('(\{\{(?:%s)[^}]*?\}\})+((</ref>)?\{\{(?:%s)[^}]*?\}\})'%(dead_templates, dead_templates), r'\2', wikitext)
 
# Figure out an edit message of what we did
if sizechange:
if state2 != state0:
EditMsg = "Updating %d PDF%s and fixes" % (sizechange, sizechange>1 and 's' or '')
else:
EditMsg = "Updating %d PDF%s" % (sizechange, sizechange>1 and 's' or '')
EditMsg = msg_fixed
else:
# state0: renamed templates
if len(wikitext) - len(state1) > 34:
# genfix: fixPDFlink
# Minimum of 34 bytes to add file size information
# state2: removePDFlink
EditMsg = msg_added
else#wikitext: -
EditMsg = msg_updated"General fixes for PDFs"
if wikitext == state0:
wikipedia.setAction(wikipedia.translate(page.site().language(), EditMsg))
pass # text stayed the same
elif wikitext == genfixState:
EditMsg = "Correct {{PDFlink}} syntax"
elif wikitext == state2:
if genfixState == state0: # no fixes
EditMsg = "Remove incorrect {{PDFlink}}"
else: #fixes+removal
pass
wikipedia.setAction(EditMsg)
updateSizes = wikitext
 
# Fix equal sign problem
# moved here to avoid changing edit message
wikitext = re.sub(r'\{\{(PDF|PDFlink)\|(1=|)(.{2}[^{|}]+=[^{|}]+)', r'{{\1|1=\3', wikitext)
# altert me if the page does not contains {{pdflink|no-link}}
if re.search(r'\{\{PDF(link|)\|[^:]+\}\}', wikitext):
wikipedia.output(u'FIXME: No link in {{PDFlink}} on %s' % page.aslink())
# If the text has changed at all since the state point, upload it
if (wikitext != state0 and sizechange) or state2 != state0 or updateSizes != wikitext:
wikipedia.output('PDFs updated: % 3d' % sizechange)
 
# [[pdf]] -> [[PDF]]
wikitext = re.sub(r'\[\[pdf(?=[|\]])', '[[PDF', wikitext)
 
# {{cite | format = pdf }}
wikitext = re.sub(r'(?s)(?:([|]\s*format\s*=\s*)(?:\[\[|)[Pp][Dd][Ff](?:\]\]|))+(\s*[{|}])', r'\1PDF\2', wikitext)
# To many to just fix when we come across, so we don't count it with the fixes
# Unlink PDF in format parameters
wikitext = re.sub(r'(?i)(\|\s*format\s*=\s*)\[\[(adobe|portable|document|file|format|pdf|\.|\s|\(|\)|\|)+\]\]', r'\1PDF', wikitext)
wikitext = re.sub(r'(?i)(\|\s*format\s*=\s*)(\s*\.?(adobe|portable|document|file|format|pdf|\(|\)))+?(?=\s*[|}])', r'\1PDF', wikitext)
# Apply common fixes if avalible
if commonfixes:
wikitext = commonfixes.fix(page, text=wikitext)
 
# Apply reflink if avalible
if reflinks:
# Hackist hook
page._contents = wikitext
if page.get() != wikitext:
wikipedia.output("Injected text wasn't returned with page.get()")
elif reflinks.linksInRef.search(wikitext):
reflinksbot = reflinks.ReferencesRobot(iter([page]))
reflinksbot.run()
if hasattr(reflinksbot, 'new_text'):
if reflinksbot.page != page:raise 'pages not the same'
wikitext = reflinksbot.new_text
# Reset edit summary
wikipedia.setAction(EditMsg)
 
try:
wikipedia.output(u'PageWRITE: Delta changelength byof %s 3d bytes. Writing new version.' % str(len(wikitext)-len(state0)))
page.put(wikitext)
except Exception, e:
wikipedia.output(u'-------ERROR: Except Write%s errorraised ------while writing.' % e)
# Pause to reduce load on the servers
time.sleep(writeDelay)
else:
wikipedia.put_throttle()
time.sleep(readDelay)
pass
def main():
site = wikipedia.getSite()
gen = None
namespaces = [0]
for arg in wikipedia.handleArgs():
Line 226 ⟶ 408:
page = wikipedia.Page(wikipedia.getSite(), unicode(arg[6:]))
gen = iter([page])
elif arg.startswith('-ns:'):
namespaces.append(int(arg[11:]))
elif arg.startswith('-delay:'):
global readDelay, writeDelay
readDelay = int(arg[7:])
writeDelay = int(arg[7:])
 
if not gen is None:
wikipedia.showHelp(u'pdfbot')
return
 
wikipedia.output(u'Delays are %s s for read and %s for writes' % (readDelay, writeDelay,) )
if namespaces != []:
wikipedia.output(u'Read delay is %s seconds.' % readDelay)
gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay)
# Only process pages from the main namespace
gen = pagegenerators.NamespaceFilterPageGenerator(gen, [0])
gen = pagegenerators.RedirectFilterPageGenerator(gen)
for page in gen:
if page.site().messages:
wikipedia.output(u'Messages left on talk page, halting.')
return
process_article(page)
wikipedia.output(u'\nOperationFinished Complete.\nupdating')
 
if __name__ == "__main__":
Line 248 ⟶ 438:
finally:
wikipedia.stopme()
</syntaxhighlight>
</nowiki></pre>