User:PDFbot/pdfbot.py: Difference between revisions

Content deleted Content added
Major addition: Interlanguage support
Lots of small fixes, standardization of output messages, Fixed bug "?" in with query, regex optimizations, support for SI bases
Line 21:
#
 
import re, sys, httplib, time
import wikipedia, pagegenerators, login, config, catlib
fromimport urllib2httplib, importsocket, urlparse
 
# Define global constants
readDelay = 20 # seconds
writeDelay = 60 # seconds
prefixSI_prefix = ['bytes', '[[KibibyteKilobyte|KiBkB]]', '[[MebibyteMegabyte|MiBMB]]', '[[GibibyteGigabyte|GiBGB]]']
IEC_prefix = ['bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]']
urlpattern = re.compile(r'http[s]?://[^][>< \n|]*', re.IGNORECASE)
# following char sperate url from title: []"<>\ \n
userAgent = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)'
# {|} is included since we're in a template
urlpattern = re.compile(r'http[s]?://[^][>< >\ns"{|}]*', re.IGNORECASE)
httpHeader = {
userAgent = 'User-Agent': 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)',
'Accept': 'application/pdf,application/octet-stream,*/*;q=0.5',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Keep-Alive': '30',
'Connection': 'keep-alive',
}
 
# Edit summary messages
Line 50 ⟶ 59:
}
 
def checkLink(___location, redirectCounteruseHEAD = True, counter = 5):
try:
while (redirectCountercounter >= 0 and ___location is not None):
(scheme, site, path, args, query, frag) = urlparse.urlparse(___location)
if query != '':
query = '?' + query
if path == '':
path = '/'
if scheme == "http":
conn = httplib.HTTPConnection(site)
Line 59 ⟶ 72:
conn = httplib.HTTPSConnection(site)
#conn.set_debuglevel(1)
conn.putrequestrequest('HEAD', path + args + query, None, httpHeader)
#conn.putheader('User-Agent', userAgent)
#conn.endheaders()
response = conn.getresponse()
Line 68 ⟶ 81:
content_type = response.msg.getheader('content-type')
response_code = response.status
response_reason= response.reason
conn.close()
redirectCountercounter -= 1
if(redirect is not None):
wikipedia.output( u'STATUS: HTTP %s Moved: %s to %s' % (response_code, ___location, redirect) )
if(redirect[:4] != "http"):
___location = urlparse.urljoin(___location, redirect)
else:
___location = redirect
wikipedia.output( u'STATUS: HTTP %s Moved: %s' % (response_code, ___location) )
else:
___location = None
return [___location, response_code, response_reason, content_length, content_type]
except httplib.error, arg:
wikipedia.output(u'HTTP ErrorERROR: HTTP %s %s' % (arg, ___location))
return [___location, 752, "", None, None]
except socket.error, arg:
wikipedia.output(u'Error with URLERROR: Socket %s %s' % (arg, ___location))
return [___location, 6arg[0], arg[1], None, None]
 
# Convert the byte count to a human readable value
def binary_notation(size, base = 1024., prefix = IEC_prefix):
a = float(size)
exponent = 0
while(a >= 1000.):
a /= 1024.base
exponent += 3
 
Line 101 ⟶ 115:
return byteSigs + '&nbsp;' + prefix[exponent / 3]
 
def fix_broken_links(hypertextlink):
#This function attempts to fix multipule broken link using its dictionary
 
# Moving of resources
hypertextlink = relink.subreplace(r'virginiadot.org/infoservice/resources/', r'virginiadot.org/info/resources/', hypertext)
hypertextlink = relink.subreplace(r'ncdot.org/transit/aviation/ncairports/locations/pdf/', r'ncdot.org/transit/aviation/download/ncairports/', hypertext)
link = link.replace('waitangi-tribunal.govt.nz/doclibrary/researchwhanui/', 'waitangi-tribunal.govt.nz/doclibrary/public/researchwhanui/')
# 301 Permanent Redirects
hypertextlink = relink.subreplace(r'transportation.ky.gov/planning/', r'www.planning.kytc.ky.gov/', hypertext)
hypertextlink = relink.subreplace(r'official-documents.co.uk/', r'official-documents.gov.uk/', hypertext)
link = link.replace('http://bmj.bmjjournals.com/', 'http://www.bmj.com/')
link = link.replace('http://bris.ac.uk/', 'http://www.bristol.ac.uk/')
link = link.replace('http://www.shef.ac.uk/socst/', 'http://www.shef.ac.uk/socstudies/')
link = link.replace('http://www.sims.berkeley.edu:8000/', 'http://www2.sims.berkeley.edu/')
link = link.replace('http://www.cs.wm.edu/hpcs/', 'http://www.cse.ohio-state.edu/hpcs/')
link = link.replace('http://www.pchrgaza.org/', 'http://www.pchrgaza.ps/')
link = link.replace('http://www.almlondon.org.uk/', 'http://www.mlalondon.org.uk/')
link = link.replace('http://www.state.ma.us/eot/', 'http://www.eot.state.ma.us/')
link = link.replace('http://www.aapt.org.au/', 'http://www.ausapt.org.au/')
link = link.replace('http://berlin.usembassy.gov/', 'http://germany.usembassy.gov/')
return hypertextlink
 
def update_size_paramter(template_text):
m = re.search(r'\{\{(?P<tpl>[^|]*)\|(1=)?(?P<text>[^|]*).*(, (?P<size>\d+) bytes .*)??\}\}', fix_broken_links(template_text))
# following char sperate url from title: []"<>\ \n
link_text = m.group('text')
# | is included since we're in a template
___location = urlpattern.search(fixed_textlink_text).group(0)
fixed_text = fix_broken_links(template_text)
 
___location = urlpattern.search(fixed_text).group(0)
prefix_text = re.search(r'(\{\{[^|]*\|[^|]*).*\}\}', fixed_text).group(1)
if(m.group('size') is not None and m.group('size') !=''):
if (re.findall(r'=', template_text)):
old_size = int(m.group('size'))
parameter_prefix = '|2='
else:
parameter_prefixold_size = '|0'
if (link_text.find('=') != -1):
parameter_prefix = '|2='
else:
parameter_prefix = ''
# Parse indirect HTML character references
___location = re.sub(r'&\#61(\d\d);', r'=%\1', ___location)
___location = re.sub(r'&amp;', r'&', ___location)
(redirect, response, reason, content_length, content_type) = checkLink(___location)
if (content_type is not None and content_length is not None and int(content_length) > 168 and int(content_length) != old_size):
# I should really put in 404 error handling code, but this has been working just fine.
if re.findall(r'pdf|octet-stream', content_type):
return u'{{%s|%s|%s%s<!-- %s, %s bytes -->}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length), content_type, content_length )
return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) + ' bytes -->}}'
else:
wikipedia.output(u'UnusualFIXME: Bad content_typeresponse: content_type=%s, code:=%s, ___location=%s' % (content_type, response, ___location))
return template_text
# If anything else return template_text back
Line 151 ⟶ 179:
# Convert hard coded pdf links (ex: [http link] (pdf) )
wikitext = re.sub(r'(\[http\w*://[^][]*\]) *\((\[\[[^|\]]*|)?\.?(PDF|pdf) *([Ff]ile)? *([Ff]ormat)?(\]\]|)?\)', r'{{PDFlink|\1}}', wikitext)
# Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] )
wikitext = re.sub(r'[(\(|)]?\{\{(PDFlink|PDF)\}\}(\)|[)]? *(\[http://[^][]*\])', r'{{\21|\42}}', wikitext)
wikitext = re.sub(r'(("|)?\[http[^]]*\]("|)?)([^a-zA-ZZ0-9()]*) *[(\(|)]?\{\{ *(PDFlink|PDF) *\}\}(\)|[)]?', r'{{\63|\1}}\42', wikitext)
# Experimental: Convert with with tag at the end of a bullet list (ex: * [http link] some text ([[PDF]]) )
wikitext = re.compile(r'(\n *\*+[^\n:/]*)(\[http://[^][]*\])([^\n:/]*) *\[(](\[\[|\{\{|)?(Portable Document Format\[|]PDF|PDFlinkpdflink).?(pdf.?)?(file|format|datei|)?(\}\}|\]\]|)\?[)]', re.IGNORECASE).sub(r'\1{{PDFPDFlink|\2}}\3', wikitext)
wikitext = re.sub(r'(\n *\*+[^\n:/]*)(\[http://[^][]*\])([^\n:/]*) *\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext)
# Remove PDFlink from citation templates
# {{cite |format={{PDF}}}}
wikitext = re.sub(r'(format *= *)(PDF|pdf|)(\(|)\{\{PDF[^{}]*\}\}(\)|)', r'\1PDF', wikitext)
# {{cite.*?}}{{PDF}}
wikitext = re.sub(r'(\{\{(Cite|cite)[^}]*)(}}[^a-zA-Z]*)(\(|)\{\{(PDF|PDFlink)\}\}(\)|)', r'\1 |format=PDF\3', wikitext)
# {{cite | lang= EN {{PDF}} }}
wikitext = re.sub(r'(\{\{.ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext)
state1 = wikitext
m = re.findall(r'\{\{PDF[link]{0,4}\|[^{}]*?\}\}', wikitext)
for s in m:
Line 174 ⟶ 205:
wikitext = re.sub(re.escape(s), replacetext, wikitext)
# Uncomment the bellow line to see the replacement text
# wikipedia.output(u'OUTPUT: %s' % replacetext)
# Fix equal sign problem
wikitext = re.sub(r'\{\{(PDF|PDFlink)\|(1=|)(.{2}[^{|}]+=[^{|}]+)', r'{{\1|1=\3', wikitext)
Line 181 ⟶ 212:
# Test to see if file sizes parameter was untouched
if wikitext == state1:
if len(wikitext) - len(state1state0) <= 4:
# 4 or more bytes removed typically indicate a embed citation removal
EditMsg = msg_removed_cite
Line 193 ⟶ 224:
EditMsg = msg_updated
wikipedia.setAction(wikipedia.translate(page.site().language(), EditMsg))
# altert me if the page contains {{pdflink|no-link}}
if re.findall(r'\{\{PDF(link|)\|[^:]*\}\}', wikitext):
wikipedia.output(u'FIXME: No link in {{PDFlink}}')
# If the text has changed at all since the state point, upload it
if (wikitext != state0):
try:
wikipedia.output(u'PageWRITE: Delta changelength byof %s bytes. Writing new version.' % str(len(wikitext)-len(state0)))
page.put(wikitext)
except:
wikipedia.output(u'-------ERROR: Except Writeraised errorwhile ------writing.')
# Pause to reduce load on the servers
time.sleep(writeDelay)
Line 210 ⟶ 245:
site = wikipedia.getSite()
gen = None
namespaces = [0]
for arg in wikipedia.handleArgs():
Line 226 ⟶ 262:
page = wikipedia.Page(wikipedia.getSite(), unicode(arg[6:]))
gen = iter([page])
elif arg.startswith('-namespace:'):
namespaces.append(int(arg[11:]))
 
if gen is None:
Line 233 ⟶ 271:
wikipedia.output(u'Read delay is %s seconds.' % readDelay)
wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay)
if namespaces != []:
# Only process pages from the main namespace
gen = pagegenerators.NamespaceFilterPageGenerator(gen, [0]namespaces)
gen = pagegenerators.RedirectFilterPageGenerator(gen)
for page in gen:
process_article(page)
wikipedia.output(u'\nOperation Complete.\n')