User:PDFbot/pdfbot.py: Difference between revisions

Content deleted Content added
Posting Source, Please not there a bugs related to & in the display
 
Many fixes, more commenting
Line 9:
Command line options:
 
-file: Update allarticle pages listed in a text file.
-ref: Update allarticle pages transcluding from a given page.
-cat: Update artcile pages from the given category.
 
"""
 
import re, sys, httplib, time
import wikipedia, pagegenerators, login, config, codecscatlib
from urllib2 import urlparse
 
# Define global variables
writeDelay = 60 # seconds
readDelay = 15 #seconds
httpDebug = 0
userAgent = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)'
 
def whichURL(___location):
redirectCounter = 86
try:
while (redirectCounter > 0 and ___location is not None ):
(scheme, site, path, args, query, frag) = urlparse.urlparse(___location)
path = path + args + query
conn = httplib.HTTPConnection(site)
conn.set_debuglevel(0httpDebug)
conn.putrequest('HEAD', path)
conn.putheader('User-Agent', 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)'userAgent)
conn.endheaders()
Line 34 ⟶ 41:
redirectCounter -= 1
if(redirectCounter > 0 and ___location is not None ):
conn.close()
wikipedia.output( u'Redirecting to %s' % ___location )
Line 40 ⟶ 47:
content_length = response.msg.getheader('content-length')
content_type = response.msg.getheader('content-type')
response_code = response.status
conn.close()
return ( [site, path, content_length, content_type] )
Line 50 ⟶ 58:
a = float(size)
exponent = 0
while a >= 1000. :
a /= 1024.
exponent += 3
prefix = ['bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]']
 
# Truncate and remove trailing dot
Line 62 ⟶ 70:
 
def update_size_paramter(template_text):
___location = re.search(r'\[(http[^] |}]*) ', template_text ).group(1)
prefix_text = re.search(r'(\{\{[^|]*\|[^|}]*)[^}]*\}\}', template_text ).group(1)
if (re.findall(r'=', template_text)):
Line 74 ⟶ 82:
___location = re.sub(r'&', r'&', ___location)
(site, path, content_length, content_type ) = whichURL(___location)
if (___location.lower()[:4] == 'http'):
if (content_length is not None and int(content_length) > 16):
(site, path, content_length, content_type ) = whichURL(___location)
# I should really put in 404 error handling code, but this has been working just fine.
if (content_length is not None):
if (re.findall(r'pdf|octet-stream', content_type)):
return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) +" bytes -->}}"
else:
return template_text
wikipedia.output(u'Unusual content_type: ' + content_type)
return template_text
def process_article(site, pageName):
# If anything else return the template_text back
page = wikipedia.Page(site, pageName)
 
def process_article(site, pageNamepage):
wikitext = page.get()
# Fix Casing (Reduces the number of possible expressions)
wikitext = re.sub(r'\{\{ *(Template:|template:|)(PDF|Pdf|pdf)', r'{{PDF', wikitext)
# State point. Count any changes as needing an update if they're after this line
startTextstate0 = wikitext
# Convert fromhard thecoded oldpdf stylelinks to the(ex: new[http stylelink] (pdf) )
wikitext = re.sub(r'(\[http[^]]*\]) *\((\[\[[^|\]]*|)(PDF|pdf)(\]\]|)\)', r'{{PDFlink|\1}}', wikitext)
# Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] )
wikitext = re.sub(r'(\(|)\{\{(PDFlink|PDF)\}\}(\)|) *(\[http[^]]*\])', r'{{\2|\4}}', wikitext)
wikitext = re.sub(r'("|)(\[http[^]]*\])("|)([^a-zA-Z(]*) *(\(|)\{\{(PDFlink|PDF)\}\}(\)|)', r'{{\36|\12}}\4', wikitext)
# Remove PDFlink forfrom citation Templatestemplates
wikitext = re.sub(r'(format *= *)(\(|)\{\{PDF(|link)\}\}(\(|)', r'\1PDF', wikitext)
wikitext = re.sub(r'(\{\{(Cite|cite)[^}]*)(}}[^a-zA-Z]*)(\(|)\{\{(PDF|PDFlink)\}\}(\)|)', r'\1 |format=PDF\3', wikitext)
wikitext = re.sub(r'(\{\{.ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext)
# Fix equal sign problem
m = re.findall(r'(\{\{PDF[^|}]*\|[^}]*\}\})', wikitext )
wikitext = re.sub(r'(PDF|PDFlink)\|(1=|)([^{|}]*=[^{|}]*)', r'\1|1=\3', wikitext)
state1 = wikitext
if (m is None):
m = re.findall(r'(\{\{PDF[^|}]*\|[^}]*\}\})', wikitext )
wikipedia.output(u"Error: Tempate:PDFlink not found.")
return
for s in m:
if (re.findall(r'\[http', s)):
replacetext = update_size_paramter(s)
wikitext = re.sub(re.escape(s), replacetext, wikitext)
# Uncomment the bellow line to see the replacement text
# wikipedia.output(replacetext)
# print replacetext.encode('ascii', 'replace')
wikitext = re.sub(re.escape(s), replacetext, wikitext)
sizeChangeif = len(wikitext) -== len(startTextstate1):
if (sizeChange > 0):
EditMsg = 'Corrected use of {{[[Template:PDFlink|PDFlink]]}}'
else:
Line 118 ⟶ 132:
wikipedia.setAction(EditMsg)
# If the text has changed at all since, upload the newstate point, upload versionit
if (startTextwikitext != wikitextstate0):
wikipedia.output(u'UploadingPage updatedchange version.by %s Detlabytes. byte count:Writing %snew version.' % str(sizeChangelen(wikitext)-len(state0)))
# page.put(wikitext)
 
# Pause to reduce load on the servers
def serverlist(site, pageName):
time.sleep(timer*60writeDelay)
s = wikipedia.Page(site, unicode(pageName))
else:
return [page for page in s.getReferences(onlyTemplateInclusion=True)]
time.sleep(readDelay)
def main():
site = wikipedia.getSite()
arg = wikipedia.handleArgs()[0]
timer = 1 # Minutes
iffor (arg in wikipedia.startswithhandleArgs('-ref:')):
worklistif = serverlist(site, arg[len.startswith('-ref:')):])
page referredPage = wikipedia.Page(site, pageNamearg[5:])
elif (arg.startswith('-file:')):
worklist gen = pagegenerators.TextfilePageGeneratorReferringPageGenerator(arg[len('-file:'):]referredPage)
elif (arg.startswith('-file:')):
else:
gen = pagegenerators.TextfilePageGenerator(arg[6:])
wikipedia.showHelp(u'pdfbot')
elif (arg.startswith('-cat:')):
return
cat = catlib.Category(site, arg[5:])
gen = pagegenerators.CategorizedPageGenerator(cat)
else:
wikipedia.showHelp(u'pdfbot')
return
wikipedia.output(u'WillRead sleepingdelay foris ' + str(timer) +%s seconds.' minutes% between page loads.\n'readDelay)
wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay)
for page in worklistgen:
if (not re.findall(r'(User|Wikipedia|Image|MediaWiki|Template|Help|Category|Portal)(|Talk)(| talk):', page.title())):
process_article(site, page.title())
# Pause to reduce load on the servers
time.sleep(timer*60)
 
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()
</nowiki></pre>