User:PDFbot/pdfbot.py: Difference between revisions

Content deleted Content added
Many fixes, more commenting
code sync, big feature: Dead external link repair
Line 9:
Command line options:
 
-file: Update article pages listed in a text file.
-ref: Update article pages transcluding from a given page.
-cat: Update artcile pages from the given category.
-links: Update pages linked from a given page.
 
"""
Line 20 ⟶ 21:
 
# Define global variables
writeDelay = 60 # seconds
readDelay = 1520 # seconds
httpDebug = 0
userAgent = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)'
 
def whichURLcheckLink(___location):
redirectCounter = 6
try:
Line 38 ⟶ 39:
response = conn.getresponse()
___locationredirect = response.msg.getheader('___location')
content_length = response.msg.getheader('content-length')
content_type = response.msg.getheader('content-type')
response_code = response.status
conn.close()
redirectCounter -= 1
if(redirectCounter > 0 and ___locationredirect is not None):
if(redirect[:4] != "http"):
conn.close()
wikipedia ___location = urlparse.outputurljoin( u'Redirecting to %s' % ___location, redirect)
else:
___location = redirect
content_length = response.msg.getheader('content-length')
wikipedia.output( u'Redirecting (%s) to %s' % (response_code, ___location) )
content_type = response.msg.getheader('content-type')
else:
response_code = response.status
___location = None
conn.close()
return ( [site___location, pathresponse_code, content_length, content_type] )
except:
wikipedia.output(u'Error with URL: %s' % ___location)
return ( [None___location, None006, None, None] )
 
# Convert the byte count to a human readable value
Line 58 ⟶ 63:
a = float(size)
exponent = 0
while (a >= 1000. ):
a /= 1024.
exponent += 3
Line 67 ⟶ 72:
if (byteSigs.endswith('.')):
byteSigs = byteSigs[:3]
return ( byteSigs + ' ' + prefix[exponent / 3] )
 
def fix_broken_links(hypertext):
#This function attempts to fix multipule broken link using its dictionary
 
# Moving of resources
hypertext = re.sub(r'virginiadot.org/infoservice/resources/', r'virginiadot.org/info/resources/', hypertext)
hypertext = re.sub(r'ncdot.org/transit/aviation/ncairports/locations/pdf/', r'ncdot.org/transit/aviation/download/ncairports/', hypertext)
# 301 Permanent Redirects
hypertext = re.sub(r'transportation.ky.gov/planning/', r'www.planning.kytc.ky.gov/', hypertext)
hypertext = re.sub(r'official-documents.co.uk/', r'official-documents.gov.uk/', hypertext)
return hypertext
 
def update_size_paramter(template_text):
___location fixed_text = re.searchfix_broken_links(r'(http[^] |}]*)', template_text).group(1)
 
prefix_text = re.search(r'(\{\{[^|]*\|[^|}]*)[^}]*\}\}', template_text).group(1)
___location = re.search(r'(http[^] |}]*)', fixed_text).group(1)
prefix_text = re.search(r'(\{\{[^|]*\|[^|}]*)[^}]*\}\}', template_textfixed_text).group(1)
if (re.findall(r'=', template_text)):
Line 78 ⟶ 98:
parameter_prefix = '|'
# FixParse indirect HTML character referncesreferences
___location = re.sub(r'&\#61;', r'=', ___location)
___location = re.sub(r'&', r'&', ___location)
(siteredirect, pathresponse, content_length, content_type) = whichURLcheckLink(___location)
if (content_type is not None and content_length is not None and int(content_length) > 16):
# I should really put in 404 error handling code, but this has been working just fine.
if (re.findall(r'pdf|octet-stream', content_type)):
return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) +" bytes -->}}"
else:
wikipedia.output(u'Unusual content_type: %s, code: %s' +% (content_type, response))
return template_text
# If anything else return the template_text back
 
def process_article(page):
Line 106 ⟶ 126:
# Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] )
wikitext = re.sub(r'(\(|)\{\{(PDFlink|PDF)\}\}(\)|) *(\[http[^]]*\])', r'{{\2|\4}}', wikitext)
wikitext = re.sub(r'(("|)(\[http[^]]*\])("|))([^a-zA-Z(]*) *(\(|)\{\{(PDFlink|PDF)\}\}(\)|)', r'{{\6|\21}}\4', wikitext)
# Experimental: Convert with with tag at the end of a bullet list (ex: * [http link] some text ([[PDF]]) )
wikitext = re.sub(r'(\n\*[^\n:/]*)(\[http[^]]*\])([^\n:/]*) *\((\[\[|\{\{|)(Portable Document Format\|PDF|PDFlink|PDF|pdf)(\}\}|\]\]|)\)', r'\1{{PDF|\2}}\3', wikitext)
wikitext = re.sub(r'(PDF|PDFlink)\|(1=|n\*[^\n:/]*)(\[http[^{|}]]*=\])([^{|}\n:/]*) *\{\{(PDFlink|PDF)\}\}', r'\1{{\4|1=\2}}\3', wikitext)
# Remove PDFlink from citation templates
wikitext = re.sub(r'(format *= *)(PDF|pdf|)(\(|)\{\{PDF(|link)\}\}(\(|)', r'\1PDF', wikitext)
wikitext = re.sub(r'(\{\{(Cite|cite)[^}]*)(}}[^a-zA-Z]*)(\(|)\{\{(PDF|PDFlink)\}\}(\)|)', r'\1 |format=PDF\3', wikitext)
wikitext = re.sub(r'(\{\{.ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext)
# Fix equal sign problem
wikitext = re.sub(r'(PDF|PDFlink)\|(1=|)([^{|}]*=[^{|}]*)', r'\1|1=\3', wikitext)
state1 = wikitext
m = re.findall(r'\{\{(PDF[^|}]*PDFlink)\|[^}]*\}\}', wikitext)
for s in m:
if (re.findall(r'http:', s)):
replacetext = update_size_paramter(s)
wikitext = re.sub(re.escape(s), replacetext, wikitext)
# Uncomment the bellow line to see the replacement text
# wikipedia.output(replacetext)
wikitext = re.sub(re.escape(s), replacetext, wikitext)
# Fix equal sign problem
wikitext = re.sub(r'\{\{(PDF|PDFlink)\|(1=|)([^{|}]+=[^{|}]+)', r'{{\1|1=\3', wikitext)
if (wikitext == state1):
if (wikitext == state1):
# Nothing was done with the embedded file sizes string
EditMsg = 'Corrected use of {{[[Template:PDFlink|PDFlink]]}}'
else:
if len(wikitext) - len(state1) > 34:
EditMsg = 'Updating filesize for external links tagged with {{[[Template:PDFlink|PDFlink]]}}'
# Minimum of 34 bytes to add file size information
EditMsg = 'UpdatingAdded filesizefile size for external links tagged with {{[[Template:PDFlink|PDFlink]]}}'
else:
EditMsg = 'Updated file size of transcluded {{[[Template:PDFlink|PDFlink]]}}'
wikipedia.setAction(EditMsg)
# If the text has changed at all since the state point, upload it
if (wikitext != state0):
try:
wikipedia.output(u'Page change by %s bytes. Writing new version.' % str(len(wikitext)-len(state0)))
page.put(wikitext)
except:
wikipedia.output(u'------- Write error ------')
# Pause to reduce load on the servers
time.sleep(writeDelay)
Line 146 ⟶ 178:
for arg in wikipedia.handleArgs():
if (arg.startswith('-ref:')):
referredPage = wikipedia.Page(site, arg[5:])
gen = pagegenerators.ReferringPageGenerator(referredPage)
elif (arg.startswith('-file:')):
gen = pagegenerators.TextfilePageGenerator(arg[6:])
elif (arg.startswith('-cat:')):
cat = catlib.Category(site, arg[5:])
gen = pagegenerators.CategorizedPageGenerator(cat)
elif arg.startswith('-links:'):
pagelinks = wikipedia.Page(wikipedia.getSite(), arg[7:])
gen = pagegenerators.LinkedPageGenerator(pagelinks)
else:
wikipedia.showHelp(u'pdfbot')
Line 160 ⟶ 195:
wikipedia.output(u'Read delay is %s seconds.' % readDelay)
wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay)
# Only process pages from the main namespace
gen = pagegenerators.NamespaceFilterPageGenerator(gen, [0])
for page in gen:
process_article(page)
if (not re.findall(r'(User|Wikipedia|Image|MediaWiki|Template|Help|Category|Portal|Talk)(| talk):', page.title())):
wikipedia.output(u'\nOperation Complete.\n')
process_article(page)
 
if __name__ == "__main__":