Revision as of 02:18, 13 March 2007 edit Dispenser (talk \| contribs) Extended confirmed users, Pending changes reviewers 33,005 edits Many fixes, more commenting ← Previous edit		Revision as of 02:26, 10 May 2007 edit undo Dispenser (talk \| contribs) Extended confirmed users, Pending changes reviewers 33,005 edits code sync, big feature: Dead external link repair Next edit →
Line 9: Command line options: -file: Update ~~article~~ pages listed in a text file. -ref: Update ~~article~~ pages transcluding from a given page. -cat: Update ~~artcile~~ pages from the given category. -links: Update pages linked from a given page. """ Line 20 ⟶ 21: # Define global variables writeDelay = 60 # seconds readDelay = 1520 # seconds httpDebug = 0 userAgent = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)' def ~~whichURL~~checkLink(___location): redirectCounter = 6 try: Line 38 ⟶ 39: response = conn.getresponse() ~~___location~~redirect = response.msg.getheader('___location') content_length = response.msg.getheader('content-length')▼ content_type = response.msg.getheader('content-type')▼ response_code = response.status▼ conn.close()▼ redirectCounter -= 1 if(~~redirectCounter > 0 and ___location~~redirect is not None): if(redirect[:4] != "http"): ▲ conn.close() ~~wikipedia~~ ___location = urlparse.~~output~~urljoin( ~~u'Redirecting to %s' %~~ ___location, redirect) else: ___location = redirect ▲ content_length = response.msg.getheader('content-length') wikipedia.output( u'Redirecting (%s) to %s' % (response_code, ___location) ) ▲ content_type = response.msg.getheader('content-type') else: ▲ response_code = response.status ___location = None ~~conn.close()~~ return ( [~~site~~___location, ~~path~~response_code, content_length, content_type] ) except: wikipedia.output(u'Error with URL: %s' % ___location) return ( [~~None~~___location, ~~None~~006, None, None] ) # Convert the byte count to a human readable value Line 58 ⟶ 63: a = float(size) exponent = 0 while (a >= 1000. ): a /= 1024. exponent += 3 Line 67 ⟶ 72: if (byteSigs.endswith('.')): byteSigs = byteSigs[:3] return ( byteSigs + ' ' + prefix[exponent / 3] ) def fix_broken_links(hypertext): #This function attempts to fix multipule broken link using its dictionary # Moving of resources hypertext = re.sub(r'virginiadot.org/infoservice/resources/', r'virginiadot.org/info/resources/', hypertext) hypertext = re.sub(r'ncdot.org/transit/aviation/ncairports/locations/pdf/', r'ncdot.org/transit/aviation/download/ncairports/', hypertext) # 301 Permanent Redirects hypertext = re.sub(r'transportation.ky.gov/planning/', r'www.planning.kytc.ky.gov/', hypertext) hypertext = re.sub(r'official-documents.co.uk/', r'official-documents.gov.uk/', hypertext) return hypertext def update_size_paramter(template_text): ~~___location~~ fixed_text = ~~re.search~~fix_broken_links(~~r'(http[^] \|}])',~~ template_text~~).group(1~~) prefix_text = re.search(r'(\{\{[^\|]\\|[^\|}])[^}]\}\}', template_text).group(1) ▼ ___location = re.search(r'(http[^] \|}])', fixed_text).group(1) ▲ prefix_text = re.search(r'(\{\{[^\|]\\|[^\|}])[^}]\}\}', ~~template_text~~fixed_text).group(1) if (re.findall(r'=', template_text)): Line 78 ⟶ 98: parameter_prefix = '\|' # ~~Fix~~Parse indirect HTML character ~~refernces~~references ___location = re.sub(r'&\#61;', r'=', ___location) ___location = re.sub(r'&', r'&', ___location) (~~site~~redirect, ~~path~~response, content_length, content_type) = ~~whichURL~~checkLink(___location) if (content_type is not None and content_length is not None and int(content_length) > 16): # I should really put in 404 error handling code, but this has been working just fine. if (re.findall(r'pdf\|octet-stream', content_type)): return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) +" bytes -->}}" else: wikipedia.output(u'Unusual content_type: %s, code: %s' +% (content_type, response)) return template_text # If anything else return ~~the~~ template_text back def process_article(page): Line 106 ⟶ 126: # Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] ) wikitext = re.sub(r'(\(\|)\{\{(PDFlink\|PDF)\}\}(\)\|) (\[http[^]]\])', r'{{\2\|\4}}', wikitext) wikitext = re.sub(r'(("\|)(\[http[^]]\])("\|))([^a-zA-Z(]) (\(\|)\{\{(PDFlink\|PDF)\}\}(\)\|)', r'{{\6\|\21}}\4', wikitext) ▼ # Experimental: Convert with with tag at the end of a bullet list (ex: [http link] some text ([[PDF]]) ) wikitext = re.sub(r'(\n\[^\n:/])(\[http[^]]\])([^\n:/]) \((\[\[\|\{\{\|)(Portable Document Format\\|PDF\|PDFlink\|PDF\|pdf)(\}\}\|\]\]\|)\)', r'\1{{PDF\|\2}}\3', wikitext) wikitext = re.sub(r'(~~PDF\|PDFlink)~~\~~\|(1=\|~~n\[^\n:/])(\[http[^~~{\|}~~]]=\])([^~~{\|}~~\n:/]) \{\{(PDFlink\|PDF)\}\}', r'\1{{\4\|1=\2}}\3', wikitext)▼ # Remove PDFlink from citation templates wikitext = re.sub(r'(format = )(PDF\|pdf\|)(\(\|)\{\{PDF(\|link)\}\}(\(\|)', r'\1PDF', wikitext) wikitext = re.sub(r'(\{\{(Cite\|cite)[^}])(}}[^a-zA-Z])(\(\|)\{\{(PDF\|PDFlink)\}\}(\)\|)', r'\1 \|format=PDF\3', wikitext) wikitext = re.sub(r'(\{\{.ite web[^}]) (\(\|)\{\{(PDF\|PDFlink)\}\}(\)\|) ([^}]\}\})', r'\1 \|format=PDF \5', wikitext) # Fix equal sign problem▼ ▲ wikitext = re.sub(r'(PDF\|PDFlink)\\|(1=\|)([^{\|}]=[^{\|}])', r'\1\|1=\3', wikitext) state1 = wikitext m = re.findall(r'\{\{(PDF[^\|}]PDFlink)\\|[^}]\}\}', wikitext) for s in m: if (re.findall(r'http:', s)): replacetext = update_size_paramter(s) wikitext = re.sub(re.escape(s), replacetext, wikitext)▼ # Uncomment the bellow line to see the replacement text # wikipedia.output(replacetext) ▲ wikitext = re.sub(re.escape(s), replacetext, wikitext) ▲ # Fix equal sign problem wikitext = re.sub(r'\{\{(PDF\|PDFlink)\\|(1=\|)([^{\|}]+=[^{\|}]+)', r'{{\1\|1=\3', wikitext) if (wikitext == state1):▼ ▲ if (wikitext == state1): # Nothing was done with the embedded file sizes string EditMsg = 'Corrected use of {{[[Template:PDFlink\|PDFlink]]}}' else: if len(wikitext) - len(state1) > 34: EditMsg = 'Updating filesize for external links tagged with {{[[Template:PDFlink\|PDFlink]]}}'▼ # Minimum of 34 bytes to add file size information ▲ EditMsg = '~~Updating~~Added ~~filesize~~file size for external links tagged with {{[[Template:PDFlink\|PDFlink]]}}' else: EditMsg = 'Updated file size of transcluded {{[[Template:PDFlink\|PDFlink]]}}' wikipedia.setAction(EditMsg) # If the text has changed at all since the state point, upload it if (wikitext != state0): try: wikipedia.output(u'Page change by %s bytes. Writing new version.' % str(len(wikitext)-len(state0))) page.put(wikitext) ▲ except: wikipedia.output(u'------- Write error ------') ▲ # Pause to reduce load on the servers time.sleep(writeDelay) Line 146 ⟶ 178: for arg in wikipedia.handleArgs(): if (arg.startswith('-ref:')): referredPage = wikipedia.Page(site, arg[5:]) gen = pagegenerators.ReferringPageGenerator(referredPage) elif (arg.startswith('-file:')): gen = pagegenerators.TextfilePageGenerator(arg[6:]) elif (arg.startswith('-cat:')): cat = catlib.Category(site, arg[5:]) gen = pagegenerators.CategorizedPageGenerator(cat) elif arg.startswith('-links:'): pagelinks = wikipedia.Page(wikipedia.getSite(), arg[7:]) gen = pagegenerators.LinkedPageGenerator(pagelinks) else: wikipedia.showHelp(u'pdfbot') Line 160 ⟶ 195: wikipedia.output(u'Read delay is %s seconds.' % readDelay) wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay) # Only process pages from the main namespace gen = pagegenerators.NamespaceFilterPageGenerator(gen, [0]) for page in gen: process_article(page)▼ ~~if (not re.findall(r'(User\|Wikipedia\|Image\|MediaWiki\|Template\|Help\|Category\|Portal\|Talk)(\| talk):', page.title())):~~ wikipedia.output(u'\nOperation Complete.\n') ▲ process_article(page) if __name__ == "__main__":

User:PDFbot/pdfbot.py: Difference between revisions