Revision as of 23:09, 16 February 2007 edit Dispenser (talk \| contribs) Extended confirmed users, Pending changes reviewers 33,005 edits Posting Source, Please not there a bugs related to & in the display		Revision as of 02:18, 13 March 2007 edit undo Dispenser (talk \| contribs) Extended confirmed users, Pending changes reviewers 33,005 edits Many fixes, more commenting Next edit →
Line 9: Command line options: -file: Update ~~all~~article pages listed in a text file. -ref: Update ~~all~~article pages transcluding from a given page. -cat: Update artcile pages from the given category. """ import re, sys, httplib, time import wikipedia, pagegenerators, login, config, ~~codecs~~catlib from urllib2 import urlparse # Define global variables writeDelay = 60 # seconds readDelay = 15 #seconds httpDebug = 0 userAgent = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)' def whichURL(___location): redirectCounter = 86 try: while (redirectCounter > 0 and ___location is not None ): (scheme, site, path, args, query, frag) = urlparse.urlparse(___location) path = path + args + query conn = httplib.HTTPConnection(site) conn.set_debuglevel(0httpDebug) conn.putrequest('HEAD', path) conn.putheader('User-Agent', ~~'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)'~~userAgent) conn.endheaders() Line 34 ⟶ 41: redirectCounter -= 1 if(redirectCounter > 0 and ___location is not None ): conn.close() wikipedia.output( u'Redirecting to %s' % ___location ) Line 40 ⟶ 47: content_length = response.msg.getheader('content-length') content_type = response.msg.getheader('content-type') response_code = response.status conn.close() return ( [site, path, content_length, content_type] ) Line 50 ⟶ 58: a = float(size) exponent = 0 while a >= 1000. : a /= 1024. exponent += 3 prefix = ['bytes', '[[Kibibyte\|KiB]]', '[[Mebibyte\|MiB]]', '[[Gibibyte\|GiB]]'] # Truncate and remove trailing dot Line 62 ⟶ 70: def update_size_paramter(template_text): ___location = re.search(r'\[(http[^] \|}]) ', template_text ).group(1) prefix_text = re.search(r'(\{\{[^\|]\\|[^\|}])[^}]\}\}', template_text ).group(1) if (re.findall(r'=', template_text)): Line 74 ⟶ 82: ___location = re.sub(r'&', r'&', ___location) (site, path, content_length, content_type ) = whichURL(___location)▼ ~~if (___location.lower()[:4] == 'http'):~~ if (content_length is not None and int(content_length) > 16):▼ ▲ (site, path, content_length, content_type ) = whichURL(___location) # I should really put in 404 error handling code, but this has been working just fine. ▲ if (content_length is not None): if (re.findall(r'pdf\|octet-stream', content_type)): return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) +" bytes -->}}" else:▼ return template_text▼ wikipedia.output(u'Unusual content_type: ' + content_type) ▲ return template_text def process_article(site, pageName):▼ # If anything else return the template_text back page = wikipedia.Page(site, pageName)▼ ▲def process_article(~~site, pageName~~page): wikitext = page.get() # Fix Casing (Reduces the number of possible expressions) wikitext = re.sub(r'\{\{ (Template:\|template:\|)(PDF\|Pdf\|pdf)', r'{{PDF', wikitext) # State point. Count any changes as needing an update if they're after this line ~~startText~~state0 = wikitext # Convert ~~from~~hard ~~the~~coded ~~old~~pdf ~~style~~links to ~~the~~(ex: ~~new~~[http ~~style~~link] (pdf) ) wikitext = re.sub(r'(\[http[^]]\]) \((\[\[[^\|\]]\|)(PDF\|pdf)(\]\]\|)\)', r'{{PDFlink\|\1}}', wikitext) ▼ # Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] ) wikitext = re.sub(r'(\(\|)\{\{(PDFlink\|PDF)\}\}(\)\|) (\[http[^]]\])', r'{{\2\|\4}}', wikitext) wikitext = re.sub(r'("\|)(\[http[^]]\])("\|)([^a-zA-Z(]) (\(\|)\{\{(PDFlink\|PDF)\}\}(\)\|)', r'{{\36\|\12}}\4', wikitext) # Remove PDFlink ~~for~~from citation ~~Templates~~templates wikitext = re.sub(r'(format = )(\(\|)\{\{PDF(\|link)\}\}(\(\|)', r'\1PDF', wikitext) wikitext = re.sub(r'(\{\{(Cite\|cite)[^}])(}}[^a-zA-Z])(\(\|)\{\{(PDF\|PDFlink)\}\}(\)\|)', r'\1 \|format=PDF\3', wikitext) wikitext = re.sub(r'(\{\{.ite web[^}]) (\(\|)\{\{(PDF\|PDFlink)\}\}(\)\|) ([^}]\}\})', r'\1 \|format=PDF \5', wikitext) # Fix equal sign problem m = re.findall(r'(\{\{PDF[^\|}]\\|[^}]\}\})', wikitext )▼ wikitext = re.sub(r'(PDF\|PDFlink)\\|(1=\|)([^{\|}]=[^{\|}])', r'\1\|1=\3', wikitext) state1 = wikitext ~~if (m is None):~~ ▲ m = re.findall(r'(\{\{PDF[^\|}]\\|[^}]\}\})', wikitext ) ~~wikipedia.output(u"Error: Tempate:PDFlink not found.")~~ return▼ for s in m: if (re.findall(r'\[http', s)): replacetext = update_size_paramter(s) wikitext = re.sub(re.escape(s), replacetext, wikitext)▼ # Uncomment the bellow line to see the replacement text # wikipedia.output(replacetext) ~~# print replacetext.encode('ascii', 'replace')~~ ▲ wikitext = re.sub(re.escape(s), replacetext, wikitext) ~~sizeChange~~if ~~= len~~(wikitext) -== ~~len(startText~~state1): ▲ ~~if (sizeChange > 0):~~ EditMsg = 'Corrected use of {{[[Template:PDFlink\|PDFlink]]}}' else: Line 118 ⟶ 132: wikipedia.setAction(EditMsg) # If the text has changed at all since~~, upload~~ the ~~new~~state point, upload ~~version~~it if (~~startText~~wikitext != ~~wikitext~~state0): wikipedia.output(u'~~Uploading~~Page ~~updated~~change ~~version.~~by %s ~~Detla~~bytes. ~~byte~~ ~~count:~~Writing %snew version.' % str(~~sizeChange~~len(wikitext)-len(state0))) # page.put(wikitext) ▼ # Pause to reduce load on the servers▼ ~~def serverlist(site, pageName):~~ time.sleep(~~timer60~~writeDelay)▼ ~~s = wikipedia.Page(site, unicode(pageName))~~ else: ~~return [page for page in s.getReferences(onlyTemplateInclusion=True)]~~ time.sleep(readDelay) def main(): site = wikipedia.getSite() ~~arg = wikipedia.handleArgs()[0]~~ ~~timer = 1 # Minutes~~ iffor (arg in wikipedia.~~startswith~~handleArgs(~~'-ref:')~~): ~~worklist~~if ~~= serverlist~~(~~site,~~ arg~~[len~~.startswith('-ref:')):]) ▲ ~~page~~ referredPage = wikipedia.Page(site, ~~pageName~~arg[5:]) elif (arg.startswith('-file:')):▼ ~~worklist~~ gen = pagegenerators.~~TextfilePageGenerator~~ReferringPageGenerator(~~arg[len('-file:'):]~~referredPage) ▲ elif (arg.startswith('-file:')): ▲ else: gen = pagegenerators.TextfilePageGenerator(arg[6:]) wikipedia.showHelp(u'pdfbot')▼ elif (arg.startswith('-cat:')): ~~return~~ cat = catlib.Category(site, arg[5:]) gen = pagegenerators.CategorizedPageGenerator(cat) else: ▲ wikipedia.showHelp(u'pdfbot') ▲ return wikipedia.output(u'~~Will~~Read ~~sleeping~~delay ~~for~~is ~~' + str(timer) +~~%s seconds.' ~~minutes~~% ~~between page loads.\n'~~readDelay) wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay) for page in ~~worklist~~gen: if (not re.findall(r'(User\|Wikipedia\|Image\|MediaWiki\|Template\|Help\|Category\|Portal)(\|Talk)(\| talk):', page.title())): process_article(~~site,~~ page~~.title()~~) ▲ ▲ # Pause to reduce load on the servers ▲ time.sleep(timer*60) if __name__ == "__main__": try: main() finally: wikipedia.stopme() </nowiki></pre>

User:PDFbot/pdfbot.py: Difference between revisions