Revision as of 23:42, 20 May 2007 edit Dispenser (talk \| contribs) Extended confirmed users, Pending changes reviewers 33,005 edits Major addition: Interlanguage support ← Previous edit		Revision as of 03:30, 13 July 2007 edit undo Dispenser (talk \| contribs) Extended confirmed users, Pending changes reviewers 33,005 edits Lots of small fixes, standardization of output messages, Fixed bug "?" in with query, regex optimizations, support for SI bases Next edit →
Line 21: # import re, sys~~, httplib~~, time import wikipedia, pagegenerators, login, config, catlib ~~from~~import ~~urllib2~~httplib, ~~import~~socket, urlparse # Define global constants readDelay = 20 # seconds writeDelay = 60 # seconds ~~prefix~~SI_prefix = ['bytes', '[[~~Kibibyte~~Kilobyte\|~~KiB~~kB]]', '[[~~Mebibyte~~Megabyte\|~~MiB~~MB]]', '[[~~Gibibyte~~Gigabyte\|~~GiB~~GB]]'] IEC_prefix = ['bytes', '[[Kibibyte\|KiB]]', '[[Mebibyte\|MiB]]', '[[Gibibyte\|GiB]]'] urlpattern = re.compile(r'http[s]?://[^][>< \n\|]', re.IGNORECASE)▼ # following char sperate url from title: []"<>\ \n▼ userAgent = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)'▼ # {\|} is included since we're in a template▼ ▲urlpattern = re.compile(r'http[s]?://[^][>< >\ns"{\|}]', re.IGNORECASE) httpHeader = { ▲~~userAgent =~~ 'User-Agent': 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)', 'Accept': 'application/pdf,application/octet-stream,/;q=0.5', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,;q=0.7', 'Keep-Alive': '30', 'Connection': 'keep-alive', } # Edit summary messages Line 50 ⟶ 59: } def checkLink(___location, ~~redirectCounter~~useHEAD = True, counter = 5): try: while (~~redirectCounter~~counter >= 0 and ___location is not None): (scheme, site, path, args, query, frag) = urlparse.urlparse(___location) if query != '': query = '?' + query if path == '': path = '/' if scheme == "http": conn = httplib.HTTPConnection(site) Line 59 ⟶ 72: conn = httplib.HTTPSConnection(site) #conn.set_debuglevel(1) conn.~~putrequest~~request('HEAD', path + args + query, None, httpHeader) #conn.putheader('User-Agent', userAgent) #conn.endheaders() response = conn.getresponse() Line 68 ⟶ 81: content_type = response.msg.getheader('content-type') response_code = response.status response_reason= response.reason conn.close() ~~redirectCounter~~counter -= 1 if(redirect is not None): wikipedia.output( u'STATUS: HTTP %s Moved: %s to %s' % (response_code, ___location, redirect) )▼ if(redirect[:4] != "http"): ___location = urlparse.urljoin(___location, redirect) else: ___location = redirect ▲ wikipedia.output( u'STATUS: HTTP %s Moved: %s' % (response_code, ___location) ) else: ___location = None return [___location, response_code, response_reason, content_length, content_type] except httplib.error, arg: wikipedia.output(u'~~HTTP Error~~ERROR: HTTP %s %s' % (arg, ___location)) return [___location, 752, "", None, None] except socket.error, arg: wikipedia.output(u'~~Error with URL~~ERROR: Socket %s %s' % (arg, ___location)) return [___location, 6arg[0], arg[1], None, None] # Convert the byte count to a human readable value def binary_notation(size, base = 1024., prefix = IEC_prefix): a = float(size) exponent = 0 while(a >= 1000.): a /= ~~1024.~~base exponent += 3 Line 101 ⟶ 115: return byteSigs + ' ' + prefix[exponent / 3] def fix_broken_links(~~hypertext~~link): #This function attempts to fix multipule broken link using its dictionary # Moving of resources ~~hypertext~~link = relink.~~sub~~replace(r'virginiadot.org/infoservice/resources/', r'virginiadot.org/info/resources/'~~, hypertext~~) ~~hypertext~~link = relink.~~sub~~replace(r'ncdot.org/transit/aviation/ncairports/locations/pdf/', r'ncdot.org/transit/aviation/download/ncairports/'~~, hypertext~~) link = link.replace('waitangi-tribunal.govt.nz/doclibrary/researchwhanui/', 'waitangi-tribunal.govt.nz/doclibrary/public/researchwhanui/') # 301 Permanent Redirects ~~hypertext~~link = relink.~~sub~~replace(r'transportation.ky.gov/planning/', r'www.planning.kytc.ky.gov/'~~, hypertext~~) ~~hypertext~~link = relink.~~sub~~replace(r'official-documents.co.uk/', r'official-documents.gov.uk/'~~, hypertext~~) link = link.replace('http://bmj.bmjjournals.com/', 'http://www.bmj.com/') link = link.replace('http://bris.ac.uk/', 'http://www.bristol.ac.uk/') link = link.replace('http://www.shef.ac.uk/socst/', 'http://www.shef.ac.uk/socstudies/') link = link.replace('http://www.sims.berkeley.edu:8000/', 'http://www2.sims.berkeley.edu/') link = link.replace('http://www.cs.wm.edu/hpcs/', 'http://www.cse.ohio-state.edu/hpcs/') link = link.replace('http://www.pchrgaza.org/', 'http://www.pchrgaza.ps/') link = link.replace('http://www.almlondon.org.uk/', 'http://www.mlalondon.org.uk/') link = link.replace('http://www.state.ma.us/eot/', 'http://www.eot.state.ma.us/') link = link.replace('http://www.aapt.org.au/', 'http://www.ausapt.org.au/') link = link.replace('http://berlin.usembassy.gov/', 'http://germany.usembassy.gov/') return ~~hypertext~~link def update_size_paramter(template_text): m = re.search(r'\{\{(?P<tpl>[^\|])\\|(1=)?(?P<text>[^\|]).(, (?P<size>\d+) bytes .)??\}\}', fix_broken_links(template_text)) ▲ # following char sperate url from title: []"<>\ \n link_text = m.group('text') ▲ # \| is included since we're in a template ___location = urlpattern.search(~~fixed_text~~link_text).group(0)▼ ~~fixed_text = fix_broken_links(template_text)~~ ▲ ___location = urlpattern.search(fixed_text).group(0) ~~prefix_text = re.search(r'(\{\{[^\|]\\|[^\|]).\}\}', fixed_text).group(1)~~ if(m.group('size') is not None and m.group('size') !=''): ~~if (re.findall(r'=', template_text)):~~ old_size = int(m.group('size')) parameter_prefix = '\|2='▼ else: ~~parameter_prefix~~old_size = '\|0' ▼ if (link_text.find('=') != -1): ▲ parameter_prefix = '\|2=' else: parameter_prefix = '' # Parse indirect HTML character references ___location = re.sub(r'&\#61(\d\d);', r'=%\1', ___location) ___location = re.sub(r'&', r'&', ___location) (redirect, response, reason, content_length, content_type) = checkLink(___location) ▼ if (content_type is not None and content_length is not None and int(content_length) > 168 and int(content_length) != old_size): # I should really put in 404 error handling code, but this has been working just fine. if re.findall(r'pdf\|octet-stream', content_type): return u'{{%s\|%s\|%s%s<!-- %s, %s bytes -->}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length), content_type, content_length ) ~~return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) + ' bytes -->}}'~~ else: wikipedia.output(u'~~Unusual~~FIXME: Bad ~~content_type~~response: content_type=%s, code:=%s, ___location=%s' % (content_type, response, ___location)) return template_text # If anything else return template_text back Line 151 ⟶ 179: # Convert hard coded pdf links (ex: [http link] (pdf) ) wikitext = re.sub(r'(\[~~http~~\w://[^][]\]) \((\[\[[^\|\]]\|)?\.?(PDF\|pdf) ([Ff]ile)? ([Ff]ormat)?(\]\]\|)?\)', r'{{PDFlink\|\1}}', wikitext) # Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] ) wikitext = re.sub(r'[(~~\(\|)~~]?\{\{(PDFlink\|PDF)\}\}~~(\)\|~~[)]? (\[http://[^][]\])', r'{{\21\|\42}}', wikitext) wikitext = re.sub(r'(("\|)?\[http[^]]\]("\|)?)([^a-zA-ZZ0-9()]) [(~~\(\|)~~]?\{\{ (PDFlink\|PDF) \}\}~~(\)\|~~[)]?', r'{{\63\|\1}}\42', wikitext) ▲ # Experimental: Convert with with tag at the end of a bullet list (ex: [http link] some text ([[PDF]]) ) wikitext = re.compile(r'(\n \+[^\n:/])(\[http://[^][]\])([^\n:/]) \[(](\[\[\|\{\{\|)?(Portable Document Format\[\|]PDF\|~~PDFlink~~pdflink).?(pdf.?)?(file\|format\|datei\|)?(\}\}\|\]\]\|)\?[)]', re.IGNORECASE).sub(r'\1{{~~PDF~~PDFlink\|\2}}\3', wikitext) wikitext = re.sub(r'(\n \+[^\n:/])(\[http://[^][]\])([^\n:/]) \{\{(PDFlink\|PDF)\}\}', r'\1{{\4\|\2}}\3', wikitext) ▲ # Remove PDFlink from citation templates # {{cite \|format={{PDF}}}} wikitext = re.sub(r'(format = )(PDF\|pdf\|)(\(\|)\{\{PDF[^{}]\}\}(\)\|)', r'\1PDF', wikitext) # {{cite.?}}{{PDF}} wikitext = re.sub(r'(\{\{(Cite\|cite)[^}])(}}[^a-zA-Z])(\(\|)\{\{(PDF\|PDFlink)\}\}(\)\|)', r'\1 \|format=PDF\3', wikitext) # {{cite \| lang= EN {{PDF}} }} wikitext = re.sub(r'(\{\{.ite web[^}]) (\(\|)\{\{(PDF\|PDFlink)\}\}(\)\|) ([^}]\}\})', r'\1 \|format=PDF \5', wikitext) state1 = wikitext m = re.findall(r'\{\{PDF[link]{0,4}\\|[^{}]?\}\}', wikitext) for s in m: Line 174 ⟶ 205: wikitext = re.sub(re.escape(s), replacetext, wikitext) # Uncomment the bellow line to see the replacement text # wikipedia.output(u'OUTPUT: %s' % replacetext) # Fix equal sign problem wikitext = re.sub(r'\{\{(PDF\|PDFlink)\\|(1=\|)(.{2}[^{\|}]+=[^{\|}]+)', r'{{\1\|1=\3', wikitext) Line 181 ⟶ 212: # Test to see if file sizes parameter was untouched if wikitext == state1: if len(wikitext) - len(~~state1~~state0) <= 4: # 4 or more bytes removed typically indicate a embed citation removal EditMsg = msg_removed_cite Line 193 ⟶ 224: EditMsg = msg_updated wikipedia.setAction(wikipedia.translate(page.site().language(), EditMsg)) # altert me if the page contains {{pdflink\|no-link}} if re.findall(r'\{\{PDF(link\|)\\|[^:]\}\}', wikitext): wikipedia.output(u'FIXME: No link in {{PDFlink}}') # If the text has changed at all since the state point, upload it if (wikitext != state0): try: wikipedia.output(u'~~Page~~WRITE: Delta ~~change~~length byof %s bytes~~. Writing new version~~.' % str(len(wikitext)-len(state0))) page.put(wikitext) except: wikipedia.output(u'~~-------~~ERROR: Except ~~Write~~raised ~~error~~while ~~------~~writing.') # Pause to reduce load on the servers time.sleep(writeDelay) Line 210 ⟶ 245: site = wikipedia.getSite() gen = None namespaces = [0] for arg in wikipedia.handleArgs(): Line 226 ⟶ 262: page = wikipedia.Page(wikipedia.getSite(), unicode(arg[6:])) gen = iter([page]) elif arg.startswith('-namespace:'): namespaces.append(int(arg[11:])) if gen is None: Line 233 ⟶ 271: wikipedia.output(u'Read delay is %s seconds.' % readDelay) wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay) if namespaces != []: ~~# Only process pages from the main namespace~~ gen = pagegenerators.NamespaceFilterPageGenerator(gen, ~~[0]~~namespaces) gen = pagegenerators.RedirectFilterPageGenerator(gen) for page in gen: process_article(page) wikipedia.output(u'\nOperation Complete.\n')

User:PDFbot/pdfbot.py: Difference between revisions