Utente:Wisbot/coordbot.py

Versione del 24 giu 2007 alle 18:28 di Wiso (discussione | contributi) (Nuova pagina: <code language=phyton> # -*- coding: utf-8 -*- """ This bot will make direct text replacements. It will retrieve information on which pages might need changes either from an XML dump o...)
(diff) ← Versione meno recente | Versione attuale (diff) | Versione più recente → (diff)

  1. -*- coding: utf-8 -*-

""" This bot will make direct text replacements. It will retrieve information on which pages might need changes either from an XML dump or a text file, or only change a single page.

You can run the bot with the following commandline parameters:

-file - Work on all pages given in a local text file.

              Will read any wiki link and use these articles.
              Argument can also be given as "-file:filename".

-cat - Work on all pages which are in a specific category.

              Argument can also be given as "-cat:categoryname".

-page - Only edit a specific page.

              Argument can also be given as "-page:pagetitle". You can give this
              parameter multiple times to edit multiple pages.

-ref - Work on all pages that link to a certain page.

              Argument can also be given as "-ref:referredpagetitle".

-filelinks - Works on all pages that link to a certain image.

              Argument can also be given as "-filelinks:ImageName".

-links - Work on all pages that are linked to from a certain page.

              Argument can also be given as "-links:linkingpagetitle".

-start - Work on all pages in the wiki, starting at a given page. Choose

              "-start:!" to start at the beginning.
              NOTE: You are advised to use -xml instead of this option; this is
              meant for cases where there is no recent XML dump.

-except:XYZ - Ignore pages which contain XYZ. If the -regex argument is given,

              XYZ will be regarded as a regular expression.

-summary:XYZ - Set the summary message text for the edit to XYZ, bypassing the

              predefined message texts with original and replacements inserted.

-template:XYZ- -namespace:n - Number of namespace to process. The parameter can be used

              multiple times. It works in combination with all other
              parameters, except for the -start parameter. If you e.g. want to
              iterate over all user pages starting at User:M, use
              -start:User:M.

-always - Don't prompt you for each replacement other: -

NOTE: Only use either -xml or -file or -page, but don't mix them.

Examples:

"""

  1. Utente:Wiso 2007
  2. Distributed under the terms of the GPL licence

from __future__ import generators import sys, re import wikipedia, pagegenerators,catlib, config

__version__='$Id: coordbot.py,v 0.1 $'

  1. Summary messages in different languages
  2. NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
  3. below.`v

msg = u'robot Aggiungo Template:Coord dalla pagina %s'

templates = {

   'safe': [
   (r'\{\{ ?[Cc]oord(.*?)\}\}',   r"Template:Coord\1\n"),

(r'{{coor[_ ]title[_ ]d\|([0-9\.-]+)\|([NS])\|([0-9\.-]+)\|([EW])\|?([^}]*?)}}', r"

Il template {{Coord}} ha riscontrato degli errori (istruzioni):
  • d format: latd non è un numero
  • d format: latc diverso da N e da S
  • d format: longd non è un numero
  • d format: longc diverso da E e da W

\n"), (r'{{coor[_ ]title[_ ]dm\|([0-9\.-]+)\|([0-9\.-]+)\|([NS])\|([0-9\.-]+)\|([0-9\.-]+)\|([EW])\|?([^\}]*?)\}\}', r"

Il template {{Coord}} ha riscontrato degli errori (istruzioni):
  • dm format: latd non è un numero
  • dm format: latm non è un numero
  • dm format: latc diverso da N e da S
  • dm format: longd non è un numero
  • dm format: longm non è un numero
  • dm format: longc diverso da E e da W

\n"), (r'{{coor[_ ]title[_ ]dms\|([0-9\.-]+)\|([0-9\.-]+)\|([0-9\.-]+)\|([NS])\|([0-9\.-]+)\|([0-9\.-]+)\|([0-9\.-]+)\|([EW])\|?([^}]*?)}}', r"

Il template {{Coord}} ha riscontrato degli errori (istruzioni):
  • dms format: latd non è un numero
  • dms format: latm non è un numero
  • dms format: lats non è un numero
  • dms format: latc diverso da N e da S
  • dms format: longd non è un numero
  • dms format: longm non è un numero
  • dms format: longs non è un numero
  • dms format: longc diverso da E e da W

\n"), (r'\{\{ ?[Cc]oor[ _]d\|([0-9\.+-]+)\|([0-9\.+-])(\|?[^\|]*)\}\}', r"

Il template {{Coord}} ha riscontrato degli errori (istruzioni):
  • dec format: latd non è un numero
  • dec format: longd non è un numero

\n"),

   ],
   'notsafe': [

(r'\{\{ ?[Cc]oord[ _]dm\|([0-9]+)\|([0-9\.]+)\|([NS])\|([0-9\.]+)\|([0-9\.]+)\|([EW])(\|?[^\|]*)\}\}', r"

Il template {{Coord}} ha riscontrato degli errori (istruzioni):
  • dm format: latd non è un numero
  • dm format: latm non è un numero
  • dm format: latc diverso da N e da S
  • dm format: longd non è un numero
  • dm format: longm non è un numero
  • dm format: longc diverso da E e da W

\n"), (r'\{\{ ?[Cc]oor[ _]dms\|([0-9]+)\|([0-9\.]+)\|([0-9\.]+)\|([NS])\|([0-9\.]+)\|([0-9\.]+)\|([0-9\.]+)\|([EW])(\|?[^\|]*)\}\}', r"

Il template {{Coord}} ha riscontrato degli errori (istruzioni):
  • dms format: latd non è un numero
  • dms format: latm non è un numero
  • dms format: lats non è un numero
  • dms format: latc diverso da N e da S
  • dms format: longd non è un numero
  • dms format: longm non è un numero
  • dms format: longs non è un numero
  • dms format: longc diverso da E e da W

\n"), (r'\{\{.*latd *= *([0-9\.]+).*longd ?= ?([0-9\.]+)', r"

Il template {{Coord}} ha riscontrato degli errori (istruzioni):
  • dec format: latd non è un numero
  • dec format: longd non è un numero

\n")

   ]
   }

exceptions = [ r'\{\{ *?Geobox',

              r'\{\{ ?[Cc]oord',
              r'\{\{ ?Template:[Cc]oord',
              r'\{\{ ?[mM]ontagna',
              r'\{\{ ?(Template:)?[cC]omune',
              r'\{\{ ?[cC]ittà',
              r'\{\{ ?[mM]unicipalità',
              r'\{\{ ?[aA]eroporto\|',
              r'\{\{ ?[Mm]unicipi',
              r'\{\{ ?[iI]nfobox[ _]Azienda\|',
              r'\{\{ ?[Ss]\|aziende',
              r'\{\{ ?[Dd]isambigua\|',
              r'\{\{ ?[Ff]razione',
              r'\{\{ ?[Ss]quadra',
              r'\{\{ ?[Pp]asso ?(\||\n)',
              r'\{\{ ?[Bb]undesland[ _]tedesco'
              ]    
   

class CoordRobot:

   """
   A bot that import coordinates from other wikipedia.
   """
   def __init__(self, generator, autoTitle = False, autoText = False):
       self.generator = generator
       self.compileregex()
   def compileregex(self):
   for key in templates.keys():        
           for i in range(len(templates[key])):
               old, new = templates[key][i]
               oldR = re.compile(old, re.UNICODE)
               templates[key][i] = oldR, new
       for i in range(len(exceptions)):
           exceptions[i] = re.compile(exceptions[i])
   def checkExceptions(self, text):
       for exception in exceptions:
           hit = exception.search(text)
           if hit:
               return hit.group(0)
       return None
   def change(self,page,new_text):
       try:
           page.put(new_text)
       except wikipedia.EditConflict:
           wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
       except wikipedia.SpamfilterError, url:
           wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(),url))


   def run(self):
       trovato_en = False
       sen = wikipedia.Site('en')
       interwiki_list = []
       for page in self.generator: 
           try:
               if not page.canBeEdited():
                   wikipedia.output(u'Skipping locked page %s' % page.title())
                   continue
               interwiki_list = page.interwiki()
           except wikipedia.NoPage:
               wikipedia.output(u'Page %s not found' % page.title())
               continue
           except wikipedia.IsRedirectPage:
               wikipedia.output(u'Page %s is a redirect, skip' % page.title())
               continue
           trovato_en = False
           for page_en in interwiki_list:
               if page_en.site() == sen:
                   trovato_en = True
                   break
           if not trovato_en:
               continue
           wikipedia.output(page.title())
           wikipedia.output(u'en: %s' %page_en.title())
           text_it = page.get()
           match = self.checkExceptions(text_it)
           # skip all pages that contain certain texts
           if match:
               colors = [None] * 9 + [None] * len(page.title()) + [None] * 21 + [10] * len(match)
               wikipedia.output(u'Skipping %s because it contains %s' % (page.title(), match), colors = colors)
               continue
           try:
               text_en = page_en.get()
           except wikipedia.NoPage:
               wikipedia.output(u'Page %s not found' %page_en.title())
               continue
           except wikipedia.IsRedirectPage:
               wikipedia.output(u'Page %s is a redirect, follow redirect' %page_en.title())
               text_en = page_en.get(get_redirect=True)
           for old, new in templates['safe']:
               match = old.search(text_en)
               if not match:
                   continue
               colors = [None] * 5 + [13] * len(page.title()) + [None] * 4
               wikipedia.output(u'\n>>> %s <<<' % page.title(), colors = colors)
               wikipedia.output(u'Trovato %s: ' %text_en[match.start():match.end()])
               template_new = old.sub(new, text_en[match.start():match.end()])
               wikipedia.output(template_new)
               new_text_it = template_new + text_it
             
               choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
               if choice in ['y', 'Y']:
                   wikipedia.setAction(msg % page_en.aslink())
                   self.change(page,new_text_it)
   


def main():

   gen = None
   # summary message
   summary_commandline = None
   # Don't edit pages which contain certain texts.
   exceptions = []
   # commandline paramater.
   # Which namespaces should be processed?
   # default to [] which means all namespaces will be processed
   namespaces = []
   template = None
   PageTitles = []
   autoText = False
   autoTitle = False
   # This factory is responsible for processing command line arguments
   # that are also used by other scripts and that determine on which pages
   # to work on.
   genFactory = pagegenerators.GeneratorFactory()
   # Load default summary message.
   # BUG WARNING: This is probably incompatible with the -lang parameter.
   wikipedia.setAction(msg)
   # Read commandline parameters.
   for arg in wikipedia.handleArgs():
       if arg == '-autotitle':
           autoTitle = True
       elif arg =='-autotext':
           autoText = True
       elif arg.startswith('-page'):
           if len(arg) == 5:
               PageTitles.append(wikipedia.input(u'Which page do you want to chage?'))
           else:
               PageTitles.append(arg[6:])
       elif arg.startswith('-except:'):
           exceptions.append(arg[8:])
       elif arg.startswith('-template:'):
           template = arg[10:]
       elif arg.startswith('-namespace:'):
           namespaces.append(int(arg[11:]))
       elif arg.startswith('-summary:'):
           wikipedia.setAction(arg[9:])
           summary_commandline = True
       else:
           generator = genFactory.handleArg(arg)
           if generator:
               gen = generator
   print namespaces
   if PageTitles:
       pages = [wikipedia.Page(wikipedia.getSite(), PageTitle) for PageTitle in PageTitles]
       gen = iter(pages)
   if not gen:
       # syntax error, show help text from the top of this file
       wikipedia.showHelp('coordbot')
       wikipedia.stopme()
       sys.exit()
   if namespaces != []:
       gen =  pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
  1. gen = pagegenerators.RedirectFilterPageGenerator(gen)
   preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20)
   bot = CoordRobot(preloadingGen, autoTitle, autoText)
   bot.run()
               

if __name__ == "__main__":

   try:
       main()
   finally:
       wikipedia.stopme()