User:AnomieBOT/source/tasks/WatchlistUpdater.pm

This is an old revision of this page, as edited by AnomieBOT (talk | contribs) at 17:04, 23 August 2008 (Updating published sources: General: * fullquery: If multiple continues are returned, process them in parallel. WatchlistUpdater: * More changing around, to allow output other than a table. * Improve error handling.). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
package tasks::WatchlistUpdater;

=pod

=begin metadata

Task:   WatchlistUpdater
BRFA:   N/A
Status: Begun 2008-08-15
Rate:   As needed, at most every 6 hours

Updates algorithmically-defined "watchlists" (like [[User:Anomie/uw-templates]])
when pages are created or deleted. The bot only edits when something actually
changes.

=end metadata

=cut

use strict;

use AnomieBOT::Task;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;

use POSIX qw/strftime/;
use Data::Dumper;

my @cfg_pages=(
    {
        page        => 'User:Anomie/uw-templates',
        beginmarker => "\n<!-- SNIP HERE -->\n",
        endmarker   => '',
        frequency   => 6*60*60,
        maxrows     => 10000,
        query       => {
            list        => 'allpages',
            apprefix    => 'Uw-',
            apnamespace => '10',
            aplimit     => 'max'
        },
        gcontinue   => 'apcontinue',
        result      => 'allpages',
        match       => {},
        summary     => 'Automatically updating list of uw-* templates',
        botflag     => 1,
        outprefix   => sub { "{| class=\"wikitable\"\n" },
        outformat   => sub {
                my ($main, $talk);
                if($_[1]{'ns'}==14 || $_[1]{'ns'}==6){
                    $main=':'.$_[1]{'title'};
                } else {
                    $main=$_[1]{'title'};
                }
                if($_[1]{'ns'}==0){
                    $talk="Talk:".$_[1]{'title'};
                } else {
                    $talk=$_[1]{'title'};
                    substr($talk, index($talk, ':'), 0)=' talk';
                }
                return "|-\n|[[$main]]||[[$talk]]\n",
            },
        outerror    => sub { "|-\n|colspan=\"2\"|".$_[1]."\n" },
        outsuffix   => sub { "|}" }
    },
    {
        page        => 'User:AnomieBOT/index',
        beginmarker => "\n<!-- SNIP HERE -->\n",
        endmarker   => '',
        frequency   => 6*60*60,
        maxrows     => 10000,
        query       => {
            list        => 'allpages',
            apprefix    => 'AnomieBOT/',
            apnamespace => '2',
            aplimit     => 'max'
        },
        gcontinue   => 'apcontinue',
        result      => 'allpages',
        match       => {},
        summary     => 'Automatically updating userspace index',
        botflag     => 1,
        outprefix   => sub { "{| class=\"wikitable\"\n" },
        outformat   => sub {
                my ($main, $talk);
                if($_[1]{'ns'}==14 || $_[1]{'ns'}==6){
                    $main=':'.$_[1]{'title'};
                } else {
                    $main=$_[1]{'title'};
                }
                if($_[1]{'ns'}==0){
                    $talk="Talk:".$_[1]{'title'};
                } else {
                    $talk=$_[1]{'title'};
                    substr($talk, index($talk, ':'), 0)=' talk';
                }
                return "|-\n|[[$main]]||[[$talk]]\n",
            },
        outerror    => sub { "|-\n|colspan=\"2\"|".$_[1]."\n" },
        outsuffix   => sub { "|}" }
    }
);

sub new {
    my $class=shift;
    my $self=$class->SUPER::new;
    $self->{'pages'}=[@cfg_pages];
    bless $self, $class;
    return $self;
}

=pod

=for info
Per [[WP:BOT#Approval]], any bot or automated editing process that only
affects only the operators' user and talk pages (or subpages thereof),
and which are not otherwise disruptive, may be run without prior
approval.

=cut

sub approved {
    return 1;
}

sub run {
    my ($self, $api)=@_;

    $api->task('WatchlistUpdater');
    $api->read_throttle(6);
    $api->edit_throttle(10);

    my $endtime=time()+600;

    foreach my $data (@{$self->{'pages'}}){
        my $page=$data->{'page'};

        # We've run too long, wait on the rest until next time
        return 0 if time()>=$endtime;

        # Check last run time if we haven't already recorded it
        if(!exists($data->{'lastrun'})){
            my $res=$api->query(
                titles  => $page,
                prop    => 'revisions',
                rvuser  => $api->user,
                rvprop  => 'timestamp',
                rvlimit => 1
            );
            if($res->{'code'} ne 'success'){
                warn "Failed to retrieve last edit date for $page";
                return 60;
            }
            $res=[values(%{$res->{'query'}{'pages'}})];
            if(exists($res->[0]{'revisions'}[0]{'timestamp'})){
                $data->{'lastrun'}=$self->ISO2timestamp($res->[0]{'revisions'}[0]{'timestamp'});
            } else {
                $data->{'lastrun'}=0;
            }
        }

        # Time to check again?
        next unless time()>=$data->{'lastrun'}+$data->{'frequency'};

        # Get edit token
        my $tok=$api->edittoken($page);
        if($tok->{'code'} eq 'shutoff'){
            $self->warn("Task disabled: ".$tok->{'content'}."\n");
            return 300;
        }
        if($tok->{'code'} ne 'success'){
            $self->warn("Failed to retrieve edit token for $page: ".$tok->{'error'});
            return 60;
        }
        if(exists($tok->{'missing'})){
            $self->warn("Page $page does not exist");
            $data->{'lastrun'}=time();
            next;
        }
        my $intxt=$tok->{'revisions'}[0]{'*'};

        # Generate new table
        my %out=();
        my $rows=0;
        my %cont=();
        do {
            my $res=$self->fullquery($api, $data->{'gcontinue'}, %{$data->{'query'}}, %cont);
            if($res->{'code'} ne 'success'){
                $self->warn("Failed to retrieve data for $page: ".$res->{'error'});
                return 60;
            }
            %cont=();
            if(exists($res->{'query-continue'})){
                foreach my $n (values %{$res->{'query-continue'}}){
                    %cont=(%cont, %$n);
                }
            }
            $res=$res->{'query'}{$data->{'result'}};
            my @r;
            if(ref($res) eq 'ARRAY'){
                @r=@$res;
            } elsif(ref($res) eq 'HASH'){
                @r=values %$res;
            } else {
                $self->warn("Invalid data for $page: Not an array or hash ref");
                return 60;
            }
            foreach (@r){
                next if ($_->{'ns'}&1)==1;
                next unless _match($data->{'match'}, $_);
                $out{$_->{'title'}}=$_;
                last if ++$rows>$data->{'maxrows'};
            }
        } while($rows<=$data->{'maxrows'} && values(%cont));
        my $x={};
        my $table=$data->{'outprefix'}($x);
        map { $table.=$data->{'outformat'}($x,$out{$_}); } sort keys %out;
        $table.=$data->{'outerror'}($x,"<strong class=\"error\">List truncated at $rows rows</strong>") if $rows>$data->{'maxrows'};
        $table.=$data->{'outsuffix'}($x);

        # Perform edit, if needed
        my $outtxt=$intxt;
        my ($begin,$end);
        if($data->{'beginmarker'} eq ''){
            $begin=0;
        } else {
            $begin=index($outtxt, $data->{'beginmarker'});
            $begin+=length($data->{'beginmarker'}) if $begin>=0;
        }
        if($data->{'endmarker'} eq ''){
            $end=length($outtxt);
        } else {
            $end=index($outtxt, $data->{'endmarker'}, $begin);
        }
        if($begin<0 || $end<0){
            warn $outtxt;
            $self->warn("Begin/end markers not found, refusing to edit $page\n");
        } else {
            substr($outtxt,$begin,$end-$begin)=$table;
            if($intxt eq $outtxt){
                $self->warn("No update needed for $page\n");
            } else {
                my $res=$api->edit($tok, $outtxt, $data->{'summary'}, 0, $data->{'botflag'});
                if($res->{'code'} ne 'success'){
                    $self->warn("Write for $page failed: ".$res->{'error'});
                    next;
                }
                $self->warn("Updated $page\n");
            }
        }

        # Record last update time
        $data->{'lastrun'}=time();
    }

    # We processed all pages, calculate the number of seconds until the next
    # time we're needed.
    my $t=864000; # arbitrary initial/max value
    foreach (@{$self->{'pages'}}){
        next if $_->{'lastrun'}==0;
        my $tt=$_->{'lastrun'}+$_->{'frequency'}-time();
        $t=$tt if $tt<$t;
    }
    return $t;
}

sub _match {
    my $match = shift;
    my $value = shift;

    return $match->($value) if(ref($match) eq 'CODE');

    if(ref($match) eq 'ARRAY'){
        my $ok=0;
        foreach (@$match){ $ok=($ok || _match($_,$value)); }
        return $ok;
    }
    if(ref($value) eq 'ARRAY'){
        my $ok=0;
        foreach (@$value){ $ok=($ok || _match($match,$_)); }
        return $ok;
    }

    return !defined($value) if !defined($match);
    return 0 if !defined($value);
    return ($match eq $value) if !ref($match);
    return $value=~/$match/ if(ref($match) eq 'Regexp');

    if(ref($match) eq 'HASH'){
        return 0 if ref($value) ne 'HASH';
        my $ok=1;
        while(my ($k,$v)=each(%$match)){
            my $v2=exists($value->{$k})?$value->{$k}:undef;
            $ok=($ok && _match($v,$v2));
        }
        return $ok;
    }

    return 0;
}

1;