User:AnomieBOT/source/tasks/LanguageCategoryCreator.pm

This is an old revision of this page, as edited by AnomieBOT (talk | contribs) at 00:30, 11 April 2025 (Updating published sources: LanguageCategoryCreator: * New task!). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
(diff) ← Previous revision | Latest revision (diff) | Newer revision → (diff)
package tasks::LanguageCategoryCreator;

=pod

=begin metadata

Bot:      AnomieBOT
Task:     LanguageCategoryCreator
BRFA:     Wikipedia:Bots/Requests for approval/AnomieBOT 84
Status:   BRFA
Created:  2025-04-10

Create needed categories under [[:Category:Wikipedia maintenance categories sorted by month]] and [[:Category:Wikipedia categories sorted by month]].

=end metadata

=cut

use utf8;
use strict;

use POSIX;
use Data::Dumper;
use AnomieBOT::API;
use AnomieBOT::Task qw/:time bunchlist/;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;

my @textcats = (
    'Articles containing (?<n>.+)-language text',
    'Articles with text in (?<n>.+) languages',
);
my @sourcecats = (
    'Articles with (?<n>.+)-language sources \((?<c>.+)\)',
);

sub new {
    my $class=shift;
    my $self=$class->SUPER::new();
    $self->{'iter'}=undef;
    bless $self, $class;
    return $self;
}

=pod

=for info
Approval requested.<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 84]]

=cut

sub approved {
    return 0;
}

sub run {
    my ($self, $api)=@_;

    $api->task('LanguageCategoryCreator',0,0,qw/d::Talk d::Redirects d::Templates/);

    my $cont = $self->{'dbcontinue'} // '';

    my ($dbh);
    eval {
        ($dbh) = $api->connectToReplica( 'enwiki' );
    };
    if ( $@ ) {
        $api->warn( "Error connecting to replica: $@\n" );
        return 300;
    }
    $dbh->do( q{SET NAMES 'utf8'} );

    my $catsql = join( ' OR ', map { my $c = $_; $c =~ s/\\//g; $c =~ s/ /_/g; $c =~ s/\(\?<[nc]>\.\+\)/%/g; "cat_title LIKE " . $dbh->quote( $c ) } ( @textcats, @sourcecats ) );

    # Spend a max of 5 minutes on this task before restarting
    my $endtime=time()+300;

    my $didstart = ( $cont eq '' );
    while ( 1 ) {
        return 0 if $api->halting;

        # Load the list of categories needing creation
        my @rows;
        my $t0 = Time::HiRes::time();
        eval {
            @rows = @{ $dbh->selectall_arrayref( qq{
                SET STATEMENT max_statement_time=300 FOR
                SELECT cat_title
                FROM category
                  LEFT JOIN page ON (page_namespace=14 AND page_title=cat_title)
                WHERE
                  ($catsql)
                  AND page_id IS NULL
                  $cont
                ORDER BY cat_title
                LIMIT 50
            }, { Slice => {} } ) };
        };
        if ( $@ ) {
            $api->warn( "Error fetching page list from replica: $@\n" );
            return 300;
        }
        my $t1 = Time::HiRes::time();
        $api->log( 'DB query took ' . ($t1-$t0) . ' seconds' );
        unless ( @rows ) {
            last if $didstart;
            $didstart = 1;
            $cont = '';
            $self->{'dbcontinue'} = $cont;
            next;
        }

        for my $row (@rows) {
            utf8::decode( $row->{'cat_title'} ); # Data from database is binary
            my $title = $row->{'cat_title'};
            $title =~ s/_/ /g;
            my $ret = $self->make_cat_if_needed( $api, $title );
            if ( $ret ) {
                $title = $dbh->quote( $title );
                $cont = " AND (cat_title > $title)";
                $self->{'dbcontinue'} = $cont;
                return $ret;
            }
        }

        # On the next time around, skip any we've already processed this run
        my $title = $rows[$#rows]->{'cat_title'};
        $title = $dbh->quote( $title );
        $cont = " AND (cat_title > $title)";
        $self->{'dbcontinue'} = $cont;

        # If we've been at it long enough, let another task have a go.
        return 0 if time()>=$endtime;
    }

    $self->{'dbcontinue'} = '';

    # No more pages to check for now
    return 7200;
}

# Create the category, if it seems to be sane
sub make_cat_if_needed {
    my ($self,$api,$title)=@_;

    # Figure out the language name or code.
    my ($n,$c,$src) = ('', '', '');
    if ( $src eq '' ) {
        for my $cat (@textcats) {
            if ( $title =~ /^$cat$/ ) {
                $n = $+{'n'} // '';
                $c = $+{'c'} // '';
                $src = '{{Non-English-language text category}}';
                last;
            }
        }
    }
    if ( $src eq '' ) {
        for my $cat (@sourcecats) {
            if ( $title =~ /^$cat$/ ) {
                $n = $+{'n'} // '';
                $c = $+{'c'} // '';
                $src = '{{Non-English-language sources category}}';
                last;
            }
        }
    }
    if ( $src eq '' ) {
        $api->log( "Skipping $title, not a valid category" );
        return 0;
    }

    # Ask Module:Lang if the name and/or tag are known.
    my $res = $api->query(
        action => 'parse',
        title => 'Foobar',
        text => "<p>Tag: {{#invoke:Lang|tag_from_name|$n}}</p>\n<p>Name: {{#invoke:Lang|name_from_tag|$c}}</p>\n",
        prop => 'text',
        disablelimitreport => 1,
        formatversion => 2,
    );
    if($res->{'code'} ne 'success'){
        $api->warn( "For $title: Failed to check: " . $res->{'content'} . "\n" );
        return 60;
    }

    my $txt = $res->{'parse'}{'text'};

    if ( $n ne '' ) {
        if ( $txt =~ /Error: language: .* not found/ ) {
            $api->warn( "Sanity check failed for $title: Language $n not found\n" );
            return 0;
        }
        unless ( $txt =~ /<p>Tag: ([a-z-]+)<\/p>/ ) {
            $api->warn( "For $title: Failed to check name $n: $txt\n" );
            return 60;
        }
        if ( $c ne '' && $c ne $1 ) {
            $api->log( "Sanity check failed for $title: For $n, $c !== $1\n" );
            return 0;
        }
    }
    if ( $c ne '' ) {
        if ( $txt =~ /Error: unrecognized language tag/ ) {
            $api->log( "Sanity check failed for $title: Language tag $c not found\n" );
            return 0;
        }
        unless ( $txt =~ /<p>Name: ([^<]+)<\/p>/ ) {
            $api->warn( "For $title: Failed to check tag $c: $txt\n" );
            return 60;
        }
        if ( $n ne '' && $n ne $1 ) {
            $api->log( "Sanity check failed for $title: For $c, $n !== $1\n" );
            return 0;
        }
    }

    # Check if the category has members.
    $res = $api->query(
        list => 'categorymembers',
        cmtitle => "Category:$title",
        cmlimit => 1,
        formatversion => 2,
    );
    if ( $res->{'code'} ne 'success' ) {
        $api->warn( "Failed to get category members for $title: " . $res->{'error'} . "\n" );
        return 60;
    }
    unless ( @{$res->{'query'}{'categorymembers'}} ) {
        $api->log( "Skipping $title, has no members" );
        return 0;
    }

    # Good enough.
    my $tok=$api->edittoken( "Category:$title", EditRedir => 1 );
    if ( $tok->{'code'} eq 'shutoff' ) {
        $api->warn( "Task disabled: " . $tok->{'content'} . "\n" );
        return 300;
    }
    if ( $tok->{'code'} eq 'pageprotected' || $tok->{'code'} eq 'botexcluded' ) {
        # Skip protected and excluded pages
        $api->log( "Skipping $title, $tok->{code}" );
        return 0;
    }
    if ( $tok->{'code'} ne 'success' ) {
        $api->warn( "Failed to get edit token for $title: " . $tok->{'error'} . "\n" );
        return 60;
    }
    unless ( exists($tok->{'missing'}) ) {
        $api->log( "Skipping $title, already exists" );
        return 0;
    }

    $api->log( "Creating language category in Category:$title" );
    my $r = $api->edit( $tok, $src, "Creating non-empty language category", 1, 1 );
    if ( $r->{'code'} ne 'success' ) {
        $api->warn( "Write failed on Category:$title: " . $r->{'error'} . "\n" );
        return 60;
    }

    return 0;
}

1;