User:AnomieBOT/source/tasks/EnDashRedirectCreator.pm

package tasks::EnDashRedirectCreator;

=pod

=begin metadata

Bot:      AnomieBOT
Task:     EnDashRedirectCreator
BRFA:     Wikipedia:Bots/Requests for approval/AnomieBOT 74
Status:   Approved 2016-03-08
+BRFA:    Wikipedia:Bots/Requests for approval/AnomieBOT 80
+Status:  Approved 2020-06-19
+BRFA:    Wikipedia:Bots/Requests for approval/AnomieBOT 86
+Status:  BRFA
Created:  2016-03-03

Create redirects for articles with titles containing en-dashes from the
corresponding title with ASCII hyphens. Update these redirects later as
targets change.

=end metadata

=cut

use utf8;
use strict;

use AnomieBOT::Task;
use Data::Dumper;
use Time::HiRes;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;

my @skipNs = (
    2, 3, # User, probably not useful in most cases
    14, 15, # Category, doesn't use normal redirects
    118, 119, # Draft, probably not useful
    446, 447, # Education Program, probably not useful
    828, 829, # Module, doesn't use normal redirects
    2300, 2301, # Gadget, probably doesn't use normal redirects
    2302, 2303, # Gadget definition, probably doesn't use normal redirects
    2600, 2601, # Topic, probably doesn't use normal redirects
);

# Titles that the bot can't and shouldn't create redirects for, to avoid logspam.
my %skipTitles = (
);

my %crossNsOk = (
    0 => 1, # Not actually cross
    4 => 1, # Wikipedia, not eligible for CSD:R2
    10 => 1, # Template, not eligible for CSD:R2
    12 => 1, # Help, not eligible for CSD:R2
    14 => 1, # Category, not eligible for CSD:R2
    100 => 1, # Portal, not eligible for CSD:R2
);

my %dashes = (
    '–' => 'en-dashes',
);

sub new {
    my $class = shift;
    my $self = $class->SUPER::new();
    bless $self, $class;
    return $self;
}

=pod

=for info
Approved 2016-03-08<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 74]]

=for info
Supplemental BFRA approved 2020-06-19<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 80]]

=for info
Supplemental BFRA approval requested 2025-08-23<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 86]]

=cut

sub approved {
    return 3;
}

sub run {
    my ($self, $api) = @_;

    $api->task('EnDashRedirectCreator', 0, 10, qw/d::Redirects d::IWNS d::Talk d::Timestamp d::Templates/);
    my $screwup=' Errors? [[User:'.$api->user.'/shutoff/EnDashRedirectCreator]]';

    my $BRFA86 = 0;

    my %ns = $api->namespace_map();
    my %rns = $api->namespace_reverse_map();
    my $nsre = $api->namespace_re(qw/! 0/);

    my ($dbh);
    eval {
        ($dbh) = $api->connectToReplica( 'enwiki' );
    };
    if ( $@ ) {
        $api->warn( "Error connecting to replica: $@\n" );
        return 300;
    }

    # Templates to preserve.
    my %preserve = $api->redirects_to_resolved(
        'Template:NASTRO comment',
    );
    if(exists($preserve{''})){
        if($preserve{''}{'code'} eq 'shutoff'){
            $api->warn("Task disabled: " . $preserve{''}{'content'} . "\n");
            return 300;
        }
        $api->warn("Failed to get preserve template redirects: " . $preserve{''}{'error'} . "\n");
        return 60;
    }

    # Templates to preserve.
    my %preserveBefore = $api->redirects_to_resolved(
        'Template:Old CfD',
        'Template:Old MfD',
        'Template:Old prod',
        'Template:Old RfD',
        'Template:Old TfD',
        'Template:Old XfD multi',
        'Template:WikiProject banner shell',
    );
    if(exists($preserveBefore{''})){
        if($preserveBefore{''}{'code'} eq 'shutoff'){
            $api->warn("Task disabled: " . $preserveBefore{''}{'content'} . "\n");
            return 300;
        }
        $api->warn("Failed to get preserve template redirects: " . $preserveBefore{''}{'error'} . "\n");
        return 60;
    }

    # Rcat templates to preserve. This should probably be limited to particularly unprintworthy things, e.g. misspelling but not alternative spelling.
    my %rpreserve = $api->redirects_to_resolved(
        'Template:R from incorrect disambiguation',
        'Template:R from incorrect hyphenation',
        'Template:R from incorrect name',
        'Template:R from miscapitalization',
        'Template:R from misquotation',
        'Template:R from misspelling',
        'Template:R from remote talk page',
        'Template:R unprintworthy',
    );
    if(exists($rpreserve{''})){
        if($rpreserve{''}{'code'} eq 'shutoff'){
            $api->warn("Task disabled: " . $rpreserve{''}{'content'} . "\n");
            return 300;
        }
        $api->warn("Failed to get r-preserve template redirects: " . $rpreserve{''}{'error'} . "\n");
        return 60;
    }

    my $dofixupuntil = $self->{'dofixupuntil'} // 0;
    $dofixupuntil = 0 if $dofixupuntil < time();

    my $cont = $self->{'dbcontinue'} // '';
    my $skipNs = join( ',', @skipNs );
    my $dashstr = join( '', keys %dashes );
    my $dashcond = join( ' OR ', map { "p1.page_title LIKE '%$_%'" } keys %dashes );

    # Spend a max of 5 minutes on this task before restarting
    my $endtime=time()+300;

    $dbh->do( q{SET NAMES 'utf8'} );

    my $actorIds;
    eval {
        $actorIds = join( ',', @{ $dbh->selectcol_arrayref( "SELECT actor_id FROM actor_user WHERE actor_name = 'AnomieBOT'" ) } );
    };
    if ( $@ ) {
        $api->warn( "Error fetching actor ID from replica: $@\n" );
        return 300;
    }

    my $targetid;
    eval {
        ( $targetid ) = $dbh->selectrow_array( "SELECT lt_id FROM linktarget WHERE lt_namespace=2 AND lt_title = 'AnomieBOT/Auto-G8'" );
    };
    if ( $@ ) {
        $api->warn( "Error fetching linktarget ID from replica: $@\n" );
        return 300;
    }

    my $autoG8Mismatch = '0=1';
    my $needAvoidedRedir = '0=1';
    my $needAutoG8 = '0=1';
    if ( $BRFA86 && $dofixupuntil ) {
        $autoG8Mismatch = "EXISTS( SELECT 1 FROM externallinks WHERE el_from = p2.page_id AND el_to_domain_index = 'urn:.' and el_to_path = 'x-anomiebot-auto-g8-mismatch:endash' )";
        $needAutoG8 = "templatelinks.tl_from IS NULL";
        $needAvoidedRedir = "r1.rd_from IS NOT NULL AND NOT EXISTS( SELECT 1 FROM categorylinks WHERE cl_from=p2.page_id AND cl_to='Avoided_double_redirects' )";
    }

    my $botOwned = "revision.rev_page IS NOT NULL";
    if ( $BRFA86 ) {
        $botOwned = "( $botOwned OR templatelinks.tl_from IS NOT NULL )";
    }

    while ( 1 ) {
        return 0 if $api->halting;

        # Load the list of redirects needing creation
        my @rows;
        my $t0 = Time::HiRes::time();
        eval {
            @rows = @{ $dbh->selectall_arrayref( qq{
                SET STATEMENT max_statement_time=300 FOR
                SELECT p1.page_namespace AS ns, p1.page_title AS title
                FROM page as p1
                    LEFT JOIN page AS p2 ON ( p1.page_namespace = p2.page_namespace AND REGEXP_REPLACE( CONVERT(p1.page_title USING utf8), '[$dashstr]', '-' ) = p2.page_title )
                    LEFT JOIN redirect AS r1 ON(r1.rd_from=p1.page_id)
                    LEFT JOIN redirect AS r2 ON(r2.rd_from=p2.page_id)
                    LEFT JOIN revision ON (rev_page = p2.page_id AND rev_actor IN ($actorIds) AND rev_parent_id = 0)
                    LEFT JOIN templatelinks ON (tl_from=p2.page_id AND tl_target_id=$targetid)
                WHERE
                    ($dashcond) AND p1.page_namespace NOT IN ($skipNs)
                    AND (
                        p2.page_id IS NULL
                        OR $botOwned AND (
                            r2.rd_namespace != COALESCE( r1.rd_namespace, p1.page_namespace )
                            OR r2.rd_title != COALESCE( r1.rd_title, p1.page_title )
                            OR r2.rd_fragment != r1.rd_fragment
                            OR $autoG8Mismatch
                            OR $needAutoG8
                            OR $needAvoidedRedir
                        )
                    )
                    $cont
                ORDER BY p1.page_namespace, p1.page_title
                LIMIT 50
            }, { Slice => {} } ) };
        };
        if ( $@ ) {
            $api->warn( "Error fetching page list from replica: $@\n" );
            return 300;
        }
        my $t1 = Time::HiRes::time();
        $api->log( 'DB query took ' . ($t1-$t0) . ' seconds' );
        last unless @rows;

        my %redirects = ();
        for my $row (@rows) {
            utf8::decode( $row->{'title'} ); # Data from database is binary

            next if exists( $skipTitles{$row->{'ns'} & ~1}{$row->{'title'}} );
            next if $row->{'ns'} == 10 && $row->{'title'} =~ m!^Editnotices/!; # None of these will be editable by the bot
            next if ( $row->{'ns'} & ~1 ) == 10 && $row->{'title'} =~ m!\.css$!; # Skip pages that will be TemplateStyles css, and their talk pages

            my $to = ( $row->{'ns'} ? $rns{$row->{'ns'}} . ':' : '' ) . $row->{'title'};
            $to =~ s/_/ /g;
            my $from = $to;
            $from =~ s/[$dashstr]/-/g;
            $redirects{$to} = [ $from, $to, $to, undef ];
        }

        if ( %redirects ) {
            # Bypass double redirects and remove missing target pages
            my $res = $api->query(
                titles => join('|', keys %redirects),
                redirects => 1
            );
            if($res->{'code'} ne 'success'){
                $api->warn("Failed to retrieve redirect list: ".$res->{'error'}."\n");
                return 60;
            }
            my %map = ();
            if ( exists($res->{'query'}{'normalized'} ) ) {
                $map{$_->{'from'}} = [ $_->{'to'}, $_->{'tofragment'} // undef ] foreach @{$res->{'query'}{'normalized'}};
            }
            if ( exists($res->{'query'}{'redirects'} ) ) {
                $map{$_->{'from'}} = [ $_->{'to'}, $_->{'tofragment'} // undef ] foreach @{$res->{'query'}{'redirects'}};
            }
            my %exists = ();
            if ( exists($res->{'query'}{'pages'} ) ) {
                for my $p (values %{$res->{'query'}{'pages'}}) {
                    $exists{$p->{'title'}} = 1 if $p->{'pageid'}//0;
                }
            }
            while( my ($key, $targets) = each( %redirects ) ) {
                my ($redir, $origtarget, $target, $fragment) = @$targets;
                my %seen=( $target => 1 );
                while ( exists( $map{$target} ) ) {
                    $fragment = $map{$target}[1] // $fragment;
                    $target = $map{$target}[0];
                    $redirects{$key} = [ $redir, $origtarget, $target, $fragment ];
                    if ( exists( $seen{$target} ) ) {
                        $api->warn("Redirect loop involving [[$target]]");
                        delete $redirects{$key};
                        last;
                    }
                    $seen{$target}=1;
                }
                delete $redirects{$key} unless exists( $exists{$target} );
            }

            # Now, create the redirects
            while( my ($key, $targets) = each( %redirects ) ) {
                return 0 if $api->halting;

                my ($redir, $origtarget, $target, $fragment) = @$targets;

                my $redirNs = ( $redir =~ /^([^:]+):/ && exists( $ns{$1} ) ? $ns{$1} : 0 );
                my $targetNs = ( $target =~ /^([^:]+):/ && exists( $ns{$1} ) ? $ns{$1} : 0 );
                my $extrabefore = '';
                my $rextra = '';
                my $extra = '';

                my $tok=$api->edittoken($redir, EditRedir => 1, imageinfo => { prop => '', limit => 1 });
                if($tok->{'code'} eq 'shutoff'){
                    $api->warn("Task disabled: ".$tok->{'content'}."\n");
                    return 300;
                }
                if($tok->{'code'} ne 'success'){
                    $api->warn("Failed to get edit token for $redir: ".$tok->{'error'}."\n");
                    next;
                }
                if ( !exists( $tok->{'missing'} ) ) {
                    my $res = $api->query(
                        titles => $redir,
                        prop => 'revisions',
                        rvprop => 'user',
                        rvdir => 'newer',
                        rvlimit => 1,
                        formatversion => 2,
                    );
                    my $user = $res->{'query'}{'pages'}[0]{'revisions'}[0]{'user'} // '';
                    if ( $user ne 'AnomieBOT' ) {
                        $api->log("$redir already exists and wasn't originally created by the bot, skipping");
                        next;
                    }

                    if ( !exists( $tok->{'redirect'} ) ) {
                        $api->log("$redir was created by the bot, but it is not a redirect anymore, skipping");
                        next;
                    }

                    my $txt = $tok->{'revisions'}[0]{'slots'}{'main'}{'*'};
                    if ( $txt =~ m!\{\{User:AnomieBOT/Auto-G8\|(?:[^{}|]*\|)?target=(.*?)\}\}! && $1 ne $origtarget ) {
                        my $oldtarget = $1;
                        my $tmp = $oldtarget;
                        $tmp =~ s/[$dashstr]/-/gu;
                        if ( $tmp eq $redir ) {
                            if ( $redirNs & 1 ) {
                                my $subjNsPrefix = $redirNs > 1 ? "$rns{$redirNs & ~1}:" : '';
                                my ( $oldtargetSubj, $origtargetSubj );
                                ( $origtargetSubj = $origtarget ) =~ s/^$nsre:/$subjNsPrefix/;
                                ( $oldtargetSubj = $oldtarget ) =~ s/^$nsre:/$subjNsPrefix/;

                                my %tgts = $api->resolve_redirects( $origtargetSubj, $oldtargetSubj, $origtarget, $oldtarget );

                                my ($oldsubjtgt, $oldtalktgt, $origsubjtgt, $origtalktgt);
                                $oldsubjtgt = $tgts{$oldtargetSubj};
                                ( $oldtalktgt = $tgts{$oldtarget} ) =~ s/^($nsre):/ $ns{$1} > 1 ? "$rns{$ns{$1} & ~1}:" : '' /e;
                                $origsubjtgt = $tgts{$origtargetSubj};
                                ( $origtalktgt = $tgts{$origtarget} ) =~ s/^($nsre):/ $ns{$1} > 1 ? "$rns{$ns{$1} & ~1}:" : '' /e;

                                if ( $oldsubjtgt eq $origsubjtgt && $oldsubjtgt eq $oldtalktgt && $origsubjtgt ne $origtalktgt ) {
                                    $api->log("Skipping [[$origtarget]], [[$redir]] already exists for [[$oldtarget]] and [[$oldtargetSubj]] matches that while [[$origtarget]] does not match [[$origtargetSubj]]");
                                    next;
                                } elsif ( $oldsubjtgt eq $origsubjtgt && $oldsubjtgt ne $oldtalktgt && $origsubjtgt eq $origtalktgt ) {
                                    $api->warn("Updating [[$redir]] to [[$target]]: [[$redir]] already exists for [[$oldtarget]], but that does not match [[$oldtargetSubj]] while [[$origtarget]] does match [[$origtargetSubj]]");
                                } else {
                                    $api->warn("[[$redir]] apparently exists for both [[$oldtarget]] and [[$origtarget]], not updating");
                                    next;
                                }
                            } else {
                                $api->warn("[[$redir]] apparently exists for both [[$oldtarget]] and [[$origtarget]], not updating");
                                next;
                            }
                        } else {
                            $api->warn("[[$redir]] claims to exist for [[$oldtarget]], but that's not valid so overwriting");
                        }
                    }

                    # Check for certain extra templates that we should probably preserve when updating.
                    $api->process_templates( $txt, sub {
                        my $name = shift;
                        my $params = shift;
                        my $wikitext = shift;

                        if ( exists( $preserveBefore{"Template:$name"} ) || exists( $preserveBefore{$name} ) ) {
                            $extrabefore .= "$wikitext\n";
                        } elsif ( exists( $rpreserve{"Template:$name"} ) || exists( $rpreserve{$name} ) ) {
                            $rextra .= "\n$wikitext";
                        } elsif ( exists( $preserve{"Template:$name"} ) || exists( $preserve{$name} ) ) {
                            $extra .= "\n$wikitext";
                        }

                        return undef;
                    } );

                    # Also keep any categories from the existing redirect page.
                    if ( $txt =~ /\[\[\s*(?i:Category)\s*:/ ) {
                        my $txt2 = $txt;
                        $txt2 =~ s/^#REDIRECT\s*\[\[.*?\]\]//i;
                        my $nowiki = {};
                        $txt2 = $api->strip_nowiki( $txt2, $nowiki );
                        my @cats = $txt2 =~ /\[\[\s*(?i:Category)\s*:.*?\]\]/g;
                        $extra .= "\n\n" . $api->replace_nowiki( join( "\n", @cats ), $nowiki ) if @cats;
                    }
                }
                if ( exists( $tok->{'imagerepository'} ) && $tok->{'imagerepository'} ne '' ) {
                    $api->log("$redir is an existing image (repo=$tok->{imagerepository}), skipping");
                    next;
                }

                if ( $redirNs == 0 && !( $crossNsOk{$targetNs} // 0 ) ) {
                    $api->log("$redir to $target would be a cross-namespace redirect, skipping");
                    next;
                }

                if ( $redirNs == 1 && !( $crossNsOk{$targetNs & ~1} // 0 ) ) {
                    $api->log("$redir to $target is the talk page of what would be a cross-namespace redirect, skipping");
                    next;
                }

                if ( $targetNs == 7 ) {
                    # Special rule for File talk: If the corresponding file doesn't exist, forget it.
                    my $n = $target;
                    $n =~ s/^[^:]*/File/;
                    my $res = $api->query( titles => $n );
                    if($res->{'code'} eq 'shutoff'){
                        $api->warn("Task disabled: ".$res->{'content'}."\n");
                        return 300;
                    }
                    if($res->{'code'} ne 'success'){
                        $api->warn("Failed to get status for $n: ".$res->{'error'}."\n");
                        next;
                    }
                    if ( exists( (values %{$res->{'query'}{'pages'}} )[0]{'missing'} ) ) {
                        #$api->log("File talk page redirect [[$redir]] -> [[$target]] has no corresponding target file page, skipping");
                        next;
                    }
                }

                $target .= '#' . $fragment if defined( $fragment );
                $rextra = "\n{{R to section}}$rextra" if defined( $fragment );
                $rextra = "\n{{R from ASCII-only}}$rextra" if $redir =~ /^[ -~]+$/;
                $rextra = "\n{{R avoided double redirect|1=$origtarget}}$rextra" if $origtarget ne $target;
                my $txt = "#REDIRECT [[:$target]]\n\n${extrabefore}{{Redirect category shell|\n{{R from alternative hyphenation|1={{-r|1=$origtarget}}}}$rextra\n}}\n{{User:AnomieBOT/Auto-G8|endash|target=$origtarget}}$extra";
                my $summary;
                if ( exists( $tok->{'missing'} ) ) {
                    my @what = ();
                    while ( my ($char, $what) = each %dashes ) {
                        push @what, $what if $origtarget =~ /$char/;
                    }
                    $what[$#what] = 'and ' . $what[$#what] if @what > 1;
                    my $what = join( @what > 2 ? ", " : " ", @what );
                    $summary = "Redirecting to [[:$origtarget]] because titles with $what are hard to type";
                } else {
                    $summary = "Updating redirect to [[:$origtarget]]";
                }
                $summary.=" (and resolving the double redirect to [[:$target]])" if $origtarget ne $target;

                # Create/update page
                $api->log("$summary in $redir");
                my $r = $api->edit($tok, $txt, "$summary. $screwup", 0, 1);
                if($r->{'code'} ne 'success'){
                    $api->warn("Write failed on $redir: ".$r->{'error'}."\n");
                    next;
                }

                # Check for edit warring.
                if ( ! exists( $tok->{'missing'} ) ) {
                    my $res = $api->query(
                        titles => $redir,
                        prop => 'revisions',
                        rvprop => 'user|sha1',
                        rvlimit => 'max',
                        rvend => $api->timestamp2ISO( time() - 30 * 86400 ),
                        formatversion => 2,
                    );
                    my %shas = ();
                    my $bot1058 = 0;
                    for my $rev (@{$res->{'query'}{'pages'}[0]{'revisions'}}) {
                        $shas{$rev->{'sha1'}} = ( $shas{$rev->{'sha1'}} // -1 ) + 1;
                        $bot1058++ if $rev->{'user'} eq 'Bot1058';
                    }
                    my $ct = 0;
                    for my $sha (keys %shas) {
                        $ct += $shas{$sha};
                    }
                    if ( $ct > 2 ) {
                        my $extranote = $bot1058 > 2 ? "If it's AnomieBOT and Bot1058 fighting, that probably means {{-r|1=$origtarget}} needs to be updated to match {{-r|1={{subst:SUBJECTPAGENAME:$origtarget}}}} or vice versa." : "";
                        $api->whine( "Possible edit warring on [[:$redir]]", "In the past 30 days, there appear to have been $ct reverts on {{-r|1=$redir}}. This suggests that vandalism or edit warring of some sort may be occurring (there or on {{-r|1=$origtarget}}). A human should look into it. $extranote" );
                    }
                }

                # If we've been at it long enough, let another task have a go.
                return 0 if time()>=$endtime;
            }
        }

        # On the next time around, skip any we've already processed this run
        my ($ns, $title) = @{$rows[$#rows]}{'ns','title'};
        $title = $dbh->quote( $title );
        $cont = " AND (p1.page_namespace > $ns OR p1.page_namespace = $ns AND p1.page_title > $title)";
        $self->{'dbcontinue'} = $cont;

        # If we've been at it long enough, let another task have a go.
        return 0 if time()>=$endtime;
    }

    $self->{'dbcontinue'} = '';

    # The intention is that first we run through all new creations needed, and only after those are done do we run through things needing the Auto-G8 template re-added.
    if ( ! $dofixupuntil ) {
        $self->{'dofixupuntil'} = time() + 21600;
        return 0;
    }

    $self->{'dofixupuntil'} = 0;
    return $dofixupuntil - time();
}

1;