![]() | Approval requested Wikipedia:Bots/Requests for approval/AnomieBOT 30 |
package tasks::OnThisDayTagger;
=pod
=begin metadata
Task: OnThisDayTagger
BRFA: Wikipedia:Bots/Requests for approval/AnomieBOT 30
Status: BRFA
Rate: Max 6 edits/minute
Created: 2009-05-14
Process each anniversary article (e.g. [[Wikipedia:Selected
anniversaries/October 31]] for 2008-10-31, 2007-10-31, 2006-10-31, and so on)
to extract the bolded links and tag the corresponding article talk pages with
{{tl|OnThisDay}}.
=end metadata
=cut
use utf8;
use strict;
use AnomieBOT::Task;
use Data::Dumper;
use POSIX;
use Digest::SHA qw/sha256_base64/;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;
my $skip_links_re=join('|',
'(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}',
'\d{4}',
'List of historical anniversaries',
);
$skip_links_re=qr/^(?:$skip_links_re)$/;
my @skip_templates=(
'Template:Archive box',
'Template:Archives',
);
my $whine_to='User talk:AnomieBOT';
my $mode='list'; # 'list' or 'live'
my $list='/tmp/otd-pages.txt';
sub new {
my $class=shift;
my $self=$class->SUPER::new();
$self->{'nextday'}=0;
$self->{'redir'}={};
bless $self, $class;
return $self;
}
=pod
=for info
Approval requested<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 30]]
=cut
sub approved {
return 0;
}
sub run {
my ($self, $api)=@_;
my $res;
$api->task('OnThisDayTagger', 0, 10, qw/d::Talk d::Timestamp d::Templates d::Redirects/);
my $screwup=' Errors? [[User:'.$api->user.'/shutoff/OnThisDayTagger]]';
my $starttime=time;
my $today=day_from_timestamp($starttime);
my %redir=%{$self->{'redir'}};
if($mode eq 'list'){
die "Could not open $list: $!\n" unless open(X, '<:utf8', $list);
my %revs=();
for(my $m=1; $m<=12; $m++){
for(my $d=1; $d<=31; $d++){
my $md=sprintf("%02d-%02d",$m,$d);
next unless POSIX::strftime("%m-%d",0,0,0,$d,$m-1,100) eq $md;
my $res=$api->query([],
titles => POSIX::strftime("Wikipedia:Selected anniversaries/%B %-d",0,0,0,$d,$m-1,100),
prop => 'revisions',
rvprop => 'ids|timestamp',
rvlimit => 'max',
);
if($res->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$res->{'content'}."\n");
return 300;
}
if($res->{'code'} ne 'success'){
$api->warn("Failed to fetch revisions for $md: $res->{error}\n");
sleep(10);
redo;
}
$revs{$md}=(values(%{$res->{'query'}{'pages'}}))[0]{'revisions'};
}
}
my %revids=();
for(my $day=0; $day<$today; $day++){
my $start=$api->ISO2timestamp(day("%Y-%m-%dT00:00:00Z",$day+1));
foreach (@{$revs{day('%m-%d',$day)}}){
my $t=$api->ISO2timestamp($_->{'timestamp'});
if($t<$start){
$revids{day('%F',$day)}=$_->{'revid'};
last;
}
}
}
my %pages=();
while(<X>){
next unless /^\* \[\[(.*?)\]\]: (\d{4}-\d{2}-\d{2}(?:, \d{4}-\d{2}-\d{2})*)\s*$/;
my $title=$1;
my @dates=split /, /, $2;
my %dates=();
foreach (@dates){
die "Bad date: $_" unless exists($revids{$_});
$dates{$_}=$revids{$_};
}
# Check the page
my $res=$api->query(
titles => $title,
prop => 'categories',
cllimit => 'max',
clcategories => 'Category:All disambiguation pages',
);
if($res->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$res->{'content'}."\n");
return 300;
}
if($res->{'code'} ne 'success'){
$api->warn(day("Failed to load links for %F: ", $self->{'nextday'}).$res->{'error'}."\n");
sleep(10);
redo;
}
my $page=(values(%{$res->{'query'}{'pages'}}))[0];
if(exists($page->{'missing'})){
die "$title does not exist";
} elsif(exists($page->{'redirect'})){
die "$title is a redirect";
} elsif($page->{'ns'}!=0){
die "$title is a non-article";
} elsif(exists($page->{'categories'}) && @{$page->{'categories'}}){
die "$title is a disambiguation page";
}
# Tag the talk page
$title="Talk:$title";
$api->log("Tagging $title");
my $tok=$api->edittoken("$title", EditRedirect => 1);
if($tok->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$tok->{'content'}."\n");
return 300;
}
if($tok->{'code'} ne 'success'){
$api->warn("Failed to get edit token for $title: ".$tok->{'error'}."\n");
sleep(10);
redo;
}
if(exists($tok->{'redirect'})){
die "$title is a redirect, cannot tag";
}
my $txt=$self->tag($api, $tok->{'revisions'}[0]{'*'}, %dates);
$res=$api->edit($tok, $txt, "Adding/updating {{OnThisDay}}. $screwup", 0, 1);
if($res->{'code'} ne 'success'){
$api->warn("Write failed on $title: $res->{error}\n");
sleep(10);
redo;
}
}
close X;
return undef;
}
if($mode eq 'live'){
if($self->{'nextday'}==0 && exists($api->store->{'nextday'})){
my $t=$api->store->{'nextday'};
$self->{'nextday'}=$t if $t=~/^\d+$/;
}
# Iterate over all our pages
while($self->{'nextday'}<$today){
my $day=$self->{'nextday'};
my $fday=day("%F", $self->{'nextday'});
my $page=load_page_for_day($api, $day);
if($page->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$page->{'content'}."\n");
return 300;
}
if($page->{'code'} ne 'success'){
$api->warn("Failed to fetch page for $fday: $page->{error}\n");
return 60;
}
my $revid=$page->{'revisions'}[0]{'revid'};
# Strip out non-rendered content
my ($txt, $nowiki)=$api->strip_nowiki($page->{'revisions'}[0]{'*'});
while(my ($k,$v)=each %$nowiki){
$nowiki->{$k}='' if $v=~/^<!--/;
}
$txt=$1 if $txt=~m!<onlyinclude>(.*?)</onlyinclude>!;
$txt=~s!<noinclude>(.*?)</noinclude>!!g;
$txt=~s!</?includeonly>!!g;
# Replace time-varying templates
$txt=~s/{{IsLeapYear}}/{{IsLeapYear|{{CURRENTYEAR}}}}/g;
my $x=day("%-d", $self->{'nextday'});
$txt=~s/{{CURRENTDAY}}/$x/g;
$x=day("%w", $self->{'nextday'});
$txt=~s/{{CURRENTDOW}}/$x/g;
$x=day("%m", $self->{'nextday'});
$txt=~s/{{CURRENTMONTH}}/$x/g;
$x=day("%Y", $self->{'nextday'});
$txt=~s/{{CURRENTYEAR}}/$x/g;
$txt=$api->replace_nowiki($txt, $nowiki);
# Expand templates
$res=$api->query(
action => 'expandtemplates',
title => $page->{'title'},
text => $txt,
);
if($res->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$res->{'content'}."\n");
return 300;
}
if($res->{'code'} ne 'success'){
$api->warn(day("Failed to expand templates for %F: ", $self->{'nextday'}).$res->{'error'}."\n");
$api->debug(2,"Text was $txt\n");
return 60;
}
# Transform ''' to <b>
$txt=doAllQuotes($api, $res->{'expandtemplates'}{'*'});
# Extract just the bold parts
$txt=join('',$txt=~m!<b>(.*?)</b>!g);
# Extract the links
$res=$api->query(
action => 'parse',
title => $page->{'title'},
text => $txt,
prop => 'links',
);
if($res->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$res->{'content'}."\n");
return 300;
}
if($res->{'code'} ne 'success'){
$api->warn(day("Failed to expand templates for %F: ", $self->{'nextday'}).$res->{'error'}."\n");
$api->debug(2,"Text was $txt\n");
return 60;
}
my @links=map($_->{'*'}, grep($_->{'ns'}==0 && exists($_->{'exists'}), @{$res->{'parse'}{'links'}}));
# Filter out month/year links
@links=grep(!/$skip_links_re/o, @links);
# Resolve redirects & check for dabs
my %res=();
my @err=();
while(@links){
my @l=splice(@links,0,500);
my $res=$api->query(
titles => join('|',@l),
prop => 'categories',
cllimit => 'max',
clcategories => 'Category:All disambiguation pages',
redirects => 1,
);
if($res->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$res->{'content'}."\n");
return 300;
}
if($res->{'code'} ne 'success'){
$api->warn(day("Failed to load links for %F: ", $self->{'nextday'}).$res->{'error'}."\n");
return 60;
}
my %map=();
$map{$_->{'from'}}=$_->{'to'} foreach (@{$res->{'query'}{'redirects'}});
my %pages=map { $_->{'title'}=>$_ } values %{$res->{'query'}{'pages'}};
foreach my $l (@l){
my $t=$l;
$t=$map{$t} while(exists($map{$t}));
if(!exists($pages{$t})){
$api->warn("No result for $l ".day("(%F)", $day)."\n");
return 60;
}
my $page=$pages{$t};
if(exists($page->{'missing'})){
push @err, "* [[:$l]] does not exist";
} elsif(exists($page->{'redirect'})){
push @err, "* [[:$l]] is a double redirect";
} elsif($page->{'ns'}!=0){
push @err, "* [[:$l]] redirects to a non-article";
} elsif(exists($page->{'categories'}) && @{$page->{'categories'}}){
push @err, "* [[:$l]] is a disambiguation page";
} else {
$res{$t}=[] unless exists($res{$t});
push @{$res{$t}}, $l;
}
}
}
# Tag the talk pages
foreach my $title (keys %res){
$title="Talk:$title";
$api->log("Tagging $title for ".day("%F", $day));
my $tok=$api->edittoken("$title", EditRedirect => 1);
if($tok->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$tok->{'content'}."\n");
return 300;
}
if($tok->{'code'} ne 'success'){
$api->warn("Failed to get edit token for $title: ".$tok->{'error'}."\n");
return 60;
}
if(exists($tok->{'redirect'})){
push @err, "* [[$title]] is a redirect, cannot tag for [[".join(']] / [[', @{$res{$title}})."]]";
next;
}
my $txt=$self->tag($api, $tok->{'revisions'}[0]{'*'}, $fday => $revid);
$res=$api->edit($tok, $txt, "Adding/updating {{OnThisDay}}. $screwup", 0, 1);
if($res->{'code'} ne 'success'){
$api->warn("Write failed on $title: $res->{error}\n");
return 60;
}
}
# Whine about errors
if(@err){
$api->log("Whining about bad links on ".day("%F", $day));
my $err=day("Errors processing [[Wikipedia:Selected anniversaries/%B %-d]]", $self->{'nextday'});
$res=$api->whine($err, "The following links could not be processed:\n".join("\n", @err)."\nPlease add {{tl|OnThisDay}} to them manually using oldid $revid, I will not be retrying. Thanks.", Summary => $err, Pagename => $whine_to, NoSmallPrint => 1);
if($res->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$res->{'content'}."\n");
return 300;
}
if($res->{'code'} ne 'success'){
$api->warn(day("Could not complain about %F: ", $self->{'nextday'}).$res->{'error'}."\n");
return 60;
}
}
# Done! Next day
$api->store->{"nextday"}=++$self->{'nextday'};
}
my $next=86400-($starttime%86400)-(time-$starttime);
return 86400-(time%86400) if($self->{'nextday'}>=$today);
}
die "Invalid mode '$mode'";
}
# The anniversary pages were created 2004-02-26 through 2004-02-28; We count
# 2004-02-26 as day 0. This function does POSIX::strftime on the day number.
# Thankfully, it correctly converts
sub day {
my $fmt=shift;
my $day=shift;
return POSIX::strftime($fmt,0,0,0,$day+26,1,104);
}
sub day_from_timestamp {
my $ts=shift;
my ($min,$max)=(0,5000);
my $target=POSIX::strftime("%F", gmtime $ts);
while(1){
my $day=day("%F",$max);
if($day eq $target){
return $max;
} elsif($day lt $target){
$min=$max;
$max+=$max;
} else {
last;
}
}
while($min!=$max){
my $d=POSIX::floor(($min+$max)/2);
my $day=day("%F",$d);
if($day eq $target){
return $d;
} elsif($day lt $target){
$min=$d+1;
} else {
$max=$d;
}
}
die "WTF? Searching for $target, got to $min which is ".day("%F",$min);
}
sub load_page_for_day {
my $api=shift;
my $day=shift;
return $api->store->{"day $day"} if(exists($api->store->{"day $day"}));
my $start=$api->ISO2timestamp(day("%Y-%m-%dT00:00:00Z",$day+1));
my %q=(
titles => day("Wikipedia:Selected anniversaries/%B %-d",$day),
rvlimit => 1,
rvprop => 'ids|timestamp|content',
rvdir => 'older',
rvstart => $start,
prop => 'revisions',
);
while(1){
my $res=$api->query(%q);
return $res unless $res->{'code'} eq 'success';
my $page=(values(%{$res->{'query'}{'pages'}}))[0];
$page->{'code'}='success';
my $t=$api->ISO2timestamp($page->{'revisions'}[0]{'timestamp'});
if($t<$start){
$api->store->{"day $day"}=$page;
$api->store->{"day $day"}{'cached'}=1;
$res->{'cached'}=0;
return $page;
}
return undef unless exists($res->{'query-continue'});
$q{'rvstartid'}=$res->{'query-continue'}{'revisions'}{'rvstartid'};
delete $q{'rvstart'};
}
}
# Parse single-quotes in the same way MediaWiki does.
sub doAllQuotes {
my $api=shift;
my ($text,$nowiki)=$api->strip_nowiki(shift);
my $outtxt='';
my @lines=split(/\n/, $text);
$outtxt.=doQuotes($_)."\n" foreach (@lines);
$outtxt=substr($outtxt,0,-1);
return $api->replace_nowiki($outtxt,$nowiki);
}
sub doQuotes {
my $text=shift;
my @arr=split(/(''+)/, $text);
return $text if @arr == 1;
my $numbold=0;
my $numitalics=0;
for(my $i=1; $i<@arr; $i+=2){
my $l=length($arr[$i]);
if($l==4){
# Four 's => assume one plain text + bold
$arr[$i-1].="'";
$arr[$i]="'''";
$l=3;
} elsif($l>5){
# More than five 's => assume N-5 plain text + bold + italic
$arr[$i-1].="'"x($l-5);
$arr[$i]="'''''";
$l=5;
}
$numitalics++ if($l==2 || $l==5);
$numbold++ if($l==3 || $l==5);
}
# Odd number of each => guess one of the bolds is really plain + italic
# Somewhat odd, but to match MediaWiki's parser...
if(($numbold&1) && ($numitalics&1)){
my $firstsingleletterword = -1;
my $firstmultiletterword = -1;
my $firstspace = -1;
for(my $i=1; $i<@arr; $i+=2){
next unless length($arr[$i])==3;
my $x1=substr($arr[$i-1],-1);
my $x2=substr($arr[$i-1],-2,1);
if($x1 eq ' '){
$firstspace=$i if $firstspace == -1;
} elsif($x2 eq ' '){
$firstsingleletterword=$i if $firstsingleletterword == -1;
} else {
$firstmultiletterword=$i if $firstmultiletterword == -1;
}
}
if($firstsingleletterword != -1){
$arr[$firstsingleletterword]="''";
$arr[$firstsingleletterword-1].="'";
} elsif($firstmultiletterword != -1){
$arr[$firstmultiletterword]="''";
$arr[$firstmultiletterword-1].="'";
} elsif($firstspace != -1){
$arr[$firstspace]="''";
$arr[$firstspace-1].="'";
}
}
# Now, convert to HTML
my $output='';
my $buffer='';
my $state='';
for(my $i=0; $i<@arr; $i++){
my $r=$arr[$i];
if($i&1){
if(length($r)==2){
if($state eq 'i'){
$output.='</i>'; $state='';
} elsif($state eq 'bi'){
$output.='</i>'; $state='b';
} elsif($state eq 'ib'){
$output.='</b></i><b>'; $state='b';
} elsif($state eq 'both'){
$output.="<b><i>$buffer</i>"; $state='b';
} else {
$output.='<i>'; $state.='i';
}
} elsif(length($r)==3){
if($state eq 'b'){
$output.='</b>'; $state='';
} elsif($state eq 'bi'){
$output.='</i></b><i>'; $state='i';
} elsif($state eq 'ib'){
$output.='</b>'; $state='i';
} elsif($state eq 'both'){
$output.="<i><b>$buffer</b>"; $state='i';
} else {
$output.='<b>'; $state.='b';
}
} elsif(length($r)==5){
if($state eq 'b'){
$output.='</b><i>'; $state='i';
} elsif($state eq 'i'){
$output.='</i><b>'; $state='b';
} elsif($state eq 'bi'){
$output.='</i></b>'; $state='';
} elsif($state eq 'ib'){
$output.='</b></i>'; $state='';
} elsif($state eq 'both'){
$output.="<i><b>$buffer</b></i>"; $state='';
} else {
$buffer=''; $state='both';
}
}
} else {
if($state eq 'both'){
$buffer.=$r;
} else {
$output.=$r;
}
}
}
$output.='</b>' if($state eq 'b' || $state eq 'ib');
$output.='</i>' if($state eq 'i' || $state eq 'bi' || $state eq 'ib');
$output.='</b>' if($state eq 'bi');
$output.="<b><i>$buffer</i></b>" if($state eq 'both' && $buffer ne '');
return $output;
}
sub tag {
my $self=shift;
my $api=shift;
my $txt=shift;
my %dates=@_;
my %redir=%{$self->{'redir'}};
if(!%redir){
%redir=$api->redirects_to('Template:OnThisDay');
if(exists($redir{''})){
$api->warn("Could not load list of redirects to Template:OnThisDay: ".$redir{''}{'error'}."\n");
return undef;
}
$self->{'redir'}=\%redir;
}
# Update an existing template?
my $done=0;
$txt=$api->process_templates($txt, sub {
my $name=shift;
my $params=shift;
shift; # $wikitext
shift; # $data
my $oname=shift;
return unless exists($redir{"Template:$name"});
@$params=grep(/^\s*(?:oldid|date)\d+\s*=/, @$params);
my $i=1;
$i++ while grep(/^\s*oldid$i\s*=/, @$params);
foreach my $date (sort keys %dates){
push @$params, "date$i=$date", "oldid$i=".$dates{$date};
$i++;
}
$done=1;
return "{{$oname|".join('|', @$params)."}}";
});
return $txt if $done;
if(!exists($self->{'loaded skip redirects'})){
my %skip=$api->redirects_to(@skip_templates);
if(exists($skip{''})){
$api->warn("Could not load list of redirects for skip templates: ".$skip{''}{'error'}."\n");
return undef;
}
@skip_templates=keys %skip;
$self->{'loaded skip redirects'}=1;
}
# No, add a new one.
my $templ='{{OnThisDay';
my $i=1;
foreach my $date (sort keys %dates){
$templ.="|date$i=$date|oldid$i=".$dates{$date};
$i++;
}
$templ.="}}";
my $nowiki;
($txt,$nowiki)=$api->strip_nowiki($txt);
my $outtmpl={};
$txt=$api->process_templates($txt, \&_strip_templates, $outtmpl);
$txt="$templ\n$txt" unless $txt=~s/^((?:\s*\x02[a-zA-Z0-9_-]+\x03)*[ \t]*)(?:$|(?=\n))/$1\n$templ/;
$txt=_unstrip_templates($txt, $outtmpl);
return $api->replace_nowiki($txt, $nowiki);
}
sub _strip_templates {
my ($name, $params, $wikitext, $data) = @_;
return undef if $name=~/^#/;
return undef if grep(/^\s*small\s*=\s*(?!no|n|0)\S/, @$params);
return undef if grep("Template:$name" eq $_, @skip_templates);
$wikitext=_unstrip_templates($wikitext,$data);
my $tag="\x02".sha256_base64($wikitext)."\x03";
$tag=~tr!+/=!-_!d;
$data->{$tag}=$wikitext;
return $tag;
}
sub _unstrip_templates {
my $wikitext=shift;
my $templ=shift;
$wikitext=~s!(\x02[a-zA-Z0-9_-]+\x03)! exists($templ->{$1})?$templ->{$1}:$1 !gioe;
return $wikitext;
}
1;