#!/usr/bin/perl # 10c-removal # # A bot to remove NFCC #10c-incompliant images from pages use strict; use warnings; use Date::Calc; use Data::Dumper; use libBot; my @common_links = ("Copyright", "Copyright infringement", "Fair use", "Logo", "Trademark", "United States copyright law", "Wikimedia", "Computer game", "Counterfeit", "Currency", "Free software", "Portable Network Graphics", "Poster", "Public ___domain", "Screenshot", "Station identification", "United States Code", "U.S. state", "Video game", "Wikimedia Foundation", "Work of the United States Government"); my $common_links = join "|", @common_links; my $test = 0; my $homedir = '/home/mark/Desktop/wikibots/10cbot'; my $permit_interruptions = 1; # Allow talkpage messages to stop the bot? Pearle::init("FairuseBot", "", "$homedir/removebot.log","$homedir/removebot-cookies.txt"); Pearle::config(nullOK => 1, printlevel => 4); config(username => "FairuseBot"); if(!Pearle::login()) { exit; } # Check for a running copy if(-e "$homedir/pid") { # Possible other copy. Compare PIDs open PIDFILE, "<", "$homedir/pid"; my $pid = <PIDFILE>; close PIDFILE; my $psresult = `ps -p $pid`; if($psresult =~ /10c-removal.pl/) { botwarnlog("*Previous run is taking longer than normal\n"); exit; } } open PIDFILE, ">", "$homedir/pid"; print PIDFILE $$; close PIDFILE; my $total_images = 0; my @logs; { my @images; my $image; my $images_removed = 0; @images = (); Pearle::myLog(2, "Beginning set at " . time() . "\n"); # Get the log if($test) { @images = ('Image:Dummy316.png'); } else { my $CURRENT_DIR; my @files; # Scan the directory for log files opendir($CURRENT_DIR, $homedir) or (print "Failed: $!\n" and return); @files = readdir $CURRENT_DIR or (print "Failed: $!\n" and return); closedir $CURRENT_DIR; @files = grep {/^partial_failures.*txt$/} @files; foreach my $file (@files) { my ($year, $month, $day) = $file =~ /_(\d{4})-(\d{1,2})-(\d{1,2})/; if(Date::Calc::Delta_Days( $year, $month, $day, (Date::Calc::Today(1))) > 5) { open INFILE, "<:utf8", "$homedir/$file"; my @new_images = <INFILE>; close INFILE; chomp @new_images; push @images, @new_images; push @logs, "$homedir/$file"; } } } Pearle::myLog(3, join("\n", @images)); Pearle::myLog(3, "\n" . scalar(@images) . " images found\n"); if(scalar(@images) == 0) { Pearle::myLog(1, "*No images to remove\n"); } foreach $image (@images) { my $image_url; my $image_regex = $image; my $page; my $full_comment = ""; my $removal_prefix = "Image with inadequate rationale removed:"; my $removal_comment = "Removing image with inadequate [[WP:NFCC|rationale]]"; # Fetch image info Pearle::myLog(2, "Processing image $image\n"); # Fetch the image data my $image_data; if($test) { $image_data = Pearle::APIQuery(titles => [$image], prop => ['links', 'revisions', 'imageinfo', 'categories'], plnamespace => [0, 2], # Links rvprop => ['content'], # Article body iiprop => ['user', 'comment', 'sha1'], iilimit => 500, # Upload history meta => 'userinfo', uiprop => ['hasmsg'], # Check for talkpage messages list => 'imageusage', iutitle => $image, iunamespace => [0, 2], iulimit => 500); # Image usage } else { $image_data = Pearle::APIQuery(titles => [$image], prop => ['links', 'revisions', 'imageinfo', 'categories'], plnamespace => [0], # Links rvprop => ['content'], # Article body iiprop => ['user', 'comment', 'sha1'], iilimit => 500, # Upload history meta => 'userinfo', uiprop => ['hasmsg'], # Check for talkpage messages list => 'imageusage', iutitle => $image, iunamespace => [0], iulimit => 500); # Image usage } if(!defined($image_data)) { Pearle::myLog(0, "Server did not return an appropriate response. Exiting.\n"); last; } # Extract the list of pages where it's used. my @pages = GetPageList($image_data); my $num_pages = scalar(@pages); my @failed_pages; # Extract the categories my @categories = GetPageCategories($image_data); # Extract a list of pages this image links to. my @links = GetPageLinks($image_data); # Filter out common links @links = grep {$_ !~ /^($common_links)$/} @links; if($permit_interruptions and DoIHaveMessages($image_data)) { Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n"); exit; } # Sanity check: Does the image still exist? if($image_data =~ /missing=""/) { Pearle::myLog(2, "*Image [[:$image]] has been deleted.\n"); next; } # Sanity check: Is this still tagged as non-free? if(!grep {$_ eq 'Category:All non-free media'} @categories) { Pearle::myLog(2, "*Image [[:$image]] is no longer marked as non-free.\n"); next; } # Sanity check: Is the image used? if(scalar(@pages) == 0) { # Orphaned fairuse image Pearle::myLog(2, "*Image [[:$image]] is not used anywhere\n"); # Is this image already disputed? if(grep {$_ eq 'Category:All disputed non-free images'} @categories) { Pearle::myLog(2, "*Image [[:$image]] is already marked for deletion.\n"); } else { if(!grep {$_ eq 'Category:All orphaned fairuse images'} @categories) { my $text = "\n{{subst:orfud}}\n"; wikilog($image, $text, "Non-free image is not used in any article\n"); } } next; } # Sanity check: Is the image still tagged as disputed? if(!grep {$_ eq 'Category:All disputed non-free images'} @categories) { Pearle::myLog(2, "*Image [[:$image]] is not marked for deletion.\n"); next; } # Remove the NFCC-failure tag and the list of pages # Blindly removing the tag is safe: # 1) If the program fails, 10cbot will pick the image up on its next pass # 2) If the image is orphaned, or will be orphaned by removal (unlikely), 10cbot or another bot will pick it up # 3) If the image is non-compliant on all pages, 10cbot will pick it up on the next pass my $wikipage = Pearle::getPage($image); my $text = $wikipage->getEditableText(); $text =~ s/\x03\x44i-missing article links[^\x04]*\x04//s; Pearle::myLog(4, "Text after processing:\n$text\n"); $wikipage->setEditableText($text); Pearle::postPage($wikipage, "Removing tag", 0); Pearle::limit(); # Build the image-matching regex my ($raw_image) = $image =~ /Image:(.*)/; $raw_image = MakeWikiRegex($raw_image); if($image !~ /(\.jpg|\.jpeg|\.png|\.gif|\.svg)$/i) { $image_regex = "[ _]*(:?[Ii][Mm][Aa][Gg][Ee]|[Mm][Ee][Dd][Ii][Aa])[ _]*:[ _]*${raw_image}[ _]*"; Pearle::myLog(2, "*Non-image media file [[:$image]] found.\n"); next; # Non-image media are too hard to work with } else { $image_regex = "[ _]*[Ii][Mm][Aa][Gg][Ee][ _]*:[ _]*${raw_image}[ _]*"; } # Sanity check if(!defined($raw_image) or $image !~ /$raw_image/) { botwarnlog("*Parse error on image [[:$image]] ($raw_image)\n"); next; } Pearle::myLog(3, "Image regex: $image_regex\n"); # Check for best-case compliance: each use has a matching direct link in the body of the text - tested Pearle::myLog(4, "Image is used in " . scalar(@pages) . " pages.\n"); Pearle::myLog(4, "Image is used on " . join("|", @pages) . "\n"); Pearle::myLog(4, "Image links to " . join("|", @links) . "\n"); foreach my $page (@links) # Filter out pages that match a link { @pages = grep {$_ ne $page} @pages; } Pearle::myLog(4, "Image failed best-case test for " . scalar(@pages) . " pages.\n"); next if(scalar(@pages) == 0); # Check for liberal compliance: # For each use, remove it from the list if there's a case-insensitive match in the body text - tested foreach my $page (@pages) { my $page_match_regex = MakeWikiRegex($page); push @failed_pages, $page unless($text =~ /$page_match_regex/i); } @pages = @failed_pages; @failed_pages = (); Pearle::myLog(4, "Image failed text test for " . scalar(@pages) . " pages.\n"); next if(scalar(@pages) == 0); # Check for strict compliance: # For each link, chase redirects - tested if(scalar(@links) > 0) { my $page_data = Pearle::APIQuery(titles => \@links, redirects => 1); my $parsed_xml = Pearle::getXMLParser()->XMLin($page_data); my @redirects; Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{redirects}->{r}) and defined($parsed_xml->{query}->{redirects}->{r})) { if(ref($parsed_xml->{query}->{redirects}->{r}) eq 'ARRAY') { @redirects = @{$parsed_xml->{query}->{redirects}->{r}}; } else { @redirects = ($parsed_xml->{query}->{redirects}->{r}); } } foreach my $page (@pages) { my $matched = 0; foreach my $redirect (@redirects) { if($redirect->{to} eq $page) { # We can get there by a redirect UpdateLink($image, $redirect->{from}, $page); Pearle::limit(); $matched = 1; last; } } if(!$matched) { push @failed_pages, $page; } } @pages = @failed_pages; @failed_pages = (); } Pearle::myLog(4, "Image failed redirect test for " . scalar(@pages) . " pages.\n"); next if(scalar(@pages) == 0); # Check for near-compliance: # For each use, if we can get to it by means of a disambiguation page, update the link - tested foreach my $page (@links) { # Fetch the page text and page links my $page_data = Pearle::APIQuery(titles => [$page], prop => ['links', 'revisions'], plnamespace => [2], # Links rvprop => ['content']); # Article body # If the page text indicates disambig, see if any of the links is one we're looking for my $page_text = GetPageText($page_data); if($page_text =~ /{{disambig}}/i) { my @page_links = GetPageLinks($page_data); foreach my $disambig_link (@page_links) { if(grep {$_ eq $disambig_link} @pages) { # It's a match. Remove it from the list @pages = grep {$_ ne $disambig_link} @pages; # Post to the page my $success = UpdateLink($image, $page, $disambig_link); if(!$success) { botwarnlog("*Failed to update disambiguation link for [[:$image]] from [[$page]] to [[$disambig_link]]\n"); } Pearle::limit(); } } } } Pearle::myLog(4, "Image failed disambiguation test for " . scalar(@pages) . " pages.\n"); next if(scalar(@pages) == 0); # Test for compliance # Over-use (some compliant, some non-compliant): Remove from any non-compliant articles, OrphanBot-style. Leave a note on the article talk page. if(scalar(@pages) > 0 and $num_pages > scalar(@pages)) { Pearle::myLog(2, "Image $image failed on " . scalar(@pages) . " pages.\n"); my $parsed_removal_comment = $removal_comment; $parsed_removal_comment =~ s/image/[[:$image|image]]/; foreach $page (@pages) { my $hits = 0; notelog("Page for removal: $page\n"); if($hits = RemoveImageFromPage($image, $page, $image_regex, $removal_prefix, $parsed_removal_comment)) # Don't limit if we just touched the article { Pearle::myLog(2, "Removed image $image from article $page ($hits times)\n"); Pearle::limit(); } $images_removed += $hits; } } elsif(scalar(@pages) > 0) { # Fully-non-compliant. Should never occur, but if it does, let 10cbot pick it up on the next pass. Pearle::myLog(2, "Image $image failed on all pages\n"); } else { Pearle::myLog(2, "Image $image is now fully-compliant\n"); } } Pearle::myLog(2, "Finished with set. Removed $images_removed images.\n"); $total_images += $images_removed; } # Remove the processed logs unlink @logs; unlink "$homedir/pid"