User:FairuseBot/10c-removal.pl

#!/usr/bin/perl


# 10c-removal
#
# A bot to remove NFCC #10c-incompliant images from pages

use strict;
use warnings;

use Date::Calc;
use Data::Dumper;

use libBot;

my @common_links = ("Copyright", "Copyright infringement", "Fair use", "Logo", "Trademark", "United States copyright law", "Wikimedia",
                    "Computer game", "Counterfeit", "Currency", "Free software", "Portable Network Graphics", "Poster", "Public ___domain",
                    "Screenshot", "Station identification", "United States Code", "U.S. state", "Video game", "Wikimedia Foundation",
                    "Work of the United States Government");
my $common_links = join "|", @common_links;

my $test = 0;

my $homedir = '/home/mark/Desktop/wikibots/10cbot';
my $permit_interruptions = 1;	# Allow talkpage messages to stop the bot?

Pearle::init("FairuseBot", "", "$homedir/removebot.log","$homedir/removebot-cookies.txt");
Pearle::config(nullOK => 1, printlevel => 4);
config(username => "FairuseBot");

if(!Pearle::login())
{
	exit;
}

# Check for a running copy
if(-e "$homedir/pid")
{
	# Possible other copy.  Compare PIDs
	open PIDFILE, "<", "$homedir/pid";
	my $pid = <PIDFILE>;
	close PIDFILE;

	my $psresult = `ps -p $pid`;
	if($psresult =~ /10c-removal.pl/)
	{
		botwarnlog("*Previous run is taking longer than normal\n");
		exit;
	}
}

open PIDFILE, ">", "$homedir/pid";
print PIDFILE $$;
close PIDFILE;

my $total_images = 0;
my @logs;

{
	my @images;
	my $image;
	my $images_removed = 0;
	
	@images = ();
	
	Pearle::myLog(2, "Beginning set at " . time() . "\n");

	# Get the log
	if($test)
	{
		@images = ('Image:Dummy316.png');
	}
	else
	{
		my $CURRENT_DIR;
		my @files;
		# Scan the directory for log files
		opendir($CURRENT_DIR, $homedir) or (print "Failed: $!\n" and return);
		@files = readdir $CURRENT_DIR or (print "Failed: $!\n" and return);
		closedir $CURRENT_DIR;
		@files = grep {/^partial_failures.*txt$/} @files;
		foreach my $file (@files)
		{
			my ($year, $month, $day) = $file =~ /_(\d{4})-(\d{1,2})-(\d{1,2})/;
			if(Date::Calc::Delta_Days( $year, $month, $day, (Date::Calc::Today(1))) > 5)
			{
				open INFILE, "<:utf8", "$homedir/$file";
				my @new_images = <INFILE>;
				close INFILE;
				chomp @new_images;
				push @images, @new_images;
				
				push @logs, "$homedir/$file";
			}
		}
	}
		
	Pearle::myLog(3, join("\n", @images));
	Pearle::myLog(3, "\n" . scalar(@images) . " images found\n");
	
	if(scalar(@images) == 0)
	{
		Pearle::myLog(1, "*No images to remove\n");
	}

	foreach $image (@images)
	{
		my $image_url;
		my $image_regex = $image;
		my $page;
		
		my $full_comment = "";
		my $removal_prefix = "Image with inadequate rationale removed:";
		my $removal_comment = "Removing image with inadequate [[WP:NFCC|rationale]]";
		
		# Fetch image info
		Pearle::myLog(2, "Processing image $image\n");
		# Fetch the image data
		my $image_data;
		if($test)
		{
			$image_data = Pearle::APIQuery(titles => [$image], prop => ['links', 'revisions', 'imageinfo', 'categories'], 
							plnamespace => [0, 2],  							# Links
							rvprop => ['content'],							# Article body
							iiprop => ['user', 'comment', 'sha1'], iilimit => 500,			# Upload history
							meta => 'userinfo', uiprop => ['hasmsg'], 					# Check for talkpage messages
							list => 'imageusage', iutitle => $image, iunamespace => [0, 2], iulimit => 500);	# Image usage
		}
		else
		{
			$image_data = Pearle::APIQuery(titles => [$image], prop => ['links', 'revisions', 'imageinfo', 'categories'], 
							plnamespace => [0],	  							# Links
							rvprop => ['content'],							# Article body
							iiprop => ['user', 'comment', 'sha1'], iilimit => 500,			# Upload history
							meta => 'userinfo', uiprop => ['hasmsg'], 					# Check for talkpage messages
							list => 'imageusage', iutitle => $image, iunamespace => [0], iulimit => 500);	# Image usage
		}
		
		if(!defined($image_data))
		{
			Pearle::myLog(0, "Server did not return an appropriate response.  Exiting.\n");
			last;
		}
	
		# Extract the list of pages where it's used.
		my @pages = GetPageList($image_data);
		my $num_pages = scalar(@pages);
		my @failed_pages;
		# Extract the categories
		my @categories = GetPageCategories($image_data);
		# Extract a list of pages this image links to.
		my @links = GetPageLinks($image_data);
		# Filter out common links
		@links = grep {$_ !~ /^($common_links)$/} @links;

		if($permit_interruptions and DoIHaveMessages($image_data))
		{
			Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n");
			exit;
		}
		
		# Sanity check: Does the image still exist?
		if($image_data =~ /missing=""/)
		{
			Pearle::myLog(2, "*Image [[:$image]] has been deleted.\n");
			next;
		}
		# Sanity check: Is this still tagged as non-free?
		if(!grep {$_ eq 'Category:All non-free media'} @categories)
		{
			Pearle::myLog(2, "*Image [[:$image]] is no longer marked as non-free.\n");
			next;
		}
		# Sanity check: Is the image used?
		if(scalar(@pages) == 0)
		{
			# Orphaned fairuse image
			Pearle::myLog(2, "*Image [[:$image]] is not used anywhere\n");
			# Is this image already disputed?
			if(grep {$_ eq 'Category:All disputed non-free images'} @categories)
			{
				Pearle::myLog(2, "*Image [[:$image]] is already marked for deletion.\n");
			}
			else
			{
				if(!grep {$_ eq 'Category:All orphaned fairuse images'} @categories)
				{
					my $text = "\n{{subst:orfud}}\n";
					wikilog($image, $text, "Non-free image is not used in any article\n");
				}
			}
			next;
		}
		# Sanity check: Is the image still tagged as disputed?
		if(!grep {$_ eq 'Category:All disputed non-free images'} @categories)
		{
			Pearle::myLog(2, "*Image [[:$image]] is not marked for deletion.\n");
			next;
		}
		
		# Remove the NFCC-failure tag and the list of pages
		# Blindly removing the tag is safe:
		# 1) If the program fails, 10cbot will pick the image up on its next pass
		# 2) If the image is orphaned, or will be orphaned by removal (unlikely), 10cbot or another bot will pick it up
		# 3) If the image is non-compliant on all pages, 10cbot will pick it up on the next pass
		my $wikipage = Pearle::getPage($image);
		my $text = $wikipage->getEditableText();
		$text =~ s/\x03\x44i-missing article links[^\x04]*\x04//s;
		Pearle::myLog(4, "Text after processing:\n$text\n");
		$wikipage->setEditableText($text);
		Pearle::postPage($wikipage, "Removing tag", 0);
		Pearle::limit();
		
		# Build the image-matching regex
		my ($raw_image) = $image =~ /Image:(.*)/;
		$raw_image = MakeWikiRegex($raw_image);
		if($image !~ /(\.jpg|\.jpeg|\.png|\.gif|\.svg)$/i)
		{
			$image_regex = "[ _]*(:?[Ii][Mm][Aa][Gg][Ee]|[Mm][Ee][Dd][Ii][Aa])[ _]*:[ _]*${raw_image}[ _]*";
			Pearle::myLog(2, "*Non-image media file [[:$image]] found.\n");
			next;			# Non-image media are too hard to work with
		}
		else
		{
			$image_regex = "[ _]*[Ii][Mm][Aa][Gg][Ee][ _]*:[ _]*${raw_image}[ _]*";
		}
				
		# Sanity check
		if(!defined($raw_image) or $image !~ /$raw_image/)
		{
			botwarnlog("*Parse error on image [[:$image]] ($raw_image)\n");
			next;
		}
		Pearle::myLog(3, "Image regex: $image_regex\n");
		
		# Check for best-case compliance: each use has a matching direct link in the body of the text - tested
		Pearle::myLog(4, "Image is used in " . scalar(@pages) . " pages.\n");
		Pearle::myLog(4, "Image is used on " . join("|", @pages) . "\n");
		Pearle::myLog(4, "Image links to " . join("|", @links) . "\n");
		
		foreach my $page (@links)	# Filter out pages that match a link
		{
			@pages = grep {$_ ne $page} @pages;
		}
		Pearle::myLog(4, "Image failed best-case test for " . scalar(@pages) . " pages.\n");
		next if(scalar(@pages) == 0);
			
		# Check for liberal compliance:
		# For each use, remove it from the list if there's a case-insensitive match in the body text - tested
		foreach my $page (@pages)
		{
			my $page_match_regex = MakeWikiRegex($page);
			push @failed_pages, $page unless($text =~ /$page_match_regex/i);
		}
		@pages = @failed_pages;
		@failed_pages = ();
		
		Pearle::myLog(4, "Image failed text test for " . scalar(@pages) . " pages.\n");
		next if(scalar(@pages) == 0);
		
		# Check for strict compliance:
		# For each link, chase redirects - tested
		if(scalar(@links) > 0)
		{
			my $page_data = Pearle::APIQuery(titles => \@links, redirects => 1);
			my $parsed_xml = Pearle::getXMLParser()->XMLin($page_data);
			my @redirects;
			Pearle::myLog(4, Dumper($parsed_xml));
			if(exists($parsed_xml->{query}->{redirects}->{r}) and defined($parsed_xml->{query}->{redirects}->{r}))
			{
				if(ref($parsed_xml->{query}->{redirects}->{r}) eq 'ARRAY')
				{
					@redirects = @{$parsed_xml->{query}->{redirects}->{r}};
				}
				else
				{
					@redirects = ($parsed_xml->{query}->{redirects}->{r});
				}
			}
			foreach my $page (@pages)
			{
				my $matched = 0;
				foreach my $redirect (@redirects)
				{
					if($redirect->{to} eq $page)
					{
						# We can get there by a redirect
						UpdateLink($image, $redirect->{from}, $page);
						Pearle::limit();
						$matched = 1;
						last;
					}
				}
				if(!$matched)
				{
					push @failed_pages, $page;
				}
			}
			@pages = @failed_pages;
			@failed_pages = ();
		}
		
		Pearle::myLog(4, "Image failed redirect test for " . scalar(@pages) . " pages.\n");
		next if(scalar(@pages) == 0);
	
		# Check for near-compliance:
		# For each use, if we can get to it by means of a disambiguation page, update the link - tested
		foreach my $page (@links)
		{
			# Fetch the page text and page links
			my $page_data = Pearle::APIQuery(titles => [$page], prop => ['links', 'revisions'], 
						plnamespace => [2],  						# Links
						rvprop => ['content']);					# Article body
			# If the page text indicates disambig, see if any of the links is one we're looking for
			my $page_text = GetPageText($page_data);
			if($page_text =~ /{{disambig}}/i)
			{
				my @page_links = GetPageLinks($page_data);
				foreach my $disambig_link (@page_links)
				{
					if(grep {$_ eq $disambig_link} @pages)
					{
						# It's a match.  Remove it from the list
						@pages = grep {$_ ne $disambig_link} @pages;
						# Post to the page
						my $success = UpdateLink($image, $page, $disambig_link);
						if(!$success)
						{
							botwarnlog("*Failed to update disambiguation link for [[:$image]] from [[$page]] to [[$disambig_link]]\n");
						}
						Pearle::limit();
					}
				}
			}
		}
		
		Pearle::myLog(4, "Image failed disambiguation test for " . scalar(@pages) . " pages.\n");
		next if(scalar(@pages) == 0);
		
		# Test for compliance
		# Over-use (some compliant, some non-compliant): Remove from any non-compliant articles, OrphanBot-style.  Leave a note on the article talk page.
		if(scalar(@pages) > 0 and $num_pages > scalar(@pages))
		{
			Pearle::myLog(2, "Image $image failed on " . scalar(@pages) . " pages.\n");
			
			my $parsed_removal_comment = $removal_comment;
			$parsed_removal_comment =~ s/image/[[:$image|image]]/;
			foreach $page (@pages)
			{
				my $hits = 0;
				notelog("Page for removal: $page\n");
				if($hits = RemoveImageFromPage($image, $page, $image_regex, $removal_prefix, $parsed_removal_comment)) 	# Don't limit if we just touched the article
				{
					Pearle::myLog(2, "Removed image $image from article $page ($hits times)\n");
					Pearle::limit();
				}
				$images_removed += $hits;
			}
		}
		elsif(scalar(@pages) > 0)
		{
			# Fully-non-compliant.  Should never occur, but if it does, let 10cbot pick it up on the next pass.
			Pearle::myLog(2, "Image $image failed on all pages\n");
		}
		else
		{
			Pearle::myLog(2, "Image $image is now fully-compliant\n");
		}
	}
	Pearle::myLog(2, "Finished with set.  Removed $images_removed images.\n");
	$total_images += $images_removed;
}

# Remove the processed logs
unlink @logs;

unlink "$homedir/pid"