Wikipedia:WikiProject Chemistry/Structure drawing workgroup/Mysid's script

This is an old revision of this page, as edited by Slashme (talk | contribs) at 20:26, 21 October 2007 (Yes, a blur effect solves the problem with missing things that are separated by blank space.). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
#Modify svg file from BKchem to work with librsvg, for use on Wikimedia.
use Image::Magick;

#Does the input file exist?
if(! -f $ARGV[0])
{print "$ARGV[0]: File not found\n";
 exit};
#Open files.  TODO: check that output file is writeable.
open INPUT,"<$ARGV[0]";
open OUTPUT,">$ARGV[0]_tmp.svg";

#initialize variables for minimum an maximum image size.
$miny=-1;
$maxy=0;
$minx=-1;
$maxx=0;
while (<INPUT>) {
    #I must skip over "defs" blocks, because they have objects with relative
    #dimensions.  TODO:This is nasty code. I'm sure there's a better way to do
    #this.
    if (/defs/) {
      print OUTPUT $_;
      my $line  = $_;
      while (!($line =~ /\/defs/)){
        $line  = <INPUT>;
	print OUTPUT $line;
      }
      next;
    }
    #Sans is the most general font definition we can use, and librsvg chokes
    #on Helvetica.
    s/helvetica/Sans/gi;
    #Figure out max and min coordinates for later.
    if (!/viewBox/ && /\d/) {
      #We're already matching digits, might as well round the numbers here.
      s/(\d+\.\d+)/sprintf("%.1f", $&)/ge unless(/version/);
      #and fix font sizes
      #s/(\d+)pt/$1*4 ."pt"/ge; # commented out, doesn't seem to be needed with "sans".
      s/75\%/9pt/g;
      #now let's find the approximate max and min coords, so the crop algorithm
      #runs faster:
      #Match "x1=" and "y1=", but don't match on x radius and y radius.
      while(/[^r]([xy])[12]{0,1}="(\d+)/g) {
	if($1 eq "y"){
	  $maxy = $2 if($2>$maxy);
	  $miny = $2 if(($2<$miny) || ($miny<0));
	}
	if($1 eq "x"){
	  $maxx = $2 if($2>$maxx);
	  $minx = $2 if(($2<$minx) || ($minx<0));
	}
      }
      #Match on comma-separated coordinates
      while(/(\d+),(\d+)/g) {
	$maxy = $2 if($2>$maxy);
	$miny = $2 if(($2<$miny) || ($miny<0));
	$maxx = $1 if($1>$maxx);
	$minx = $1 if(($1<$minx) || ($minx<0));
      }
      #Match on space-separated coordinates, but not when preceded
      #by a comma.
      while(/[^\d,.]([\d.]+)\s+([\d.]+)/g){
	$maxy = $2 if($2>$maxy);
	$miny = $2 if(($2<$miny) || ($miny<0));
	$maxx = $1 if($1>$maxx);
	$minx = $1 if(($1<$minx) || ($minx<0));
      }
    }
    #Put in prelim. coordinates.
    if(/<svg .*viewBox="0 0 (\d+) (\d+)"/) {
	$_ = "<svg width=\"".$1."px\" height=\"".$2."px\" version=\"1.0\" viewBox=\"0 0 $1 $2\" xmlns=\"http://www.w3.org/2000/svg\">\n";
    }
    # Replace baseline-shift="sub" with a numeric baseline-shift
    if (/y="([\d\.]+)">.*<tspan baseline-shift="sub"/) {
	$vy = $1;
	$oy = $1+3.25;
	s/baseline-shift="sub"/y="$oy"/g;
	s/<\/tspan>([^<]+)</<\/tspan><tspan y="$vy">$1<\/tspan></g;
    }
    #write each line out after mangling it.
    print OUTPUT $_;
}
#close the files (done with the original input file, output file becomes
#input for next pass.
close OUTPUT;
close INPUT;

#If you have a file called tmp.png, now's the time to say goodbye.
unlink("tmp.png");
print STDERR "bitmap...\n";
#use librsvg, because that's what Wikimedia uses.
system("rsvg-convert -o tmp.png $ARGV[0]_tmp.svg");

$img = Image::Magick->new;
$x = $img->ReadImage("tmp.png");
warn "$x" if "$x";
$he = $img->Get('height');
$wi = $img->Get('width');
#Blur the image so that we don't miss the edge later on, and to make a nice
#border
$x = $img->Blur(geometry=>'5.0x5.0', sigma=>5.0, radius=>5.0, channel=>"All");
print "$x\n";
$img->Write(filename=>'test.png');


print STDERR "crop...\n";

$sumpix=0;
#read the file one pixel row at a time, always starting at the edge value
#detected, going outwards until we hit a completely empty line (pure
#transparent pixels) This seems to work for all BKchem files, but would of
#course fail for general SVG files.
for ($y=$maxy; $y<$he; $y++) {
  @row = $img->GetPixels(width=>$wi, height=>1, x=>1, y=>$y, map=>'RGBA');
  foreach(@row){$sumpix+=$_;}
  if (!$sumpix) {
    $maxy=$y;
    last;
  }
  #Have to reset the sum each time, because we are reading non-zero data until we
  #hit the edge.
  else {
    $sumpix=0;
  }
}
$sumpix=0;
for ($y=$miny; $y>0; $y--) {
  @row = $img->GetPixels(width=>$wi, height=>1, x=>1, y=>$y, map=>'RGBA');
  foreach(@row){$sumpix+=$_;}
  if (!$sumpix) {
    $miny=$y;
    last;
  }
  else {
    $sumpix=0;
  }
}
$sumpix=0;
for ($x=$minx; $x>0; $x--) {
  @col = $img->GetPixels(width=>1, height=>$he, x=>$x, y=>1, map=>'RGBA');
  foreach(@col){$sumpix+=$_;}
  if (!$sumpix) {
    $minx=$x;
    last;
  }
  else {
    $sumpix=0;
  }
}
$sumpix=0;
for ($x=$maxx; $x<$wi; $x++) {
  @col = $img->GetPixels(width=>1, height=>$he, x=>$x, y=>1, map=>'RGBA');
  foreach(@col){$sumpix+=$_;}
  if (!$sumpix) {
    $maxx=$x;
    last;
  }
  else {
    $sumpix=0;
  }
}

$w=($maxx-$minx);
$h=($maxy-$miny);

$outfilename = $ARGV[0];
$outfilename =~ s/.svg/2.svg/;
open OUTPUT,">$outfilename";

open INPUT,"<$ARGV[0]_tmp.svg";
#$firstgroupflag=0;
while (<INPUT>) {
    if(/<svg .*viewBox="0 0 \d+ \d+"/) {
	$_ = "<svg width=\"".$w."\" height=\"".$h."\" version=\"1.0\" viewBox=\"0 0 $w $h\"
               xmlns=\"http://www.w3.org/2000/svg\">\n";
    }
    #Have to move the diagram into the view box.  A simple translation works
    #well.  There is a group which encompasses all the tags, so put it there.
    if (!$firstgroupflag && /<g/) {
      s/<g/<g transform=\"translate(-$minx,-$miny)\"/;
      #Only do it once!
      $firstgroupflag = 1 ;
    }
    #Take out redundant groups.
    if (/<\/g>/) {
      my $line  = $_;
      $nextline = <INPUT>;
      if (!($nextline =~ /<g stroke="\#000000" stroke-width="[\d.]+">\s/ )) {
	print OUTPUT $line;
	print OUTPUT $nextline;
      }
      next;
    }
    print OUTPUT $_;
}
#clean up
close OUTPUT;
unlink($ARGV[0]."_tmp.svg");
print "$outfilename written\n";