User:Ilmari Karonen/fixbmp.pl
Jump to navigation
Jump to search
#!/usr/bin/perl -w
# Copyright (c) 2010 Ilmari Karonen <vyznev@toolserver.org>.
#
# Permission to use, copy, modify, and/or distribute this
# software for any purpose with or without fee is hereby granted,
# provided that the above copyright notice and this permission
# notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
# THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
use utf8;
use strict;
use LWP::UserAgent;
use HTTP::Request::Common qw'POST $DYNAMIC_FILE_UPLOAD';
$DYNAMIC_FILE_UPLOAD = 1;
use XML::Simple;
use Data::Dumper 'Dumper';
use Getopt::Long 'GetOptions';
use Term::ReadKey 'ReadMode';
use POSIX 'strftime';
use Digest::SHA1;
use Time::HiRes qw'time sleep';
# Work with Unicode strings throughout:
binmode $_, ":utf8" for \*STDIN, \*STDOUT, \*STDERR;
utf8::decode($_) for @ARGV;
# Test string for LWP Unicode handling -- if this comes back mangled, abort:
my $unicode = "Üñıç∅∂\x{3F5}";
# Default options:
my $username = "Ilmari Karonen";
my $server = "commons.wikimedia.org";
my $prefix = $ENV{TMPDIR} || $ENV{TEMP} || "/tmp";
my $filelist;
my $watch;
my $confirm;
my $pngout;
my $delay = 0;
# Usage instructions:
my $usage = <<"USAGE";
Usage: $0 [options] <file(s)>
Options:
-u, --user, --username=<name>
User name to log in as (default: $username).
-s, --server=<hostname>
Hostname of wiki server (default: $server).
-f, --filelist=<file>
File to read file names from.
-p, --prefix=<path>
Path to save temporary files under (default: $prefix).
-w, --watch
Automatically add files to watchlist.
-c, --confirm
Prompt for confirmation before each upload.
-d, --delay=<seconds>
How many seconds to sleep between uploads (default: $delay).
--pngout=<cmd>
Command to optimize PNG files before reupload.
USAGE
# '
# Parse options, print usage message if failed:
GetOptions( 'username|u=s' => \$username,
'server|s=s' => \$server,
'filelist|f=s' => \$filelist,
'prefix|p=s' => \$prefix,
'watch|w' => \$watch,
'confirm|c' => \$confirm,
'delay|d=i' => \$delay,
'pngout=s' => \$pngout,
) and ($filelist || @ARGV) or die $usage;
# Read extra file names:
if ($filelist) {
open LIST, '<:utf8', $filelist or die "Error opening $filelist: $!\n";
push @ARGV, <LIST>;
close LIST or die "Error closing $filelist: $!\n";
}
# Set up user agent:
my $ua = LWP::UserAgent->new(
agent => "Mozilla/4.0 (compatible; $0)",
from => 'vyznev@toolserver.org',
cookie_jar => {},
parse_head => 0,
);
# General purpose routine for making MediaWiki API requests:
my $apiURI = "http://$server/w/api.php";
sub apireq {
my $query = [format => 'xml', @_];
my $sleep = 5;
ref($_) or utf8::encode($_) for @$query;
ref($_) and utf8::encode($_->[0]) for @$query;
while (1) {
my $res = $ua->post($apiURI, $query, Content_Type => 'form-data');
my $err = $res->header('MediaWiki-API-Error') || "";
return XMLin( $res->content ) if $res->is_success and $err ne 'maxlag';
print STDERR "API request failed, ", ($err || $res->status_line), "...";
if ($sleep > 3*60*60) {
warn "giving up\n";
return XMLin( $res->content );
}
warn "sleeping $sleep seconds\n";
sleep $sleep;
$sleep *= 2;
}
}
# Read password from stdin and log in:
ReadMode 'noecho';
print STDERR "Password for $username \@ $server: ";
my $pass = <STDIN>;
chomp $pass;
print STDERR "\n";
ReadMode 'restore';
warn "Logging in to $server as $username...\n";
my $login = apireq( action => 'login', lgname => $username, lgpassword => $pass );
$login = apireq( action => 'login', lgname => $username, lgpassword => $pass, lgtoken => $login->{login}{token} )
if ($login->{login}{result} || '') eq 'NeedToken';
$login->{error} and die "Login as $username failed ($login->{error}{code}): $login->{error}{info}\n";
$login->{login}{result} eq 'Success' or die "Login as $username failed: $login->{login}{result}\n";
# Do the uploads:
my $lasttime = time;
FILE: foreach my $title (@ARGV) {
# Normalize filename to MediaWiki DB key form:
s/[\s_]+/_/g, s/^_//, s/_$// for $title;
$title = ucfirst $title;
warn "Loading info for $title...\n";
# Load file info and edit/upload token via API:
my $starttime = time;
my $data = apireq(
maxlag => 5,
action => 'query',
prop => 'info|imageinfo|revisions',
intoken => 'edit',
iiprop => 'url|mime|size|sha1|comment|user|timestamp',
iilimit => 50,
rvprop => 'content|timestamp',
rvlimit => 1,
titles => "File:$title",
redirects => 1,
requestid => $unicode,
);
$data->{requestid} eq $unicode
or die "Unicode round trip failed: expected \"$unicode\", got \"$data->{requestid}\".\n";
exists $data->{query}{pages}{page}{missing}
and warn "Skipping $title, file does not exist.\n" and next;
my $token = $data->{query}{pages}{page}{edittoken}
or die "Failed to get token, got:\n", Dumper($data), "\n";
my $text = $data->{query}{pages}{page}{revisions}{rev}{content}
or die "Failed to get page text, got:\n", Dumper($data), "\n";
my $timestamp = $data->{query}{pages}{page}{revisions}{rev}{timestamp}
or die "Failed to get timestamp, got:\n", Dumper($data), "\n";
my $imageinfo = $data->{query}{pages}{page}{imageinfo}{ii}
or die "Failed to get image info, got:\n", Dumper($data), "\n";
$imageinfo = [$imageinfo] unless ref $imageinfo eq 'ARRAY';
my $url = $imageinfo->[0]{url}
or die "Failed to get file URL, got:\n", Dumper($data), "\n";
my $size = $imageinfo->[0]{size}
or die "Failed to get file size, got:\n", Dumper($data), "\n";
$imageinfo->[0]{mime} =~ m(^image/(x-(ms-)?)?bmp$)
or warn "Skipping $title due to unexpected MIME type \"$imageinfo->[0]{mime}\".\n" and next;
# We might get a different filename back from the API; if so, use it:
my $curtitle = $data->{query}{pages}{page}{title}
or die "Failed to get normalized title, got:\n", Dumper($data), "\n";
s/^[^:]*://, tr/ /_/ for $curtitle;
warn "$title normalized/redirected to $curtitle.\n" if $title ne $curtitle;
$title = $curtitle;
# If suffix isn't .png or .jpe?g, rename to force PNG format:
$title =~ s/\.[^.]*$/.png/ or die unless $title =~ /\.(jpe?g|png)$/i;
# Extract target format from new suffix, used in various places later:
my $format = uc +($title =~ /\.([^.]*)$/)[0] or die;
$format = "JPEG" if $format eq 'JPG';
# Local names of converted and unconverted files:
my $file = "$prefix/$title";
my $bmpfile = "$prefix/$curtitle.bmp";
# If neither file exists, download the BMP first:
unless (-e $file or -e $bmpfile) {
print STDERR "Downloading $title ($size bytes, $imageinfo->[0]{mime}) from $url to $bmpfile... ";
my $dl = $ua->get($url, ':content_file' => $bmpfile);
die "FAILED: " . $dl->status_line . "\n" unless $dl->is_success;
warn $dl->status_line . "\n";
}
# If only the BMP file exists, convert it:
unless (-e $file) {
# Check that the BMP file really contains what API says it should:
-s $bmpfile == $size or die "Size mismatch: expect $size bytes, got ".(-s _).".\n";
open BMP, "<", $bmpfile or die "Error opening $bmpfile: $!\n";
binmode BMP;
my $sha1 = Digest::SHA1->new;
$sha1->addfile(\*BMP);
my $digest = lc $sha1->hexdigest;
$digest eq lc $imageinfo->[0]{sha1} or die "SHA1 mismatch: expected $imageinfo->[0]{sha1}, got $digest.\n";
close BMP or die "Error closing $bmpfile: $!\n";
# Run ImageMagick convert command:
warn "Converting $bmpfile to $file...\n";
system 'convert', $bmpfile, '-quality', 100, $file
and die "Convert failed with status $?: $!\n";
# Optimize converted file using PNGOUT (optional) or jpegtran:
if ($pngout and $format eq 'PNG') {
warn "Optimizing $file with $pngout...\n";
system $pngout, $file
and die "$pngout failed with status $?: $!\n";
}
if ($format eq 'JPEG') {
warn "Optimizing $file with jpegtran...\n";
system qw'jpegtran -copy none -optimize -outfile', "$file.opttmp~", $file
and die "jpegtran failed with status $?: $!\n";
rename "$file.opttmp~", $file or die "Error moving $file.opttmp~ over $file: $!\n";
}
}
# Show info for converted and unconverted files for comparison:
system 'identify', $_ and die "identify failed with status $?: $!\n" for grep -e $_, $bmpfile, $file;
# Preset some variables for upload; these may be changed below:
my $assert = 'assert'; # for AssertEdit existence check
my $comment = "convert from BMP to $format format";
$comment .= " (with quality 100)" if $format eq 'JPEG';
my $newtext = $text;
# Adjust things if we need to reupload under a different name:
if ($title ne $curtitle) {
# Adjust upload variables:
$assert = 'nassert'; # in this case we _don't_ want the target file to exist yet
$comment .= " from [[:File:$curtitle]]";
# It's unlikely but possible that a file could have more than 50 revisions; if so, abort:
$data->{'query-continue'}{imageinfo} and die "Too many image revisions.\n";
# Early existence check, better than having the assert fail later:
my $check = apireq( action => 'query', prop => 'info', titles => "File:$title" );
exists $check->{query}{pages}{page}{missing}
or warn "Skipping $curtitle, $title already exists.\n" and next;
warn "Reuploading $curtitle as $title:\n";
# Construct list of original file revisions and append it to the description page:
my @log;
my %seen;
foreach my $ii (reverse @$imageinfo) {
# Skip all but the first upload of each distinct file version:
next if $ii->{sha1} and $seen{$ii->{sha1}}++;
# Neuter wiki syntax in upload log entries; simple links are permitted, as in edit summaries:
my $comment = $ii->{comment};
for ($comment) {
tr/\r\n/ /; # rm line breaks
s/([&{}<>\'\[\]])/sprintf "&#%d;", ord $1/eg; # escape metacharacters
s/([[\s*:?(.*?)]])/index($2, "[") < 0 ? "[[:$2]]" : $1/eg; # unescape links
}
# Construct log entry:
push @log, "* $ii->{timestamp} [[User:$ii->{user}|]] $ii->{width}x$ii->{height} ($ii->{size} Bytes, $ii->{mime}) '' $comment ''\n";
# Stop as soon as we find a version matching the one we're converting, ignore possible later reverts:
last if $ii->{sha1} and $ii->{sha1} eq $imageinfo->[0]{sha1};
}
# Make a note if we omitted some entries:
my $note = (@log < @$imageinfo ? " (excluding reverts)" : "");
# Append log section to description page and display it for review:
my $log = join "", "== Original upload log ==\nThis image was originally uploaded in BMP format as [[:File:$curtitle]]. The original upload history$note is shown below:\n", reverse @log;
warn $log;
$newtext =~ s/\s*$/\n$log/;
}
# Optionally, pause and ask for confirmation:
while ($confirm) {
print STDERR " Do you want to upload $file? (Y/N): ";
my $reply = <STDIN>;
last if $reply =~ /^y(es)?$/i;
next unless $reply =~ /^no?$/i;
#2 == unlink $bmpfile, $file or die "Error unlinking temp files: $!\n";
next FILE;
}
# Do the upload:
print STDERR "Uploading $file (".(-s $file)." bytes) as $title... ";
my $upload = apireq(
action => 'upload',
file => [$file],
filename => $title,
comment => $comment,
text => $newtext,
watch => $watch,
ignorewarnings => 1,
token => $token,
$assert => 'exists',
);
if (ref $upload ne 'HASH') {
die "Got unexpected result:\n", Dumper($upload), "\n";
} elsif ($upload->{error}) {
die "Uploading $file failed ($upload->{error}{code}): $upload->{error}{info}\n";
} elsif ($upload->{upload}{result} ne 'Success') {
die "Uploading $file did not succeed ($upload->{upload}{result}):\n", Dumper($upload), "\n";
} else {
warn "OK\n";
}
# If the file name was changed, mark old name as superseded:
if ($title ne $curtitle) {
print STDERR "Marking $curtitle as superseded... ";
my $edit = apireq(
action => 'edit',
text => "{{Superseded|$title|Converted from BMP to PNG format}}\n$text",
title => "File:$curtitle",
basetimestamp => $timestamp,
lasttimestamp => strftime("%Y-%m-%dT%H:%M:%S", gmtime $starttime),
summary => "superseded by [[:File:$title]], converted from BMP to PNG",
minor => 1,
bot => 1,
token => $token,
watchlist => ($watch ? 'watch' : 'nochange'),
assert => 'exists',
);
if (ref $edit ne 'HASH') {
die "Got unexpected result:\n", Dumper($edit), "\n";
} elsif ($edit->{error}) {
die "Editing $curtitle failed ($edit->{error}{code}): $edit->{error}{info}\n";
} elsif ($edit->{edit}{result} ne 'Success') {
die "Editing $curtitle did not succeed ($edit->{edit}{result}):\n", Dumper($edit), "\n";
} else {
warn "OK\n";
}
}
#2 == unlink $bmpfile, $file or die "Error unlinking temp files: $!\n";
my $sleep = $delay - (time - $lasttime);
warn "Sleeping $sleep seconds before next upload...\n" and sleep $sleep if $sleep > 0 and $title ne $ARGV[-1];
$lasttime = time;
}
__END__