User:MIMEStatBot/Source
Jump to navigation
Jump to search
This is a copy of the source code of MIMEStatBot copied from the actual script on the toolserver.
#!/usr/bin/perl -w
# Copyright (c) 2008-2010 Ilmari Karonen <vyznev@toolserver.org>.
#
# Permission to use, copy, modify, and/or distribute this
# software for any purpose with or without fee is hereby granted,
# provided that the above copyright notice and this permission
# notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
# THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
# The following lines are for running this script as an SGE job:
#
#$ -N commons-mime-statistics
#$ -S /usr/bin/perl
#$ -o public_html/stats/commons_mime_statistics_log.txt -j y
#$ -M vyznev@toolserver.org -m ea
#$ -l sql-s4-rr=1
# Typical runtimes as of 2012 are under 30 mins, but it's been more in the past
#$ -l h_rt=2:05:00
#$ -l s_rt=2:00:00
# Typical peak memory usage on Solaris seems to be around 22M, but Linux reports 200M for some reason???
#$ -l virtual_free=200M
# This script should work on both Linux and Solaris
#$ -l arch=*
use strict;
use utf8; # not really needed for now, but for the sake of consistency
use Time::HiRes 'time';
use POSIX 'strftime';
use DBI;
use LWP::UserAgent;
use XML::Simple;
use Digest::MD5 'md5_hex';
use Data::Dumper 'Dumper';
use URI;
use Getopt::Long;
use constant TIMEOUT => 1200; # stop db retries after this many secs
# Command line parsing (currently used only for debug flag):
my $debug;
GetOptions('debug' => \$debug)
or die "Usage: $0 [--debug]\n";
# Configuration and general setup:
# TODO: read these from the env or command line
my $dbname = "commonswiki";
my $server = "commons.wikimedia.org";
my $botname = "MIMEStatBot";
my $pwfile = "mimestatbot.pass";
my $dumpfile = "public_html/stats/commons_mime_statistics_data.txt";
my $sitename = "Commons";
my $pageprefix = ($debug ? "User:$botname/test/" : "$sitename:");
my $statpage = $pageprefix . "MIME type statistics";
my $listpage = "$statpage/Unusual types";
my $interval = ($debug ? "test" : "weekly");
my $list_max = 500;
my @now = gmtime;
my $date = strftime "%Y-%m-%d", @now;
my $time = strftime "%H:%M:%S", @now;
my ($t0, $t1, $dt, $sql, $rows); # common vars for queries and timeouts
my $botsummary = "bot updating $interval statistics at $date $time";
my $testcond = ($debug ? strftime("img_timestamp >= '%Y%m%d%H%M%S'", gmtime(time-7*24*60*60)) : "1=1");
# Start script:
warn "\n--- Generating $interval MIME statistics for $dbname at $date $time (UTC) ---\n";
# Knowing the system we're running on should help debugging:
system qw(uname -a);
$| = 1;
# SGE should start us in home dir anyway, but better safe than sorry...
chdir or die "Error changing to home directory: $!\n";
# Connect to database:
my $data_source = "DBI:mysql:database=${dbname}_p;host=${dbname}-p.rrdb.toolserver.org;mysql_read_default_group=client";
my $dbh;
$t0 = time;
until ($dbh = DBI->connect($data_source)) {
warn "SQL connect failed: $DBI::errstr\n";
my $sleep = time - $t0 + 1;
die "Giving up after $sleep seconds.\n" if $sleep > TIMEOUT;
warn "Waiting $sleep seconds before retry...\n";
sleep $sleep;
}
$dbh->{RaiseError} = 1; # all DB errors should be fatal
$dbh->do("SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED");
# Set up LWP:
my $ua = LWP::UserAgent->new(
agent => "Mozilla/4.0 (compatible; $0)",
from => 'vyznev@toolserver.org',
cookie_jar => {},
parse_head => 0,
);
# Generic MediaWiki API request handler:
my $apiURI = "http://$server/w/api.php";
sub apireq {
my $query = [format => 'xml', @_];
my $sleep = 5;
if ($URI::VERSION < 1.36) {
# Handling of Unicode strings changed in URI.pm v1.36, which $ua->post() calls internally
utf8::encode($_) for @$query;
}
while (1) {
my $res = $ua->post($apiURI, $query);
my $err = $res->header('MediaWiki-API-Error') || "";
return XMLin( $res->content ) if $res->is_success and $err ne 'maxlag';
print STDERR "API request failed, ", ($err || $res->status_line), "...";
if ($sleep > 3*60*60) {
warn "giving up\n";
return XMLin( $res->content );
}
warn "sleeping $sleep seconds\n";
sleep $sleep;
$sleep *= 2;
}
}
# Subroutine to post data to the wiki:
sub postpage {
my ($title, $content, $summary) = @_;
# XXX: inputs are assumed to be Unicode strings; use utf8::decode() on data from MySQL first!
my $md5 = md5_hex(do { my $x = $content; utf8::encode($x); $x }); # md5_hex() wants octets
warn "Getting edit token for [[$title]]\n";
my $data = apireq(
action => 'query',
prop => 'info',
intoken => 'edit',
titles => $title,
);
my $token = $data->{query}{pages}{page}{edittoken}
or die "Failed to get token, got:\n", Dumper($data), "\n";
# warn "pretending to save $title ($summary):\n$content\n"; return; ## DEBUG
warn "Editing [[$title]] ($summary)\n";
my $edit = eval { apireq(
maxlag => 5,
action => 'edit',
title => $title,
summary => $summary,
recreate => 1,
md5 => $md5,
text => $content,
token => $token,
) };
warn $@ if $@;
if (ref $edit ne 'HASH' or $edit->{error} or $edit->{edit}{result} ne 'Success') {
if (ref $edit ne 'HASH') {
warn "Got unexpected result:\n", Dumper($edit), "\n";
} elsif ($edit->{error}) {
warn "Editing $title failed ($edit->{error}{code}): $edit->{error}{info}\n";
} elsif ($edit->{edit}{result} ne 'Success') {
warn "Editing $title did not succeed ($edit->{edit}{result}):\n", Dumper($edit), "\n";
} else {
warn "Qweebl zzyzx bleep blort?\n", Dumper($edit), "\n"; # should be impossible
}
require File::Temp;
my $dump = File::Temp->new( UNLINK => 0, SUFFIX => "-$botname-dump-$date.txt", DIR => "public_html/temp" );
warn "Error detected, dumping content to ", $dump->filename, "\n";
binmode $dump, ":utf8" or warn "binmode failed: $!\n";
print $dump $content or warn "print failed: $!\n";
close $dump or warn "close failed: $!\n";
chmod 0644, $dump->filename or warn "chmod failed: $!\n";
warn "Dump complete.\n";
return 0;
}
warn "Page [[$title]] successfully saved.\n";
return 1;
}
# Run stat query:
$t0 = time;
warn "Starting stat query at ".gmtime($t0)."\n";
$sql = <<"END";
SELECT img_major_mime, img_minor_mime, img_media_type,
COUNT(*) AS files, SUM(img_size) AS bytes
FROM image
WHERE $testcond
GROUP BY img_major_mime, img_minor_mime, img_media_type
/* SLOW_OK LIMIT:3600 */
END
warn "Running query:\n$sql" if $debug;
{
$rows = eval { $dbh->selectall_arrayref($sql) };
if ($@) {
warn "Stat query failed: $@\n";
my $sleep = time - $t0 + 1;
die "Giving up after $sleep seconds.\n" if $sleep > TIMEOUT;
warn "Waiting $sleep seconds before retry...\n";
sleep $sleep;
redo;
}
}
$t1 = time;
$dt = sprintf "%.1f", $t1 - $t0;
warn "Stat query done in $dt seconds at ".gmtime($t1)."\n";
# Dump stats to file:
if (open my $dump, ">>", $dumpfile) {
if ($debug) {
warn "Debug mode, not actually writing ".@$rows." rows to $dumpfile\n";
}
else {
print $dump "\n$date\n";
for my $row (@$rows) {
my ($major, $minor, $media, $files, $bytes) = @$row;
print $dump "$major/$minor\t$media\t$files\t$bytes\n";
}
warn @$rows." rows dumped to $dumpfile\n";
}
close $dump or warn "ERROR writing to $dumpfile: $!\n";
}
else {
warn "ERROR opening $dumpfile for append: $!\n";
}
# Generate stat page:
my $stat_text = <<"END";
<noinclude>
This page is updated $interval by [[User:$botname|]]. Any other edits made to this page will be lost on next update.
</noinclude>
'''Files on $sitename by [[w:MIME type|]] as of $date $time (UTC)'''
See also: [[Commons:Project scope/Allowable file types]]
{| class="wikitable sortable"
! MIME type !! Media type !! Files !! Bytes
END
my $total_f = 0;
my $total_b = 0;
my %rare_types;
my $total_rare = 0;
for my $row (@$rows) {
my ($major, $minor, $media, $files, $bytes) = @$row;
$total_f += $files;
$total_b += $bytes;
my $type = "$major/$minor || $media";
if ($files <= $list_max) {
$type = "[[$listpage#$major/$minor ($media)|$major/$minor]] || $media";
$_ = $dbh->quote($_) for $major, $minor, $media;
$rare_types{$media}{$major}{$minor}++;
$total_rare += $files;
}
$_ = reverse($_), s/(\d{3})\B/$1,/g, $_ = reverse($_) for $files, $bytes;
utf8::decode($type); # should not be necessary, but let's be consistent
$stat_text .= <<"END";
|-
| $type
| align="right" | $files
| align="right" | $bytes
END
}
$_ = reverse($_), s/(\d{3})\B/$1,/g, $_ = reverse($_) for $total_f, $total_b;
$stat_text .= <<"END";
|- class="sortbottom"
! style="border-right:0" | '''Total'''
! style="border-left:0" | <!-- workaround for regression in table sort JS -->
| align="right" | $total_f
| align="right" | $total_b
|}
[[Category:Commons statistics|MIME type statistics]]
<!-- Generated in $dt seconds. -->
END
# Run list query:
my @cond;
for my $q_media (sort keys %rare_types) {
my $major_types = $rare_types{$q_media};
for my $q_major (sort keys %$major_types) {
my $minor_types = $major_types->{$q_major};
my $q_minor = join ", ", sort keys %$minor_types;
push @cond, "(img_major_mime = $q_major AND img_media_type = $q_media AND img_minor_mime IN ($q_minor))";
}
}
my $cond = join "\n OR ", @cond;
$t0 = time;
warn "Starting list query at ".gmtime($t0).", expecting $total_rare rows\n";
$sql = <<"END";
SELECT img_major_mime, img_minor_mime, img_media_type, img_name
FROM image
WHERE ($cond) AND ($testcond)
/* SLOW_OK LIMIT:3600 */
END
warn "Running query:\n$sql" if $debug;
{
$rows = eval { $dbh->selectall_arrayref($sql) };
if ($@) {
warn "List query failed: $@\n";
my $sleep = time - $t0 + 1;
die "Giving up after $sleep seconds.\n" if $sleep > TIMEOUT;
warn "Waiting $sleep seconds before retry...\n";
sleep $sleep;
redo;
}
}
$t1 = time;
$dt = sprintf "%.1f", $t1 - $t0;
warn "List query done in $dt seconds at ".gmtime($t1)."\n";
my %list;
for my $row (@$rows) {
my ($major, $minor, $media, $title) = @$row;
$title =~ tr/_/ /;
push @{ $list{"$major/$minor ($media)"} }, $title;
}
# Generate list page:
my $list_text = <<"END";
<noinclude>
This page is updated $interval by [[User:$botname|]]. Any other edits made to this page will be lost on next update.
</noinclude>
This page lists all files on $sitename for [[w:MIME type|]]s that have less than $list_max files each, as of $date $time (UTC). For a list of currently permitted types, see [[Commons:Project scope/Allowable file types]].
END
for my $type (sort keys %list) {
my $heading = $type;
utf8::decode($heading); # should not be necessary, but let's be consistent
$list_text .= "\n== $heading ==\n\n";
for my $title (sort @{ $list{$type} }) {
utf8::decode($title);
$title =~ s/([&<>\[\]{}|\x27])/sprintf "&#%d;", ord $1/eg; # \x27 = single quote
$title =~ s/\xA0/ /g;
$list_text .= "* [[:File:$title|$title]]\n";
}
}
$list_text .= "\n<!-- Generated in $dt seconds. -->\n";
# Disconnect from database:
$dbh->disconnect;
# Log in to wiki:
open PW, "<", $pwfile or die "Error opening $pwfile: $!";
my $botpass = <PW>;
chomp $botpass;
close PW or warn "Error reading $pwfile: $!\n";
warn "Logging in to $server as $botname, will post to [[$statpage]]\n";
my $login = apireq( action => 'login', lgname => $botname, lgpassword => $botpass );
$login = apireq( action => 'login', lgname => $botname, lgpassword => $botpass, lgtoken => $login->{login}{token} )
if ($login->{login}{result} || '') eq 'NeedToken';
$login->{error} and die "Login as $botname failed ($login->{error}{code}): $login->{error}{info}\n";
$login->{login}{result} eq 'Success' or die "Login as $botname failed: $login->{login}{result}\n";
# Post pages to wiki:
postpage($statpage, $stat_text, $botsummary);
postpage($listpage, $list_text, $botsummary);
warn "All done, exiting.\n";
__END__