User:HYanWong/imagenames2credits.pl
Jump to navigation
Jump to search
This script makes credit lines for images on Wikimedia commons from the Commons database dumps. It requires 3 or 4 file names:
- A file containing a list of image names, one per line. Anything up to the first tab is used, and lines with a tab or newline at the start are simply passed through as blank. (There is some trickery with the file name: if -, use stdin. If name starts and ends in a "/", treat this as a regular expression to match against file names).
- A commonswiki xml dump file (e.g. unpacked from http://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-pages-articles.xml.bz2
- A commonswiki categorylinks sql table (e.g. unpacked from http://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-categorylinks.sql.gz . You will need to pre-process this file using my categorylinks parser.
- Optionally, an output filename, otherwise dump to stdout
Depending on the command-line switches, the following lines are output, in utf format
- The image name
- A short text-only version of the attribution line
- A longer text-only version of the attribution line
- An html version of the attribution line
Code
[edit]#!/usr/bin/perl
use strict;
use warnings;
use Encode;
use URI::Escape;
use HTML::Entities;
use HTML::TagFilter;
# N.B. Images can be downloaded by grabbing the first name in the output file and piping it to a download program, e.g. curl
# something like the following unix command should do it
# perl -ne 'print "-O\nurl=\"$1\"\n" if (/(^[^\t]+).*/)' OutputFile | curl -K -
# Notes:
# The commons dump file is huge, and it will take several minutes to parse it. You could try slimming it down
# by 20% by removing non file pages, plus category information, with a one-liner something like
# perl -ie "print $/='<title>'; while (<>) { if (/^File:|^Image:/) {s/\[\[\:?Category.*?\]\]//g; print;}}" hugefile.xml
die ("The name of a file containing a list of images is required as a first argument (or - to take from stdin)") unless ($ARGV[0]);
die("The name of an unpacked wikicommons xml dump file is required as the second argument. Get one from http://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-pages-articles-multistream.xml.bz2\n") unless ($ARGV[1]);
die("The name of a processed wikispecies xml dump file is required as the third argument. Get one from
http://dumps.wikimedia.org/specieswiki/latest/......sql.bz2\n") unless ($ARGV[2]);
my %image;
my $match_image; #ref to function to match
my $match_re = undef;
my $lines_in;
if ($ARGV[0] =~ m|^/(.*)/$|) { #use a RE to match images, rather than taking names from a file
$match_re=qr/$1/;
$match_image = sub {($_[0] =~ /$match_re/)};
} else {
if ($ARGV[0] eq "-") {
*IMG_NAMES = *STDIN;
binmode IMG_NAMES, ":utf8";
} else {
open(IMG_NAMES, "<:utf8", $ARGV[0])
or die "cannot open ".$ARGV[0]." for reading: $!";
}
while(<IMG_NAMES>) {
m/^([^\n\t]*)/; #name is anything up to the first tab or newline
#caution - this could be a blank line
$image{$1} = {'SortBy'=>$.} if ($1 ne '');
};
$lines_in = $.;
$match_image = sub {exists $_[1]->{$_[0]}};
close(IMG_NAMES);
}
open(COMMONS_DUMP, "<:utf8", $ARGV[1])
or die "cannot open ".$ARGV[1]." for reading: $!";
my $sqlfile = $ARGV[2];
if ($sqlfile =~ s/sql$//i) {
my $txtfile = $sqlfile.".txt";
if (-e $txtfile) {
open(CATEGORIES, "<:utf8", $txtfile)
or die "cannot open ".$txtfile." for reading: $!";
} else {
print "I need to create a text summary of your sql file.\n";
print "This should take about 10 mins\n";
die "Not done yet!";
}
} elsif ($sqlfile =~ /\.txt/) {
open(CATEGORIES, "<:utf8", $sqlfile)
or die "cannot open ".$sqlfile." for reading: $!";
}
#search for details to construct attribution line
#similar to http://commons.wikimedia.org/wiki/MediaWiki:Gadget-Stockphoto.js
my %ID; #reverse mapping from ID -> image_name, to help searching category file, which is in ID order
my $match_list = \%image;
my %redirects = ();
select(STDERR); #for outputting progress
$|=1; #turn on autoflushing
print "Searching WikiCommons dump (".$ARGV[1]."). Printing a dot for every 100 thousand pages searched: ";
do {
local $/ = '<title>';
seek COMMONS_DUMP,$. = 0,0; #make sure we are at the start, resetting $. to 0.
while(<COMMONS_DUMP>) {
if (m|^(.+?)</title>|) {
unless ($. % 100000) {$. % 1000000 ? print "." : print "|"};
my $name = wikiref_to_urlref($1);
if ($match_image->($name, $match_list)) {
$image{$name}={SortBy=>$name} unless (exists $image{$name});
#first look for redirects: update %images if so
if (m{<redirect title="(?:File|Image):(.+?)"\s*/>}) {
my $new_name = wikiref_to_urlref(decode_entities($1));
$redirects{$new_name} = $image{$new_name} = $image{$name};
delete $image{$name};
print "*";
next;
}
#otherwise look for proper author detail
if (m|<id>(\d+)</id>|g) {
$image{$name}->{ID}=$1;
$ID{$1}=$name;
} else {
$image{$name}->{ID}="Problem: <id> tag not found in WikiCommons dump";
}
#now we just want to look through the text - cut directly to there. Saves time and odd problems
#as e.g. with http://commons.wikimedia.org/wiki/File:AZ_gray_squirrel.jpg
m|(<text.*?</text>)|s;
get_author_info($1, $image{$name});
}
}
}
print " done!\n";
delete @redirects{grep {exists $redirects{$_}->{ID}} keys %redirects};
if (%redirects) {
$match_list = \%redirects; #next time through the do loop, only look at the redirects
print "Redirects found, and some of their targets missing on most recent pass. Doing another pass: ";
}
} while (%redirects); #don't need to worry about tied hashes - see http://www.perlmonks.org/?node_id=173677
close(COMMONS_DUMP);
print "Searching category list (extracted from sql file at ".$ARGV[2].") for licences: ";
foreach my $k (sort {$a <=> $b} keys(%ID)) { #do this in order of ID number
$image{$ID{$k}}->{licence}='NONE';
#go to that line in the file.
while (my $line = <CATEGORIES>) {
unless ($. % 100000) {$. % 1000000 ? print "." : print "|"};
if ($. == $k) {
my @cats = split(/(?<!\\)(?:\\\\)*'/,$line);
#Split on non escaped ' marks, i.e. 'foo','bar' iterates over foo, comma, bar.
#This avoids having to cope with commans inside quotes. To explain the RE, see
# http://stackoverflow.com/questions/56554/what-is-the-proper-regular-expression-for-an-unescaped-backslash-before-a-charac
foreach my $cat (map { $cats[$_*2+1] } 0..int(@cats/2)-1) { #use every other $cat (forget the intermediate comma).
if (LicenceAcceptable($image{$ID{$k}}->{licence}) < LicenceAcceptable($cat)) {
$image{$ID{$k}}->{licence} = $cat; #just pick the licence with the highest number
}
}
last;
}
}
}
close(CATEGORIES);
print " done!\n";
#construct HTML, long text, and short text credit lines for images.
# WARNING - HTML links could point anywhere - even to dangerous sites.
my $filter_all = HTML::TagFilter->new(allow=>{}); #delete everything.
my $filter_most = new HTML::TagFilter; #defaults allow text markup, links, & images
$filter_most->deny_tags({ img => { all => [] }});
my $copyright = Encode::decode_utf8("©");
foreach my $name (keys %image) {
my $url = 'http://commons.wikimedia.org/wiki/File:'.$name;
my $im = $image{$name};
if ($im->{custom_credit_line}) {
$im->{html_credit} = "<a href='$url'>".$im->{custom_credit_line}."</a>";
$im->{long_credit} = $im->{custom_credit_line};
$im->{short_credit} = $im->{custom_credit_line};
next;
};
unless (defined $im->{licence}) {
$im->{licence} = -1; #now use licence simply to flag as file missing
print $im->{html_credit}=$im->{long_credit}=$im->{short_credit}="Cannot find 'File:$name' in the Commons xml dump. Look online?";
print "\n";
next;
}
my $lv = LicenceAcceptable($im->{licence});
unless ($lv>0) {
print $im->{html_credit}=$im->{long_credit}=$im->{short_credit}="Cannot use $url: licence $im->{licence} not acceptable.";
print "\n";
$im->{licence} = 0; #now use licence simply to flag as unacceptable
next;
}
#the following cases all construct a longer credit string
if (! exists $im->{attribution}) { #no appropriate templates found
$im->{html_credit} = "Author unclear (<a href='$url'>see here</a>)";
$im->{long_credit} = "Author unclear, see wiki URL for details";
$im->{short_credit} = "Author unclear, see File: ";
} elsif (! defined ($im->{attribution})) { #matching template, but no author field
$im->{html_credit} = "Author not defined (<a href='$url'>see here</a>)";
$im->{long_credit} = "Author not defined, see wiki URL for details";
$im->{short_credit} = "Author unclear, see File: ";
} elsif (! $im->{attribution}) { #author field in Information template is blank.
$im->{html_credit} = "Author unspecified (<a href='$url'>see here</a>)";
$im->{long_credit} = "Author unspecified, see wiki URL for details";
$im->{short_credit} = "Author unclear, see File: ";
} else {
if ($im->{attribution} =~ /upload/i) {
$im->{html_credit} = "Image uploaded & possibly $copyright ";
$im->{long_credit} = "Image uploaded & possibly $copyright ";
$im->{short_credit} = "?$copyright";
} else {
$im->{html_credit} = "Image $copyright ";
$im->{long_credit} = "Image $copyright ";
$im->{short_credit} = "$copyright";
}
my $html = wikiauthor2html($im->{attribution});
print "\n\n\n".$name." --".$im->{attribution}."--\n\n\n" unless $html;
#Construct credit lines
$im->{html_credit} .= $filter_most->filter($html);
$im->{long_credit} .= decode_entities($filter_all->filter($html)); #the filter encodes html entities by default
$im->{long_credit} =~ s/^(.{100}).*/$1\x{2026}/; #only use 1st 100 chars, add ellipsis if truncated
$im->{short_credit} .= decode_entities($filter_all->filter($html));
$im->{short_credit} =~ s/^(.{50}).*/$1\x{2026}/; #only use 1st 50 chars, add ellipsis if truncated
}
$im->{html_credit} .= " / ".$filter_all->filter(LicenceCat2HTML($im->{licence}) || $im->{licence});
$im->{html_credit} .= " / from <a href='$url'>File:".$filter_all->filter($name)."</a> on <a href='http://commons.wikimedia.org/'>WikiCommons</a>";
$im->{long_credit} .= " / ".(LicenceCat2Text($im->{licence}) || $im->{licence});
$im->{long_credit} .= " / from $url";
$im->{short_credit} .= "/".(LicenceCat2Txt($im->{licence}) || $im->{licence});
$im->{short_credit} .= "/from File:$name on WikiCommons";
$im->{licence} = $lv; #now use licence to flag acceptability
next;
}
#Now print the details
if ($ARGV[3] && $ARGV[3] ne "-") {
open(OUTPUT, ">:utf8", $ARGV[3])
or die "cannot open ".$ARGV[3]." for (over)writing: $!";
} else {
*OUTPUT = *STDOUT;
binmode OUTPUT, ":utf8";
}
my @key_list;
if (defined $match_re) {#output lines alphabetically by file name
@key_list = sort keys %image;
} else { #output lines in the same order as they went in
my %reverse_map = map {$image{$_}->{SortBy} => $_} keys %image;
@key_list = map {$reverse_map{$_} || ""} 1..$lines_in;
}
foreach my $name (@key_list) {
unless ($name eq '') {
print OUTPUT $name if ($image{$name}->{licence} > 0 ); #don't print the fn if licence not accepted
print OUTPUT "\t".$image{$name}->{licence}; #print the licence value (from LicenceAcceptable())
print OUTPUT "\t".$image{$name}->{html_credit};
print OUTPUT "\t".$image{$name}->{long_credit};
print OUTPUT "\t".$image{$name}->{short_credit};
};
print OUTPUT "\n";
};
sub LicenceAcceptable
{ #if acceptable, return a positive number. If an image has multiple licences use the one with the highest number.
for(shift) { #this should match against wikicommons Category: tags, such as [[Category:GFDL]]
return 0 if ($_ eq 'NONE');
return 0 if (/^GFDL$/);
return 2.99 if (/^CC-BY-SA-1.0\+$/); #any future BY-SA version
return 2+$1/100 if (/^CC-BY-SA-?([\d\.]*)/);
#values above 10 are for licences which do not place limits
# on the licence you yourself need to use when reusing these pictures
return 10 if (/^Attribution$/);
return 10 if (/^Copyrighted_free_use$/);
return 10+$1/10 if (/^CC-BY-?([\d\.]*)/);
return 11 if (/^FAL$/);
return 12 if (/^CC-PD-Mark/);
return 12 if (/^PD[-_]/);
return 13 if (/^CC-Zero$/);
return -1; #if not a licence category
};
# *** TO DO *** do we need to look for categories defined by the templates 'Anonymous[ _]work', 'Anonymous-EU'
};
# *** TO DO *** fill out all the licence categories here. There's a list at
# https://commons.wikimedia.org/wiki/Commons:Copyright_tags
sub LicenceCat2HTML {
for(shift) {
if (/^GFDL$/) {return '<a href="https://www.gnu.org/copyleft/fdl.html">GFDL</a>'}
if (/^FAL$/) {return '<a href="http://artlibre.org/licence/lal/en">GFDL</a>'}
if (/^CC-BY-SA-3.0/) {return '<a href="https://creativecommons.org/licenses/by-sa/3.0/">CC-BY-SA-3.0</a>'}
if (/^CC-BY-2.5/) {return '<a href="https://creativecommons.org/licenses/by-sa/2.5/">CC-BY-2.5</a>'}
if (/^CC-BY-2.0/) {return '<a href="https://creativecommons.org/licenses/by-sa/2.0/">CC-BY-2.0</a>'}
if (/^CC-BY-3.0/) {return '<a href="https://creativecommons.org/licenses/by/3.0/">CC-BY-3.0</a>'}
if (/^CC-BY-2.5/) {return '<a href="https://creativecommons.org/licenses/by/2.5/">CC-BY-2.5</a>'}
if (/^CC-BY-2.0/) {return '<a href="https://creativecommons.org/licenses/by/2.0/">CC-BY-2.0</a>'}
if (/^CC-Zero$/) {return '<a href="https://creativecommons.org/publicdomain/zero/1.0/deed.en">CC Public Domain</a>'}
if (/^Attribution$/) {return 'Attribution req.'}
if (/^Copyrighted_free_use$/) {return '<a href="https://commons.wikimedia.org/wiki/Template:Copyrighted_free_use">Free for any use: no restriction</a>'}
if (/^PD_US_Government$/) {return '<a href="http://commons.wikimedia.org/wiki/Template:PD-USGov">Public domain</a>'}
if (/^PD[-_]/) {return 'Public domain'}; #could give more specific PD- text above, e.g.
return '';
};
};
sub LicenceCat2Text { #long plain text form
for(shift) {
if (/^GFDL$/) {return 'Gnu Free Documentation Licence (POSSIBLY INCOMPATIBLE WITH DISTRIBUTION UNLESS FULL GFDL TERMS PRINTED HERE)'};
if (/^FAL$/) {return 'Free Art Licence (http://artlibre.org/licence/lal/en)'}
if (/^CC-BY-SA-3.0/) {return 'CreativeCommons BY-SA 3.0 (https://creativecommons.org/licenses/by-sa/3.0)'}
if (/^CC-BY-SA-2.5/) {return 'CreativeCommons BY-SA 2.5 (https://creativecommons.org/licenses/by-sa/2.5)'}
if (/^CC-BY-SA-2.0/) {return 'CreativeCommons BY-SA 2.0 (https://creativecommons.org/licenses/by-sa/2.0)'}
if (/^CC-BY-3.0/) {return 'CreativeCommons BY 3.0 (https://creativecommons.org/licenses/by/3.0)'}
if (/^CC-BY-2.5/) {return 'CreativeCommons BY 2.5 (https://creativecommons.org/licenses/by/2.5)'}
if (/^CC-BY-2.0/) {return 'CreativeCommons BY 2.0 (https://creativecommons.org/licenses/by/2.0)'}
if (/^CC-Zero$/) {return '<a href="https://creativecommons.org/publicdomain/zero/1.0/deed.en">CC Public Domain</a>'}
if (/^Attribution$/) {return 'Attribution required, see link'}
if (/^Copyrighted_free_use$/) {return 'Free for any use: no restriction'}
if (/^PD_US_Government$/) {return 'Public domain: US government work'}
if (/^PD[-_]/) {return 'Public domain'}
return '';
};
};
sub LicenceCat2Txt { #short text form. Note that some licences might require a full link.
#to do - this needs legal checking - what is the shortest form allowed?
for(shift) {
if (/^GFDL$/) {return 'GFDL'}
if (/^FAL$/) {return 'Free Art Licence'}
if (/^CC-BY-3.0/) {return 'CC BY 3.0'}
if (/^Attribution$/) {return 'Attribution req.'}
if (/^Copyrighted_free_use$/) {return 'Free for any use'}
if (/^PD[-_]/) {return 'Pub. domain'}
return '';
};
}
sub wikiref_to_urlref {
my $name=uri_unescape($_[0]);
$name =~ s/^\s*(?:File|Image):\s*//; #trim leading whitespace & start
$name =~ s/[\s\p{FORMAT}]*$//; #trim trailing whitespace & unicode formatting characters
$name =~ s/^(\w)/uc($1)/e;
$name =~ s/ /_/g;
return($name);
}
sub split_wikitemplate
{
# first returned item is the template name,
# second is the parameters passed to the template in a hashref whose hash contains
# the named template vars and the unnamed vars with keys {1}, {2}, {3} etc.
my $name_out;
my $name_in=shift;
$name_in =~ s/[_ ]/[_ ]/g; #either space or underscore should be subbed to match the other
$name_in =~ s/^(\w)/[\U$1\L$1]/; #allow first letter to be either UC or LC
my $string = shift;
my $depthcurly=0;
my $depthsquare=0;
my @params=(); #collect the template in here
#Parse the template string.
#This is bit complex, because templates can contain other templates. Just ignore these nested ones
my $prev_fragment="";
foreach (split /( {+ | }+ | \[+ | \]+ | = | [|] )/x, $string) { # cut into pieces by {, }, [, ], |, and =, allowing multiple brackets
if (scalar(@params)==0) { #not yet found template name
if (($prev_fragment =~ /{{/) && (/^\s*($name_in)\s*/)) {
push(@params, [$1]); #start capturing
}
} else {
if (/\[/) {
$depthsquare+= length;
} elsif (/\]/) {
$depthsquare-= length;
$depthsquare = 0 if $depthsquare < 0; #ignore extra "]" which might have been inserted by error
};
unless ($depthsquare > 0) { #don't count curly brackets if within a [hyperlink {{ like this | foo]
if (/\{/) {$depthcurly+= length}
elsif (/\}/) {$depthcurly-= length};
}
if ($depthcurly>0) { # in the depths of some bracketed expression - just bung everything in
$params[-1]->[-1] .= decode_entities($_)}
elsif ($depthcurly==0) { # we're in in the uppermost {{ bracketed expression }} - tread carefully
if ($depthsquare == 0) { #not inside a [hyperlink]
if (/[|]/) {
push(@params, [""])
} elsif (/=/) {
push(@{$params[-1]}, "") if ((scalar(@params) > 1) && #not the template name
(scalar(@{$params[-1]})==1)); #still on the LHS of an = sign
} else {
$params[-1]->[-1] .= decode_entities($_);}
} else {
$params[-1]->[-1] .= decode_entities($_) } #not a special char: add to the current param
} else { #depth < 0, end of template
last;
}
};
$prev_fragment = $_;
}
my $name_ref = shift(@params);
return () unless ($name_ref && ($name_out = $name_ref->[0]));
$name_out =~ s/^\s*//g;
$name_out =~ s/\s*$//g;
$name_out =~ s/^(\w)/\U$1/;
$name_out =~ s/_/ /g;
my $unnamed_pos=0;
my %params_out=();
foreach my $p (@params) {
#see http://meta.wikimedia.org/wiki/Help:Template#Mix_of_named_and_unnamed_parameters
if (scalar(@$p) == 1) {
$params_out{++$unnamed_pos}=$p->[0];
} else {
$p->[0] =~ s/^\s*//;
$p->[0] =~ s/\s*$//;
$p->[0] =~ s/_/ /g;
$p->[1] =~ s/^\s*//s;
$p->[1] =~ s/\s*$//s;
$params_out{$p->[0]} = $p->[1];
}
};
return ($name_out, \%params_out);
}
sub wikiauthor2html { #rather heuristic routine for making nice author text
local $_=shift;
s|\s+| |gs; #remove whitespace incl newlines
#some crude wikitext parsing
s|''''(.+?)''''|<strong><em>$1</em></strong>|gs;
s|'''(.+?)'''|<strong>$1</strong>|gs;
s|''(.+?)''|<em>$1</em>|gs;
#replace [[ foo | bar ]] with bar
s@(?<!\[) \[\[ ([^|]+) [|] (.+?) \]\] @<a href='http://commons.wikimedia.org/wiki/$1'>$2</a>@gx;
#replace [[User:foo bar]] with (WikiCommons: <a>User:foo_bar</a>)
s|(?<!\[) \[\[ (.+?) \]\] |(my $x=$1) =~ s/ /_/g; "(WikiCommons: <a href='http://commons.wikimedia.org/wiki/$x'>".$x."</a>)"|egx;
#replace [ http://foo bar baz ] with <a href='http://foo'>bar baz</a>
s|(?<!\[) \[ \s*([^\s\]]+)\s+ (.+?) \]|<a href="$1">$2</a>|gx;
#replace [ http://www.foo.com ] with <a href="http://www.foo.com">http://www.foo.com</a>
s|(?<!\[) \[\s*(.+?)\s*\]|<a href="$1">$1</a>|gx;
# Some specific templates
# ****TO DO **** should also cope with e.g. |Author={{de|Fotograf: [[User:Frank C. Müller|Frank C. Müller]]}} {{en|Photographer: [[User:Frank C. Müller|Frank C. Müller]]}} etc.
s/\{\{[Cc]reator:(.+?)\}\}/$1/;
s/\{\{[Uu]nknown\|[Aa]uthor\}\}/Unknown/;
#A common notation: replace e.g. {{User:John Smith/Author}} with Wikicommons user John Smith
s!\{\{(User:([^|/}]+)).*?\}\}!WikiCommons user <a href='http://commons.wikimedia.org/wiki/$1'>$2</a>!;
# Exceptions as implemented in http://commons.wikimedia.org/wiki/MediaWiki:Gadget-Stockphoto.js
s/\(talk\)//;
s/^Original\suploader\swas\s*//;
#might as well trim whitespace
s/^\s+//;
s/\s+$//;
return $_;
}
sub get_author_info {
my $wikitext = $_[0];
my $ref = $_[1];
#don't need extra spaces, newlines etc.
$wikitext =~ s/\s+/ /sg;
#Search wikitext for credit line
if ((my @t = split_wikitemplate('Credit line', $wikitext))[0]) {
$ref->{custom_credit_line} = ($t[1]->{Author} || $t[1]->{author} || "");
$ref->{custom_credit_line} .= " / ";
$ref->{custom_credit_line} .= ($t[1]->{Other} || $t[1]->{other} || "");
$ref->{custom_credit_line} .= " / ";
$ref->{custom_credit_line} .= ($t[1]->{License} || $t[1]->{license} || "");
} elsif ((@t = split_wikitemplate('Attribution', $wikitext))[0]) {
if ($t[1]->{Text}) {
$ref->{custom_credit_line} = $t[1]->{Text}}
elsif ($t[1]->{text}) {
$ref->{custom_credit_line} = $t[1]->{text}}
elsif ($t[1]->{2}) {
$ref->{attribution} = $t[1]->{2}}
};
# Look for authors in templates such as {{Information}}
#
#add in templates to recognise here. Perhaps cull extras from
# http://commons.wikimedia.org/wiki/Category:Infobox_templates:_based_on_Information_template
my %template_func=( #for each template name, return the attribution text from a parameter of the template
# an additional param to a cc-by template gives "the attribution string as specified by the licensor, that re-users are obliged to name" see https://commons.wikimedia.org/wiki/Template:Cc-by-2.0
'Cc-by-3.0'=> sub{$_[1]->{1} || $_[1]->{1}},
'Cc-by-2.5'=> sub{$_[1]->{1} || $_[1]->{1}},
'Cc-by-2.0'=> sub{$_[1]->{1} || $_[1]->{1}},
'Cc-by-sa-3.0'=>sub{$_[1]->{1} || $_[1]->{1}},
'Cc-by-sa-2.5'=>sub{$_[1]->{1} || $_[1]->{1}},
'Cc-by-sa-2.0'=>sub{$_[1]->{1} || $_[1]->{1}},
'Information'=> sub{$_[1]->{Author} || $_[1]->{author}},
'Specimen'=> sub{$_[1]->{Author} || $_[1]->{author}},
'Flickr'=> sub{$_[1]->{Photographer} || $_[1]->{photographer}},
'Artwork'=> sub{$_[1]->{Artist} || $_[1]->{artist}},
'Self' => sub{$_[1]->{Attribution} || $_[1]->{attribution} || $_[1]->{Author} || $_[1]->{author}},
'[\w\/\:\s_]*[Ii]nformation' => #getting desperate here - look for customised templates, hoping the end in the word "information" - yuk. An example is {{User:Aka/Information}}. But it won't catch, e.g. {{User:Biopics/infong}}. There should be a better way, e.g. see http://commons.wikimedia.org/wiki/Commons_talk:Machine-readable_data#List_of_templates_that_use_machine_readable_markup
sub{$_[1]->{author} || $_[1]->{Author} || $_[1]->{photographer} || $_[1]->{Photographer}},
'User:Biopics/infong' => sub{'Hans Hillewaert'},#example of a user-specified template
);
#now look for these templates in the wikitext
foreach my $t_name (keys %template_func) {
if (my @t = split_wikitemplate($t_name, $wikitext)) {
last if ($ref->{attribution} = $template_func{$t_name}->(@t))
}
};
};