User:HYanWong/imagenames2credits.pl

This script makes credit lines for images on Wikimedia commons from the Commons database dumps. It requires 3 or 4 file names:
A file containing a list of image names, one per line. Anything up to the first tab is used, and lines with a tab or newline at the start are simply passed through as blank. (There is some trickery with the file name: if -, use stdin. If name starts and ends in a "/", treat this as a regular expression to match against file names).
A commonswiki xml dump file (e.g. unpacked from http://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-pages-articles.xml.bz2
A commonswiki categorylinks sql table (e.g. unpacked from http://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-categorylinks.sql.gz . You will need to pre-process this file using my categorylinks parser.
Optionally, an output filename, otherwise dump to stdout
Depending on the command-line switches, the following lines are output, in utf format
The image name
A short text-only version of the attribution line
A longer text-only version of the attribution line
An html version of the attribution line
Code

#!/usr/bin/perl
use strict;
use warnings;
use Encode;
use URI::Escape;
use HTML::Entities;
use HTML::TagFilter;

# N.B. Images can be downloaded by grabbing the first name in the output file and piping it to a download program, e.g. curl
# something like the following unix command should do it
# perl -ne 'print "-O\nurl=\"$1\"\n" if (/(^[^\t]+).*/)' OutputFile | curl -K -


#  Notes:
#  The commons dump file is huge, and it will take several minutes to parse it. You could try slimming it down 
# by 20% by removing non file pages, plus category information, with a one-liner something like
# perl -ie "print $/='<title>'; while (<>) { if (/^File:|^Image:/) {s/\[\[\:?Category.*?\]\]//g; print;}}" hugefile.xml 

die ("The name of a file containing a list of images is required as a first argument (or - to take from stdin)") unless ($ARGV[0]);
die("The name of an unpacked wikicommons xml dump file is required as the second argument. Get one from http://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-pages-articles-multistream.xml.bz2\n") unless ($ARGV[1]);
die("The name of a processed wikispecies xml dump file is required as the third argument. Get one from
http://dumps.wikimedia.org/specieswiki/latest/......sql.bz2\n") unless ($ARGV[2]);

my %image;
my $match_image; #ref to function to match 
my $match_re = undef;
my $lines_in;

if ($ARGV[0] =~ m|^/(.*)/$|) { #use a RE to match images, rather than taking names from a file
	$match_re=qr/$1/;
	$match_image = sub {($_[0] =~ /$match_re/)};
} else {
	if ($ARGV[0] eq "-") {
		*IMG_NAMES = *STDIN;
		binmode IMG_NAMES, ":utf8";
	} else {
		open(IMG_NAMES, "<:utf8", $ARGV[0])
			or die "cannot open ".$ARGV[0]." for reading: $!";
	}
	while(<IMG_NAMES>) {
		m/^([^\n\t]*)/; #name is anything up to the first tab or newline
		#caution - this could be a blank line
		$image{$1} = {'SortBy'=>$.} if ($1 ne '');
	};
	$lines_in = $.;
	$match_image = sub {exists $_[1]->{$_[0]}};
	close(IMG_NAMES);
}

open(COMMONS_DUMP, "<:utf8", $ARGV[1])
	or die "cannot open ".$ARGV[1]." for reading: $!";

my $sqlfile = $ARGV[2];
if ($sqlfile =~ s/sql$//i) {
	my $txtfile = $sqlfile.".txt";
	if (-e $txtfile) {
		open(CATEGORIES, "<:utf8", $txtfile)
			or die "cannot open ".$txtfile." for reading: $!";
	} else {
		print "I need to create a text summary of your sql file.\n";
		print "This should take about 10 mins\n";
		die "Not done yet!";
	}
} elsif ($sqlfile =~ /\.txt/) {
	open(CATEGORIES, "<:utf8", $sqlfile)
		or die "cannot open ".$sqlfile." for reading: $!";
}



#search for details to construct attribution line
#similar to http://commons.wikimedia.org/wiki/MediaWiki:Gadget-Stockphoto.js
my %ID; #reverse mapping from ID -> image_name, to help searching category file, which is in ID order 
my $match_list = \%image;
my %redirects = ();
select(STDERR); #for outputting progress
$|=1; #turn on autoflushing
print "Searching WikiCommons dump (".$ARGV[1]."). Printing a dot for every 100 thousand pages searched: ";

do {
	local $/ = '<title>';
	seek COMMONS_DUMP,$. = 0,0; #make sure we are at the start, resetting $. to 0.	
	while(<COMMONS_DUMP>) {
		if (m|^(.+?)</title>|) {
			unless ($. % 100000) {$. % 1000000 ? print "." : print "|"};
			my $name = wikiref_to_urlref($1);
			if ($match_image->($name, $match_list)) {
				$image{$name}={SortBy=>$name} unless (exists $image{$name});

				#first look for redirects: update %images if so
				if (m{<redirect title="(?:File|Image):(.+?)"\s*/>}) {
					my $new_name = wikiref_to_urlref(decode_entities($1));
					$redirects{$new_name} = $image{$new_name} = $image{$name};
					delete $image{$name};
					print "*";
					next;
				}
				
				#otherwise look for proper author detail
				if (m|<id>(\d+)</id>|g) {
					$image{$name}->{ID}=$1;
					$ID{$1}=$name;
				} else {
					$image{$name}->{ID}="Problem: <id> tag not found in WikiCommons dump";
				}
				
				#now we just want to look through the text - cut directly to there. Saves time and odd problems
				#as e.g. with http://commons.wikimedia.org/wiki/File:AZ_gray_squirrel.jpg
				m|(<text.*?</text>)|s;
				get_author_info($1, $image{$name});
			}
		}
	}

	print " done!\n";	

	delete @redirects{grep {exists $redirects{$_}->{ID}} keys %redirects};
	if (%redirects) {
		$match_list = \%redirects; #next time through the do loop, only look at the redirects
		print "Redirects found, and some of their targets missing on most recent pass. Doing another pass: ";		
	}

} while (%redirects); #don't need to worry about tied hashes - see http://www.perlmonks.org/?node_id=173677

close(COMMONS_DUMP);

print "Searching category list (extracted from sql file at ".$ARGV[2].") for licences: ";
foreach my $k (sort {$a <=> $b} keys(%ID)) { #do this in order of ID number
	$image{$ID{$k}}->{licence}='NONE';
	#go to that line in the file.
	while (my $line = <CATEGORIES>) {
		unless ($. % 100000) {$. % 1000000 ? print "." : print "|"};
    	if ($. == $k) {
    		my @cats = split(/(?<!\\)(?:\\\\)*'/,$line); 
    		#Split on non escaped ' marks, i.e. 'foo','bar' iterates over foo, comma, bar.
    		#This avoids having to cope with commans inside quotes. To explain the RE, see
    		# http://stackoverflow.com/questions/56554/what-is-the-proper-regular-expression-for-an-unescaped-backslash-before-a-charac
			foreach my $cat (map { $cats[$_*2+1] } 0..int(@cats/2)-1) { #use every other $cat (forget the intermediate comma).
				if (LicenceAcceptable($image{$ID{$k}}->{licence}) < LicenceAcceptable($cat)) {
			    	$image{$ID{$k}}->{licence} = $cat; #just pick the licence with the highest number
				}
			}
			last;
    	}
   	}
}
	
close(CATEGORIES);
print " done!\n";

#construct HTML, long text, and short text credit lines for images. 
# WARNING - HTML links could point anywhere - even to dangerous sites.
my $filter_all =  HTML::TagFilter->new(allow=>{}); #delete everything.
my $filter_most = new HTML::TagFilter; #defaults allow text markup, links, & images
$filter_most->deny_tags({ img => { all => [] }});

my $copyright = Encode::decode_utf8("©");
foreach my $name (keys %image) {
	my $url = 'http://commons.wikimedia.org/wiki/File:'.$name;
	my $im = $image{$name};
	if ($im->{custom_credit_line}) {
		$im->{html_credit} = "<a href='$url'>".$im->{custom_credit_line}."</a>";
		$im->{long_credit} = $im->{custom_credit_line};
		$im->{short_credit} = $im->{custom_credit_line};
		next;
	};
	
	unless (defined $im->{licence}) {
		$im->{licence} = -1; #now use licence simply to flag as file missing
		print $im->{html_credit}=$im->{long_credit}=$im->{short_credit}="Cannot find 'File:$name' in the Commons xml dump. Look online?";
		print "\n";
		next;
	}
	
	my $lv = LicenceAcceptable($im->{licence});

	unless ($lv>0) {
		print $im->{html_credit}=$im->{long_credit}=$im->{short_credit}="Cannot use $url: licence $im->{licence} not acceptable.";
		print "\n";
		$im->{licence} = 0; #now use licence simply to flag as unacceptable
		next;
	}
		
	#the following cases all construct a longer credit string
	if (! exists $im->{attribution}) { #no appropriate templates found
		$im->{html_credit} = "Author unclear (<a href='$url'>see here</a>)";
		$im->{long_credit} = "Author unclear, see wiki URL for details";
		$im->{short_credit} = "Author unclear, see File: ";
	} elsif (! defined ($im->{attribution})) { #matching template, but no author field
		$im->{html_credit} = "Author not defined (<a href='$url'>see here</a>)";
		$im->{long_credit} = "Author not defined, see wiki URL for details";
		$im->{short_credit} = "Author unclear, see File: ";
	} elsif (! $im->{attribution}) { #author field in Information template is blank.
		$im->{html_credit} = "Author unspecified (<a href='$url'>see here</a>)";
		$im->{long_credit} = "Author unspecified, see wiki URL for details";
		$im->{short_credit} = "Author unclear, see File: ";
	} else {
		if ($im->{attribution} =~ /upload/i) {
			$im->{html_credit} = "Image uploaded & possibly $copyright ";
			$im->{long_credit} = "Image uploaded & possibly $copyright ";
			$im->{short_credit} = "?$copyright";
		} else {
			$im->{html_credit} = "Image $copyright ";
			$im->{long_credit} = "Image $copyright ";
			$im->{short_credit} = "$copyright";
		}
		my $html = wikiauthor2html($im->{attribution});

		print "\n\n\n".$name." --".$im->{attribution}."--\n\n\n" unless $html;

		#Construct credit lines
		$im->{html_credit} .= $filter_most->filter($html);
		$im->{long_credit} .= decode_entities($filter_all->filter($html)); #the filter encodes html entities by default
		$im->{long_credit} =~ s/^(.{100}).*/$1\x{2026}/; #only use 1st 100 chars, add ellipsis if truncated
		
		$im->{short_credit} .= decode_entities($filter_all->filter($html));
		$im->{short_credit} =~ s/^(.{50}).*/$1\x{2026}/; #only use 1st 50 chars, add ellipsis if truncated
	}
				
	$im->{html_credit} .= " / ".$filter_all->filter(LicenceCat2HTML($im->{licence}) || $im->{licence});
	$im->{html_credit} .= " / from <a href='$url'>File:".$filter_all->filter($name)."</a> on <a href='http://commons.wikimedia.org/'>WikiCommons</a>";
	$im->{long_credit} .= " / ".(LicenceCat2Text($im->{licence}) || $im->{licence});
	$im->{long_credit} .= " / from $url";
			
	$im->{short_credit} .= "/".(LicenceCat2Txt($im->{licence}) || $im->{licence});
	$im->{short_credit} .= "/from File:$name on WikiCommons";
	$im->{licence} = $lv; #now use licence to flag acceptability
	next;
}


#Now print the details

if ($ARGV[3] && $ARGV[3] ne "-") {
	open(OUTPUT, ">:utf8", $ARGV[3])
		or die "cannot open ".$ARGV[3]." for (over)writing: $!";
} else {
	*OUTPUT = *STDOUT;
	binmode OUTPUT, ":utf8";
}

my @key_list;
if (defined $match_re) {#output lines alphabetically by file name
	@key_list = sort keys %image;
} else {				#output lines in the same order as they went in
	my %reverse_map = map {$image{$_}->{SortBy} => $_} keys %image;
	@key_list = map {$reverse_map{$_} || ""} 1..$lines_in;
}



foreach my $name (@key_list) {
	unless ($name eq '') {
		print OUTPUT $name if ($image{$name}->{licence} > 0 ); #don't print the fn if licence not accepted
		print OUTPUT "\t".$image{$name}->{licence}; #print the licence value (from LicenceAcceptable())
		print OUTPUT "\t".$image{$name}->{html_credit};
		print OUTPUT "\t".$image{$name}->{long_credit};
		print OUTPUT "\t".$image{$name}->{short_credit};	
	};
	print OUTPUT "\n";
};


sub LicenceAcceptable 
{ #if acceptable, return a positive number. If an image has multiple licences use the one with the highest number.
	for(shift) { #this should match against wikicommons Category: tags, such as [[Category:GFDL]]
		return 0 if ($_ eq 'NONE');		
		return 0 if (/^GFDL$/);
		return 2.99 if (/^CC-BY-SA-1.0\+$/); #any future BY-SA version
		return 2+$1/100 if (/^CC-BY-SA-?([\d\.]*)/);
		
		#values above 10 are for licences which do not place limits 
		# on the licence you yourself need to use when reusing these pictures
		return 10 if (/^Attribution$/);
		return 10 if (/^Copyrighted_free_use$/);
		return 10+$1/10 if (/^CC-BY-?([\d\.]*)/);
		return 11 if (/^FAL$/);
		return 12 if (/^CC-PD-Mark/);
		return 12 if (/^PD[-_]/);
		return 13 if (/^CC-Zero$/);

		return -1; #if not a licence category
	};
	# *** TO DO *** do we need to look for categories defined by the templates 'Anonymous[ _]work', 'Anonymous-EU'
};

# *** TO DO *** fill out all the licence categories here. There's a list at 
# https://commons.wikimedia.org/wiki/Commons:Copyright_tags
sub LicenceCat2HTML {
	for(shift) {
		if (/^GFDL$/)         {return '<a href="https://www.gnu.org/copyleft/fdl.html">GFDL</a>'} 
		if (/^FAL$/)          {return '<a href="http://artlibre.org/licence/lal/en">GFDL</a>'}
		if (/^CC-BY-SA-3.0/)  {return '<a href="https://creativecommons.org/licenses/by-sa/3.0/">CC-BY-SA-3.0</a>'}
		if (/^CC-BY-2.5/)     {return '<a href="https://creativecommons.org/licenses/by-sa/2.5/">CC-BY-2.5</a>'}
		if (/^CC-BY-2.0/)     {return '<a href="https://creativecommons.org/licenses/by-sa/2.0/">CC-BY-2.0</a>'}
		if (/^CC-BY-3.0/)     {return '<a href="https://creativecommons.org/licenses/by/3.0/">CC-BY-3.0</a>'}
		if (/^CC-BY-2.5/)     {return '<a href="https://creativecommons.org/licenses/by/2.5/">CC-BY-2.5</a>'}
		if (/^CC-BY-2.0/)     {return '<a href="https://creativecommons.org/licenses/by/2.0/">CC-BY-2.0</a>'}
		if (/^CC-Zero$/)      {return '<a href="https://creativecommons.org/publicdomain/zero/1.0/deed.en">CC Public Domain</a>'}
		if (/^Attribution$/)  {return 'Attribution req.'} 
		if (/^Copyrighted_free_use$/) {return '<a href="https://commons.wikimedia.org/wiki/Template:Copyrighted_free_use">Free for any use: no restriction</a>'}
		if (/^PD_US_Government$/) {return '<a href="http://commons.wikimedia.org/wiki/Template:PD-USGov">Public domain</a>'}
		if (/^PD[-_]/)        {return 'Public domain'}; #could give more specific PD- text above, e.g. 
		return '';
	};
};

sub LicenceCat2Text { #long plain text form
	for(shift) {
		if (/^GFDL$/)        {return 'Gnu Free Documentation Licence (POSSIBLY INCOMPATIBLE WITH DISTRIBUTION UNLESS FULL GFDL TERMS PRINTED HERE)'};
		if (/^FAL$/)         {return 'Free Art Licence (http://artlibre.org/licence/lal/en)'}
		if (/^CC-BY-SA-3.0/) {return 'CreativeCommons BY-SA 3.0 (https://creativecommons.org/licenses/by-sa/3.0)'}
		if (/^CC-BY-SA-2.5/) {return 'CreativeCommons BY-SA 2.5 (https://creativecommons.org/licenses/by-sa/2.5)'}
		if (/^CC-BY-SA-2.0/) {return 'CreativeCommons BY-SA 2.0 (https://creativecommons.org/licenses/by-sa/2.0)'}
		if (/^CC-BY-3.0/)    {return 'CreativeCommons BY 3.0 (https://creativecommons.org/licenses/by/3.0)'}
		if (/^CC-BY-2.5/)    {return 'CreativeCommons BY 2.5 (https://creativecommons.org/licenses/by/2.5)'}
		if (/^CC-BY-2.0/)    {return 'CreativeCommons BY 2.0 (https://creativecommons.org/licenses/by/2.0)'}
		if (/^CC-Zero$/)     {return '<a href="https://creativecommons.org/publicdomain/zero/1.0/deed.en">CC Public Domain</a>'}
		if (/^Attribution$/) {return 'Attribution required, see link'}
		if (/^Copyrighted_free_use$/) {return 'Free for any use: no restriction'}
		if (/^PD_US_Government$/) {return 'Public domain: US government work'}
		if (/^PD[-_]/)       {return 'Public domain'}
		return '';
	};
};

sub LicenceCat2Txt { #short text form. Note that some licences might require a full link.
#to do - this needs legal checking - what is the shortest form allowed?
	for(shift) {
		if (/^GFDL$/)         {return 'GFDL'}
		if (/^FAL$/)          {return 'Free Art Licence'}
		if (/^CC-BY-3.0/)    {return 'CC BY 3.0'}
		if (/^Attribution$/)  {return 'Attribution req.'}
		if (/^Copyrighted_free_use$/) {return 'Free for any use'}
		if (/^PD[-_]/)      {return 'Pub. domain'}
		return '';
	};
}



sub wikiref_to_urlref {
	my $name=uri_unescape($_[0]);
	$name =~ s/^\s*(?:File|Image):\s*//; #trim leading whitespace & start
	$name =~ s/[\s\p{FORMAT}]*$//; #trim trailing whitespace & unicode formatting characters
	$name =~ s/^(\w)/uc($1)/e;
	$name =~ s/ /_/g;
	return($name);
}

sub split_wikitemplate 
{
#   first returned item is the template name, 
#   second is the parameters passed to the template in a hashref whose hash contains 
#   the named template vars and the unnamed vars with keys {1}, {2}, {3} etc.
	my $name_out;
	my $name_in=shift;
	$name_in =~ s/[_ ]/[_ ]/g; #either space or underscore should be subbed to match the other
	$name_in =~ s/^(\w)/[\U$1\L$1]/; #allow first letter to be either UC or LC
	my $string = shift;
	my $depthcurly=0;
	my $depthsquare=0;
	my @params=(); #collect the template in here
	#Parse the template string. 
	#This is bit complex, because templates can contain other templates. Just ignore these nested ones
	my $prev_fragment="";
	foreach (split /( {+ | }+ | \[+ | \]+ | = | [|] )/x, $string) { # cut into pieces by {, }, [, ], |, and =, allowing multiple brackets
   		if (scalar(@params)==0) { #not yet found template name
   			if (($prev_fragment =~ /{{/) && (/^\s*($name_in)\s*/)) {
   				push(@params, [$1]); 	#start capturing
   			}   		
   		} else {
 			if    (/\[/)      {
 				$depthsquare+= length;
	   		} elsif (/\]/)      {
 				$depthsquare-= length; 
 				$depthsquare = 0 if $depthsquare < 0; #ignore extra "]" which might have been inserted by error
   			};
			unless ($depthsquare > 0) { #don't count curly brackets if within a [hyperlink {{ like this | foo]
	 			if    (/\{/)      {$depthcurly+= length}
 				elsif (/\}/)      {$depthcurly-= length};
			}
 
 			if ($depthcurly>0) { # in the depths of some bracketed expression - just bung everything in
 				$params[-1]->[-1] .= decode_entities($_)} 
	 		elsif ($depthcurly==0) { # we're in in the uppermost {{ bracketed expression }} - tread carefully
 				if ($depthsquare == 0) { #not inside a [hyperlink]
 					if (/[|]/) {
 						push(@params, [""])
 					} elsif (/=/) {
 						push(@{$params[-1]}, "") if ((scalar(@params) > 1) &&      #not the template name
 			    									 (scalar(@{$params[-1]})==1)); #still on the LHS of an = sign
  			    	} else {
  			    		$params[-1]->[-1] .= decode_entities($_);}
 			    } else {
 			    	$params[-1]->[-1] .= decode_entities($_) }  #not a special char: add to the current param
	 		} else { #depth < 0, end of template
	 			last;
			}
   		};
   		$prev_fragment = $_;
	}

	my $name_ref = shift(@params);
	return () unless ($name_ref && ($name_out = $name_ref->[0]));
	
	$name_out =~ s/^\s*//g;
	$name_out =~ s/\s*$//g;
	$name_out =~ s/^(\w)/\U$1/;
	$name_out =~ s/_/ /g;
	
	my $unnamed_pos=0;
	my %params_out=();
	foreach my $p (@params) {
		#see http://meta.wikimedia.org/wiki/Help:Template#Mix_of_named_and_unnamed_parameters
		if (scalar(@$p) == 1) {
			$params_out{++$unnamed_pos}=$p->[0];
		} else {
			$p->[0] =~ s/^\s*//;
			$p->[0] =~ s/\s*$//;
			$p->[0] =~ s/_/ /g;
			$p->[1] =~ s/^\s*//s;
			$p->[1] =~ s/\s*$//s;
			$params_out{$p->[0]} = $p->[1];
		}
	};
	return ($name_out, \%params_out);
}

sub wikiauthor2html { #rather heuristic routine for making nice author text
	local $_=shift;
	s|\s+| |gs; #remove whitespace incl newlines	
		
	#some crude wikitext parsing
	s|''''(.+?)''''|<strong><em>$1</em></strong>|gs;
	s|'''(.+?)'''|<strong>$1</strong>|gs;
	s|''(.+?)''|<em>$1</em>|gs;
	
	#replace [[ foo | bar ]] with bar
	s@(?<!\[) \[\[  ([^|]+)  [|]  (.+?) \]\] @<a href='http://commons.wikimedia.org/wiki/$1'>$2</a>@gx; 
	
	#replace [[User:foo bar]] with (WikiCommons: <a>User:foo_bar</a>)
	s|(?<!\[)  \[\[  (.+?) \]\] |(my $x=$1) =~ s/ /_/g; "(WikiCommons: <a href='http://commons.wikimedia.org/wiki/$x'>".$x."</a>)"|egx;
	
	#replace [ http://foo  bar baz ] with <a href='http://foo'>bar baz</a>
	s|(?<!\[)  \[ \s*([^\s\]]+)\s+  (.+?)  \]|<a href="$1">$2</a>|gx; 
				
	#replace [ http://www.foo.com ] with <a href="http://www.foo.com">http://www.foo.com</a>
	s|(?<!\[)  \[\s*(.+?)\s*\]|<a href="$1">$1</a>|gx; 

	# Some specific templates
	# ****TO DO **** should also cope with e.g. |Author={{de|Fotograf: [[User:Frank C. Müller|Frank C. Müller]]}} {{en|Photographer: [[User:Frank C. Müller|Frank C. Müller]]}} etc.
	s/\{\{[Cc]reator:(.+?)\}\}/$1/;
	s/\{\{[Uu]nknown\|[Aa]uthor\}\}/Unknown/;
	#A common notation: replace e.g. {{User:John Smith/Author}} with Wikicommons user John Smith
	s!\{\{(User:([^|/}]+)).*?\}\}!WikiCommons user <a href='http://commons.wikimedia.org/wiki/$1'>$2</a>!; 		

	# Exceptions as implemented in http://commons.wikimedia.org/wiki/MediaWiki:Gadget-Stockphoto.js
	s/\(talk\)//;
	s/^Original\suploader\swas\s*//; 
	
	#might as well trim whitespace
	s/^\s+//;
	s/\s+$//;		
	return $_;
}

sub get_author_info {
	my $wikitext = $_[0];
	my $ref = $_[1];
				
	#don't need extra spaces, newlines etc.
	$wikitext =~ s/\s+/ /sg; 

    #Search wikitext for credit line
	if ((my @t = split_wikitemplate('Credit line', $wikitext))[0]) {
		$ref->{custom_credit_line}  = ($t[1]->{Author} || $t[1]->{author} || "");
		$ref->{custom_credit_line} .= " / ";
		$ref->{custom_credit_line} .= ($t[1]->{Other} || $t[1]->{other} || "");
		$ref->{custom_credit_line} .= " / "; 
		$ref->{custom_credit_line} .= ($t[1]->{License} || $t[1]->{license} || "");
	} elsif ((@t = split_wikitemplate('Attribution', $wikitext))[0]) {
		if ($t[1]->{Text}) {
			$ref->{custom_credit_line}  = $t[1]->{Text}}
		elsif ($t[1]->{text}) {
			$ref->{custom_credit_line}  = $t[1]->{text}}
		elsif ($t[1]->{2}) {
			$ref->{attribution} = $t[1]->{2}}
	};

	# Look for authors in templates such as {{Information}}
	#
	#add in templates to recognise here. Perhaps cull extras from 
	# http://commons.wikimedia.org/wiki/Category:Infobox_templates:_based_on_Information_template

	my %template_func=( #for each template name, return the attribution text from a parameter of the template
	
	# an additional param to a cc-by template gives "the attribution string as specified by the licensor, that re-users are obliged to name" see https://commons.wikimedia.org/wiki/Template:Cc-by-2.0
		'Cc-by-3.0'=>   sub{$_[1]->{1} || $_[1]->{1}},
		'Cc-by-2.5'=>   sub{$_[1]->{1} || $_[1]->{1}},
		'Cc-by-2.0'=>   sub{$_[1]->{1} || $_[1]->{1}},
		'Cc-by-sa-3.0'=>sub{$_[1]->{1} || $_[1]->{1}},
		'Cc-by-sa-2.5'=>sub{$_[1]->{1} || $_[1]->{1}},
		'Cc-by-sa-2.0'=>sub{$_[1]->{1} || $_[1]->{1}},

		'Information'=> sub{$_[1]->{Author} || $_[1]->{author}},
		'Specimen'=>    sub{$_[1]->{Author} || $_[1]->{author}},
		'Flickr'=>      sub{$_[1]->{Photographer} || $_[1]->{photographer}},
		'Artwork'=>     sub{$_[1]->{Artist} || $_[1]->{artist}},
		'Self' =>       sub{$_[1]->{Attribution} || $_[1]->{attribution} || $_[1]->{Author} || $_[1]->{author}},
		'[\w\/\:\s_]*[Ii]nformation' => #getting desperate here - look for customised templates, hoping the end in the word "information" - yuk. An example is {{User:Aka/Information}}. But it won't catch, e.g. {{User:Biopics/infong}}. There should be a better way, e.g. see http://commons.wikimedia.org/wiki/Commons_talk:Machine-readable_data#List_of_templates_that_use_machine_readable_markup
			sub{$_[1]->{author} || $_[1]->{Author} || $_[1]->{photographer} || $_[1]->{Photographer}},
		'User:Biopics/infong' => sub{'Hans Hillewaert'},#example of a user-specified template
	);
	
	#now look for these templates in the wikitext
	foreach my $t_name (keys %template_func) {
		if (my @t = split_wikitemplate($t_name, $wikitext)) {
			last if ($ref->{attribution} = $template_func{$t_name}->(@t))
		}
	};
};
User:HYanWong/imagenames2credits.pl

Code

Navigation menu

Search