User:FlickreviewR 2/flickreviewr.py

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search
"""
 Copyright (c) 2006-2009 Bryan Tong Minh
 
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
 files (the "Software"), to deal in the Software without
 restriction, including without limitation the rights to use,
 copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the
 Software is furnished to do so, subject to the following
 conditions:
 
 The above copyright notice and this permission notice shall be
 included in all copies or substantial portions of the Software.
 
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
"""
import sys, os, time
import traceback
import urllib, urlparse
try:
	from hashlib import sha1
except ImportError:
	from sha import sha as sha1

import re

from botbase import FlickrBotBase
import flickr_tools
import mwclient
import flickr
import database

class Review(object):
	Templates = {\
		'passed':	('', 'User:FlickreviewR/reviewed-pass'),
		'failed':	('', 'User:FlickreviewR/reviewed-fail-recent'),
		'passed_changed':	('', 'User:FlickreviewR/reviewed-pass-change'),
		'hash_not_matching':	('', 'User:FlickreviewR/reviewed-notmatching'),
		'size_not_found':	('', 'User:FlickreviewR/reviewed-error'),
		'no_flickr_link':	('', 'User:FlickreviewR/reviewed-nosource'),
		'flickr_not_found':	('', 'User:FlickreviewR/reviewed-notfound'),
		'bad_author':	('', 'User:FlickreviewR/bad-author'),
		'library_of_congress':	('', 'User:FlickreviewR/library-of-congress'),
		'powerhouse_museum':	('', 'User:FlickreviewR/powerhouse-museum')
	}
	
	Failures = ['nd', 'nc']	
	
		

class FlickreviewR(FlickrBotBase):
	def __init__(self):
		FlickrBotBase.__init__(self)
		self.FlickrStatic = {}
		self.review_template_regex = re.compile(
			r'(\{\{(?:flickrr?eview)?(?:User\:FlickreviewR\/reviewed\-.*?)?\}\})', re.S | re.I)
		self.cc_license_template_regex = re.compile(
			r'\{\{(cc\-by(?:\-sa)?(?:\-[0-9]\.[0-9])?)(?:\|.*?)?\}\}', re.S | re.I)
		self.bad_authors = re.findall(r"^[^#\s].+$", mwclient.page.Page(self.site, "User:FlickreviewR/bad-authors").edit(), re.M)
	
	
	# Functions part of the review process
	def run(self, max = -1):
		""" Main function. """
		
		count = 0
		for image in self.site.Categories['Flickr review needed']:
			if count == max: return
			
			if image.namespace != 6: continue
			
			try:
				# Review
				review_result, data = self.review(image)
				self.output(u'* [[:%s]] %s %s %s' % (image, review_result, data['license'], data['data']), True)
				# Post
				self.post_result(image, review_result, data)
			except mwclient.InsufficientPermission:
				self.output(u'Insufficient permission editting %s' % image.name)
				continue
			except mwclient.APIError, e:
				self.output(u'APIError editting %s' % image.name)
				self.output(''.join(traceback.format_exception_only(type(e), e)))
				continue
			except:
				self.output(u'Exception reviewing image %s' % image.name)
				raise
			
			# Store
			#self.store_author(data['author'])
			#self.store_result(image.name, review_result, data['license'], data['photo_id'], 
			#	data['author'][0], data['uploaded'])
				
			count = count + 1
	
	def review(self, image):
		""" Reviews an image.
		Returns:
			(Results, Author, Photo_Id, License, Data)
		"""
	
		return_data = {'author':('',''), 'photo_id':'', 'license':'', 'data':'', 'uploaded': False}
		data = image.edit()
		categories = list(image.categories(generator = False))
		imageinfo = image.imageinfo
		photo_id = flickr_tools.get_photo_id(data)
		
		# Check whether there is a link to a Flickr page
		if photo_id:
			return_data['photo_id'] = photo_id
			try:
				flickr_image = self.Flickr[photo_id]
			except KeyError:
				return 'flickr_not_found', return_data
			author = (flickr_image['owner']['nsid'], flickr_image['owner']['username'])
			return_data['author'] = author
			
			if not imageinfo:
				return 'size_not_found', return_data
			
			# Try reviewing by EXIF information
			exif_passed, exif_failed, exif_unverifiable = self.review_exif(
				imageinfo['metadata'], flickr_image)
			
			size = (int(imageinfo['width']), int(imageinfo['height']))
			return_data['size'] = size
			hash_review = None
			# Do SHA1 review if:
			##	- The EXIF information is too limited to be certain
			##	- The EXIF information does not match
			##	- Uploading a high resolution version is possible
			
			if len(exif_passed) < 10 or exif_failed or self.can_upload(size, flickr_image):
				flickr_file = self.get_flickr_image(size, int(imageinfo['size']), flickr_image)
				if not flickr_file: 
					return 'size_not_found', return_data
				
				hash_review = self.review_sha1(imageinfo['sha1'], flickr_file)
				# If the MD5 review does not match and the EXIF comparison was not sufficient:
				if not hash_review and (len(exif_passed) < 10 or exif_failed):
					return 'hash_not_matching', return_data
			
			# Images match, review license
			
			if author[0] in self.bad_authors:
				return 'bad_author', return_data
			
			# "No known restrictions" special case
			if flickr_image['license'] == u'7':
				if author[0] == '8623220@N02':
					return_data['license'] = 'pd'
					return 'library_of_congress', return_data
				elif author[0] == '24785917@N03':
					return_data['license'] = 'pd'
					return 'powerhouse_museum', return_data
			
			license_check, license = self.review_license(data, (flickr_image['license'], self.Flickr.licenses[flickr_image['license']]), categories)
			return_data['license'] = license
			
			# Try substituting the license tag if necessary
			if license_check == 'passed_changed':
				return_data['data'] = self.try_license_subst(image, license)

			# If licensing is ok and the SHA1 review passed, try uploading hires
			if license_check != 'failed' and hash_review:
				hires = self.upload_hires(image.name, size, flickr_image)
				if hires: 
					return_data['uploaded'] = True

			return license_check, return_data
		else:
			# No Flickr link avaiable
			return 'no_flickr_link', return_data
	
	def review_exif(self, metadata, flickr_image):
		""" Compares exif information from Commons and Flickr.
		    Returns a tuple containing (similar, disimilar, missing)
		    exif tags.
		"""
		if metadata is None: 
			metadata = {}
		else:
			metadata = dict([(i['name'], i['value']) for i in metadata])
			
		# Prepare a list of all tags and remove duplicates
		tags = []
		tags.extend(metadata.iterkeys())
		tags.extend(flickr_image.exif.iterkeys())
		for tag in tags[:]:
			while tags.count(tag) > 1: tags.remove(tag)
			
		# Init return value
		passed, failed, unverifiable = [], [], []
		
		for tag in tags:
			if tag in metadata and tag in flickr_image.exif:
				if unicode(metadata[tag]) == flickr_image.exif[tag]:
					passed.append(tag)
				else:
					failed.append((tag, metadata[tag], flickr_image.exif[tag]))
			else:
				unverifiable.append(tag)
		
		return passed, failed, unverifiable
		
	def review_sha1(self, commons_hash, flickr_file):
		""" Compares a file with a certain SHA1 hash in base16. """
		
		flickr_hash = sha1()
		s = '\x00'
		
		while s:
			s = flickr_file.read(8192)
			flickr_hash.update(s)
		
		# Explicitly close files to avoid problems with httplib
		flickr_file.close()
		
		return commons_hash == flickr_hash.hexdigest()
		
	def review_license(self, image_data, (flickr_license_id, flickr_license), categories):
		""" Compares the licenses on Commons and Flickr. """
		
		# Flickr images with license 7 should have a license template
		# which categorizes in Category:The Commons
		if flickr_license_id == u'7':
			if u'Category:The Commons' in categories:
				return 'passed', flickr_license
			else:
				return 'passed_changed', flickr_license
		# United States government work, no specific category
		if flickr_license_id == u'8':
			return 'passed', flickr_license
			
			
		# Unconditionally pass images with OTRS permission confirmed	
		if u'Category:Items with OTRS permission confirmed' in categories:
			return 'passed', 'otrs'
		
		# Verify CC licenses
		flickr_license = flickr_license.split('-')
		if flickr_license[0] == 'cc':
			for item in flickr_license[1:-1]:
				if item in Review.Failures:
					# Image has one of the ND/NC licenses
					return 'failed', '-'.join(flickr_license)
			# Image is under a free CC license
			if u'Category:' + u'-'.join(flickr_license).upper() in categories:
				# License on Flickr and Commons are the same
				return 'passed', '-'.join(flickr_license)
			else:
				# License on Flickr and Commons differ
				return 'passed_changed', '-'.join(flickr_license)
		else:
			# A non CC license
			return 'failed', '-'.join(flickr_license)
		
	
	# Various helper functions to get something from Flickr

	def get_static_connection(self, location):
		""" Obtain a connection to the Flickr static farm """

		if location.netloc not in self.FlickrStatic:
			# Open a new persistent connection to the Flickr static farm
			self.FlickrStatic[location.netloc] = mwclient.http.HTTPPersistentConnection(location.netloc)
		return self.FlickrStatic[location.netloc]
		
	
	def get_flickr_image(self, commons_image_size, file_size, flickr_image):
		""" Return a file object containing the Flickr image 
		    matching the size of the Commons image. """
		for flickr_image_size, location in flickr_image.sizes:
			if commons_image_size != flickr_image_size:
				continue
			# Found an image with the same image size
			# Perform a HEAD request to obtain the Content-Length
			location = urlparse.urlparse(location) # Ignore query string
			connection = self.get_static_connection(location)
			status, headers = connection.head(location.netloc, location.path)
			content_length = ([int(v) for k, v in headers if k.lower() == 'content-length'] + [0])[0]
			if status == 200 and content_length == file_size:
				# Get the file
				return connection.get(location.netloc, location.path)
			self.output(u'File sizes do not match. Commons: %s; Flickr: %s' % (file_size, content_length))
	
	# Upload functions
	
	def upload_hires(self, name, commons_size, flickr_image):
		""" Upload a hires version of a flickr image. """
		
		# Only upload if it is sure that the current image is a thumbnail
		if commons_size not in [size for size, loc in flickr_image.sizes]:
			self.output(u'No matching image size found for %s' %name)
			return
		
		# Find the highest resolution image
		hires = max(flickr_image.sizes, key = lambda i: i[0][0])
		if hires[0][0] > commons_size[0]:
			# Upload if it is larger than the Commons image
			hires_image = flickr_tools.download_temporary(flickr_image)
			
			# Perform rotation if necessary 
			rotation = flickr_tools.get_rotation(flickr_image)
			if rotation:
				self.output(u'Rotated %s %s degrees' % (name, rotation))
				hires_image = flickr_tools.rotate(hires_image, 
						rotation, self.config['jpegtran'])
			
			self.site.upload(hires_image, name[name.find(':') + 1:], 
				'Replacing image by its original image from Flickr', ignore = True)
			return hires[0]
		
	def can_upload(self, commons_size, flickr_image):
		"""Only upload in case the current version matches some version from Flickr!"""
		return (commons_size in [size for size, loc in flickr_image.sizes]) and \
			(commons_size[0] != max(flickr_image.sizes, key = lambda i: i[0][0]))
	
	
	# Store and post functions
	
	def store_result(self, image, review_result, license, photo_id, author_nsid, upload):
		""" Push the result to the database. """
		
		# Init defaults
		if not photo_id: photo_id = 0
		if not author_nsid: author_nsid = None
		if not license: license = None
		if license == 'pd': review_result = 'passed'
			
		self.database.insert('review',
			('rv_timestamp', 'rv_image', 'rv_result', 'rv_license', 'rv_photo_id', 'rv_nsid', 'rv_uploaded'),
			(self.database.now(), image[6:].replace(' ', '_'), review_result, license,
				photo_id, author_nsid, int(upload)))
		self.database.commit()
		
	def post_result(self, page, result, data):
		""" Push the result to Commons. """
		
		t_data = u'%s{{%s|%s|http://flickr.com/photos/%s/%s|%s|%s|%s}}' % \
			(Review.Templates[result] + (data['author'][1], data['author'][0], 
				data['photo_id'], time.strftime('%Y-%m-%d %H:%M:%S'), data['license'], data['data']))
		text = self.review_template_regex.sub(t_data, page.text)
		summary = '[[User:FlickreviewR 2|FlickreviewR 2]]: %s %s' % (result, data['license'])
		page.save(text, summary = summary)
		
	def try_license_subst(self, image, license):
		match = self.cc_license_template_regex.search(image.text)
		if match:
			old_license = match.group(1) 
			image.text = image.text.replace(match.group(1), license)
			return old_license
		return ''

if __name__ == '__main__': 
	import sys
	
	fr = FlickreviewR()
	if '-r' in sys.argv:
		fr.run()