User:FlickreviewR 2/flickreviewr.py
Jump to navigation
Jump to search
"""
Copyright (c) 2006-2009 Bryan Tong Minh
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
"""
import sys, os, time
import traceback
import urllib, urlparse
try:
from hashlib import sha1
except ImportError:
from sha import sha as sha1
import re
from botbase import FlickrBotBase
import flickr_tools
import mwclient
import flickr
import database
class Review(object):
Templates = {\
'passed': ('', 'User:FlickreviewR/reviewed-pass'),
'failed': ('', 'User:FlickreviewR/reviewed-fail-recent'),
'passed_changed': ('', 'User:FlickreviewR/reviewed-pass-change'),
'hash_not_matching': ('', 'User:FlickreviewR/reviewed-notmatching'),
'size_not_found': ('', 'User:FlickreviewR/reviewed-error'),
'no_flickr_link': ('', 'User:FlickreviewR/reviewed-nosource'),
'flickr_not_found': ('', 'User:FlickreviewR/reviewed-notfound'),
'bad_author': ('', 'User:FlickreviewR/bad-author'),
'library_of_congress': ('', 'User:FlickreviewR/library-of-congress'),
'powerhouse_museum': ('', 'User:FlickreviewR/powerhouse-museum')
}
Failures = ['nd', 'nc']
class FlickreviewR(FlickrBotBase):
def __init__(self):
FlickrBotBase.__init__(self)
self.FlickrStatic = {}
self.review_template_regex = re.compile(
r'(\{\{(?:flickrr?eview)?(?:User\:FlickreviewR\/reviewed\-.*?)?\}\})', re.S | re.I)
self.cc_license_template_regex = re.compile(
r'\{\{(cc\-by(?:\-sa)?(?:\-[0-9]\.[0-9])?)(?:\|.*?)?\}\}', re.S | re.I)
self.bad_authors = re.findall(r"^[^#\s].+$", mwclient.page.Page(self.site, "User:FlickreviewR/bad-authors").edit(), re.M)
# Functions part of the review process
def run(self, max = -1):
""" Main function. """
count = 0
for image in self.site.Categories['Flickr review needed']:
if count == max: return
if image.namespace != 6: continue
try:
# Review
review_result, data = self.review(image)
self.output(u'* [[:%s]] %s %s %s' % (image, review_result, data['license'], data['data']), True)
# Post
self.post_result(image, review_result, data)
except mwclient.InsufficientPermission:
self.output(u'Insufficient permission editting %s' % image.name)
continue
except mwclient.APIError, e:
self.output(u'APIError editting %s' % image.name)
self.output(''.join(traceback.format_exception_only(type(e), e)))
continue
except:
self.output(u'Exception reviewing image %s' % image.name)
raise
# Store
#self.store_author(data['author'])
#self.store_result(image.name, review_result, data['license'], data['photo_id'],
# data['author'][0], data['uploaded'])
count = count + 1
def review(self, image):
""" Reviews an image.
Returns:
(Results, Author, Photo_Id, License, Data)
"""
return_data = {'author':('',''), 'photo_id':'', 'license':'', 'data':'', 'uploaded': False}
data = image.edit()
categories = list(image.categories(generator = False))
imageinfo = image.imageinfo
photo_id = flickr_tools.get_photo_id(data)
# Check whether there is a link to a Flickr page
if photo_id:
return_data['photo_id'] = photo_id
try:
flickr_image = self.Flickr[photo_id]
except KeyError:
return 'flickr_not_found', return_data
author = (flickr_image['owner']['nsid'], flickr_image['owner']['username'])
return_data['author'] = author
if not imageinfo:
return 'size_not_found', return_data
# Try reviewing by EXIF information
exif_passed, exif_failed, exif_unverifiable = self.review_exif(
imageinfo['metadata'], flickr_image)
size = (int(imageinfo['width']), int(imageinfo['height']))
return_data['size'] = size
hash_review = None
# Do SHA1 review if:
## - The EXIF information is too limited to be certain
## - The EXIF information does not match
## - Uploading a high resolution version is possible
if len(exif_passed) < 10 or exif_failed or self.can_upload(size, flickr_image):
flickr_file = self.get_flickr_image(size, int(imageinfo['size']), flickr_image)
if not flickr_file:
return 'size_not_found', return_data
hash_review = self.review_sha1(imageinfo['sha1'], flickr_file)
# If the MD5 review does not match and the EXIF comparison was not sufficient:
if not hash_review and (len(exif_passed) < 10 or exif_failed):
return 'hash_not_matching', return_data
# Images match, review license
if author[0] in self.bad_authors:
return 'bad_author', return_data
# "No known restrictions" special case
if flickr_image['license'] == u'7':
if author[0] == '8623220@N02':
return_data['license'] = 'pd'
return 'library_of_congress', return_data
elif author[0] == '24785917@N03':
return_data['license'] = 'pd'
return 'powerhouse_museum', return_data
license_check, license = self.review_license(data, (flickr_image['license'], self.Flickr.licenses[flickr_image['license']]), categories)
return_data['license'] = license
# Try substituting the license tag if necessary
if license_check == 'passed_changed':
return_data['data'] = self.try_license_subst(image, license)
# If licensing is ok and the SHA1 review passed, try uploading hires
if license_check != 'failed' and hash_review:
hires = self.upload_hires(image.name, size, flickr_image)
if hires:
return_data['uploaded'] = True
return license_check, return_data
else:
# No Flickr link avaiable
return 'no_flickr_link', return_data
def review_exif(self, metadata, flickr_image):
""" Compares exif information from Commons and Flickr.
Returns a tuple containing (similar, disimilar, missing)
exif tags.
"""
if metadata is None:
metadata = {}
else:
metadata = dict([(i['name'], i['value']) for i in metadata])
# Prepare a list of all tags and remove duplicates
tags = []
tags.extend(metadata.iterkeys())
tags.extend(flickr_image.exif.iterkeys())
for tag in tags[:]:
while tags.count(tag) > 1: tags.remove(tag)
# Init return value
passed, failed, unverifiable = [], [], []
for tag in tags:
if tag in metadata and tag in flickr_image.exif:
if unicode(metadata[tag]) == flickr_image.exif[tag]:
passed.append(tag)
else:
failed.append((tag, metadata[tag], flickr_image.exif[tag]))
else:
unverifiable.append(tag)
return passed, failed, unverifiable
def review_sha1(self, commons_hash, flickr_file):
""" Compares a file with a certain SHA1 hash in base16. """
flickr_hash = sha1()
s = '\x00'
while s:
s = flickr_file.read(8192)
flickr_hash.update(s)
# Explicitly close files to avoid problems with httplib
flickr_file.close()
return commons_hash == flickr_hash.hexdigest()
def review_license(self, image_data, (flickr_license_id, flickr_license), categories):
""" Compares the licenses on Commons and Flickr. """
# Flickr images with license 7 should have a license template
# which categorizes in Category:The Commons
if flickr_license_id == u'7':
if u'Category:The Commons' in categories:
return 'passed', flickr_license
else:
return 'passed_changed', flickr_license
# United States government work, no specific category
if flickr_license_id == u'8':
return 'passed', flickr_license
# Unconditionally pass images with OTRS permission confirmed
if u'Category:Items with OTRS permission confirmed' in categories:
return 'passed', 'otrs'
# Verify CC licenses
flickr_license = flickr_license.split('-')
if flickr_license[0] == 'cc':
for item in flickr_license[1:-1]:
if item in Review.Failures:
# Image has one of the ND/NC licenses
return 'failed', '-'.join(flickr_license)
# Image is under a free CC license
if u'Category:' + u'-'.join(flickr_license).upper() in categories:
# License on Flickr and Commons are the same
return 'passed', '-'.join(flickr_license)
else:
# License on Flickr and Commons differ
return 'passed_changed', '-'.join(flickr_license)
else:
# A non CC license
return 'failed', '-'.join(flickr_license)
# Various helper functions to get something from Flickr
def get_static_connection(self, location):
""" Obtain a connection to the Flickr static farm """
if location.netloc not in self.FlickrStatic:
# Open a new persistent connection to the Flickr static farm
self.FlickrStatic[location.netloc] = mwclient.http.HTTPPersistentConnection(location.netloc)
return self.FlickrStatic[location.netloc]
def get_flickr_image(self, commons_image_size, file_size, flickr_image):
""" Return a file object containing the Flickr image
matching the size of the Commons image. """
for flickr_image_size, location in flickr_image.sizes:
if commons_image_size != flickr_image_size:
continue
# Found an image with the same image size
# Perform a HEAD request to obtain the Content-Length
location = urlparse.urlparse(location) # Ignore query string
connection = self.get_static_connection(location)
status, headers = connection.head(location.netloc, location.path)
content_length = ([int(v) for k, v in headers if k.lower() == 'content-length'] + [0])[0]
if status == 200 and content_length == file_size:
# Get the file
return connection.get(location.netloc, location.path)
self.output(u'File sizes do not match. Commons: %s; Flickr: %s' % (file_size, content_length))
# Upload functions
def upload_hires(self, name, commons_size, flickr_image):
""" Upload a hires version of a flickr image. """
# Only upload if it is sure that the current image is a thumbnail
if commons_size not in [size for size, loc in flickr_image.sizes]:
self.output(u'No matching image size found for %s' %name)
return
# Find the highest resolution image
hires = max(flickr_image.sizes, key = lambda i: i[0][0])
if hires[0][0] > commons_size[0]:
# Upload if it is larger than the Commons image
hires_image = flickr_tools.download_temporary(flickr_image)
# Perform rotation if necessary
rotation = flickr_tools.get_rotation(flickr_image)
if rotation:
self.output(u'Rotated %s %s degrees' % (name, rotation))
hires_image = flickr_tools.rotate(hires_image,
rotation, self.config['jpegtran'])
self.site.upload(hires_image, name[name.find(':') + 1:],
'Replacing image by its original image from Flickr', ignore = True)
return hires[0]
def can_upload(self, commons_size, flickr_image):
"""Only upload in case the current version matches some version from Flickr!"""
return (commons_size in [size for size, loc in flickr_image.sizes]) and \
(commons_size[0] != max(flickr_image.sizes, key = lambda i: i[0][0]))
# Store and post functions
def store_result(self, image, review_result, license, photo_id, author_nsid, upload):
""" Push the result to the database. """
# Init defaults
if not photo_id: photo_id = 0
if not author_nsid: author_nsid = None
if not license: license = None
if license == 'pd': review_result = 'passed'
self.database.insert('review',
('rv_timestamp', 'rv_image', 'rv_result', 'rv_license', 'rv_photo_id', 'rv_nsid', 'rv_uploaded'),
(self.database.now(), image[6:].replace(' ', '_'), review_result, license,
photo_id, author_nsid, int(upload)))
self.database.commit()
def post_result(self, page, result, data):
""" Push the result to Commons. """
t_data = u'%s{{%s|%s|http://flickr.com/photos/%s/%s|%s|%s|%s}}' % \
(Review.Templates[result] + (data['author'][1], data['author'][0],
data['photo_id'], time.strftime('%Y-%m-%d %H:%M:%S'), data['license'], data['data']))
text = self.review_template_regex.sub(t_data, page.text)
summary = '[[User:FlickreviewR 2|FlickreviewR 2]]: %s %s' % (result, data['license'])
page.save(text, summary = summary)
def try_license_subst(self, image, license):
match = self.cc_license_template_regex.search(image.text)
if match:
old_license = match.group(1)
image.text = image.text.replace(match.group(1), license)
return old_license
return ''
if __name__ == '__main__':
import sys
fr = FlickreviewR()
if '-r' in sys.argv:
fr.run()