User:JarektUploadBot/Upload Open Access Images.py
Jump to navigation
Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
A program to upload Open Access Images based on CSV file
'''
import sys, os.path, glob, re, hashlib, base64, StringIO
import wikipedia, upload, csv, urllib2, string, catlib
def processFile(row):
# Read line of metadata
enc='utf-8'
metadata = {
u'description' : unicode(row.get(u'en') , enc),
u'date' : unicode(row.get(u'date') , enc),
u'filename' : unicode(row.get(u'file location') , enc),
u'author' : unicode(row.get(u'author') , enc),
u'source' : unicode(row.get(u'source') , enc),
u'categories' : unicode(row.get(u'categories') , enc),
u'journal' : unicode(row.get(u'journal') , enc),
u'permission' : unicode(row.get(u'permission') , enc),
}
# Format file description
template = u"""{{subst:User:JarektUploadBot/Open Access Subst
|description = %(description)s
|date = %(date)s
|source = %(source)s
|author = %(author)s
|permission = %(permission)s
|other_versions =
|journal = %(journal)s
|category00 = %(category00)s
|category01 = %(category01)s
|category02 = %(category02)s
|category03 = %(category03)s
|category04 = %(category04)s
|category05 = %(category05)s
|category06 = %(category06)s
|category07 = %(category07)s
|category08 = %(category08)s
|category09 = %(category09)s
|category10 = %(category10)s
|category11 = %(category11)s
|category12 = %(category12)s
|category13 = %(category13)s
|category14 = %(category14)s
|category15 = %(category15)s
|category16 = %(category16)s
|category17 = %(category17)s
|category18 = %(category18)s
|category19 = %(category19)s
}}
"""
n=0
targetSite = wikipedia.getSite('commons', 'commons')
catlist = metadata['categories'].split('\n')
metadata['category00'] = ''
for cat in catlist:
n = n+1
metadata['category'+"%02i" % n] = cat.strip()
cat_page =catlib.Category(targetSite, cat.strip())
if not cat_page.exists():
metadata['category00'] = 'File with non-existing species category'
for i in range(19-n):
n = n+1
metadata['category'+"%02i" % n] = ''
fname = metadata['filename'].rsplit('/',1)[1]
if len(metadata['category01'])>0:
fname = metadata['category01'] + ' - ' + fname
metadata['date'] = re.sub(u'(\d\d\d\d)-(\d)-(\d\d?)', ur'\1-0\2-\3', metadata['date'])
metadata['date'] = re.sub(u'(\d\d\d\d)-(\d\d?)-(\d)', ur'\1-\2-0\3', metadata['date'])
description = template % metadata
wikipedia.output(fname)
wikipedia.output(description)
# We don't want to upload duplicates
# So take the photo, calculate the SHA1 hash and ask the mediawiki api for a list of duplicates.
imageFile = urllib2.urlopen(metadata['filename']).read()
photo = StringIO.StringIO(imageFile)
hashObject = hashlib.sha1()
hashObject.update(photo.getvalue())
SHA1 = base64.b16encode(hashObject.digest())
duplicates = targetSite.getFilesFromAnHash(SHA1)
if duplicates:
str = duplicates.pop()
wikipedia.output(u'Duplicate image: %s' % str + u' = %(FILENAME)s' % metadata)
return
# upload file to Commons
bot = upload.UploadRobot(url=metadata['filename'],
description = description,
useFilename = fname,
keepFilename = True,
verifyDescription=False,
targetSite = targetSite)
bot.run() # Comment out this line to have a dry run
def main(args):
csvFile = 'Open Access1.csv'
reader = csv.DictReader(open(csvFile, "rb"), dialect='excel', delimiter=',')
i=0
try:
for row in reader:
#print 'Row read successfully:', row
processFile(row)
i=i+1
if i==-1: # Set a number of files to upload, or set to -1 for all files
return
except csv.Error, e:
sys.exit('file %s, line %d: %s' % (csvFile, reader.line_num, e))
if __name__ == "__main__":
try:
main(sys.argv[1:])
finally:
print "All done!"