User:JarektUploadBot/FixWGAMetadataInfo.py
Jump to navigation
Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
A program to update file description of images from Web Gallery of Art website at http://www.wga.hu/
which were manually uploaded to commons. This code is for files using {{Information}} template
'''
import sys, os.path, glob, re, hashlib, base64, StringIO
sys.path.append("C:/Programs/pywikipedia/")
sys.path.append("../")
import wikipedia, upload, csv, urllib2, string, catlib
def processFile(row):
# Read line of metadata
enc='utf-8'
metadata = {
'IMG_ID' : int (row.get(u'IMG_ID') ),
'CREATOR' : unicode(row.get(u'CREATOR') , enc),
'DATE' : unicode(row.get(u'DATE') , enc),
'TITLE' : unicode(row.get(u'TITLE') , enc),
'DIMENSIONS' : unicode(row.get(u'DIMENSIONS') , enc),
'TECHNIQUE' : unicode(row.get(u'TECHNIQUE') , enc),
'FILENAME' : unicode(row.get(u'FILENAME') , enc),
'FILENAME2' : unicode(row.get(u'FILENAME2') , enc),
'FORM' : unicode(row.get(u'FORM') , enc),
'TYPE' : unicode(row.get(u'TYPE') , enc),
'SCHOOL' : unicode(row.get(u'SCHOOL') , enc),
'TIMELINE' : unicode(row.get(u'TIMELINE') , enc),
'INSTITUTION' : unicode(row.get(u'INSTITUTION') , enc),
'CREATOR_CAT' : unicode(row.get(u'CREATOR_CAT') , enc),
'INSTITUTION_CAT' : unicode(row.get(u'INSTITUTION_CAT'), enc),
'TITLE_CAT' : unicode(row.get(u'TITLE_CAT') , enc),
'DATE_CAT' : unicode(row.get(u'DATE_CAT') , enc),
'URL' : unicode(row.get(u'URL') , enc),
'IMAGEURL' : unicode(row.get(u'IMAGEURL') , enc),
'FRAME' : unicode(row.get(u'FRAME') , enc),
}
metadata['FORM1'] = metadata['FORM'].capitalize();
metadata['FILENAME2'] = 'File:'+metadata['FILENAME2'].strip();
metadata['CREATOR'] = metadata['CREATOR'].strip();
metadata['INSTITUTION'] = metadata['INSTITUTION'].strip();
# Format file description
article_template = u"""{{Artwork
|artist = %(CREATOR)s
|title = {{en|%(TITLE)s}}
|description =
|date = %(DATE)s
|medium = %(TECHNIQUE)s
|dimensions = %(DIMENSIONS)s
|institution = %(INSTITUTION)s
|location = <!-- location within the gallery/museum -->
|references =
|object history =
|credit line =
|inscriptions =
|notes =
|accession number =
|source = {{WGA link|ID=%(IMG_ID)s|pic-url=%(IMAGEURL)s|info-url=%(URL)s}}
|permission = {{PD-art|PD-old-100}}
|other_versions =
}}
%(FRAME)s
{{WGA tag|%(FORM)s|%(TYPE)s|%(SCHOOL)s|%(TIMELINE)s}}
[[Category:WGA form: %(FORM)s]]
[[Category:WGA type: %(TYPE)s]]
[[Category:WGA School: %(SCHOOL)s]]
[[Category:WGA time period: %(TIMELINE)s]]
"""
description = article_template % metadata
# Get current file categories
targetSite = wikipedia.getSite('commons', 'commons')
page = wikipedia.Page(targetSite, metadata['FILENAME2'])
desc = page.get()
wikipedia.output("================================================================================" )
wikipedia.output("=== BEFORE =====================================================================" )
wikipedia.output("================================================================================" )
wikipedia.output(desc)
#get files categories and parent categories of those
parentCats=''
for m in re.finditer("\[\[[Cc]ategory:([^\]\|]*)", desc):
cat = u'Category:%s\n'%m.groups(0)[0]
parentCats += cat
catO = catlib.Category(targetSite,cat)
for parent in catO.supercategoriesList():
parentCats += parent.title() +'\n'
wikipedia.output("================================================================================" )
wikipedia.output("=== parent =====================================================================" )
wikipedia.output("================================================================================" )
wikipedia.output(parentCats)
#get file's categories in wikitext format with sort order (if any)
cats = ''
for m in re.finditer("\[\[[Cc]ategory:([^\]]*)", desc):
if (m.groups(0)[0] not in cats):
cats += u'[[Category:%s]]\n'%m.groups(0)[0]
#If try adding categories to the existing cats set
cat = u'%(TITLE_CAT)s'%metadata
if (cat not in parentCats):
cats += '\n[[Category:' + cat + ']]'
cat = u'%(DATE_CAT)s'%metadata
if (cat not in parentCats):
cats += '\n[[Category:' + cat + ']]'
cat = u'%(CREATOR_CAT)s'%metadata
p=cat.rpartition(' ')
if (p[2] not in parentCats):
cats += u'\n{{subst:#ifexist:Category:%(FORM1)ss by %(CREATOR_CAT)s|[[Category:%(FORM1)ss by %(CREATOR_CAT)s]]|[[Category:%(CREATOR_CAT)s]]}}'%metadata
cat = u'%(INSTITUTION_CAT)s'%metadata
if (cat not in parentCats):
cats += u'\n{{subst:#ifexist:Category:%(FORM1)ss in the %(INSTITUTION_CAT)s|[[Category:%(FORM1)ss in the %(INSTITUTION_CAT)s]]|[[Category:%(INSTITUTION_CAT)s]]}}'%metadata
cats = string.replace(cats, " |", "|")
cats = string.replace(cats, "| ", "|")
cats = string.replace(cats, "Paintings by ", "Paintings by ")
cats = string.replace(cats, "[[Category:]]\n", "")
cats = string.replace(cats, "[[Category:]]", "")
cats = string.replace(cats, "[[Category: ", "[[Category:")
description = description + cats
description = string.replace(description, "{{}}", "")
description = string.replace(description, "\n\n", "\n")
wikipedia.output("================================================================================" )
wikipedia.output("=== AFTER ======================================================================" )
wikipedia.output("================================================================================" )
wikipedia.output(description)
page.put(description, "Update metadata and categories", None, False)
def main(args):
csvFile = 'WGA_infobatch1b.csv'
reader = csv.DictReader(open(csvFile, "rb"), dialect='excel', delimiter=',')
try:
for row in reader:
processFile(row)
except csv.Error, e:
sys.exit('file %s, line %d: %s' % (csvFile, reader.line_num, e))
if __name__ == "__main__":
try:
main(sys.argv[1:])
finally:
print "All done!"