User:JarektUploadBot/FixWGAMetadataArt.py
Jump to navigation
Jump to search
<source lang="python">
- !/usr/bin/python
- -*- coding: utf-8 -*-
A program to upload all the images in the Web Gallery of Art website at http://www.wga.hu/
import sys, os.path, glob, re, hashlib, base64, StringIO, time sys.path.append("C:/Programs/pywikipedia/") sys.path.append("../") import wikipedia, upload, csv, urllib2, string, catlib
def processFile(row):
# Read line of metadata enc='utf-8' metadata = { 'IMG_ID' : int (row.get(u'IMG_ID') ), 'CREATOR' : unicode(row.get(u'CREATOR') , enc), 'DATE' : unicode(row.get(u'DATE') , enc), 'TITLE' : unicode(row.get(u'TITLE') , enc), 'DIMENSIONS' : unicode(row.get(u'DIMENSIONS') , enc), 'TECHNIQUE' : unicode(row.get(u'TECHNIQUE') , enc), 'FILENAME' : unicode(row.get(u'FILENAME') , enc), 'FILENAME2' : unicode(row.get(u'FILENAME2') , enc), 'FORM' : unicode(row.get(u'FORM') , enc), 'TYPE' : unicode(row.get(u'TYPE') , enc), 'SCHOOL' : unicode(row.get(u'SCHOOL') , enc), 'TIMELINE' : unicode(row.get(u'TIMELINE') , enc), 'INSTITUTION' : unicode(row.get(u'INSTITUTION') , enc), 'CREATOR_CAT' : unicode(row.get(u'CREATOR_CAT') , enc), 'INSTITUTION_CAT' : unicode(row.get(u'INSTITUTION_CAT'), enc), 'TITLE_CAT' : unicode(row.get(u'TITLE_CAT') , enc), 'DATE_CAT' : unicode(row.get(u'DATE_CAT') , enc), 'URL' : unicode(row.get(u'URL') , enc), 'IMAGEURL' : unicode(row.get(u'IMAGEURL') , enc), 'FRAME' : unicode(row.get(u'FRAME') , enc), } metadata['FORM1'] = metadata['FORM'].capitalize(); metadata['FILENAME2'] = 'File:'+metadata['FILENAME2'].strip(); metadata['CREATOR'] = metadata['CREATOR'].strip(); metadata['INSTITUTION'] = metadata['INSTITUTION'].strip();
# Get current file description targetSite = wikipedia.getSite('commons', 'commons') page = wikipedia.Page(targetSite, metadata['FILENAME2']) desc = page.get() wikipedia.output("================================================================================" ) wikipedia.output("=== BEFORE %(FILENAME2)s"%metadata ) wikipedia.output("================================================================================" ) wikipedia.output(desc) # Merge sources if original was from ww.allartpainting.com or The Yorck Project, both of those sources sometimes had the same binaries as WGA
metadata['SOURCE'] = u"Web Gallery of Art: [%(IMAGEURL)s Image] [%(URL)s Info about artwork]
" % metadata
m=re.search("http:\/\/www\.allartpainting\.com\/[^\n\]\s]*", desc) if m!=None: metadata['SOURCE'] = u"""
- [%s www.allartpainting.com]
- %s""" % (m.group(0), metadata['SOURCE'])
else: m=re.search("The Yorck Project", desc) if m!=None: metadata['SOURCE'] = u"""
- The Yorck Project: 10.000 Meisterwerke der Malerei. DVD-ROM, 2002. ISBN 3936122202. Distributed by DIRECTMEDIA Publishing GmbH.
- %s""" % (metadata['SOURCE'])
# If origial description used institution template than use that one desc = string.replace(desc, "{{:museum:", "{{Institution:") m=re.search("\{\{[Ii]nstitution:", desc) if m!=None: metadata['INSTITUTION'] = ""
# If origial description used creator template than use that one m=re.search("[Cc]reator:", desc) if m!=None: metadata['CREATOR'] = ""
# If origial description used technique template than use that one m=re.search("\{\{[Tt]echnique\|", desc) if m!=None: metadata['TECHNIQUE'] = "" # Format file description article_template = u"""{{subst:User:Jarekt/WGA |CREATOR = %(CREATOR)s |TITLE = %(TITLE)s |DATE = %(DATE)s |TECHNIQUE = %(TECHNIQUE)s |DIMENSIONS = %(DIMENSIONS)s |INSTITUTION = %(INSTITUTION)s |FRAME = %(FRAME)s |FORM = %(FORM)s |TYPE = %(TYPE)s |SCHOOL = %(SCHOOL)s |TIMELINE = %(TIMELINE)s |SOURCE = %(SOURCE)s
"""
description = string.replace(desc,"{{Artwork", article_template % metadata)
#get files categories and parent categories of those parentCats= for m in re.finditer("\[\[[Cc]ategory:([^\]\|]*)", desc): cat = u'Category:%s\n'%m.groups(0)[0] parentCats += cat catO = catlib.Category(targetSite,cat) for parent in catO.supercategoriesList(): parentCats += parent.title() +'\n' wikipedia.output("================================================================================" ) wikipedia.output("=== parent =====================================================================" ) wikipedia.output("================================================================================" ) wikipedia.output(parentCats) #get file's categories in wikitext format with sort order (if any) cats = u"""
""" % metadata
#If try adding categories to the existing cats set cat = u'%(TITLE_CAT)s'%metadata if (cat not in parentCats): cats += '\n'
cat = u'%(DATE_CAT)s'%metadata if (cat not in parentCats): cats += '\n'
cat = u'%(CREATOR_CAT)s'%metadata p=cat.rpartition(' ') if (p[2] not in parentCats): cats += u'\n'%metadata cat = u'%(INSTITUTION_CAT)s'%metadata if (cat not in parentCats): cats += u'\n'%metadata
cats = string.replace(cats, " |", "|") cats = string.replace(cats, "| ", "|") cats = string.replace(cats, "Paintings by ", "Paintings by ") cats = string.replace(cats, "[[Category:]]\n", "") cats = string.replace(cats, "[[Category:]]", "") cats = string.replace(cats, "[[Category: ", "[[Category:") cats = string.replace(cats, "[[Category::", "[[Category:") description = description + cats description = string.replace(description, "","") description = string.replace(description, "{{}}", "") #description = string.replace(description, "\n\n", "\n")
wikipedia.output("================================================================================" ) wikipedia.output("=== AFTER ======================================================================" ) wikipedia.output("================================================================================" ) wikipedia.output(description) page.put(description, "Update metadata and categories. Please check!", True, False) time.sleep(30)
def main(args):
csvFile = 'WGA_artbatch2.csv' reader = csv.DictReader(open(csvFile, "rb"), dialect='excel', delimiter=',')
for row in reader: try: processFile(row) except csv.Error, e: wikipedia.output('skip')
if __name__ == "__main__":
try: main(sys.argv[1:]) finally: print "All done!"