User:JarektUploadBot/Upload Pilsudski Institute.py

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: utf-8  -*-
'''
A program to upload Józef Piłsudski Institute of America archive documents based on @ separated CSV file
 
'''
import sys, os.path, glob, re, hashlib, base64, StringIO
import wikipedia, upload, csv, urllib2, string, catlib
 
def processFile(row):
   # Read line of metadata
   enc='utf-8' 
   metadata = {
      u'id': unicode(row.get(u'id'), enc),
      u'identifier_uri': unicode(row.get(u'identifier.uri'), enc),
      u'fondsID': unicode(row.get(u'fondsID'), enc),
      u'folderID': unicode(row.get(u'folderID'), enc),
      u'page_first': unicode(row.get(u'page.first'), enc),
      u'page_last': unicode(row.get(u'page.last'), enc),
      u'license': unicode(row.get(u'license'), enc),
      u'contributor.author': unicode(row.get(u'contributor.author'), enc),
      u'contributor': unicode(row.get(u'contributor'), enc),
      u'coverage_spatial': unicode(row.get(u'coverage.spatial'), enc),
      u'coverage_temporal': unicode(row.get(u'coverage.temporal'), enc),
      u'date': unicode(row.get(u'dateOriginal'), enc),
      u'description_abstract': unicode(row.get(u'description.abstract'), enc),
      u'description': unicode(row.get(u'description'), enc),
      u'identifier_uri': unicode(row.get(u'identifier.uri'), enc),
      u'identifier': unicode(row.get(u'identifier'), enc),
      u'language': unicode(row.get(u'language.iso'), enc),
      u'relation_uri': unicode(row.get(u'relation.uri'), enc),
      u'relation': unicode(row.get(u'relation'), enc),
      u'subject': unicode(row.get(u'subject'), enc),
      u'title_alternative': unicode(row.get(u'title.alternative'), enc),
      u'title': unicode(row.get(u'title'), enc),
      u'type': unicode(row.get(u'type'), enc),
      u'person': unicode(row.get(u'person'), enc),
      u'filename_in': unicode(row.get(u'filename_in'), enc),  
      u'filename_out': unicode(row.get(u'filename_out'), enc),  
   }
 
   # Format file descriptio
   template = u"""{{Piłsudski Institute document
|title               = {{pl|1=%(title)s}}  
|alternative_title   = {{pl|1=%(title_alternative)s}}  
|date                = %(date)s
|author              = {{Creator:Józef Piłsudski}} 
|contributor         = %(contributor)s
|wiki_description    = 
|archive_description = {{pl|1=%(description)s }} 
|archive_abstract    = {{pl|1=%(description_abstract)s }}
|subject             = {{pl|1=%(subject)s }} 
|language            = %(language)s
|object_type         = %(type)s
|people              = %(person)s
|places              = %(coverage_spatial)s
|dates               = %(coverage_temporal)s
|medium              = 
|dimensions          = 
|department          = 
|references          = %(relation)s
|object_history      =
|exhibition_history  =
|credit_line         =
|inscriptions        =
|notes               =
|accession_number    = 701/%(fondsID)s/%(folderID)s/%(page_first)s
|identifier_uri      = %(identifier_uri)s
|relation.uri        = %(relation_uri)s
|id                  = %(id)s
|fonds               = Archiwum Józefa Piłsudskiego
|pages               = %(pages)s
|permission          = {{PD-old-auto-1996|deathyear=1935|country={{Poland}}|date={{date|1996|1|1}}|reason=[[User:Piotrus/PolishCopyright|PD in Poland & US]]}}
|other_versions      =
|text                = yes
}}

[[Category:Collections of Józef Piłsudski Institute of America]]
[[Category:Letters by Józef Piłsudski]]
[[Category:%(year)s in Poland]]
"""
   n=0
   targetSite = wikipedia.getSite('commons', 'commons')
   metadata['date']  = re.sub(u'(\d\d\d\d)-([1-9])-(\d\d?)', ur'\1-0\2-\3', metadata['date'])
   metadata['coverage_temporal']  = re.sub(u'(\d\d\d\d)-(\d\d)-(\d\d?)', ur'{{ISOdate|\1-\2-\3}}', metadata['coverage_temporal'])
   metadata['coverage_temporal'] = metadata['coverage_temporal'].replace(u";",u"<br/>")
   metadata['coverage_spatial']  = u'{{City|'+metadata['coverage_spatial'].replace(u";",u"}}<br/>{{City|") + u'}}'
   metadata['person']            = u'{{Piłsudski Institute document/people|'+metadata['person'].replace(u";",u"}}<br/>{{Piłsudski Institute document/people|")+u'}}'
   m = re.search(u'(\d\d\d\d)', metadata['date'])
   metadata['year'] = m.group(0)
   p=int(float(metadata['page_last'])-float(metadata['page_first'])+1)
   if p==1:
      metadata['pages'] = str(p)
   else:
      metadata['pages'] = '1-'+str(p)
   metadata['relation'] = re.sub(ur'Niepodległość,\s*T\.\s*([IVX]*),\s*Warszawa (19\d\d),\s*s\.\s*([\d\-]*)',
     ur'{{cite journal|journal=[[:pl:Niepodległość (czasopismo)|Niepodległość]]|location=Warsaw|publisher=[[:pl:Instytut Józefa Piłsudskiego Poświęcony Badaniu Najnowszej Historii Polski|Instytut Józefa Piłsudskiego]]|year=\2|volume=\1|issue=|pages=\3|doi=|pmc=|pmid=|issn=|}}',
     metadata['relation'])
   description = template % metadata
   description = description.replace('{{pl|1= }}','')
   description = description.replace('{{pl|1=}}','')
   description = description.replace('{{City|}}','')
   description = description.replace('{{City| ','{{City|')
   description = description.replace(u'Piłsudski Józef',u'Józef Piłsudski')
   description = description.replace(u'{{Piłsudski Institute document/people|}}','')
   description = description.replace('||','; ')
   fname = u'file:///C:/Users/tuszynskij/Documents/Wiki/Pilsudzki Institute/set2/' + u'%(filename_in)s' % metadata
   wikipedia.output(fname)
   wikipedia.output(metadata['filename_out'])
   wikipedia.output(description)
 
   # We don't want to upload duplicates
   # So take the photo, calculate the SHA1 hash and ask the mediawiki api for a list of duplicates.
   imageFile  = urllib2.urlopen(fname).read()
   photo      = StringIO.StringIO(imageFile)
   hashObject = hashlib.sha1()
   hashObject.update(photo.getvalue())
   SHA1       = base64.b16encode(hashObject.digest())
   duplicates = targetSite.getFilesFromAnHash(SHA1)
 
   if duplicates:
      strng = duplicates.pop()
      wikipedia.output(u'Duplicate image: %s' % strng + u' = %(filename_out)s' % metadata)
      return 
   
   # upload file to Commons
   bot = upload.UploadRobot(url=fname, 
           description  = description, 
           useFilename  = metadata['filename_out'].strip(), 
           keepFilename = True, 
           verifyDescription=False, 
           targetSite   = targetSite)
   bot.run() # Comment out this line to have a dry run


 
def main(args):
   csvFile = 'Pilsudski.csv'
 
   reader = csv.DictReader(open(csvFile, "rb"), dialect='excel', delimiter='@')
   i=0
   try:
    for row in reader:
        #print 'Row read successfully:', row
        processFile(row)
        i=i+1
        if i==-20: # Set a number of files to upload, or set to -1 for all files
           return
 
   except csv.Error, e:
     sys.exit('file %s, line %d: %s' % (csvFile, reader.line_num, e))
 
 
if __name__ == "__main__":
   try:
       main(sys.argv[1:])
   finally:
       print "All done!"