User:JarektUploadBot/Upload Pilsudski Institute.py
Jump to navigation
Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
A program to upload Józef Piłsudski Institute of America archive documents based on @ separated CSV file
'''
import sys, os.path, glob, re, hashlib, base64, StringIO
import wikipedia, upload, csv, urllib2, string, catlib
def processFile(row):
# Read line of metadata
enc='utf-8'
metadata = {
u'id': unicode(row.get(u'id'), enc),
u'identifier_uri': unicode(row.get(u'identifier.uri'), enc),
u'fondsID': unicode(row.get(u'fondsID'), enc),
u'folderID': unicode(row.get(u'folderID'), enc),
u'page_first': unicode(row.get(u'page.first'), enc),
u'page_last': unicode(row.get(u'page.last'), enc),
u'license': unicode(row.get(u'license'), enc),
u'contributor.author': unicode(row.get(u'contributor.author'), enc),
u'contributor': unicode(row.get(u'contributor'), enc),
u'coverage_spatial': unicode(row.get(u'coverage.spatial'), enc),
u'coverage_temporal': unicode(row.get(u'coverage.temporal'), enc),
u'date': unicode(row.get(u'dateOriginal'), enc),
u'description_abstract': unicode(row.get(u'description.abstract'), enc),
u'description': unicode(row.get(u'description'), enc),
u'identifier_uri': unicode(row.get(u'identifier.uri'), enc),
u'identifier': unicode(row.get(u'identifier'), enc),
u'language': unicode(row.get(u'language.iso'), enc),
u'relation_uri': unicode(row.get(u'relation.uri'), enc),
u'relation': unicode(row.get(u'relation'), enc),
u'subject': unicode(row.get(u'subject'), enc),
u'title_alternative': unicode(row.get(u'title.alternative'), enc),
u'title': unicode(row.get(u'title'), enc),
u'type': unicode(row.get(u'type'), enc),
u'person': unicode(row.get(u'person'), enc),
u'filename_in': unicode(row.get(u'filename_in'), enc),
u'filename_out': unicode(row.get(u'filename_out'), enc),
}
# Format file descriptio
template = u"""{{Piłsudski Institute document
|title = {{pl|1=%(title)s}}
|alternative_title = {{pl|1=%(title_alternative)s}}
|date = %(date)s
|author = {{Creator:Józef Piłsudski}}
|contributor = %(contributor)s
|wiki_description =
|archive_description = {{pl|1=%(description)s }}
|archive_abstract = {{pl|1=%(description_abstract)s }}
|subject = {{pl|1=%(subject)s }}
|language = %(language)s
|object_type = %(type)s
|people = %(person)s
|places = %(coverage_spatial)s
|dates = %(coverage_temporal)s
|medium =
|dimensions =
|department =
|references = %(relation)s
|object_history =
|exhibition_history =
|credit_line =
|inscriptions =
|notes =
|accession_number = 701/%(fondsID)s/%(folderID)s/%(page_first)s
|identifier_uri = %(identifier_uri)s
|relation.uri = %(relation_uri)s
|id = %(id)s
|fonds = Archiwum Józefa Piłsudskiego
|pages = %(pages)s
|permission = {{PD-old-auto-1996|deathyear=1935|country={{Poland}}|date={{date|1996|1|1}}|reason=[[User:Piotrus/PolishCopyright|PD in Poland & US]]}}
|other_versions =
|text = yes
}}
[[Category:Collections of Józef Piłsudski Institute of America]]
[[Category:Letters by Józef Piłsudski]]
[[Category:%(year)s in Poland]]
"""
n=0
targetSite = wikipedia.getSite('commons', 'commons')
metadata['date'] = re.sub(u'(\d\d\d\d)-([1-9])-(\d\d?)', ur'\1-0\2-\3', metadata['date'])
metadata['coverage_temporal'] = re.sub(u'(\d\d\d\d)-(\d\d)-(\d\d?)', ur'{{ISOdate|\1-\2-\3}}', metadata['coverage_temporal'])
metadata['coverage_temporal'] = metadata['coverage_temporal'].replace(u";",u"<br/>")
metadata['coverage_spatial'] = u'{{City|'+metadata['coverage_spatial'].replace(u";",u"}}<br/>{{City|") + u'}}'
metadata['person'] = u'{{Piłsudski Institute document/people|'+metadata['person'].replace(u";",u"}}<br/>{{Piłsudski Institute document/people|")+u'}}'
m = re.search(u'(\d\d\d\d)', metadata['date'])
metadata['year'] = m.group(0)
p=int(float(metadata['page_last'])-float(metadata['page_first'])+1)
if p==1:
metadata['pages'] = str(p)
else:
metadata['pages'] = '1-'+str(p)
metadata['relation'] = re.sub(ur'Niepodległość,\s*T\.\s*([IVX]*),\s*Warszawa (19\d\d),\s*s\.\s*([\d\-]*)',
ur'{{cite journal|journal=[[:pl:Niepodległość (czasopismo)|Niepodległość]]|location=Warsaw|publisher=[[:pl:Instytut Józefa Piłsudskiego Poświęcony Badaniu Najnowszej Historii Polski|Instytut Józefa Piłsudskiego]]|year=\2|volume=\1|issue=|pages=\3|doi=|pmc=|pmid=|issn=|}}',
metadata['relation'])
description = template % metadata
description = description.replace('{{pl|1= }}','')
description = description.replace('{{pl|1=}}','')
description = description.replace('{{City|}}','')
description = description.replace('{{City| ','{{City|')
description = description.replace(u'Piłsudski Józef',u'Józef Piłsudski')
description = description.replace(u'{{Piłsudski Institute document/people|}}','')
description = description.replace('||','; ')
fname = u'file:///C:/Users/tuszynskij/Documents/Wiki/Pilsudzki Institute/set2/' + u'%(filename_in)s' % metadata
wikipedia.output(fname)
wikipedia.output(metadata['filename_out'])
wikipedia.output(description)
# We don't want to upload duplicates
# So take the photo, calculate the SHA1 hash and ask the mediawiki api for a list of duplicates.
imageFile = urllib2.urlopen(fname).read()
photo = StringIO.StringIO(imageFile)
hashObject = hashlib.sha1()
hashObject.update(photo.getvalue())
SHA1 = base64.b16encode(hashObject.digest())
duplicates = targetSite.getFilesFromAnHash(SHA1)
if duplicates:
strng = duplicates.pop()
wikipedia.output(u'Duplicate image: %s' % strng + u' = %(filename_out)s' % metadata)
return
# upload file to Commons
bot = upload.UploadRobot(url=fname,
description = description,
useFilename = metadata['filename_out'].strip(),
keepFilename = True,
verifyDescription=False,
targetSite = targetSite)
bot.run() # Comment out this line to have a dry run
def main(args):
csvFile = 'Pilsudski.csv'
reader = csv.DictReader(open(csvFile, "rb"), dialect='excel', delimiter='@')
i=0
try:
for row in reader:
#print 'Row read successfully:', row
processFile(row)
i=i+1
if i==-20: # Set a number of files to upload, or set to -1 for all files
return
except csv.Error, e:
sys.exit('file %s, line %d: %s' % (csvFile, reader.line_num, e))
if __name__ == "__main__":
try:
main(sys.argv[1:])
finally:
print "All done!"