User:US National Archives bot/script

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: utf-8  -*-
'''
Bot to upload NARA images to Commons.
 
The bot expects a directory containing the images on the commandline and a text file containing the mappings.
 
The bot uses http://toolserver.org/~slakr/archives.php to get the description
'''

convertCmdPath = 'convert'
#convertCmdPath = '/opt/local/bin/convert'
 
import sys, os.path, hashlib, base64, glob, re, urllib, time, unicodedata
#sys.path.append("/Users/Dominic/pywikipedia")
sys.path.append("/home/john/src/pywikipedia")
import wikipedia, config, query, upload
import shutil, socket
import subprocess
 
########################################################
### start effbot code
### source: http://effbot.org/zone/re-sub.htm#unescape-html
########################################################
#import re, htmlentitydefs
import htmlentitydefs
 
##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
 
def unescape(text):
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, text)
########################################################
### end effbot code
########################################################
 

########################################################
### Start template parsing code
########################################################

def find_template( name, wikitext):
    
    def find_fields(templateText):
        """
        Returns the contents of a given template, in a list of (parameter, value) elements
        
            templateText : the raw text of the template
        """
        
        def addParameter():
            fields[param_name.strip()]={'val':param_value.strip(), 'index':param_count }
            return param_count + 1
            

        fields = {}

        brace_count = -2 #count of {, ignore the ones around {{template}}
        sq_brk_count = 0 #count of [
        ang_brk_count = 0 #count of <

        #state machine states
        OUTSIDE = 0
        IN_NAME = 1
        IN_VAL = 2

        state = OUTSIDE

        param_name = ''
        param_value = ''
        param_count = 0

        for char in templateText:

            if char == '{':
                brace_count += 1
            elif char == '}':
                brace_count -= 1

                if brace_count == -1: #end of template
                    try:
                        param_count = addParameter()
                    except UnboundLocalError:
                        pass #maybe we have no parameters

            elif char == '[':
                sq_brk_count += 1
            elif char == ']':
                sq_brk_count -= 1
                
            elif char == '<':
                ang_brk_count += 1
            elif char == '>':
                ang_brk_count -= 1

            elif char == '|' and brace_count == 0 and sq_brk_count == 0:
            # we have a pipe in the header template
                if param_name: #we have a parameter from before:
                    param_count = addParameter()

                param_name = '' #reset name
                state = IN_NAME #set state
                continue #skip "|" char

            elif char == '=' and sq_brk_count == 0 and state == IN_NAME:
            # we have an equals after a parameter name

                state = IN_VAL
                param_value = '' #reset the parameter field value
                continue


            if state == IN_NAME:
                param_name += char

            elif state == IN_VAL:
                param_value += char
                
        return fields

    def find_matching_braces( text, first_brace_index ):
        """
        Finds the index of the matching right brace to a left brace
        
            text: text to search in
            first_brace_index : index of the left brace to look for a partner
            
        Returns:
            If a matching brace is found, returns the index of it
            If no matching brace is found, returns None
        """

        lbrace = text[first_brace_index]

        if lbrace == '{':
            rbrace = '}'
        elif lbrace == '[':
            rbrace = ']'
        elif lbrace == '(':
            rbrace = ')'
        elif lbrace == '<':
            rbrace == '>'
        else:
            print('"%s" is not a brace, cannot find a partner.'% text[first_brace_index])
            return None

        # brace counter
        count = 0
        char_number = first_brace_index

        for char in text[first_brace_index:]:
            if char == lbrace:
                count += 1
            elif char == rbrace:
                count -= 1
            elif count == 0:
                break

            char_number += 1

        if count != 0:
            print('Cannot find a partner for "%s" in the string.'% text[first_brace_index])
            return None
        else:
            return char_number

    templates = []

    regex = r'({{\s*[' + name[0].lower() + name[0].upper() + ']' + name[1:] +')'

    templateRanges = re.finditer(regex, wikitext)

    for templateRange in templateRanges:

        template_start = templateRange.start()
        template_end = find_matching_braces(wikitext, template_start)
        
        if not template_end: #malformed - missing braces
            continue

        template = wikitext[template_start:template_end]        
        
        fields = find_fields(template)
        
        templateData = {'name':name, 'fields':fields, 'start':template_start, 'end':template_end}

        templates.append( templateData )

    return templates
    
def format_fields( name, fields, compact=False):
    
    fieldList = [ (fields[entry]['index'], entry, fields[entry]['val']) for entry in fields ]
    
    fieldList.sort()
    
    longestField = max( [ len(x[1]) for x in fieldList] )
    
    templateText = '{{' + name
    
    for index, paramName, paramVal in fieldList:
        
        if not compact: templateText += '\n'
        templateText += ' | '
    
        spacing = '' if compact else ' '*(longestField-len(paramName) )
        templateText += paramName + spacing + ' = ' + paramVal
        
    if not compact: templateText += '\n'
    templateText += '}}'
    
    return templateText
 

########################################################
### End template parsing code
########################################################
 
def getRecords(textfile):
    result = {}
    f = open(textfile, "r")
 
    for line in f.readlines():
        (filename, sep, arc) = line.partition(u' ')
        result[filename] = int(arc.strip())
 
    return result
 
 
def findDuplicateImagesByHash(filename, site = wikipedia.getSite(u'commons', u'commons')):
    '''
    Takes the photo, calculates the SHA1 hash and asks the mediawiki api for a list of duplicates.
 
    TODO: Add exception handling, fix site thing
    '''
    f = open(filename, 'rb')
 
    hashObject = hashlib.sha1()
    hashObject.update(f.read(-1))
    return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
    
def findDuplicateImagesByName(filename, site = wikipedia.getSite(u'commons', u'commons')):
    
    try:
        text = wikipedia.Page(site, 'File:' + filename).get()
    except wikipedia.NoPage:
        return [] #no file by that name
    
    return [filename] # duplicate exists, return it
    
def addDuplicatesToList(fileInfo, foundDuplicates, duplicateFiletypes):
    
    if len(foundDuplicates) > 0:
        duplicateFile = foundDuplicates.pop()
        duplicateFiletypes[fileInfo['ext']]=duplicateFile
        wikipedia.output(u'Found duplicate of %s at %s' % (fileInfo['name'], duplicateFile) )
        
    return duplicateFiletypes
 
 
def fetchDescriptionFromWebtool(fileId):
    
    # No metadata handling. We use a webtool
    description = ''
    descriptionFetchTries = 0
    
    wikipedia.output(u'Attempting to fetch description for file ID %d from the webtool.' % fileId)
    
    while True:
        description = getDescription(fileId)
        descriptionFetchTries += 1 
        
        if not description:
            if descriptionFetchTries > 10:
                wikipedia.output(u'Decription text cannot be found for this file.')
                break
            else:
                wikipedia.output(u'No reply from the webtool, retrying.')
        else:
            wikipedia.output(u'Decription text found.')
            break
            
    return description
 
def getDescription(fileId):
    """
    fileId  : the ARC ID
    """
    
    url = u'http://toolserver.org/~slakr/archives.php?archiveHint=%s' % (fileId,)
 
 
    textareaRe = re.compile('^<textarea rows="\d+" cols="\d+">(.+)</textarea>$', re.MULTILINE + re.DOTALL)
 
    gotInfo = False
    matches = None
    maxtries = 10
    tries = 0
    while(not gotInfo):
        try:
            if ( tries < maxtries ):
                tries = tries + 1
                archivesPage = urllib.urlopen(url)
                matches = textareaRe.search(archivesPage.read().decode('utf-8'))
                gotInfo = True
            else:
                break
        except IOError:
            wikipedia.output(u'Got an IOError, let\'s try again')
        except socket.timeout:
            wikipedia.output(u'Got a timeout, let\'s try again')
 
    if (matches and gotInfo):
        description =  unescape(matches.group(1))
        return description
    return u''
 
def getTitle(fileId, description):
    titleRe = re.compile('^\|Title=(.+)$', re.MULTILINE)
    titleMatch = titleRe.search(description)
    
    if titleMatch:
        titleText = truncateWithEllipsis(titleMatch.group(1), 120, "...")
        title = u'%s - NARA - %s.tif' % (titleText, fileId)
        return cleanUpTitle(title)
    else:
        wikipedia.output(u'No title found in the webtool output!' )
        return False
 
def cleanUpTitle(title):
    """
    Clean up the title of a potential mediawiki page. Otherwise the title of
    the page might not be allowed by the software.
    """
    
    title = title.strip()
    title = re.sub(u"[<{\\[]", u"(", title)
    title = re.sub(u"[>}\\]]", u")", title)
    title = re.sub(u"[ _]?\\(!\\)", u"", title)
    title = re.sub(u",:[ _]", u", ", title)
    title = re.sub(u"[;:][ _]", u", ", title)
    title = re.sub(u"[\t\n ]+", u" ", title)
    title = re.sub(u"[\r\n ]+", u" ", title)
    title = re.sub(u"[\n]+", u"", title)
    title = re.sub(u"[?!]([.\"]|$)", u"\\1", title)
    title = re.sub(u"[#%?!]", u"^", title)
    title = re.sub(u"[;]", u",", title)
    title = re.sub(u"[/+\\\\:]", u"-", title)
    title = re.sub(u"--+", u"-", title)
    title = re.sub(u",,+", u",", title)
    title = re.sub(u"[-,^]([.]|$)", u"\\1", title)
    title = title.replace(u" ", u"_")
    return title
 
def truncateWithEllipsis(s, limit, ellipsis=u"\u2026"):
    if len(s) > limit:
        for i in range(limit, 0, -1):
            if (unicodedata.category(s[i]) == 'Zs'
                and i + len(ellipsis) <= limit):
                return s[:i] + ellipsis
        return s[:-len(ellipsis)] + ellipsis
    else:
        return s
        
def createDerivatives(sourcefilename, derivativeDirectory):
    """
    Create any derivative files needed.
    
    TIFFs will be converted to JPGs
    """

        
    def makeDerivative(convertExt):
        
        def makeDerivativeName():
            return os.path.join(derivativeDirectory, srcName + convertExt)
            
        filelist.append( {'ext':srcExt.lower(), 'name':sourcefilename})
        derivativeName = makeDerivativeName()
        
        #generate if they don't exist already
        if not os.path.exists(derivativeName):
            cmd = [convertCmdPath, sourcefilename, '-quality', '100', derivativeName]
            subprocess.call(cmd)
        
        filelist.append({'ext':convertExt, 'name':derivativeName})
        
    
    srcHead, srcTail = os.path.split(sourcefilename)
    srcName, srcExt = os.path.splitext(srcTail)
    
    filelist = []
    # if the filetype needs a jpg creating
    if srcExt.lower() in ['.tif', '.tiff']:
        makeDerivative('.jpg')


    return filelist
    
def setDestinations(fileList, title):
    """
    Set the destinations for the derivative files based on the title
    """
    newList = []
    titleRoot, titleExt = os.path.splitext(title)
    
    for fileInfo in fileList:
        fileInfo['dest'] = titleRoot + fileInfo['ext']
        
        newList.append(fileInfo)

    return newList
    
def createDerivativeGallery(fileList, title):
    """
    Constructs a gallery of derivative files
    
        fileList : list of pairs of src/dest files: (local filepath, destination filename)
        title    : intendend destination filename
    """
    gallery = ''
    if len(fileList)>1:
        
        gallery += '<gallery>'
        
        for fileInfo in fileList:

            gallery += '\nFile:%s|%s' % (fileInfo['dest'], fileInfo['ext'])
        gallery += '\n</gallery>'
        
    return gallery
    
def addDerivativesToDescription(description, gallery, title):
    """
    If there are any derivative files, add a gallery of them to the
    description under "Other versions"
    
        gallery    : gallery to add to the description
        description: raw description text
    """
    
    try:
        infoTemplate = find_template( 'NARA-image-full', description)[0]
    except:
        return False #we didn't find a template to update
        
    otherVersionsParam = None
    for name in ['Other_versions', 'other_versions', 'Other versions', 'other versions']:
        if name in infoTemplate['fields']:
            otherVersionsParam = name

    if otherVersionsParam:
        otherVersions = infoTemplate['fields'][otherVersionsParam]['val']
    else:        
        wikipedia.output(u"Couldn't find the 'other versions' parameter.")
        otherVersionsParam = 'Other_versions'
        otherVersions = '' #create the parameter
        infoTemplate['fields'][otherVersionsParam] = {'val':otherVersions, 'index':100}
        
    
    m = re.search(r'<gallery>', otherVersions)
    
    if m:
        return False #there is a gallery already
    else:
        otherVersions = gallery + otherVersions 
        
    infoTemplate['fields'][otherVersionsParam]['val'] = otherVersions
    
    #reinsert modified template
    description = (description[0:infoTemplate['start']] +
        format_fields( 'NARA-image-full', infoTemplate['fields'] ) +
        description[infoTemplate['end']:] )
        
    return description
    

def removeTIFFParameter(description, type):
    """
    Returns a description without the TIFF parameter if the file is 
    not a TIFF
    """
    
    isTiff = type.lower() in ['.tiff','.tif']
    
    if not isTiff:
        return re.sub(r'(\s*\|\s*TIFF\s*=)\s*yes', r'\1', description)
    
    else:
        return description
     
def main(args):
    '''
    Main loop.
    '''
    workdir = u''
    textfile = u''
    records = {}

 
    site = wikipedia.getSite(u'commons', u'commons')
    wikipedia.setSite(site)
 
    if  (len(args) < 3):
        wikipedia.output(u'Too few arguments. Usage: NARA_uploader.py <original dir> <textfile> <derivative dir> [start filename]')
        sys.exit()
 
    if os.path.isdir(args[0]):
        workdir = args[0]
    else:
        wikipedia.output(u'%s doesn\'t appear to be a directory. Exiting.' % (args[0],))
        sys.exit()
        
    derivativeDirectory = args[2]
    if os.path.exists(derivativeDirectory) and not os.path.isdir(derivativeDirectory):
        wikipedia.output(u"%s exists, but isn't a directory. Exiting." % derivativeDirectory)
        sys.exit()
    elif not os.path.exists(derivativeDirectory):
        wikipedia.output(u'%s doesn\'t appear to exist. Creating.' % derivativeDirectory)
        os.mkdir(derivativeDirectory)
        
    try:
        startFile = args[3]
        startFileFound = False
        
        startPath = os.path.join(workdir, startFile)
        
        if not os.path.exists(startPath) or os.path.isdir(startPath):
            wikipedia.output(u"%s doesn't exist, or it is directory. Exiting." % startPath)
            sys.exit()
    
    except IndexError:
        startFile = None
        
 
    textfile = args[1]
    records = getRecords(textfile)
    #print records
 
    sourcefilenames = glob.glob(workdir + u"/*.TIF")
    sourcefilenames.sort()
 
    for sourcefilename in sourcefilenames:
        
        wikipedia.output(u'\nProcessing %s' % sourcefilename)
        
        if startFile: #if we want to skip to a file
            fileHead, fileTail = os.path.split(sourcefilename)
            
            if not startFileFound:
                if fileTail != startFile:
                    wikipedia.output('Skipping %s' % sourcefilename)
                    continue
                else: #we have fond the start point
                    startFileFound = True
        
        filename = os.path.basename(sourcefilename)
        # This will give an ugly error if the id is unknown
        if not records.get(filename):
             wikipedia.output(u'Can\'t find %s in %s. Skipping this file.' % (filename, textfile))
        elif os.path.getsize(sourcefilename) >= 1024 * 1024 * 100:
             wikipedia.output(u'%s too big. Skipping this file.' % (sourcefilename,))
        else:
            fileId = records.get(filename)
            
            wikipedia.output(u'Found file ID: %d' % fileId)
 
            
            #generate all the files we might need to upload
            filesToUpload = createDerivatives(sourcefilename, derivativeDirectory)
            
            
            duplicateFiletypes = {}
            #check for duplicates of the original on wiki
            for fileInfo in filesToUpload:   
            
                if fileInfo['ext'] != '.tif' :
                    continue
                
                foundDuplicates = findDuplicateImagesByHash(fileInfo['name'])
                
                duplicateFiletypes = addDuplicatesToList(fileInfo, foundDuplicates, duplicateFiletypes)
                    
            # follow the naming + description from the tif if it exists, or make it up from the description
            if '.tif' in duplicateFiletypes:
                title = duplicateFiletypes['.tif']
                
                wikipedia.output(u'Fetching description from TIF file page: %s' % title )
                description = wikipedia.Page(site, 'File:' + title).get()

            else:
                description = fetchDescriptionFromWebtool(fileId)
                
                if not description:
                    wikipedia.output(u'No description! Skipping this file.' )
                    continue
                else:
                    categories = u'{{Uncategorized-NARA|year={{subst:CURRENTYEAR}}|month={{subst:CURRENTMONTHNAME}}|day={{subst:CURRENTDAY}}}}\n'
                    description = description + categories
                    
                    title = getTitle(fileId, description)
                    
                    if not title:
                        continue
                
            #check for duplicates of the derivatives (using the filename we just made)
            for fileInfo in filesToUpload:   
            
                if fileInfo['ext'] == '.tif' :
                    continue
                    
                titleRoot, ext = os.path.splitext(title)
                fileTitle = titleRoot + fileInfo['ext']
                
                foundDuplicates = findDuplicateImagesByName(fileTitle)
                
                duplicateFiletypes = addDuplicatesToList(fileInfo, foundDuplicates, duplicateFiletypes)
                
            #construct the gallery
            filesToUpload = setDestinations(filesToUpload, title)
            gallery = createDerivativeGallery(filesToUpload, title)
            
            #for every file, including original and derivatives
            for fileInfo in filesToUpload:
                
                titleRoot, ext = os.path.splitext(title)
                fileTitle = titleRoot + fileInfo['ext']

                if fileInfo['ext'] in duplicateFiletypes: #we have a duplicate: add derivs if needed
                
                    currentFilename = duplicateFiletypes[fileInfo['ext']]
                    
                    currentFilePage = wikipedia.Page(site, 'File:' + currentFilename)
                    
                    currentDescription = currentFilePage.get()
                    
                    currentDescription = addDerivativesToDescription(currentDescription, gallery, title)
                    
                    if currentDescription:
                        wikipedia.output('Updating the description for %s:\n\n%s' % (currentFilename, currentDescription))
                        currentFilePage.put( currentDescription, comment="Adding other versions to the description." )
                    else:
                        wikipedia.output('Gallery exists on page %s' % currentFilename)
                
                else: #upload the file with generated info   
                    
                    wikipedia.output(fileInfo['name'] +' --> '+ fileInfo['dest'])
                    
                    newDescription = addDerivativesToDescription(description, gallery, title)
                    
                    if newDescription: #if the gallery add failed due to existing gallery, just carry on with the original
                        description = newDescription

                    fileDescription = removeTIFFParameter(description, fileInfo['ext'])
                    wikipedia.output(fileDescription)
                    bot = upload.UploadRobot(url=fileInfo['name'].decode(sys.getfilesystemencoding()), description=fileDescription, useFilename=fileInfo['dest'], keepFilename=True, verifyDescription=False)
                    bot.run()
 
if __name__ == "__main__":
    try:
        main(sys.argv[1:])
    finally:
        print u'All done'