User:Tpt/djvuocr.py
< User:Tpt
This script for pywikipedia and GNU/Linux do the OCR of a djvu file by adding a text layer. It is completely automatized : it download the file, do the OCR and upload the new version. It use Tesseract 3.00 and, if you use an other language than English the corresponding language pack (they are in all good repositories like Debian, Ubuntu or Fedora).
Sample : python djvuocr.py -filelang:fra -djvu:"Name of the djvu.djvu"
djvuocr.py
[edit]#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bot do the ocr of a djvu file. It is intended to be used for Commons.
The following parameters are supported:
-djvu:... Name of the djvu file on commons
-filelang:... The lang of the text like eng of fra
(Default: eng)
"""
#
# (C) Pywikipedia bot team, 2008-2010
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id: djvuocr.py 9246 2011-07-29 15:42:46Z xqt $'
import wikipedia as pywikibot
import os, sys, urllib
import config, codecs, query
class AppURLopener(urllib.FancyURLopener):
version = 'Pywikipediabot/1.0'
urllib._urlopener = AppURLopener()
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
}
class DjVuOCRBot:
# Edit summary message that should be used.
# NOTE: Put a good description here, and add translations, if possible!
msg = {
'en': u'Robot: OCR of the file with Tesseract',
'fr': u'Bot: OCR du fichier avec Tesseract',
}
def __init__(self, djvu, filelang='eng', ask=False, debug=False):
"""
Constructor. Parameters:
djvu : filename
"""
self.djvu = djvu
self.dry = debug
self.ask = ask
self.filelang = filelang
self.targetSite = pywikibot.getSite()
def NoOfImages(self):
cmd = u"djvused -e 'n' \"/tmp/%s\"" % (self.djvu)
count = os.popen( cmd.encode(sys.stdout.encoding) ).readline().rstrip()
count = int(count)
pywikibot.output("page count = %d" % count)
return count
def PagesGenerator(self):
start = 1
end = self.NoOfImages()
if self.pages:
pos = self.pages.find('-')
if pos != -1:
start = int(self.pages[:pos])
if pos < len(self.pages)-1:
end = int(self.pages[pos+1:])
else:
start = int(self.pages)
end = start
pywikibot.output(u"Processing pages %d-%d" % (start, end))
return range(start, end+1)
def run(self):
self.djvu = self.djvu.replace(' ', '_')
if self.djvu[len(self.djvu) - 5 : len(self.djvu)] != '.djvu':
pywikibot.output("It isn't a djvu file")
return
#download the djvu
djvuPage = pywikibot.ImagePage(self.targetSite, 'File:%s' % self.djvu)
if not djvuPage:
pywikibot.output("The djvu file can't be downloaded")
return
pywikibot.output('Download of the djvu file')
try:
urllib.urlretrieve(djvuPage.fileUrl(), '/tmp/' + self.djvu)
except:
pywikibot.output("The djvu file can't be downloaded")
return
os.stat('/tmp/' + self.djvu)
#OCR
pywikibot.output(u"OCRing text from %s" % (self.djvu) )
num_pages = self.NoOfImages()
pageno = 1
while pageno <= num_pages:
pywikibot.output("Processing page %d" % pageno)
self.ocr_page(pageno)
pageno += 1
#upload
f = codecs.open("/tmp/%s" % self.djvu, 'r')
self.upload(f.read())
f.close()
os.remove(u"/tmp/%s" % self.djvu)
os.remove(u"/tmp/%s.txt" % self.djvu)
os.remove(u"/tmp/%s.tiff" % self.djvu)
os.remove(u"/tmp/%s.djvu-txt" % self.djvu)
def ocr_page(self, pageno):
cmd = u"ddjvu -format=tiff -page=%d \"/tmp/%s\" \"/tmp/%s.tiff\" " % (pageno, self.djvu, self.djvu)
os.system ( cmd.encode(sys.stdout.encoding) )
cmd = u"tesseract \"/tmp/%s.tiff\" \"/tmp/%s\" -l %s " % (self.djvu, self.djvu, self.filelang)
os.system ( cmd.encode(sys.stdout.encoding) )
f = codecs.open(u"/tmp/%s.txt" % self.djvu, 'r', config.textfile_encoding, 'replace')
ft = codecs.open(u"/tmp/%s.djvu-txt" % self.djvu, 'w', config.textfile_encoding, 'replace')
lines = f.readlines()
if lines:
ft.write('(page 0 0 1 1\n')
for line in lines:
line = line.strip()
line = line.replace('"', '')
line = line.replace('\\', '')
ft.write(u"(line 0 0 1 1 \"%s\")\n" % line)
ft.write(')\n')
f.close()
ft.close()
cmd = u"djvused \"/tmp/%s\" -e 'select %d; remove-txt' -s" % (self.djvu, pageno)
os.system ( cmd.encode(sys.stdout.encoding) )
cmd = u"djvused \"/tmp/%s\" -e 'select %d; set-txt \"/tmp/%s.djvu-txt\"' -s" % (self.djvu, pageno, self.djvu)
os.system ( cmd.encode(sys.stdout.encoding) )
# function from upload.py of pywikipedia
# (C) Rob W.W. Hooft, Andre Engels 2003-2004
# (C) Pywikipedia bot team, 2003-2010
def upload(self, djvu):
"""Upload the image at self.url to the target wiki.
Return the filename that was used to upload the image.
If the upload fails, ask the user whether to try again or not.
If the user chooses not to retry, return null.
"""
if not self.targetSite.has_api() or self.targetSite.versionnumber() < 16:
pywikibot.output("The file can't be uploaded : the wiki have a too old configuration" % arg)
return
params = {
'action': 'upload',
'token': self.targetSite.getToken(),
'comment': pywikibot.translate(pywikibot.getSite(), self.msg),
'ignorewarnings': 1,
'filename': self.djvu,
'file': djvu
}
pywikibot.output(u'Uploading file to %s via API....' % self.targetSite)
data = query.GetData(params, self.targetSite)
if pywikibot.verbose:
pywikibot.output("%s" % data)
if 'error' in data:
errCode = data['error']['code']
pywikibot.output("%s" % data)
else:
data = data['upload']
if data['result'] == 'Success':
pywikibot.output("Upload successful.")
return
def main():
import os
djvu = None
# what would have been changed.
dry = False
ask = False
# Parse command line arguments
for arg in pywikibot.handleArgs():
if arg.startswith("-dry"):
dry = True
elif arg.startswith("-ask"):
ask = True
elif arg.startswith("-djvu:"):
djvu = arg[6:]
elif arg.startswith("-filelang:"):
filelang = arg[10:]
else:
pywikibot.output(u"Unknown argument %s" % arg)
if djvu:
site = pywikibot.getSite()
bot = DjVuOCRBot(djvu, filelang, ask, dry)
bot.run()
else:
pywikibot.showHelp()
if __name__ == "__main__":
try:
main()
finally:
pywikibot.stopme()