User:Dispenser/Absurd overhead.py
Jump to navigation
Jump to search
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Absurd Overhead
usage: python absurd_overhead.py [language] [img_name|file page id]
# Ubuntu dependancies
# Programs: exiftool, jpegtran, optipng, unrar-nonfree, identify, pngcheck, gifsicle
sudo apt-get install libimage-exiftool-perl libjpeg-progs optipng unrar imagemagick pngcheck, gifsicle
sudo pip install phpserialize oursql
# NON-FREE SOFTWARE, DO NOT USE ON WIKIMEDIA SERVERS
# pngout <http://www.advsys.net/ken/utils.htm> [Freeware]
wget http://static.jonof.id.au/dl/kenutils/pngout-20150319-linux.tar.gz -O - | tar -zxv
cd pngout-20150319-linux/ # Find your files for your system
# sudo cp . /opt
"""
import hashlib, mmap, os, re, time, urllib
import dbm, oursql, shutil, subprocess
import phpserialize
try:
# https://github.com/toollabs/embeddeddata
from detection import detect
except ImportError:
detect = None
os.sys.path.append(os.path.expanduser('~/pywikibot/'))
StartTime=time.time()
os.nice(10) # Lower CPU
# Configuration
skip_ifunder = 250*1024
lang = (os.sys.argv[1:2] or ['commons'])[0]
test_img_name = (os.sys.argv[2:3] or [''])[0]
log_db = dbm.open(os.path.expanduser('./absurd_cache'), 'c')
download_dir = os.path.expanduser('/user-data/images/%s/'%lang)
output_file = os.path.expanduser('./Absurd_overhead.%s.txt'%lang)
overhead_table = 'u2815__.file_overhead'
os.chdir( './' ) # Change working directory
# Magic numbers
magic_numbers = {
# Hidden archive formats
'7zip': (b'7z\xBC\xAF\x27\x1C', '7z archive', ),
'rar4': (b'Rar!\x1A\x07\x00', 'RAR 1.5 to 4.0', ),
'rar5': (b'Rar!\x1A\x07\x01\x00', 'RAR 5+', ),
#'tar': (b'ustar \x00' 'Tar archive', ),
'tar': (b'ustar\x0000' 'Tar archive', ),
'zip': (b'PK\x03\x04', 'ZIP archive', ),
'zips': (b'PK\x07\x08', 'ZIP spanned archive', ),
# too short
#'bz2': (b'BZh', 'bzip2 archive' ), # BHh[1-9]
#'gz': (b'\x1F\x8B\x08', 'GZip', ),
# Metadata
'exif': (b'EXIF\x00', ''),
'jfif': (b'JFIF\x00', ''),
'xmp': (b'<x:xmpmeta', ''),
'icc': (b'ICC_PROFILE', 'ICC profile', ),
# Misc
'asf': (b'\x30\x26\xB2\x75\x8E\x66\xCF\x11', 'WMA/ASF media', ),# http://www.digitalpreservation.gov/formats/fdd/fdd000027.shtml
'wmv': (b'\x30\x26\xB2\x75\x8E\x66\xCF', 'Windows Video file', ),
'msi': (b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1', 'Microsoft Office file/MSI', ),
'mkv': (b'\x1A\x45\xDF\xA3', 'Matroska/WebM video', ),
'djvu': (b'AT&TFORM', 'DjVu document',),
'jp2': (b'\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A', 'JPEG 2000 graphic file', ),
'gif9': (b'GIF89', 'GIF graphic file', ),
'gif': (b'GIF87', 'GIF graphic file', ),
'eps': (b'%!PS-Adobe-3.0 EPSF-3 0', 'EPS File', ),
'pdf': (b'%PDF', 'PDF Document', ),
'docx': (b'\x50\x4B\x03\x04PK', 'Office 2010 file', ),
# MPEG 4 seems to be r'...ftyp\w{2,4}' http://www.ftyps.com/
'mp4': (b'ftyp', 'MPEG-4 video|QuickTime file', ),
}
connections = {}
def getConn(dbname, host=None, reconnect=False):
try: connections[host,dbname].ping()
except: reconnect = True
if (host,dbname) not in connections or reconnect:
connections[host,dbname] = oursql.connect(
db=dbname,
host=host or dbname[:-2]+'.labsdb',
read_default_file=os.path.expanduser('~/.my.cnf'),
charset=None,
compress=True,
use_unicode=False,
autoping=True
)
return connections[host,dbname]
def base36encode(number):
alphabet = '0123456789abcdefghijklmnopqrstuvwxyz'
base36 = ''
while number:
number, i = divmod(number, 36)
base36 = alphabet[i] + base36
return base36.zfill(31)
def sha1file(filepath):
sha1 = hashlib.sha1()
with open(filepath, 'rb') as f:
sha1.update(f.read())
return sha1.hexdigest()
def sha1file36(filepath):
return base36encode(int(sha1file(filepath), 16))
def mkdir_p(path):
import errno
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def reduction(end_size, img_size):
return b'%+2.0f%%' % (100.0*(end_size-img_size)/float(img_size),) if end_size >= 0 else b'-'
def put_pywikibot(lang, title, new_text, summary=None, prompt=True):
import pywikibot
site = pywikibot.Site('commons', 'commons') if lang=='commons' else pywikibot.Site(lang)
page = pywikibot.Page(site, title)
new_text = re.sub(ur'^\{\|.*?^\|\}$', "", page.get(), flags=re.U | re.M | re.DOTALL) + new_text
if new_text.count('\n') - page.get().count('\n') > -200:
pywikibot.showDiff(page.get(), new_text)
site.login()
summary = ''#raw_input('Summary for edit:') or 'Update '
print page.title(asLink=True)
if page.botMayEdit():
if summary and (raw_input('Save to wiki as User:%s? [yes/No] ' % (site.user(), )) in ('yes', 'y')):
pywikibot.config.simulate = True
#page.put(new_text, summary, minorEdit=False, botflag=False)
# Pregenerate table for quick debugging
def cache_images(cursor):
cursor.execute('''
-- ; mysql -h commonswiki.labsdb commonswiki_p
DROP TABLE IF EXISTS u2815__.absurd_images;
-- ''');cursor.execute('''
/* absurd_images 15 min <SLOW_OK> */
CREATE TABLE u2815__.absurd_images (
ao_img_name VARBINARY(255) NOT NULL PRIMARY KEY
) ENGINE=MyISAM AS
SELECT img_name AS ao_img_name
FROM image
WHERE img_media_type="BITMAP"
AND img_major_mime="image"
AND img_minor_mime IN ("jpeg", "png")
AND img_size > IF(img_minor_mime="jpeg", 3,
IF(img_metadata LIKE '%s:16:"truecolour-alpha"%', 4,
IF(img_bits<8 OR img_metadata LIKE '%s:14:"index-coloured"%' OR img_metadata LIKE '%s:9:"greyscale"%', 1, 3)
) * img_bits / 8
) * img_width * img_height + 16*1024;
''')
def main():
cursor = getConn('%swiki_p'%lang).cursor()
cursor.execute('CREATE DATABASE IF NOT EXISTS '+overhead_table.partition('.')[0])
query_where = ["img_size > ?"]
query_data = [skip_ifunder]
if test_img_name:
if test_img_name.isdigit():
query_where.append("page_id=?")
query_data.append(test_img_name)
else:
query_where.append("img_name=?")
query_data.append(test_img_name)
else:
if lang=='commons':
#cursor.execute('DROP TABLE IF EXISTS '+overhead_table)
#cache_images(cursor)
pass
cursor.execute('''
CREATE TABLE IF NOT EXISTS '''+overhead_table+''' (
fo_page INT NOT NULL,
fo_sha1 VARCHAR(32) NOT NULL PRIMARY KEY,
fo_icc_size INT,
fo_identify_size INT,
fo_exiftool_size INT,
fo_size INT NOT NULL
);''')
cursor.execute('''/* absurd_images 20 min <SLOW_OK> */
SELECT
img_name,
img_size,
img_width,
img_height,
img_bits,
IF(img_minor_mime="jpeg", 3, /* Workaround for [[phab:T132986]] */
IF(img_metadata LIKE '%s:16:"truecolour-alpha"%', 4,
IF(img_bits<8 OR img_metadata LIKE '%s:14:"index-coloured"%' OR img_metadata LIKE '%s:9:"greyscale"%', 1, 3)
) * img_bits / 8
) * img_width * img_height + IFNULL(fo_size, 50*1024) AS est_size,
img_minor_mime,
img_metadata,
user_name,
user_editcount,
img_timestamp,
img_sha1,
EXISTS (SELECT 1 FROM ipblocks_ipindex WHERE ipb_user=user_id AND (ipb_expiry="infinity" OR ipb_expiry>NOW()) LIMIT 1) AS user_block,
fo_size,
'''+('(SELECT COUNT(*) FROM globalimagelinks WHERE gil_to=img_name)' if lang=='commons' else '-1')+''' AS img_usage
FROM image '''+
('' if lang!='commons' else 'JOIN u2815__.absurd_images ON img_name=ao_img_name /*quicker debugging*/')+
'''
JOIN page ON page_namespace=6 AND page_title=img_name
LEFT JOIN user ON user_id=img_user
LEFT JOIN '''+overhead_table+''' ON fo_sha1=img_sha1
LEFT JOIN categorylinks ON cl_from=page_id AND cl_to IN ("Animated_PNG", "Fireworks_PNG_files")
WHERE img_width>0 AND img_height>0
AND img_media_type="BITMAP" AND img_major_mime="image"
AND img_minor_mime IN ("jpeg", "png")
AND '''+' AND '.join(query_where)+'''
AND img_size * 0.996 > IF(img_minor_mime="jpeg", 3,
IF(img_metadata LIKE '%s:16:"truecolour-alpha"%', 4,
IF(img_bits<8 OR img_metadata LIKE '%s:14:"index-coloured"%' OR img_metadata LIKE '%s:9:"greyscale"%', 1, 3)
) * img_bits / 8
) * img_width * img_height + IFNULL(fo_size, 0) + 4 * 1024
AND cl_from IS NULL
ORDER BY CAST(img_size AS SIGNED)-est_size DESC
LIMIT 50000;
''', tuple(query_data))
print 'Queried for images in %2.4g minutes, %swiki, rows: %s'%((time.time()-StartTime)/60.0,lang,cursor.rowcount)
f = open(output_file, 'w+b')
f.write(b'\xEF\xBB\xBF')
def write(text):
f.write(text.encode('utf-8') if isinstance(text, unicode) else str(text))
f.write(b'\n')
write(b"""\
{| class="wikitable sortable plainlinks" style="text-align:center"
|-
! Name
! Date
! Size (KB)
! BMP
! Zip
! Trim
! Opti
! Links
! Usage
! Uploader
! Notes""")
count = 0
for img_name, img_size, img_width, img_height, img_bits, est_size, img_minor_mime, img_metadata, user_name, user_editcount, img_timestamp, img_sha1, user_block, fo_size, img_usage in cursor.fetchall():
if img_sha1+'.skip' in log_db:
# We skipped this before
continue
notes = []
print
print img_name.center(len(img_name)+2).decode('utf-8').center(79, '=').encode('utf-8')
metadata = {}
if img_metadata not in ('', '-1', '0'): # https://phabricator.wikimedia.org/T155741
try:
metadata = phpserialize.loads(img_metadata)
except ValueError as e:
print 'img_metadata decode error:', e
# Check for animation
img_framecount = metadata.get('frameCount', 1)
if img_framecount > 1:
print 'Skipping animation [[%s]] (%s frames)' % (img_name, img_framecount)
continue
# Use color channels from MediaWiki
if metadata.get('colorType') in (b'index-coloured', 'greyscale') or img_bits < 8:
img_channels = 1
elif metadata.get('colorType') == b'truecolour':
img_channels = 3
elif metadata.get('colorType') == b'truecolour-alpha':
img_channels = 4
elif metadata.get('colorType'):
raise Exception('Unknown colorType: %s' % metadata.get('colorType'))
else: # Assume 3 channels
img_channels = 1 if img_minor_mime=='gif' else 3 if img_minor_mime=='jpeg' else 4
# Make image URL
img_name_md5 = hashlib.md5(img_name).hexdigest()
mw_url = 'https://upload.wikimedia.org/wikipedia/%s/%s/%s/%s' % (
lang,
img_name_md5[0:1],
img_name_md5[0:2],
urllib.quote(img_name),
)
mkdir_p(download_dir)
if os.path.exists('absurd.img'):
os.remove('absurd.img')
# Do we already have the file?
if os.path.isfile(os.path.join(download_dir, img_name)):
# Is it the right size?
new_name = os.path.join(download_dir, img_name)
if os.path.getsize(new_name) != img_size:
print 'File size mismatch: DL %s != SQL %s for %s' % (os.path.getsize(new_name), img_size, img_name)
while os.path.exists(new_name):
new_name += '_'
os.rename(os.path.join(download_dir, img_name),os.path.join(download_dir, new_name))
# Either (re)download it or copy it over
if not os.path.isfile(os.path.join(download_dir, img_name)):
ec = os.system('wget "%s" --output-document="absurd.img" --no-clobber --limit-rate=2M'%(mw_url,))
if ec: raise ec
shutil.copy2('absurd.img', os.path.join(download_dir, img_name))
else:
shutil.copy2(os.path.join(download_dir, img_name), 'absurd.img')
# SHA1 check
if sha1file36('absurd.img') != img_sha1:
print sha1file36('absurd.img'), '!=', img_sha1
raise ValueError('SHA-1 hash mismatch for [[File:%s]]' % img_name)
# Find metadata overhead
if fo_size == None:
# ImageMagick's identify (better for JPEGs)
identify_size = 0
fo_icc_size = None
p7zip = subprocess.Popen(
['identify', '-verbose', 'absurd.img'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
stdout, stderr = p7zip.communicate()
if stderr or p7zip.returncode:
print stderr
print 'Exit code:', p7zip.returncode
for m in re.finditer(r' *(.*?): (\d+) bytes', stdout):
print m.group()
identify_size += int(m.group(2))
if m.group(1) == 'Profile-icc':
fo_icc_size = int(m.group(2))
# ExifTool (better for PNGs)
exiftool_size = 0
exif= subprocess.Popen(
['exiftool', '-a', '-b', 'absurd.img'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
#{
# 'Profile-8bim': '',
# 'Profile-APP3': '',
# 'Profile-exif': '-exif',
# 'Profile-icc': '-icc_profile',
# 'Profile-iptc': '-iptc',
# 'Profile-xmp': '-xmp',
#}
stdout, stderr = exif.communicate()
if stderr:
print 'ERROR', stderr
if exif.returncode:
print 'Exit code:', exif.returncode, ' xmp bytes: ', len(stdout)
elif stdout:
exiftool_size = len(stdout)
print '\tExifTool: %d bytes' % (exiftool_size,)
#print 'stdout', stdout,
fo_size = identify_size if img_minor_mime=='jpeg' else exiftool_size
cursor = getConn('%swiki_p'%lang).cursor()
cursor.execute(
"INSERT INTO "+overhead_table+
"(fo_page, fo_sha1, fo_icc_size, fo_identify_size, fo_exiftool_size, fo_size) "+
"VALUES (?, ?, ?, ?, ?, ?) "+
"ON DUPLICATE KEY UPDATE fo_size=fo_size",
(0, img_sha1, fo_icc_size, identify_size, exiftool_size, fo_size)
)
if img_minor_mime=='png':
pngcheck = subprocess.Popen(
['pngcheck', '-v', 'absurd.img'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
stdout, stderr = pngcheck.communicate()
if 'Macromedia Fireworks private' in stdout and not notes:
print 'Skipping Fireworks file'
log_db[img_sha1+'.skip'] = "Fireworks"
continue
elif 'Microsoft Picture It private' in stdout and not notes:
print 'Skipping Picture It! file'
log_db[img_sha1+'.skip'] = "PictureIt"
continue
else:
print stdout if stdout.count('\n') < 10 else stdout[stdout.rstrip().rfind('\n')+1:]
if 'ERRORS DETECTED' in stdout:
notes.append(stdout.strip().split('\n')[-2].strip())
if stderr or pngcheck.returncode:
print stderr
print '!'*60
if '-bit RGB+alpha, ' in stdout:
img_channels = 4
elif '-bit RGB, 'in stdout:
img_channels = 3
elif '-bit palette, ' in stdout:
img_channels = 1
elif '-bit grayscale, ' in stdout:
img_channels = 1
else:
print stdout
print stderr
raise Exception('Unrecongized pngcheck -v output')
def magicSearch(needle):
with open('absurd.img', 'r+b') as f:
data = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
result = data.find(needle)
return result != -1
magic = dict((k, magicSearch(v[0])) for k,v in magic_numbers.iteritems())
if any(magic.values()):
print 'Magic numbers: %s'%(', '.join((str(k) for k,v in magic.iteritems() if v)))
# Check common archive formats
is_7z_file = 0
is_rar_file = 0
is_encrypted = False
if magic['rar4'] or magic['rar5']:
unrar = subprocess.Popen(
['unrar-nonfree', 'Lb', '-ppassword', 'absurd.img'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
stdout, stderr = unrar.communicate()
is_rar_file = stdout.count('\n')
print stdout, stderr
print 'RAR: v1-4:%5s v5+:%5s %3s files ' % (magic['rar4'], magic['rar5'], is_rar_file,)
if unrar.returncode:
print '!'*60
print unrar.returncode
raise Exception(stdout)
if magic['7zip'] or (not is_rar_file and magic['rar4']) \
or magic['zip'] or magic['zips']:
p7zip = subprocess.Popen(
['7z', 'l', '-p', 'absurd.img'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
stdout, stderr = p7zip.communicate()
print stdout, stderr, 'exit code: ', p7zip.returncode
m = re.search(r'\s(\d) files, ', stdout)
is_7z_file = m.group(1) if m else 0
print '7z: %s %3s files ' % (magic['7zip'], is_7z_file,)
if 'encrypted archive' in stdout:
is_encrypted = True
if p7zip.returncode:
print '!'*30
#raise Exception(p7zip.returncode)
# Recompute with new img_channels and overhead adjustment
est_size = img_width * img_height * img_channels * img_bits/8.0 + fo_size + 4 * 1024
# Identify PNG channels, oversized?
if est_size >= img_size * 0.996:
if is_encrypted or is_rar_file or is_7z_file:
raise Exception('found something strange')
if not notes:
print 'Skipping RGBA' if img_channels > 3 else 'Skipping overhead adjusted', '{:,} < {:,}'.format(img_size, est_size)
continue
else:
print 'Would skip, but note: %s'%(notes,)
# Test entropy with DEFLATE
if img_sha1+'.compressed' not in log_db:
os.system('gzip -9 < absurd.img > compressed.img')
log_db[img_sha1+'.compressed'] = str(os.path.getsize('compressed.img'))
os.remove('compressed.img')
cmpr_size = int(log_db[img_sha1+'.compressed'])
# Format specific for trimmers and optimizers
if img_sha1+'.optimized' not in log_db or img_sha1+'.trimmed' not in log_db:
if img_minor_mime=='jpeg':
_=subprocess.call('jpegtran -copy all absurd.img > trimmed.img', shell=True)
_=subprocess.call('jpegtran -copy all -optimize absurd.img > optimized.img', shell=True)
elif img_minor_mime=='png':
# trim
_=subprocess.call('pngout -force -ks -s4 -y absurd.img trimmed.img', shell=True)
if _: # PNGOUT error
print('PNGOUT exit code: %s'%(_,))
shutil.copy2('absurd.img', 'trimmed.img.png')
os.system('mv trimmed.img.png trimmed.img')
# optimize
_=subprocess.call('optipng -fix -force -quiet absurd.img -out optimized.img', shell=True)
_=subprocess.call('pngout -s0 -q -y optimized.img optimized.img.png', shell=True)
os.system('mv -f optimized.img.png optimized.img')
#elif img_minor_mime=='gif':
# subprocess.call('gifsicle absurd.img --output optimized.img')
else:
raise 'Optimization Unsupported for %s' % img_minor_mime
log_db[img_sha1+'.trimmed'] = str(os.path.getsize('trimmed.img'))
log_db[img_sha1+'.optimized'] = str(os.path.getsize('optimized.img'))
os.remove('trimmed.img')
os.remove('optimized.img')
trim_size = int(log_db[img_sha1+'.trimmed'])
opti_size = int(log_db[img_sha1+'.optimized'])
if detect:
res = detect('absurd.img')
if res:
notes.append(repr(res))
# Delete our work files
os.remove('absurd.img')
# Skip heuristics
if not notes:
# Trim is low, but image seems to be uncompressed
if cmpr_size + 0.10*img_size > opti_size > cmpr_size - 0.10*img_size and trim_size > img_size * 0.90:
print 'Skipping uncompressed image: opti {:4.2f}% within compr'.format(
opti_size/float(cmpr_size)*100.0,
)
continue
# Indication of "zero" padding at the end of the image
if opti_size < est_size and trim_size + 0.05*img_size > cmpr_size: # TODO improve allow [Zip: -20, Opti: -90%], disalllow [Zip -1%, opti-90%]
print "Skipping padded image"
continue
# Files under threshold
if img_size - min(est_size, trim_size, opti_size) < skip_ifunder:
print "Skipping {:,} - min({:,}, {:,}, {:,}) = {:,} under {:,}".format(
img_size, est_size, trim_size, opti_size,
img_size - min(est_size, trim_size, opti_size),
skip_ifunder,
)
continue
notes += [
str(metadata.get('Software', '')),
'%d KB metadata' % (fo_size/1024.0) if fo_size > img_size * 0.05 > 1024 else ''
# arhives
'PK Zip header' if magic['zip'] or magic['zips'] else '',
"'''7z''' (%s files)"%is_7z_file if is_7z_file != 0 else '',
"'''RAR''' (%s files)"%is_rar_file if is_rar_file != 0 else '',
"'''encrypted archive?'''" if is_encrypted else '',
]
notes.append(';'.join(k.upper() for k,v in magic.iteritems() if v and k in ('mp4', 'mp4-', 'gp5', 'asf', 'tar', 'jar')))
count += 1
write('\n'.join((
'|-',
'| align=left | [[:File:{}]]'.format(img_name.replace('_', ' ')),
'| '+time.strftime('%Y-%m-%d', time.strptime(img_timestamp, '%Y%m%d%H%M%S')),
'| align=right| {:,.0f}'.format(img_size // 1024.0),
'| '+reduction(est_size, img_size),
'| '+reduction(cmpr_size, img_size),
'| '+reduction(trim_size, img_size),
'| '+reduction(opti_size, img_size),
'| [http://imgops.com/{{filepath:%s}} ImgOps], [http://exif.regex.info/exif.cgi?url={{urlencode:https:{{filepath:%s}}|QUERY}} Exif], [//images.google.com/searchbyimage?site=search&image_url={{filepath:%s|%s}} Google]' % (
img_name, img_name, img_name, 120 if img_width<=300 else 300 if img_width<=800 else 800,
),
'| %s'%img_usage,
'| align=left | [[User:%(user_name)s|%(user_name)s]] ([[User talk:%(user_name)s|talk]]%(blocked)s%(lowcount)s)' % dict(
user_name=user_name,
blocked=", '''BLOCKED'''" if user_block else '',
lowcount=', %s edits'%user_editcount if user_editcount <=100 else '',
),
'| align=left | %s' % ', '.join(note for note in notes if note) if notes else '',
)))
write('|}')
f.close()
print 'Found %s images in %4.2f minutes' % (count, (time.time()-StartTime)/60.0,)
if __name__ == '__main__':
main()
# Save
with open(output_file, 'r') as f:
f.seek(3)
new_text = f.read().decode('utf-8')
if lang=='commons':
try:
put_pywikibot(lang, 'User:Dispenser/Absurd overhead', new_text)
except Exception as e:
print 'Error pasting new page: %s' % (e,)
print new_text.encode('utf-8')
raise
else:
if len(new_text) > 200:
print new_text.encode('utf-8')