User:Fæ/code/batchRussianplanes.py

From Wikimedia Commons, the free media repository
< User:Fæ‎ | code
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
# batchRussianplanes.py
# Customized batch upload for the Russian planes forum.
# Cropping is needed and must be via a separate routine.
# 
# There is dead code here! You can have quick and dirty
# or clean and never. ;-)
#
# Date: Feb 2014
# Author: Fae, http://j.mp/faewm
# Permissions: CC-BY-SA
#
# Ethical statement (to be included in derivatives as part of the required attribution):
# This source code is published with request that derivatives are used only to support the
# release of *freely reusable images and files*. It is well established that non-free licences,
# including Fair Use or Non-Commercial, damage open knowledge and educational value, due to
# files not being easily used by the majority of the public who have an educational need.
# See:
# * http://wikimediafoundation.org/wiki/Vision
# * http://wikimediafoundation.org/wiki/Resolution:Licensing_policy
'''

import wikipedia, upload, sys, config, urllib2, urllib, re, string, time, catlib, pagegenerators, os.path, hashlib, pprint, subprocess
import webbrowser, itertools
from unidecode import unidecode
from BeautifulSoup import BeautifulSoup
from sys import argv
from sys import stdout
import collections
from time import sleep
from os import remove
from colorama import Fore, Back, Style
from colorama import init
init()
''' Terminal colours:
Fore: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
Back: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
Style: DIM, NORMAL, BRIGHT, RESET_ALL
'''

# Colours only on mac
Red="\033[0;31m"     #Red
Green="\033[0;32m"   #Green
GreenB="\033[1;32m" #Green bold
GreenU="\033[4;32m" #Green underlined
Yellow="\033[0;33m"  #Yellow
Blue="\033[0;34m"    #Blue
Purple="\033[0;35m"  #Purpley
Cyan="\033[0;36m"    #Cyan
White="\033[0;37m"   #White

if len(sys.argv)>1:
  if sys.argv[-1]=="w":
    Red,White,Green,Cyan,Yellow='**','','','','' # Fix if in Windoze
  if sys.argv[1]=="help":
    print Yellow
    print '*'*60
    print """Optional parameters
#last <"w"> Switch off colours, to be windows console safe
"""+"*"*60,White
    sys.exit(0)

site = wikipedia.getSite('commons', 'commons')
#cat = catlib.Category(site,u'Category:'+category)
#gen = pagegenerators.CategorizedPageGenerator(cat,recurse=True)

subs=[['%20',' '],['%28','('],['%29',')'],['%2C',','],['%3A',':']]
def pquote(s):
  s=urllib.quote(s)
  for c in range(len(subs)):
    s=re.sub(subs[c][0],subs[c][1],s)
  return s

def gettag(tag,name):
  if tag=='class':
    try:
      r=string.split(html,'class="'+name+'"')[1]
      r=string.split(r,'>')[1]
      r=string.split(r,'<')[0]
    except:
      return ''
    return r
  if tag=='td':
    try:
      r=string.split(html,'<tr><td>'+name+'</td><td>')[1]
      r=string.split(r,'</td>')[0]
    except:
      return ''
         return r
  return ''

def p(tag, val): # Print nicely
  if len(val)>1:
    print Cyan+tag+" : "+Yellow+val+White
  return
def pp(tag): # Print nicely
  val=eval(tag)
  if len(val)>0:
    print Cyan+tag+" : "+Yellow+val+White
  else:
    print Cyan+tag+Red+" Missing data"+White
  return

def trim(s):   # Trim leading and trailing spaces
 return re.sub('^[\s\r\t\f\n]*|[\s\r\t\f\n]*$','',s)
def trimmore(s): # Trim trailing or leading punctuation
  return re.sub('^[\.,;\-:\!]*|[\.,;\-:\!]*$','',s)

def up(filename, pagetitle, desc):
    url = filename
    keepFilename=True        #set to True to skip double-checking/editing destination filename
    verifyDescription=False    #set to False to skip double-checking/editing description => change to bot-mode
    targetSite = wikipedia.getSite('commons', 'commons')
    bot = upload.UploadRobot(url, description=desc, useFilename=pagetitle, keepFilename=keepFilename, verifyDescription=verifyDescription, targetSite = targetSite)
    bot.upload_image(debug=True)
      
def urltry(u):
 headers = { 'User-Agent' : 'Mozilla/5.0' } # Spoof header
 countErr=0
 x=''
 while x=='':
   try:
     req = urllib2.Request(u,None,headers)
     x = urllib2.urlopen(req)
     time.sleep(1)
   except:
     x=''
     countErr+=1
     if countErr>300: countErr=300 # 5 minutes between read attempts, eventually
     print Cyan,'** ERROR',countErr,'\n ** Failed to read from '+Yellow+u+Cyan+'\n ** Pause for '+str(countErr*1)+' seconds and try again ['+time.strftime("%H:%M:%S")+']',White
     time.sleep(1*countErr)
 return x

def htmltry(x,u):
  countErr=0
  r=True
  while r:
    try:
      return x.read()
    except:
      x=urltry(u)
      countErr+=1
      if countErr>200:
        p=300
      else:
        p=countErr*2
      print Cyan,'** ERROR',countErr,'\n ** Failed to read xml'
      if countErr==1:
        print Blue+'xml ='+str(x)
        print 'url ='+u+Cyan
      print ' ** Pause for '+str(p)+' seconds and try again'+White
      time.sleep(p)
    else:
      r=False
  return

def xmlfind(item):
  if data.find('<metadata name="'+item)>-1:
    return data.split('<metadata name="'+item+'" value="')[1].split('"')[0]
  else:
    return ''

def gettag(h):
  #<span class="link-11"><a href="/tag/thunderstorm" rel="tag" title="">Thunderstorm</a></span>
  t=re.split('href="/tag/[^"]*" rel="tag" title="">',h)
  for i in range(len(t)):
     t[i]=t[i].split('<')[0]
  t[0]=re.sub('^\s*|\s$','',t[0])
  if t[0]=='': t.pop(0)
  return ", ".join(t)

def getatt(t,h):
 if h.find('>'+t+':</th>')==-1: return ''
 r=re.split('<th scope="row"[^>]*>'+t+':</th>',h)[1].split('</td>')[0]
 r=re.sub('[\n\r\t\s]',' ',r)
 r=re.sub('\s+',' ',r)
 return re.sub('^\s*|<[^>]*>|\s*$','',r)

def getcat(h):
  # <div class="item-list"><ul><li><a href="/type/natural">Natural</a></li><li><a href="/type/characteristic">Characteristic</a></li></ul></div>
  if h.find('<div class="terms">Type</div>')==-1: return '[[Category:Pdsounds.org]]'
  r=h.split('<div class="terms">Type</div>')[1]
  r=r.split('class="item-list"><ul>')[1]
  r=r.split('</a></li></ul>')[0]
  r=r.split('</a></li>')
  c=''
  for i in range(len(r)):
    r[i]=re.sub('<[^>]*?>','',r[i])
  return ", ".join(r)

def dupcheck(ff): # Using the SHA1 checksum, find if the file is already uploaded to Commons
  df=urllib2.urlopen(ff)
  #df=open(ff,'rb')
  notread=True # Try to deal with socket.timeout
  while notread:
   notread=False
   try:
    sha1 = hashlib.sha1(df.read()).hexdigest()
   except:
    notread=True
    print Red+"Trouble getting SHA1 for file, trying again in 5 seconds."
    time.sleep(5)
  #print Cyan+'SHA1='+sha1
  u="http://commons.wikimedia.org/w/api.php?action=query&list=allimages&format=xml&ailimit=1&aiprop=sha1&aisha1="+sha1
  notread=True
  while notread:
    notread=False
    try:
      x=urllib2.urlopen(u)
    except:
      notread=True
      print Red+"Trouble reading",u
      time.sleep(5)
  xd=x.read()
  x.close()
  if xd.find("<img")>-1:
    t=xd.split('title="')[1].split('"')[0]
    return True,t
  return False,''

# Check if a file title is already used on commons
def nameused(name):
  u="http://commons.wikimedia.org/w/api.php?action=query&prop=imageinfo&format=xml&titles=File:"+urllib.quote(name)
  ux=urltry(u)
  x=htmltry(ux,u)
  if x.find('<imageinfo>')>-1:
    return True
  return False

def trimtags(s):
 return trim(re.sub('<[^>]*?>','',s))

def titlecase(s):
 s=re.sub('&#0?39;',"'",s)
 s=re.sub('&amp;','and',s)
 s=re.sub(':','-',s)
 words=s.split(" ")
 smallwords=['at','the','of','by','a','during','work','on','in','and']
 bigwords=['UK','US','USA','U\.S\.','H\.M\.S\.','HMS', 'RAF', 'R\.A\.F\.', 'YWCA', 'YMCA']
 for i in range(len(words)):
   staybig=False
   for j in bigwords:
    if re.search('^'+j+'[,\.;\(\)\-]?',words[i]):
      staybig=True
      continue
   if not staybig:
    words[i]=words[i][0:1]+words[i][1:].lower()
   else:
    continue
   if i==0:
   continue
   else:
   for j in smallwords:
    if words[i].lower()==j: words[i]=words[i].lower()
    continue
 return ' '.join(words)

# *** Grab description ***
def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, basestring):
            for sub in flatten(el):
                yield sub
        else:
            yield el

def twodigits(d):
  if len(d)>1: return d
  return "0"+d

def getdesc(html):
  soup=BeautifulSoup(html)
  try:
    image=soup.findAll("img")[-1]
  except:
    return ''
  attrib=html.split("If you are going to publish, redistribute this image on the Internet place this link:")[1].split("</td>")[0]
  title=attrib.split("title=&quot;")[1].split("&quot")[0]
  site=attrib.split("href=&quot;")[1].split("&quot")[0]
  try:
    author=re.sub("\n","",urllib.quote(attrib.split("&gt; by ")[1].split("</em")[0], " ,").encode('ascii','ignore'))
  except:
    author="Public Domain Images"
  return '{{information\n| description = {{en|1=<br/>\n:''Image title: '+image['title']+'\n:Image from Public domain images website, '+site+"}}\n| source = "+image['src']+"\n| author = "+author+'\n| date = Not given\n*Transferred by [[User:{{subst:User:Fae/Fae}}|]] on {{subst:today}}\n| permission = This file is in public domain, not copyrighted, no rights reserved, free for any use. You can use this picture for any use including commercial purposes without the prior written permission and without fee or obligation.\n}}\n=={{int:license}}==\n{{PD-author|1='+author+'}}\n[[Category:Images uploaded by {{subst:User:Fae/Fae}}]]\n[[Category:Public-domain-image.com]]'+allcat, re.sub("\s{2,}"," ",image['title'].encode('ascii','ignore'))+'.jpg', image['src']

def catexists(cat): # Does this Commons category exist?
  urlpath="http://commons.wikimedia.org/w/api.php?action=query&prop=info&format=xml&titles=Category:"+urllib.quote(cat)
  url=urltry(urlpath)
  xml=htmltry(url,urlpath)
  if re.search('missing=""',xml):
    return False
  else:
    return True

plist=[]

def exists(url): # Does this Webpage exist?
  try:
    f = urllib2.urlopen(urllib2.Request(url))
    return True
  except:
    #print Red+"Could not find",url,White
    return False

def uptry(source,filename,desc):
  countErr=0
  r=True
  while r:
    try:
      up(source,filename,desc)
      return
    except:
      countErr+=1
      if countErr>200:
        p=300
      else:
        p=countErr*5
      print Cyan,'** ERROR Upload failed'
      print ' ** Pause for '+str(p)+' seconds and try again'+White
      time.sleep(p)
  return

def createcat(cat,txt):
  wikipedia.setAction("Create category")
  p=wikipedia.Page(site,"Category:"+cat)
  print Green,"Creating category",cat,White
  p.put(txt)
  return

plist=[]
def virinSearch(v):
  vcount=0
  countErr=0
  loop=True
  while loop:
    try:
      vgen = pagegenerators.SearchPageGenerator(v, namespaces = "6")
      for vPage in vgen:
        plist.append(vPage.title())
        vcount+=1
      loop=False
    except:
      loop=True
      countErr+=1
      print Red+"Problem running search, sleeping for",countErr,"seconds",Fore.WHITE
      time.sleep(countErr)
    if countErr>30:
      loop=False
      vcount=-1
  return vcount


def boxnotice(s,center=False,ctext=Fore.GREEN,cbox=Fore.YELLOW):
  sArr=s.split("\n")
  width=0
  for ss in sArr:
    if len(ss)>width-2: width=len(ss)+2
  #print os.name
  ascii=False
  if os.name=="nt":
    ascii=True
  if ascii:
    ulc,urc,lnc,blc,brc,llc="+","+","-","+","+","|"
  else:
    ulc,urc,lnc,blc,brc,llc=u"\u250F",u"\u2513",u"\u2501",u"\u2517",u"\u251B",u"\u2503"
  print cbox+ulc+lnc*width+urc  # http://en.wikipedia.org/wiki/Box-drawing_character
  for ss in sArr:
    if len(ss)<width-3 and center:
      ss=" "*int((width-2-len(ss))/2)+ss
    print cbox+llc+" "+ctext+ss+" "*(width-1-len(ss))+cbox+llc
  print cbox+blc+lnc*width+brc,Fore.WHITE


def ansearch(mysoup,mystring,href=False):
  if href:
    r=mysoup.find('a',{'href':re.compile(mystring)})['href']
  else:
    r=mysoup.find('a',{'href':re.compile(mystring)}).string
  if len(r)==0 or r is None:
    return ""
  return r

def make_unique(lst):
    if len(lst) <= 1:
        return lst
    last = lst[-1]
    for i in range(len(lst) - 2, -1, -1):
        item = lst[i]
        if item == last:
            del lst[i]
        else:
            last = item
            
def rptag(tsoup, tag):
  a = tsoup.find('a', href=re.compile("http://russianplanes.net/"+tag+"/.*"))
  return { 'en':re.sub("_"," ", a['href'].split(tag+'/')[-1]), 'ru':a.contents[0] }

def rprint(dic):
  if dic=='': return ''
  #print Fore.CYAN,"{{en|1="+dic['en']+"}}"+Fore.GREEN+"{{ru|1="+dic['ru']+"}}", Fore.WHITE
  en=dic['en']
  ru=dic['ru']
  return "{{en|1="+en+"}}"+"{{ru|1="+ru+"}}"

'''
*** SCRAPE FILENAMES ***
'''

# Custom scripts for scraping this website

# Main loop

#
# 11,000 photos at http://www.flickr.com/photos/33950163@N03/ have an OTRS release but ARR on Flickr
#

skip=0
if len(argv)>1:
  skip=int(float(argv[1]))

photographer=1

if len(argv)>2:
  photographer=int(float(argv[2]))

photographers=[
['http://russianplanes.net/f!b!t!a!c!d!l20!g!m1!s0!u!r!k!v!h!i!reg!ser!n!p', u'PavelAdzhigildaev'],
#1
['http://russianplanes.net/f!b!t!a!c!d!l20!g!m34!s0!u!r!k!v!h!i!reg!ser!n!p', u'AlexBeltyukov'],
#2
['http://russianplanes.net/f!b!t!a!c!d!l20!g!m786!s0!u!r!k!v!h!i!reg!ser!n!p', u'IgorBubin'],
#3
['http://russianplanes.net/f!b!t!a!c!d!l20!g!m895!s0!u!r!k!v!h!i!reg!ser!n!p', u'IgorDvurekov'],
#4
['http://russianplanes.net/f!b!t!a!c!d!l20!g!m245!s0!u!r!k!v!h!i!reg!ser!n!p', u'VladimirGalkin'],
]

print Fore.GREEN+"*"*60
print argv[0]
print "skip =",skip
print "photographer =", photographer, photographers[photographer][1]
print "*"*60, Fore.WHITE

rawlist=wikipedia.Page(site, "Commons:Batch uploading/Airliners/Airportlist").get()
airportslist=rawlist.split("<pre>")[1].split("# NOT AIRPORTS")[0]
airports=[line.split("; ") for line in airportslist.split("\n") if len(line)>4 and line[:1]!="#"]
#notairportslist = rawlist.split("<pre>")[1].split("# NOT AIRPORTS")[1]
#notairports=[line.split("; ") for line in notairportslist.split("\n") if len(line)>4 and line[:1]!="#"]

def ICAO(code):
  if code=='': return ''
  for a in airports:
    for r in a:
      if re.search(" ?"+code, r):
        return a[0]
  return ''

count=0
for page in range(0,260):
  if count+20<skip:
    count+=20
    continue
  # Get gallery page
  url=photographers[photographer][0]+str(page+1)
  uri=urltry(url)
  html=htmltry(uri, url)
  soup=BeautifulSoup(html)
  photoids=soup.findAll('tr', {'class':'photoheader'})
  for photoid in photoids:
    count+=1
    if count<=skip : continue
    pid = photoid.find('b').contents[0]
    idtable = photoid
    detailtable = photoid.nextSibling
    detailtable.find('div', id=re.compile("thanked.*")).extract()
    try:
      actype = rptag(photoid, 'kb')
    except:
      actype['en'], actype['ru'] = '',''
    pg = rptag(detailtable, 'photer')
    try:
      actypebottom = rptag(photoid, 'type')
    except:
      actypebottom['en'], actypebottom['ru'] = '', ''
    actypebottom['en'] = re.sub(actype['en']+'/', '', actypebottom['en'])
    source = re.sub("-200.jpg", ".jpg", detailtable.find('img').extract()['src'])
    try:
      sern = rptag(detailtable, 'sern')['en']
    except:
      sern = ''
    city = rptag(detailtable, 'city')
    try:
      citycode = detailtable.find('a', href=re.compile("http://russianplanes.net/city/.*")).i.contents[0]
    except:
      citycode = ''
    if re.search("/[A-Z]", citycode):
      airportcode = citycode.split("/")[1][:4]
    else:
      airportcode = ''
    country = rptag(detailtable, 'country')
    try:
      airline = rptag(detailtable, 'al')
    except:
      airline = ''
    try:
      latlon = detailtable.find('a', href=re.compile("http://maps.google.ru/.*")).extract()['href'].split('ll=')[1].split('&')[0].split(',')
    except:
      latlon = ''
    datet=detailtable.find('a', href=re.compile("http://russianplanes.net/search.*"))
    date={'en':datet['href'].split('=')[-1], 'ru':datet.contents[0]}
    filename=actype['ru']+" "+actypebottom['ru']+" "+sern+u", "+city['ru']+u" RP"+pid+".jpg"
    for s in [[" {2,}",''], ['/','-'], [' , ', ', ']]:
      filename=re.sub(s[0], s[1], filename)
    if not re.search(" RP", filename):
      print Fore.RED+filename, Fore.WHITE
      filename=re.sub("RP"+pid," RP"+pid, filename)
      print filename
    print "*", count, Fore.GREEN, pid
    print "*", filename
    #print Fore.YELLOW, pg['en'], Fore.CYAN+ pg['ru']
    #print Fore.YELLOW, source
    #print Fore.YELLOW, sern, Fore.WHITE
    d="{{watermark}}\n=={{int:filedesc}}==\n{{Infobox aircraft image"
    d+="\n|description="+""
    d+="\n|aircraft={{en|1="+actype['en']+" "+actypebottom['en']+"}}\n{{ru|1=" +actype['ru']+" "+actypebottom['ru']+"}}"
    d+="\n|aircraftid="+sern
    d+="\n|aircraftop="+rprint(airline)
    d+="\n|aircraftact="
    d+="\n|imagetype=Photograph"
    d+="\n|imageloc={{en|1="+ city['en'] +", "+ country['en']
    if citycode!='':
     d+=" "+citycode
    d+="}}"
    d+=re.sub(" , ", ", ", "\n{{ru|1="+ city['ru'] +", "+ country['ru'] +" "+citycode+"}}")
    d+="\n|imagedate={{en|1={{ISOdate|"+date['en']+"}}}}\n{{ru|1="+date['ru']+"}}"
    d+="\n|imageauthor="+rprint(pg)
    sourcegp=url[:5]+urllib.quote(url[5:],"?=/")
    sourcep=source[:5]+urllib.quote(source[5:])
    d+="\n|imagesource="+"\n*Photo "+sourcep+"\n*Gallery page "+sourcegp
    d+="\n|permission=\n|other_versions="
    if airportcode!='':
      d+="\n|other_fields={{Information field|name=ICAO Airport Code|value="+airportcode+"}}"
    d+="\n}}"
    if latlon!='':
      d+="\n{{location dec|1="+latlon[0]+"|2="+latlon[1]+"}}"
    d+="\n\n=={{int:license-header}}=="
    d+="\n{{"+photographers[photographer][1]+"}}\n"
    airportcat = ICAO(airportcode)
    if airportcat!='':
      d+="\n[[Category:Aircraft at "+ airportcat +"]]"
    cat=unidecode(sern)+" (aircraft)" # Potential aircraft category
    if len(sern)>2 and catexists(cat):
      d+="\n[[Category:"+cat+"]]"
    d+="\n[[Category:Images uploaded by {{subst:User:Fae/Fae}}]]\n[[Category:Russianplanes.net photos (check needed)]]\n[[Category:Russianplanes.net photos (credit bar)]]"
    d+="\n{{WMUK equipment|Faebot Macmini|year={{subst:CURRENTYEAR}} }}"
    # Quick check of filename in use - these should be unique
    if nameused(filename.encode('utf-8')):
      print Red+' Filename found', White
      continue
    # Quick check of uniqueness of database number on Commons
    plist=[]
    ANmatch='"RP"+"'+pid+'"'
    vc=virinSearch(ANmatch)
    if vc==-1:
      print Yellow+"Error generating search matches for '"+ANmatch+"', problem, this should not happen.",White
      continue
    if vc>0 and len(pid)>4:
      print Red+"-"*75
      print Red+"File appears to be in use already for",ANmatch+" ("+str(vc)+" matches). Check "+Blue+"http://commons.wikimedia.org/w/index.php?search="+urllib.quote(ANmatch)
      print Red+"-"*75,White
      continue
    # Check if image is a duplicate on Commons
    try:
      duplicate,duptitle = dupcheck(source)
    except:
      print Red+"Problem when running the duplicate check for",source,White
      time.sleep(10)
      try:
        duplicate,duptitle = dupcheck(source)
      except:
        print Red+"Failed on second try, giving up and skipping"
        continue
    if duplicate:
      print Red+'File is already on Commons as',duptitle,White
      time.sleep(2)
      continue
    uptry(source,filename,d)
    #print Fore.YELLOW,d, Fore.WHITE
    # Nice pause for human oversight
    lag=1
    for i in range(lag):
      stdout.write("\r%d " % (lag-i))
      stdout.flush()
      sleep(1)
    stdout.write("\r  ")
    stdout.flush()
    stdout.write("\r")