User:Fæ/code/batchRussianplanes.py
Jump to navigation
Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
# batchRussianplanes.py
# Customized batch upload for the Russian planes forum.
# Cropping is needed and must be via a separate routine.
#
# There is dead code here! You can have quick and dirty
# or clean and never. ;-)
#
# Date: Feb 2014
# Author: Fae, http://j.mp/faewm
# Permissions: CC-BY-SA
#
# Ethical statement (to be included in derivatives as part of the required attribution):
# This source code is published with request that derivatives are used only to support the
# release of *freely reusable images and files*. It is well established that non-free licences,
# including Fair Use or Non-Commercial, damage open knowledge and educational value, due to
# files not being easily used by the majority of the public who have an educational need.
# See:
# * http://wikimediafoundation.org/wiki/Vision
# * http://wikimediafoundation.org/wiki/Resolution:Licensing_policy
'''
import wikipedia, upload, sys, config, urllib2, urllib, re, string, time, catlib, pagegenerators, os.path, hashlib, pprint, subprocess
import webbrowser, itertools
from unidecode import unidecode
from BeautifulSoup import BeautifulSoup
from sys import argv
from sys import stdout
import collections
from time import sleep
from os import remove
from colorama import Fore, Back, Style
from colorama import init
init()
''' Terminal colours:
Fore: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
Back: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
Style: DIM, NORMAL, BRIGHT, RESET_ALL
'''
# Colours only on mac
Red="\033[0;31m" #Red
Green="\033[0;32m" #Green
GreenB="\033[1;32m" #Green bold
GreenU="\033[4;32m" #Green underlined
Yellow="\033[0;33m" #Yellow
Blue="\033[0;34m" #Blue
Purple="\033[0;35m" #Purpley
Cyan="\033[0;36m" #Cyan
White="\033[0;37m" #White
if len(sys.argv)>1:
if sys.argv[-1]=="w":
Red,White,Green,Cyan,Yellow='**','','','','' # Fix if in Windoze
if sys.argv[1]=="help":
print Yellow
print '*'*60
print """Optional parameters
#last <"w"> Switch off colours, to be windows console safe
"""+"*"*60,White
sys.exit(0)
site = wikipedia.getSite('commons', 'commons')
#cat = catlib.Category(site,u'Category:'+category)
#gen = pagegenerators.CategorizedPageGenerator(cat,recurse=True)
subs=[['%20',' '],['%28','('],['%29',')'],['%2C',','],['%3A',':']]
def pquote(s):
s=urllib.quote(s)
for c in range(len(subs)):
s=re.sub(subs[c][0],subs[c][1],s)
return s
def gettag(tag,name):
if tag=='class':
try:
r=string.split(html,'class="'+name+'"')[1]
r=string.split(r,'>')[1]
r=string.split(r,'<')[0]
except:
return ''
return r
if tag=='td':
try:
r=string.split(html,'<tr><td>'+name+'</td><td>')[1]
r=string.split(r,'</td>')[0]
except:
return ''
return r
return ''
def p(tag, val): # Print nicely
if len(val)>1:
print Cyan+tag+" : "+Yellow+val+White
return
def pp(tag): # Print nicely
val=eval(tag)
if len(val)>0:
print Cyan+tag+" : "+Yellow+val+White
else:
print Cyan+tag+Red+" Missing data"+White
return
def trim(s): # Trim leading and trailing spaces
return re.sub('^[\s\r\t\f\n]*|[\s\r\t\f\n]*$','',s)
def trimmore(s): # Trim trailing or leading punctuation
return re.sub('^[\.,;\-:\!]*|[\.,;\-:\!]*$','',s)
def up(filename, pagetitle, desc):
url = filename
keepFilename=True #set to True to skip double-checking/editing destination filename
verifyDescription=False #set to False to skip double-checking/editing description => change to bot-mode
targetSite = wikipedia.getSite('commons', 'commons')
bot = upload.UploadRobot(url, description=desc, useFilename=pagetitle, keepFilename=keepFilename, verifyDescription=verifyDescription, targetSite = targetSite)
bot.upload_image(debug=True)
def urltry(u):
headers = { 'User-Agent' : 'Mozilla/5.0' } # Spoof header
countErr=0
x=''
while x=='':
try:
req = urllib2.Request(u,None,headers)
x = urllib2.urlopen(req)
time.sleep(1)
except:
x=''
countErr+=1
if countErr>300: countErr=300 # 5 minutes between read attempts, eventually
print Cyan,'** ERROR',countErr,'\n ** Failed to read from '+Yellow+u+Cyan+'\n ** Pause for '+str(countErr*1)+' seconds and try again ['+time.strftime("%H:%M:%S")+']',White
time.sleep(1*countErr)
return x
def htmltry(x,u):
countErr=0
r=True
while r:
try:
return x.read()
except:
x=urltry(u)
countErr+=1
if countErr>200:
p=300
else:
p=countErr*2
print Cyan,'** ERROR',countErr,'\n ** Failed to read xml'
if countErr==1:
print Blue+'xml ='+str(x)
print 'url ='+u+Cyan
print ' ** Pause for '+str(p)+' seconds and try again'+White
time.sleep(p)
else:
r=False
return
def xmlfind(item):
if data.find('<metadata name="'+item)>-1:
return data.split('<metadata name="'+item+'" value="')[1].split('"')[0]
else:
return ''
def gettag(h):
#<span class="link-11"><a href="/tag/thunderstorm" rel="tag" title="">Thunderstorm</a></span>
t=re.split('href="/tag/[^"]*" rel="tag" title="">',h)
for i in range(len(t)):
t[i]=t[i].split('<')[0]
t[0]=re.sub('^\s*|\s$','',t[0])
if t[0]=='': t.pop(0)
return ", ".join(t)
def getatt(t,h):
if h.find('>'+t+':</th>')==-1: return ''
r=re.split('<th scope="row"[^>]*>'+t+':</th>',h)[1].split('</td>')[0]
r=re.sub('[\n\r\t\s]',' ',r)
r=re.sub('\s+',' ',r)
return re.sub('^\s*|<[^>]*>|\s*$','',r)
def getcat(h):
# <div class="item-list"><ul><li><a href="/type/natural">Natural</a></li><li><a href="/type/characteristic">Characteristic</a></li></ul></div>
if h.find('<div class="terms">Type</div>')==-1: return '[[Category:Pdsounds.org]]'
r=h.split('<div class="terms">Type</div>')[1]
r=r.split('class="item-list"><ul>')[1]
r=r.split('</a></li></ul>')[0]
r=r.split('</a></li>')
c=''
for i in range(len(r)):
r[i]=re.sub('<[^>]*?>','',r[i])
return ", ".join(r)
def dupcheck(ff): # Using the SHA1 checksum, find if the file is already uploaded to Commons
df=urllib2.urlopen(ff)
#df=open(ff,'rb')
notread=True # Try to deal with socket.timeout
while notread:
notread=False
try:
sha1 = hashlib.sha1(df.read()).hexdigest()
except:
notread=True
print Red+"Trouble getting SHA1 for file, trying again in 5 seconds."
time.sleep(5)
#print Cyan+'SHA1='+sha1
u="http://commons.wikimedia.org/w/api.php?action=query&list=allimages&format=xml&ailimit=1&aiprop=sha1&aisha1="+sha1
notread=True
while notread:
notread=False
try:
x=urllib2.urlopen(u)
except:
notread=True
print Red+"Trouble reading",u
time.sleep(5)
xd=x.read()
x.close()
if xd.find("<img")>-1:
t=xd.split('title="')[1].split('"')[0]
return True,t
return False,''
# Check if a file title is already used on commons
def nameused(name):
u="http://commons.wikimedia.org/w/api.php?action=query&prop=imageinfo&format=xml&titles=File:"+urllib.quote(name)
ux=urltry(u)
x=htmltry(ux,u)
if x.find('<imageinfo>')>-1:
return True
return False
def trimtags(s):
return trim(re.sub('<[^>]*?>','',s))
def titlecase(s):
s=re.sub('�?39;',"'",s)
s=re.sub('&','and',s)
s=re.sub(':','-',s)
words=s.split(" ")
smallwords=['at','the','of','by','a','during','work','on','in','and']
bigwords=['UK','US','USA','U\.S\.','H\.M\.S\.','HMS', 'RAF', 'R\.A\.F\.', 'YWCA', 'YMCA']
for i in range(len(words)):
staybig=False
for j in bigwords:
if re.search('^'+j+'[,\.;\(\)\-]?',words[i]):
staybig=True
continue
if not staybig:
words[i]=words[i][0:1]+words[i][1:].lower()
else:
continue
if i==0:
continue
else:
for j in smallwords:
if words[i].lower()==j: words[i]=words[i].lower()
continue
return ' '.join(words)
# *** Grab description ***
def flatten(l):
for el in l:
if isinstance(el, collections.Iterable) and not isinstance(el, basestring):
for sub in flatten(el):
yield sub
else:
yield el
def twodigits(d):
if len(d)>1: return d
return "0"+d
def getdesc(html):
soup=BeautifulSoup(html)
try:
image=soup.findAll("img")[-1]
except:
return ''
attrib=html.split("If you are going to publish, redistribute this image on the Internet place this link:")[1].split("</td>")[0]
title=attrib.split("title="")[1].split(""")[0]
site=attrib.split("href="")[1].split(""")[0]
try:
author=re.sub("\n","",urllib.quote(attrib.split("> by ")[1].split("</em")[0], " ,").encode('ascii','ignore'))
except:
author="Public Domain Images"
return '{{information\n| description = {{en|1=<br/>\n:''Image title: '+image['title']+'\n:Image from Public domain images website, '+site+"}}\n| source = "+image['src']+"\n| author = "+author+'\n| date = Not given\n*Transferred by [[User:{{subst:User:Fae/Fae}}|]] on {{subst:today}}\n| permission = This file is in public domain, not copyrighted, no rights reserved, free for any use. You can use this picture for any use including commercial purposes without the prior written permission and without fee or obligation.\n}}\n=={{int:license}}==\n{{PD-author|1='+author+'}}\n[[Category:Images uploaded by {{subst:User:Fae/Fae}}]]\n[[Category:Public-domain-image.com]]'+allcat, re.sub("\s{2,}"," ",image['title'].encode('ascii','ignore'))+'.jpg', image['src']
def catexists(cat): # Does this Commons category exist?
urlpath="http://commons.wikimedia.org/w/api.php?action=query&prop=info&format=xml&titles=Category:"+urllib.quote(cat)
url=urltry(urlpath)
xml=htmltry(url,urlpath)
if re.search('missing=""',xml):
return False
else:
return True
plist=[]
def exists(url): # Does this Webpage exist?
try:
f = urllib2.urlopen(urllib2.Request(url))
return True
except:
#print Red+"Could not find",url,White
return False
def uptry(source,filename,desc):
countErr=0
r=True
while r:
try:
up(source,filename,desc)
return
except:
countErr+=1
if countErr>200:
p=300
else:
p=countErr*5
print Cyan,'** ERROR Upload failed'
print ' ** Pause for '+str(p)+' seconds and try again'+White
time.sleep(p)
return
def createcat(cat,txt):
wikipedia.setAction("Create category")
p=wikipedia.Page(site,"Category:"+cat)
print Green,"Creating category",cat,White
p.put(txt)
return
plist=[]
def virinSearch(v):
vcount=0
countErr=0
loop=True
while loop:
try:
vgen = pagegenerators.SearchPageGenerator(v, namespaces = "6")
for vPage in vgen:
plist.append(vPage.title())
vcount+=1
loop=False
except:
loop=True
countErr+=1
print Red+"Problem running search, sleeping for",countErr,"seconds",Fore.WHITE
time.sleep(countErr)
if countErr>30:
loop=False
vcount=-1
return vcount
def boxnotice(s,center=False,ctext=Fore.GREEN,cbox=Fore.YELLOW):
sArr=s.split("\n")
width=0
for ss in sArr:
if len(ss)>width-2: width=len(ss)+2
#print os.name
ascii=False
if os.name=="nt":
ascii=True
if ascii:
ulc,urc,lnc,blc,brc,llc="+","+","-","+","+","|"
else:
ulc,urc,lnc,blc,brc,llc=u"\u250F",u"\u2513",u"\u2501",u"\u2517",u"\u251B",u"\u2503"
print cbox+ulc+lnc*width+urc # http://en.wikipedia.org/wiki/Box-drawing_character
for ss in sArr:
if len(ss)<width-3 and center:
ss=" "*int((width-2-len(ss))/2)+ss
print cbox+llc+" "+ctext+ss+" "*(width-1-len(ss))+cbox+llc
print cbox+blc+lnc*width+brc,Fore.WHITE
def ansearch(mysoup,mystring,href=False):
if href:
r=mysoup.find('a',{'href':re.compile(mystring)})['href']
else:
r=mysoup.find('a',{'href':re.compile(mystring)}).string
if len(r)==0 or r is None:
return ""
return r
def make_unique(lst):
if len(lst) <= 1:
return lst
last = lst[-1]
for i in range(len(lst) - 2, -1, -1):
item = lst[i]
if item == last:
del lst[i]
else:
last = item
def rptag(tsoup, tag):
a = tsoup.find('a', href=re.compile("http://russianplanes.net/"+tag+"/.*"))
return { 'en':re.sub("_"," ", a['href'].split(tag+'/')[-1]), 'ru':a.contents[0] }
def rprint(dic):
if dic=='': return ''
#print Fore.CYAN,"{{en|1="+dic['en']+"}}"+Fore.GREEN+"{{ru|1="+dic['ru']+"}}", Fore.WHITE
en=dic['en']
ru=dic['ru']
return "{{en|1="+en+"}}"+"{{ru|1="+ru+"}}"
'''
*** SCRAPE FILENAMES ***
'''
# Custom scripts for scraping this website
# Main loop
#
# 11,000 photos at http://www.flickr.com/photos/33950163@N03/ have an OTRS release but ARR on Flickr
#
skip=0
if len(argv)>1:
skip=int(float(argv[1]))
photographer=1
if len(argv)>2:
photographer=int(float(argv[2]))
photographers=[
['http://russianplanes.net/f!b!t!a!c!d!l20!g!m1!s0!u!r!k!v!h!i!reg!ser!n!p', u'PavelAdzhigildaev'],
#1
['http://russianplanes.net/f!b!t!a!c!d!l20!g!m34!s0!u!r!k!v!h!i!reg!ser!n!p', u'AlexBeltyukov'],
#2
['http://russianplanes.net/f!b!t!a!c!d!l20!g!m786!s0!u!r!k!v!h!i!reg!ser!n!p', u'IgorBubin'],
#3
['http://russianplanes.net/f!b!t!a!c!d!l20!g!m895!s0!u!r!k!v!h!i!reg!ser!n!p', u'IgorDvurekov'],
#4
['http://russianplanes.net/f!b!t!a!c!d!l20!g!m245!s0!u!r!k!v!h!i!reg!ser!n!p', u'VladimirGalkin'],
]
print Fore.GREEN+"*"*60
print argv[0]
print "skip =",skip
print "photographer =", photographer, photographers[photographer][1]
print "*"*60, Fore.WHITE
rawlist=wikipedia.Page(site, "Commons:Batch uploading/Airliners/Airportlist").get()
airportslist=rawlist.split("<pre>")[1].split("# NOT AIRPORTS")[0]
airports=[line.split("; ") for line in airportslist.split("\n") if len(line)>4 and line[:1]!="#"]
#notairportslist = rawlist.split("<pre>")[1].split("# NOT AIRPORTS")[1]
#notairports=[line.split("; ") for line in notairportslist.split("\n") if len(line)>4 and line[:1]!="#"]
def ICAO(code):
if code=='': return ''
for a in airports:
for r in a:
if re.search(" ?"+code, r):
return a[0]
return ''
count=0
for page in range(0,260):
if count+20<skip:
count+=20
continue
# Get gallery page
url=photographers[photographer][0]+str(page+1)
uri=urltry(url)
html=htmltry(uri, url)
soup=BeautifulSoup(html)
photoids=soup.findAll('tr', {'class':'photoheader'})
for photoid in photoids:
count+=1
if count<=skip : continue
pid = photoid.find('b').contents[0]
idtable = photoid
detailtable = photoid.nextSibling
detailtable.find('div', id=re.compile("thanked.*")).extract()
try:
actype = rptag(photoid, 'kb')
except:
actype['en'], actype['ru'] = '',''
pg = rptag(detailtable, 'photer')
try:
actypebottom = rptag(photoid, 'type')
except:
actypebottom['en'], actypebottom['ru'] = '', ''
actypebottom['en'] = re.sub(actype['en']+'/', '', actypebottom['en'])
source = re.sub("-200.jpg", ".jpg", detailtable.find('img').extract()['src'])
try:
sern = rptag(detailtable, 'sern')['en']
except:
sern = ''
city = rptag(detailtable, 'city')
try:
citycode = detailtable.find('a', href=re.compile("http://russianplanes.net/city/.*")).i.contents[0]
except:
citycode = ''
if re.search("/[A-Z]", citycode):
airportcode = citycode.split("/")[1][:4]
else:
airportcode = ''
country = rptag(detailtable, 'country')
try:
airline = rptag(detailtable, 'al')
except:
airline = ''
try:
latlon = detailtable.find('a', href=re.compile("http://maps.google.ru/.*")).extract()['href'].split('ll=')[1].split('&')[0].split(',')
except:
latlon = ''
datet=detailtable.find('a', href=re.compile("http://russianplanes.net/search.*"))
date={'en':datet['href'].split('=')[-1], 'ru':datet.contents[0]}
filename=actype['ru']+" "+actypebottom['ru']+" "+sern+u", "+city['ru']+u" RP"+pid+".jpg"
for s in [[" {2,}",''], ['/','-'], [' , ', ', ']]:
filename=re.sub(s[0], s[1], filename)
if not re.search(" RP", filename):
print Fore.RED+filename, Fore.WHITE
filename=re.sub("RP"+pid," RP"+pid, filename)
print filename
print "*", count, Fore.GREEN, pid
print "*", filename
#print Fore.YELLOW, pg['en'], Fore.CYAN+ pg['ru']
#print Fore.YELLOW, source
#print Fore.YELLOW, sern, Fore.WHITE
d="{{watermark}}\n=={{int:filedesc}}==\n{{Infobox aircraft image"
d+="\n|description="+""
d+="\n|aircraft={{en|1="+actype['en']+" "+actypebottom['en']+"}}\n{{ru|1=" +actype['ru']+" "+actypebottom['ru']+"}}"
d+="\n|aircraftid="+sern
d+="\n|aircraftop="+rprint(airline)
d+="\n|aircraftact="
d+="\n|imagetype=Photograph"
d+="\n|imageloc={{en|1="+ city['en'] +", "+ country['en']
if citycode!='':
d+=" "+citycode
d+="}}"
d+=re.sub(" , ", ", ", "\n{{ru|1="+ city['ru'] +", "+ country['ru'] +" "+citycode+"}}")
d+="\n|imagedate={{en|1={{ISOdate|"+date['en']+"}}}}\n{{ru|1="+date['ru']+"}}"
d+="\n|imageauthor="+rprint(pg)
sourcegp=url[:5]+urllib.quote(url[5:],"?=/")
sourcep=source[:5]+urllib.quote(source[5:])
d+="\n|imagesource="+"\n*Photo "+sourcep+"\n*Gallery page "+sourcegp
d+="\n|permission=\n|other_versions="
if airportcode!='':
d+="\n|other_fields={{Information field|name=ICAO Airport Code|value="+airportcode+"}}"
d+="\n}}"
if latlon!='':
d+="\n{{location dec|1="+latlon[0]+"|2="+latlon[1]+"}}"
d+="\n\n=={{int:license-header}}=="
d+="\n{{"+photographers[photographer][1]+"}}\n"
airportcat = ICAO(airportcode)
if airportcat!='':
d+="\n[[Category:Aircraft at "+ airportcat +"]]"
cat=unidecode(sern)+" (aircraft)" # Potential aircraft category
if len(sern)>2 and catexists(cat):
d+="\n[[Category:"+cat+"]]"
d+="\n[[Category:Images uploaded by {{subst:User:Fae/Fae}}]]\n[[Category:Russianplanes.net photos (check needed)]]\n[[Category:Russianplanes.net photos (credit bar)]]"
d+="\n{{WMUK equipment|Faebot Macmini|year={{subst:CURRENTYEAR}} }}"
# Quick check of filename in use - these should be unique
if nameused(filename.encode('utf-8')):
print Red+' Filename found', White
continue
# Quick check of uniqueness of database number on Commons
plist=[]
ANmatch='"RP"+"'+pid+'"'
vc=virinSearch(ANmatch)
if vc==-1:
print Yellow+"Error generating search matches for '"+ANmatch+"', problem, this should not happen.",White
continue
if vc>0 and len(pid)>4:
print Red+"-"*75
print Red+"File appears to be in use already for",ANmatch+" ("+str(vc)+" matches). Check "+Blue+"http://commons.wikimedia.org/w/index.php?search="+urllib.quote(ANmatch)
print Red+"-"*75,White
continue
# Check if image is a duplicate on Commons
try:
duplicate,duptitle = dupcheck(source)
except:
print Red+"Problem when running the duplicate check for",source,White
time.sleep(10)
try:
duplicate,duptitle = dupcheck(source)
except:
print Red+"Failed on second try, giving up and skipping"
continue
if duplicate:
print Red+'File is already on Commons as',duptitle,White
time.sleep(2)
continue
uptry(source,filename,d)
#print Fore.YELLOW,d, Fore.WHITE
# Nice pause for human oversight
lag=1
for i in range(lag):
stdout.write("\r%d " % (lag-i))
stdout.flush()
sleep(1)
stdout.write("\r ")
stdout.flush()
stdout.write("\r")