User:Faebot/Geograph/Code

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search

Source Code[edit]

GeoLonCatsFromFile.py (Stage 1: London)[edit]

'''
Project C: Geograph regional categorization, Stage 1: London boroughs
Link:    http://commons.wikimedia.org/wiki/User:Faebot/Geograph
What:    This is Python code which relies on pywikipedia being installed
Description:
        This code takes a selected file of Commons image page names and uses their
        embedded geo-coordinates to query up to three websites for location description.
        If the image is a Geograph project image, the location information is used to
        add the image to a Commons category to assist in further categorization.
        See the above Link to see queries to generate the source file of image names.
        
        This variation of the script for London boroughs, selects 'London borough' from
        the MapIt data (based on Ordnance Survey Open Data). Where Open Street Map does
        not appear to match MapIt, a request is put to Google Maps for 'administrative
        level 3' location which is the London borough.

        Google Maps is used only for third opinions as usage rates are more limited
        than the other two sites (2,500 calls per day).

        OS data uses locations like 'Camberwell Borough Council' and these are mapped
        to 'London Borough of Camberwell' to match MapIt and Google Maps conventions.
        The 'wanted' list is automatically generated from the category
        "Geograph images in London".

Date:    November 2012
Author:  http://commons.wikimedia.org/wiki/User:Fae
License:  CC-BY-SA
Status:  Working in OSX
        Colours need to be switched off in Windoze
        Needs spring cleaning to remove spaghetti
        3-site vote working better in this version than others.

Special Notes:
Python regex call to generate source file --
python replace.py -xml:"//Volumes/Fae_32GB/commonswiki.xml" -regex -dotall '([Ll]ocation dec\s*\|\s*51\.[2-6]\d*\s*\|\s*(-0\.[0-4]|-0\.51[0-1]|-0\.50|-0\.[0-4]|0\.[0-2]|0\.3[0-3]).*([Gg]eograph\.org\.uk|\{\{[Gg]eograph\|)|([Gg]eograph\.org\.uk|\{\{[Gg]eograph\|).*[Ll]ocation dec\s*\|\s*51\.[2-6]\d*\s*\|\s*(-0\.[0-4]|-0\.51[0-1]|-0\.50|-0\.[0-4]|0\.[0-2]|0\.3[0-3]))' '\1FAEBOT-marker-FAEBOT' -nocase -savenew:"//Volumes/Fae_32GB/Geograph/Stage1/GeoboxLondonList.txt" -ns:6

'''

import wikipedia, re, time, catlib, pagegenerators, json, urllib2, urllib

filename="Stage1/GeoboxLondonList.txt"

ff=open("/Volumes/Fae_32GB/Geograph/"+filename,'r')
gen=ff.read().split('#[[:')
gen[0]=re.sub('^.*?File','File',gen[0])
totalPages = str(len(gen))
for i in range(len(gen)):
    gen[i]=re.sub('\]\].*$','',gen[i].split('\n')[0])

testmode=False
#testmode=True

category=filename.split('.')[0]

site = wikipedia.getSite('commons', 'commons')
#cat = catlib.Category(site,'Category:'+category)
#gen = pagegenerators.CategorizedPageGenerator(cat)  #  Get pages listed in Category from Commons
count=0
countF=0
countMin=0
countMax=200000
lag=2  #  Arbitrary pause between url opens to reduce server load
hcats=''
scats=''
pageSafe=''
errors=''
data=''
commit=''

mappings=[('London Borough of Greenwich','Royal Borough of Greenwich'),
('City of London Corporation','City of London'),
('Westminster City','City of Westminster'),
]

def mapping(borough):
  for i in mappings:
    if borough==i[0]:
      borough=i[1]
  return borough

def xmltry(url):
  countErr=0
  xml=''
  while xml=='':
      try:
          xml = urllib2.urlopen(url)
          time.sleep(lag)
      except:
          xml=''
          countErr+=1
          print Cyan,'** ERROR',countErr,'\n ** Failed to read from '+url+'\n ** Pause for '+str(countErr*15)+' seconds and try again'+White
          time.sleep(15*countErr)
  return xml

def xmlget(mystr):
  if data.find('<'+mystr+'>')<1 or data.find('</'+mystr+'>')<1:
    return ''
  else:
    return data.split('<'+mystr+'>')[1].split('</'+mystr+'>')[0]

# If a category defines the wanted list, pull it in here
wantedCat='Geograph images in London'
url="https://commons.wikimedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:"+urllib.quote(wantedCat)+"&cmlimit=500&format=xml"
data=xmltry(url).read()
print 'Call to:',url
catsp=xmlget('categorymembers')
catsp=re.sub('" \/>','\n',catsp)
catsp=re.sub('<cm.*?title="','',catsp).split('\n')
catsp.pop(len(catsp)-1)
catsp[0]=re.sub('.*?(Category)','\\1',catsp[0])
count=0
wanted=[]
# Pop everything that is not a category
while count<len(catsp):
    if catsp[count].find('Category:')==-1:
        catsp.pop(count)
    else:
        count+=1
wanted.extend(catsp)
print 'Produces:\n',wanted

def iswanted(w):
    for i in wanted:
        if w==i or w==re.sub('Category:Geograph images in the ','',i):
            return True
    return False

def mapit(lat, lon):
    ekey=''
    key=''
    mapit="http://mapit.mysociety.org/point/4326/"+str(lon)+','+str(lat)
    print Blue,mapit+'.html',White
    try:
        data = json.load(xmltry(mapit))
    except:
        try:
            print Red,'** Mapit data failed to load, trying again in 5 minutes **'
            time.sleep(300)
            data = json.load(xmltry(mapit))
        except:
            print Red,'** Mapit data failed to load, trying again in 15 minutes **'
            time.sleep(900)
            try:
                data = json.load(xmltry(mapit))
            except:
                print Red,'** Mapit data failed to load, trying for a last time in 60 minutes **'
                time.sleep(60*60)
                data = json.load(xmltry(mapit))
    if data=={}:
        print Red,'** No data returned from:'
        print Red, mapit+'.html', White
        time.sleep(10)
        return ('','')
    for key in data.keys():
        type_name=[b for (a,b) in data[key].items() if a=='type_name'][0]
        if type_name=='UK Parliament constituency':
            break
    for ekey in data.keys():
        type_name=[b for (a,b) in data[ekey].items() if a=='type_name'][0]
        if type_name=='London borough':  # was Unitary Authority was European region but Unitary authority seems to work better in Scotland
            break
    if key=='':
        pc=''
    else:
        pc=[b for (a,b) in data[key].items() if a==u'name'][0]      #  UK Parliament constituency
    if ekey=='':
        eurr=''
    else:
        eurr=[b for (a,b) in data[ekey].items() if a==u'name'][0]  #  European region
        #London only
        if eurr.find('Borough Council')>-1:
            eurr=eurr.split(' Borough Council')[0]
            eurr='London Borough of '+eurr
    return (pc, eurr)

if not testmode:
    commit=' (commit)'
logFile="/Volumes/Fae_32GB/Geograph/"+category+commit+".txt"
#f = open(logFile, 'w')
log = '\n==Test on images in '+filename+'==\n'+time.asctime( time.localtime(time.time()) )+':start\n{{cot}}\n{|class="wikitable sortable"\n!Page!!Coords!!<abbr title="Check if the image is geo-tagged as being the camera or object position.">Object</abbr>!!<abbr title="Is this a Geograph project image?">Geo</abbr>!!Borough!!Action'

# Regular Colors (\033[)<effect 0=normal 1=bold 4=underline>(;3)<color 0=black>(m)
Red="\033[0;31m"     #Red
Green="\033[0;32m"   #Green
GreenB="\033[1;32m"  #Green bold
GreenU="\033[4;32m"  #Green underlined
Yellow="\033[0;33m"  #Yellow
Blue="\033[0;34m"    #Blue
Purple="\033[0;35m"  #Purple
Cyan="\033[0;36m"    #Cyan
White="\033[0;37m"   #White

print '\n',Purple,'*'*60,White,'\n'
if testmode:
    print Cyan,'TEST MODE '*6,White,'\n\n',Purple,'*'*60,White
def trimquotes(mystr):
  return re.sub('^Category:','',re.sub('^"|"$','',mystr))
def xmlget(mystr):
  if data.find('<'+mystr+'>')<1 or data.find('</'+mystr+'>')<1:
    return ''
  else:
    return data.split('<'+mystr+'>')[1].split('</'+mystr+'>')[0]

for p in gen:
  #  Get categories for candidate image using API call [on error: wait, try 3 times then add to error log]
  count += 1
  if p.find("<!--")>-1 or len(p)<5:
      if len(p)<1:
          p='[blank]'
          continue
      print Cyan+"Page",count,p
      continue
  if count<countMin:
      continue
  if count>countMax:
      break
  countS = str(count)
  try:
      page=wikipedia.Page(site,p)
  except:
      print Red,'** Page read failed',count,'page:',p
      time.sleep(5)
      continue
  pageSafe = re.sub('\[\[commons:(.*)\]\]',r'\1',str(page))
  catCall='http://commons.wikimedia.org/w/api.php?action=query&titles='+urllib.quote_plus(pageSafe,'[]:')+'&prop=categories&clprop=hidden&format=xml'
  print Yellow+'Page',countS+'/'+totalPages,pageSafe[5:60],White  #  catCall:',catCall,'\n'
  xml= xmltry(catCall)
  data = xml.read()
  if data.find('<categories>')<1:  ###  Need to add a go back and try again loop in case of timeout problems, 1,5,15 minute pauses
    print '*****\nNo categories found using',catcall,'\nOn ',page
    print data,'\n*****\n'
    errors=errors+'\nNo categories found using:'+catcall+'\n----\n'+str(data)+'\n----'
    continue
  data = data.split('<categories>')[1].split('</categories>')[0]
  data = re.sub(r'<cl ns="14" title=(.*?)\s*/>',r'\1|',data)
  data = re.sub(r'\|$','',data)
  hcats=[]
  gcats=[]
  scats=[]
  if data.lower().find('geograph ')==-1:
      isGeograph=0
  else:
      isGeograph=-1
  for c in data.split('|'):
    if c.find('hidden=')>0:
      hcats.append(c)
        elif c.find('Geograph')>0:
          gcats.append(c)
        else:
          scats.append(c) # Visible cats
#    If candidate image is already categorized against a Geograph borough then next
  #  Sort this out later
#
#    Get image page text and extract data from Object location dec or Location dec templates [not found: error log, next]
#
  text=page.get()  #  This will cause bot to sleep for 4 seconds
  textlower=text.lower()

  #  Reality check - is this a Geograph image
  if textlower.find('{{Geograph|')>1 or textlower.find('{{geograph|')>1 or  textlower.find('geograph.org.uk')>1 or textlower.find('Geograph.org.uk')>1:
      pass
  else:
      print Red,'** This is not a Geograph image - skip',White
      continue
  #  Check if object location rather than camera location
  isObject= (textlower.find('{{object location dec')>-1)
  if textlower.find('{{location dec')<0 and isObject>-1:
    print Red+'    There is no geodata in this page, so skipping\n'+White
    #log+='*'+countS+' [[:'+pageSafe+']] has no geo data.\n'
    #f.write(log)
    #log=''
    continue
    # Extract coordinates
  lat = re.search(r'\{\{([Oo]bject)?\s?[Ll]ocation dec\s*\|[\s\-]*[\d\.]+',text).group(0).split('|')[1]
  lon = re.search(r'\{\{([Oo]bject)?\s?[Ll]ocation dec\s*\|[\s\-]*[\d\.]+\s*\| ?[\s\-]*[\d\.]+',text).group(0).split('|')[2]
  lat = re.sub(r'\s','',lat)
  lon = re.sub(r'\s','',lon)
  latn = float(lat)
  lonn = float(lon)
  lonSafe=urllib.quote_plus(lon)
  latSafe=urllib.quote_plus(lat)
#
#    For each image test if within OSM bounding box [if not: next]
#
  if not (latn > 51.28676 and latn < 51.69188 and lonn > -0.51104 and lonn < 0.33402):
      #print Red,' Location not in London as outside OSM bounding box, so skipping **'+White
      #log+='** Location not in London as outside OSM bounding box\n'
      #f.write(log)
      #log=''
      continue
  log+='\n|-\n|'+countS+' [[:'+pageSafe+']]||[http://www.uk-postcodes.com/latlng/'+lat+','+lon+' '+lat+','+lon+']'
  if isObject:
      print GreenB+'  ** This page uses {{Object location dec}}\n'+White
      log += '|| <b>object</b> '
  else:
      log+='|| camera '
  if isGeograph:
      log += '|| {{tick}} '
  else:
      log += '|| N '
  #f.write(log)
  log=''
  urlukpc = 'http://www.uk-postcodes.com/latlng/'+latSafe+','+lonSafe+'.xml'
  xml = xmltry(urlukpc)
  data=xml.read()
  ukPostcode=''
  # MapIt - JSON
  (mapitConstituency,mapitEuropean)=mapit(lat,lon)
  mapitConstituency=mapping(mapitConstituency)
  mapitEuropean=mapping(mapitEuropean.split(' Council')[0].split(' 0')[0])
  print Cyan,'Mapit Const.',mapitConstituency,'('+mapitEuropean+')',White
  #  Open Street Map
  urlosm='http://nominatim.openstreetmap.org/reverse?format=xml&lat='+latSafe+'&lon='+lonSafe+'&zoom=18&addressdetails=1'
  xml=xmltry(urlosm)
  data=xml.read()
  #print '\n',data,'\n'
  #print Purple+'      Address from Open Street Map'
  #print '         Road '+xmlget('road')
  #print '       Suburb '+xmlget('suburb')
  #  OSM city given London Borough
  #  Test against uk-postcode data as OSM seems not reliable enough
  osmBorough=xmlget('city')
  osmBorough=mapping(osmBorough)
  if mapitConstituency.find(osmBorough)==-1 and mapitEuropean.find(osmBorough)>-1:
      mapitConstituency=mapitEuropean
  print Cyan,' OSM Borough',osmBorough,White
  if len(osmBorough)<2:
      #print Purple+'    District '+ukDistrict+Red+' (uk-p as OSM gave null return)'+White
      log+='|| OSM:(blank) mapit:[[:Category:'+mapitConstituency+'|]] '
      boroughCheck=-1  #  Null return means a third check is needed
  else:
      if osmBorough.lower().find(mapitConstituency.lower())>-1:
          print Purple,'      County '+osmBorough+Green+' confirmed'+Purple
          county=osmBorough
          boroughCheck=0
      else:
          print Purple,'        City '+Red+osmBorough+Purple+' (mapit:'+mapitConstituency+')'
          boroughCheck=-1  #  Failure here is the most important to follow-up on
  borough=mapitEuropean
  #  For London County should always be London
  county=xmlget('county')
  #print '       County '+county
  #print '      Country '+xmlget('country')+White
  if county.lower().find('london')==-1 and borough.lower().find('london')==-1:
      boroughCheck=0
      borough=''
      isGeograph=0
  #
  #  3rd check of Borough using Google Maps, only when needed, max of 2,500 queries per day
  #
  if boroughCheck==-1:
      gmap=u'http://maps.googleapis.com/maps/api/geocode/json?latlng='+lat+','+lon+'&sensor=true'
      data = json.load(xmltry(gmap))
      gmapBorough=[b['long_name'] for a in data['results'] for b in a['address_components'] for c in b['types'] if c=='administrative_area_level_2']
      if len(gmapBorough)>0:
          gmapBorough=gmapBorough[0]
      else:
          gmapBorough=''
      if gmapBorough=='London' or gmapBorough=='Greater London' :
          gmapBorough=[b['long_name'] for a in data['results'] for b in a['address_components'] for c in b['types'] if c=='administrative_area_level_3']
          if len(gmapBorough)>0:
              gmapBorough=gmapBorough[0]
      if len(gmapBorough)>1:
          print Purple+'    GMap Borough '+gmapBorough+White
      # We know a!=b test a=c and b=c
      if osmBorough==gmapBorough:
          borough=gmapBorough
          print Cyan,'Google Maps agrees with Open Street Map',White
          print Cyan,'Going with',borough,'by 2 out of 3',White
      elif mapitConstituency==gmapBorough or mapitEuropean==gmapBorough:
          borough=gmapBorough
          print Cyan,'Google Maps agrees with MapIt',White
          print Cyan,'Going with',borough,'by 2 out of 3',White
      else:
          print Red,'** No majority vote **',White
          borough=''
  borough=mapping(borough)
  if isGeograph==-1 and len(borough)>2:
    if text.find('[[Category:Geograph images in the '+borough+']]')>0:
        log+='|| <i>Already in [[:Category:Geograph images in the '+borough+'|]]</i>'
    else:
        log+='|| <b>Add</b> to [[:Category:Geograph images in the '+borough+'|]]'
  else:
    log+='|| None'
#  f.write(log)
  log=''
  if not iswanted(borough):
      if borough!='':
          print Red+'   ** '+borough,White
      print Red+'   ** Not on the wanted list'+White
      continue
  #urlsoc='http://mapit.mysociety.org/point/4326/'+lonSafe+','+latSafe
  #datajson = json.load(urllib2.urlopen(urlsoc))
  #print '\nDump from mapit:\n'+str([a for b in datajson for a in b])+White
#    Map OSM given borough (=locality) to existing Commons category in Category:London boroughs
#    If the number of visible non-Geograph categories on the image are 0, then
#        Add an existing, visible, Commons London borough category
  if len(scats)<1:
    text=text+'\n[[Category:'+borough+']]'
#        If template exists, then remove Uncategorized-Geograph
#        Add Check categories-Geograph
#    Add hidden Geograph by London borough category
  isCooked=(text.find('[[Category:Geograph images in the '+borough+']]')>-1)
  if (not isCooked) and (not testmode):
      text=text+'\n[[Category:Geograph images in the '+borough+']]'
      wikipedia.setAction('Add to [[Category:Geograph images in the '+borough+']], see [[User:Faebot/Geograph#C1]] for project details)')
      time.sleep(30)
      page.put(text)
      print Yellow+'  Page updated on Commons'+White
  if isCooked:
      print Red+'  This page is already categorized under\n    Geograph images in the '+borough+White

#    Write record to local log

log = '\n|}\n{{cob}}n'+time.asctime( time.localtime(time.time()) )+':end\n'
#f.write(log)
#f.close()  #  Close the log file

if testmode:
  print Green+'Test run complete on '+category+White
  #f = open(logFile, 'r')
  #log = f.read()
  #f.close()
  #wikipedia.setAction('Geograph '+filename+' analysis report')
  #page=wikipedia.Page(site,'User:Faebot/SandboxL')
  #log=page.get()+log
  #page.put(log)
else:
  print Green+'Commit run complete on '+filename+White

GeoStage2bFromFile.py (Stage 2b: Cornwall, Devon, Somerset and the Isles of Scilly)[edit]

'''
Project C: Geograph regional categorization, Stage 2b
Link:    http://commons.wikimedia.org/wiki/User:Faebot/Geograph
What:    This is Python code which relies on pywikipedia being installed
Description:
        This code takes a selected file of Commons image page names and uses their
        embedded geo-coordinates to query up to three websites for location description.
        If the image is a Geograph project image, the location information is used to
        add the image to a Commons category to assist in further categorization.
        See the above Link to see queries to generate the source file of image names.
Date:    November 2012
Author:  http://commons.wikimedia.org/wiki/User:Fae
License:  CC-BY-SA
Status:  Working in OSX
        Colours need to be switched off in Windoze
        Use of Mapit data could be better
        Needs spring cleaning to remove spaghetti
'''
import wikipedia, re, time, catlib, pagegenerators, json, urllib2, urllib

project="[[User:Faebot/Geograph#C2b]]"

#filename="GeoboxLondonList.txt"
#filename="Listed buildings in London list.txt"
filename="Stage2/GeoboxCornwallList.txt"

ff=open("/Volumes/Fae_32GB/Geograph/"+filename,'r')
gen=ff.read().split('#[[:')
gen[0]=re.sub('^.*?File','File',gen[0])
totalPages = str(len(gen))
for i in range(len(gen)):
    gen[i]=re.sub('\]\].*$','',gen[i].split('\n')[0])

testmode=False
#testmode=True

mappings=[('London Borough of Greenwich','Royal Borough of Greenwich'),
('Bath & North East Somerset','Somerset'),
('Bath and North East Somerset','Somerset'),
('Taunton Deane','Somerset'),
('Devon County','Devon'),
('South West Devon','Devon'),
('Plymouth','Devon'),
('Torbay','Devon'),
('Central Devon','Devon'),
('Somerset County','Somerset'),
('North Somerset','Somerset'),
('Poole','Dorset'),
('South Dorset','Dorset'),
('Purbeck District','Dorset'),
('Purbeck','Dorset'),
('Bournemouth','Dorset'),
('Newton Abbot','Devon'),
('Torridge and West Devon','Devon'),
('East Devon','Devon'),
('Exeter','Devon'),
('North Devon','Devon'),
('Plymouth, Moor View','Devon'),
('Plymouth, Sutton and Devonport','Devon'),
('South West Devon','Devon'),
('Tiverton and Honiton','Devon'),
('Torbay','Devon'),
('Totnes','Devon'),
('South Somerset','Somerset'),
('Taunton Deane','Somerset'),
('West Somerset','Somerset'),
('Sedgemoor','Somerset'),
('Mendip','Somerset'),
('North Somerset','Somerset'),
('Yeovil','Somerset'),
('Bridgwater and West Somerset','Somerset')
]  #  Mappings are based on Wikimedia Commons use

wanted=['Devon','Cornwall','Somerset','Dorset','Isles of Scilly']

category=filename

site = wikipedia.getSite('commons', 'commons')
#cat = catlib.Category(site,'Category:'+category)
#gen = pagegenerators.CategorizedPageGenerator(cat)  #  Get pages listed in Category from Commons
count=0
countF=0
countMin=470
countMax=148000
lag=0  #  Arbitrary pause between url opens to reduce server load
hcats=''
scats=''
pageSafe=''
errors=''
data=''
commit=''


def mapping(borough):  #  If borough is in mappings then return the mapped name
  for i in mappings:
    if borough==i[0]:
      borough=i[1]
  return borough

def wantedlist(borough):
  for i in wanted:
      if borough==i:
          return borough
          break
  return ''

def xmltry(url):
  countErr=0
  xml=''
  while xml=='':
      try:
          xml = urllib2.urlopen(url)
          time.sleep(lag)
      except:
          xml=''
          countErr+=1
          print Cyan,'** ERROR',countErr,'\n ** Failed to read from '+url+'\n ** Pause for '+str(countErr*15)+' seconds and try again'+White
          time.sleep(15*countErr)
  return xml

def mapit(lat, lon):
    ekey=''
    key=''
    mapit="http://mapit.mysociety.org/point/4326/"+str(lon)+','+str(lat)
    print Blue,mapit+'.html',White
    try:
        data = json.load(xmltry(mapit))
    except:
        try:
            print Red,'** Mapit data failed to load, trying again in 5 minutes **'
            time.sleep(300)
            data = json.load(xmltry(mapit))
        except:
            print Red,'** Mapit data failed to load, trying again in 15 minutes **'
            time.sleep(900)
            try:
                data = json.load(xmltry(mapit))
            except:
                print Red,'** Mapit data failed to load, trying for a last time in 60 minutes **'
                time.sleep(60*60)
                data = json.load(xmltry(mapit))
    if data=={}:
        print Red,'** No data returned from:'
        print Red, mapit+'.html', White
        time.sleep(10)
        return ('','')
    for key in data.keys():
        type_name=[b for (a,b) in data[key].items() if a=='type_name'][0]
        if type_name=='UK Parliament constituency':
            break
    for ekey in data.keys():
        type_name=[b for (a,b) in data[ekey].items() if a=='type_name'][0]
        if type_name=='Unitary Authority':  # was European region but Unitary authority seems to work better in Scotland
            break
    if key=='':
        pc=''
    else:
        pc=[b for (a,b) in data[key].items() if a==u'name'][0]      #  UK Parliament constituency
    if ekey=='':
        eurr=''
    else:
        eurr=[b for (a,b) in data[ekey].items() if a==u'name'][0]  #  European region
    return (pc, eurr)

if not testmode:
    commit=' (commit)'
logFile="/Volumes/Fae_32GB/Geograph/"+category+commit+".txt"
f = open(logFile, 'w')
log = '\n==Test on images in '+filename+'==\n'+time.asctime( time.localtime(time.time()) )+':start\n{{cot}}\n{|class="wikitable sortable"\n!Page!!Coords!!<abbr title="Check if the image is geo-tagged as being the camera or object position.">Object</abbr>!!<abbr title="Is this a Geograph project image?">Geo</abbr>!!Borough!!Action'

# Regular Colors (\033[)<effect 0=normal 1=bold 4=underline>(;3)<color 0=black>(m)
Red="\033[0;31m"     #Red
Green="\033[0;32m"   #Green
GreenB="\033[1;32m"  #Green bold
GreenU="\033[4;32m"  #Green underlined
Yellow="\033[0;33m"  #Yellow
Blue="\033[0;34m"    #Blue
Purple="\033[0;35m"  #Purple
Cyan="\033[0;36m"    #Cyan
White="\033[0;37m"   #White

print '\n',Purple,'*'*60,White,'\n'
if testmode:
    print Cyan,'TEST MODE '*6,White,'\n\n',Purple,'*'*60,White
def trimquotes(mystr):
  return re.sub('^Category:','',re.sub('^"|"$','',mystr))
def xmlget(mystr):
  if data.find('<'+mystr+'>')<1 or data.find('</'+mystr+'>')<1:
    return ''
  else:
    return data.split('<'+mystr+'>')[1].split('</'+mystr+'>')[0]

for p in gen:
  #  Get categories for candidate image using API call [on error: wait, try 3 times then add to error log]
  count += 1
  if p.find("<!--")>-1 or len(p)<5:
      if len(p)<1:
          p='[blank]'
          continue
      print Cyan+"Page",count,p
      continue
  if count<countMin:
      continue
  if count>countMax:
      break
  countS = str(count)
  try:
      page=wikipedia.Page(site,p)
  except:
      print Red,'** Page read failed',count,'page:',p
      print Red,'** I will try once more in 5 minutes'
      time.sleep(300)
      continue
  pageSafe = re.sub('\[\[commons:(.*)\]\]',r'\1',str(page))
  catCall='http://commons.wikimedia.org/w/api.php?action=query&titles='+urllib.quote_plus(pageSafe,'[]:')+'&prop=categories&clprop=hidden&format=xml'
  print Yellow+'Page',countS+'/'+totalPages,pageSafe[5:60],White  #  catCall:',catCall,'\n'
  xml= xmltry(catCall)
  try:
      data = xml.read()
  except:
      print Red,'** failed to read xml, trying for a second time in 5 minutes'
      time.sleep(300)
      data = xml.read()
  if data.find('<categories>')<1:  ###  Need to add a go back and try again loop in case of timeout problems, 1,5,15 minute pauses
    print '*****\nNo categories found using',catcall,'\nOn ',page
    print data,'\n*****\n'
    errors=errors+'\nNo categories found using:'+catcall+'\n----\n'+str(data)+'\n----'
    continue
  data = data.split('<categories>')[1].split('</categories>')[0]
  data = re.sub(r'<cl ns="14" title=(.*?)\s*/>',r'\1|',data)
  data = re.sub(r'\|$','',data)
  hcats=[]
  gcats=[]
  scats=[]
  for c in data.split('|'):
    if c.find('hidden=')>0:
      hcats.append(c)
        elif c.find('Geograph')>0:
          gcats.append(c)
        else:
          scats.append(c) # Visible cats
#
#    Get image page text and extract data from Object location dec or Location dec templates [not found: error log, next]
#
  text=page.get()  #  This will cause bot to sleep for 4 seconds
  textlower=text.lower()
  isObject= (textlower.find('{{object location dec')>-1)
  if textlower.find('{{location dec')<0 and isObject>-1:
    print Red,'** There is no geodata in this page, so skipping'+White
    #log+='*'+countS+' [[:'+pageSafe+']] has no geo data.\n'
    #f.write(log)
    #log=''
    continue
  #
  # Extract coordinates
  #
  lat = re.search(r'\{\{([Oo]bject)?\s?[Ll]ocation dec\s*\|[\s\-]*[\d\.]+',text).group(0).split('|')[1]
  lon = re.search(r'\{\{([Oo]bject)?\s?[Ll]ocation dec\s*\|[\s\-]*[\d\.]+\s*\| ?[\s\-]*[\d\.]+',text).group(0).split('|')[2]
  lat = re.sub(r'\s','',lat)
  lon = re.sub(r'\s','',lon)
  latn = float(lat)
  lonn = float(lon)
  lonSafe=urllib.quote_plus(lon)
  latSafe=urllib.quote_plus(lat)
  
  #  Reality check - is this a Geograph image
  if textlower.find('{{Geograph|')>1 or textlower.find('{{geograph|')>1 or  textlower.find('geograph.org.uk')>1 or textlower.find('Geograph.org.uk')>1:
      pass
  else:
      print Red,'** This is not a Geograph image - skip',White
      continue
  #    Get OSM address data
  #  uk-postcodes.com
  #urlukpc='http://www.uk-postcodes.com/latlng/'+latSafe+','+lonSafe+'.xml'
  #print Blue,'UKP',urlukpc
  #xml = xmltry(urlukpc)
  #data=xml.read()
  ukPostcode=''#xmlget('postcode')
  #ukPostcode=re.sub('\s*(\s\w?[\d]).*$',r'\1',ukPostcode)  #  Trim postcode
  #ukDistrict=xmlget('district')
  #if ukDistrict.find('<title>')>0 and ukDistrict.find('</title>')>0:
  #    ukDistrict=ukDistrict.split('<title>')[1].split('</title>')[0]
  #else:
  #      ukDistrict=''
  (mapitConstituency,mapitEuropean)=mapit(lat,lon)
  mapitConstituency=mapping(mapitConstituency)
  mapitEuropean=mapping(mapitEuropean.split(' Council')[0].split(' 0')[0])
  print Cyan,'Mapit Const.',mapitConstituency,'('+mapitEuropean+')',White
  #  Open Street Map
  urlosm='http://nominatim.openstreetmap.org/reverse?format=xml&lat='+latSafe+'&lon='+lonSafe
  print Blue,'OSM',urlosm,White
  xml=xmltry(urlosm)
  data=xml.read()
  #print '\n',data,'\n'
  #print Purple+'      Address from Open Street Map'
  #print '         Road '+xmlget('road')
  #print '       Suburb '+xmlget('suburb')
  #  OSM city given London Borough
  #  Test against uk-postcode data as OSM seems not reliable enough
  osmBorough=xmlget('county')
  osmBorough=mapping(osmBorough)
  if mapitConstituency.find(osmBorough)==-1 and mapitEuropean.find(osmBorough)>-1:
      mapitConstituency=mapitEuropean
  if osmBorough==mapitEuropean:
      mapitConstituency=mapitEuropean  #  Cut down on the number of GMap checks
  print Cyan,'  OSM County',osmBorough,White
  if len(osmBorough)<2:
      #print Purple+'    District '+ukDistrict+Red+' (uk-p as OSM gave null return)'+White
      log+='|| OSM:(blank) mapit:[[:Category:'+mapitConstituency+'|]] '
      boroughCheck=-1  #  Null return means a third check is needed
  else:
      if osmBorough.lower().find(mapitConstituency.lower())>-1 and len(mapitConstituency)>1:
          print Purple,'      County '+osmBorough+Green+' confirmed'+Purple
          log+='|| [[:Category:'+osmBorough+'|]]'
          boroughCheck=0
      else:
          print Purple,'        City '+Red+osmBorough+Purple+' (mapit:'+mapitConstituency+')'
          log+='||OSM:[[:Category:'+osmBorough+'|]] uk-p:'+mapitConstituency+' '
          boroughCheck=-1  #  Failure here is the most improtant to follow-up on
  borough=osmBorough
  #  For London County should always be London
  county=xmlget('county')
  print '              '+county
  osmPostcode=xmlget('postcode')
  osmPostcode=re.sub('\s*(\s\w?[\d]).*$',r'\1',osmPostcode)
  #if osmPostcode.lower().find(ukPostcode.lower()[:2])>-1:
      #print '     Postcode '+osmPostcode+Green+' confirmed'+Purple #postcodes from OSM appear rubbish
      #log+='** Postcode: '+osmPostcode+' (check says '+ukPostcode+')\n'
  #else:
      #print '     Postcode '+Red+osmPostcode+Purple+' (uk-p):'+ukPostcode
      #log+='** Postcode: '+osmPostcode+' <b>but</b> uk-postcodes says '+ukPostcode+'\n'
  #print '      Country '+xmlget('country')+White
  #
  #  3rd check of Borough using Google Maps, only when needed, max of 2,500 queries per day
  #
  if boroughCheck==-1:
      gmap=u'http://maps.googleapis.com/maps/api/geocode/json?latlng='+lat+','+lon+'&sensor=true'
      data = json.load(xmltry(gmap))
      gmapCounty=[b['long_name'] for a in data['results'] for b in a['address_components'] for c in b['types'] if c=='administrative_area_level_2']
      if len(gmapCounty)>0:
          gmapCounty=gmapCounty[0]
      else:
          gmapCounty=''
      if len(gmapCounty)>1:
          print Purple+'    GMap County '+gmapCounty+White
          if mapitConstituency.lower().find(gmapCounty.lower())>-1 or gmapCounty.lower().find(mapitConstituency.lower())>-1:
              print Cyan+ '   ** Use '+Green+gmapCounty+Cyan+' as preferred county name'+White
              log+='gmap:<b>[[:Category:'+gmapCounty+'|]]</b> by a poll of 3 websites\n'
              county=gmapCounty
  county=mapping(county)
  if len(county)>2:
    if text.find('[[Category:Geograph images in '+county+']]')>0:
        log+='|| <i>Already in [[:Category:Geograph images in '+county+'|]]</i>'
    else:
        log+='|| <b>Add</b> to [[:Category:Geograph images in '+county+'|]]'
  else:
    log+='|| None'
  #f.write(log)
  log=''
  if len(county)==0:
      print Red,'** Failed county mapping',White
      continue
  #urlsoc='http://mapit.mysociety.org/point/4326/'+lonSafe+','+latSafe
  #datajson = json.load(urllib2.urlopen(urlsoc))
  #print '\nDump from mapit:\n'+str([a for b in datajson for a in b])+White
#    Map OSM given county (=locality) to existing Commons category in Category:London countys
#    If the number of visible non-Geograph categories on the image are 0, then
#        Add an existing, visible, Commons London county category
  if len(scats)<1:
    text=text+'\n[[Category:'+county+']]'
#        If template exists, then remove Uncategorized-Geograph
#        Add Check categories-Geograph
#    Add hidden Geograph by London county category
  oldcounty=county
  county=wantedlist(county)
  if county=='':
      print Red,'**',oldcounty
      print Red,'** Failed county wanted list',White
      continue
  if county.find('Isles ')>-1:
      county="the "+county
  isCooked=(text.find('[[Category:Geograph images in '+county+']]')>-1)
  if (not isCooked) and (not testmode):
      text=text+'\n[[Category:Geograph images in '+county+']]'
      wikipedia.setAction('Add to [[Category:Geograph images in '+county+']], see '+project+' for details')
      time.sleep(15)
      page.put(text)
      #print text
      print Yellow+'  Page updated on Commons'+White
  if isCooked:
      print Red+'  This page is already categorized under\n    Geograph images in '+county+White

#    Write record to local log

log = '\n|}\n{{cob}}n'+time.asctime( time.localtime(time.time()) )+':end\n'
f.write(log)
f.close()  #  Close the log file

if testmode:
  print Green+'Test run complete on '+filename+White
  f = open(logFile, 'r')
  log = f.read()
  f.close()
else:
  print Green+'Uploads complete on '+filename+White