User:Faebot/Geograph/Code
< User:Faebot | Geograph
Source Code[edit]
GeoLonCatsFromFile.py (Stage 1: London)[edit]
''' Project C: Geograph regional categorization, Stage 1: London boroughs Link: http://commons.wikimedia.org/wiki/User:Faebot/Geograph What: This is Python code which relies on pywikipedia being installed Description: This code takes a selected file of Commons image page names and uses their embedded geo-coordinates to query up to three websites for location description. If the image is a Geograph project image, the location information is used to add the image to a Commons category to assist in further categorization. See the above Link to see queries to generate the source file of image names. This variation of the script for London boroughs, selects 'London borough' from the MapIt data (based on Ordnance Survey Open Data). Where Open Street Map does not appear to match MapIt, a request is put to Google Maps for 'administrative level 3' location which is the London borough. Google Maps is used only for third opinions as usage rates are more limited than the other two sites (2,500 calls per day). OS data uses locations like 'Camberwell Borough Council' and these are mapped to 'London Borough of Camberwell' to match MapIt and Google Maps conventions. The 'wanted' list is automatically generated from the category "Geograph images in London". Date: November 2012 Author: http://commons.wikimedia.org/wiki/User:Fae License: CC-BY-SA Status: Working in OSX Colours need to be switched off in Windoze Needs spring cleaning to remove spaghetti 3-site vote working better in this version than others. Special Notes: Python regex call to generate source file -- python replace.py -xml:"//Volumes/Fae_32GB/commonswiki.xml" -regex -dotall '([Ll]ocation dec\s*\|\s*51\.[2-6]\d*\s*\|\s*(-0\.[0-4]|-0\.51[0-1]|-0\.50|-0\.[0-4]|0\.[0-2]|0\.3[0-3]).*([Gg]eograph\.org\.uk|\{\{[Gg]eograph\|)|([Gg]eograph\.org\.uk|\{\{[Gg]eograph\|).*[Ll]ocation dec\s*\|\s*51\.[2-6]\d*\s*\|\s*(-0\.[0-4]|-0\.51[0-1]|-0\.50|-0\.[0-4]|0\.[0-2]|0\.3[0-3]))' '\1FAEBOT-marker-FAEBOT' -nocase -savenew:"//Volumes/Fae_32GB/Geograph/Stage1/GeoboxLondonList.txt" -ns:6 ''' import wikipedia, re, time, catlib, pagegenerators, json, urllib2, urllib filename="Stage1/GeoboxLondonList.txt" ff=open("/Volumes/Fae_32GB/Geograph/"+filename,'r') gen=ff.read().split('#[[:') gen[0]=re.sub('^.*?File','File',gen[0]) totalPages = str(len(gen)) for i in range(len(gen)): gen[i]=re.sub('\]\].*$','',gen[i].split('\n')[0]) testmode=False #testmode=True category=filename.split('.')[0] site = wikipedia.getSite('commons', 'commons') #cat = catlib.Category(site,'Category:'+category) #gen = pagegenerators.CategorizedPageGenerator(cat) # Get pages listed in Category from Commons count=0 countF=0 countMin=0 countMax=200000 lag=2 # Arbitrary pause between url opens to reduce server load hcats='' scats='' pageSafe='' errors='' data='' commit='' mappings=[('London Borough of Greenwich','Royal Borough of Greenwich'), ('City of London Corporation','City of London'), ('Westminster City','City of Westminster'), ] def mapping(borough): for i in mappings: if borough==i[0]: borough=i[1] return borough def xmltry(url): countErr=0 xml='' while xml=='': try: xml = urllib2.urlopen(url) time.sleep(lag) except: xml='' countErr+=1 print Cyan,'** ERROR',countErr,'\n ** Failed to read from '+url+'\n ** Pause for '+str(countErr*15)+' seconds and try again'+White time.sleep(15*countErr) return xml def xmlget(mystr): if data.find('<'+mystr+'>')<1 or data.find('</'+mystr+'>')<1: return '' else: return data.split('<'+mystr+'>')[1].split('</'+mystr+'>')[0] # If a category defines the wanted list, pull it in here wantedCat='Geograph images in London' url="https://commons.wikimedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:"+urllib.quote(wantedCat)+"&cmlimit=500&format=xml" data=xmltry(url).read() print 'Call to:',url catsp=xmlget('categorymembers') catsp=re.sub('" \/>','\n',catsp) catsp=re.sub('<cm.*?title="','',catsp).split('\n') catsp.pop(len(catsp)-1) catsp[0]=re.sub('.*?(Category)','\\1',catsp[0]) count=0 wanted=[] # Pop everything that is not a category while count<len(catsp): if catsp[count].find('Category:')==-1: catsp.pop(count) else: count+=1 wanted.extend(catsp) print 'Produces:\n',wanted def iswanted(w): for i in wanted: if w==i or w==re.sub('Category:Geograph images in the ','',i): return True return False def mapit(lat, lon): ekey='' key='' mapit="http://mapit.mysociety.org/point/4326/"+str(lon)+','+str(lat) print Blue,mapit+'.html',White try: data = json.load(xmltry(mapit)) except: try: print Red,'** Mapit data failed to load, trying again in 5 minutes **' time.sleep(300) data = json.load(xmltry(mapit)) except: print Red,'** Mapit data failed to load, trying again in 15 minutes **' time.sleep(900) try: data = json.load(xmltry(mapit)) except: print Red,'** Mapit data failed to load, trying for a last time in 60 minutes **' time.sleep(60*60) data = json.load(xmltry(mapit)) if data=={}: print Red,'** No data returned from:' print Red, mapit+'.html', White time.sleep(10) return ('','') for key in data.keys(): type_name=[b for (a,b) in data[key].items() if a=='type_name'][0] if type_name=='UK Parliament constituency': break for ekey in data.keys(): type_name=[b for (a,b) in data[ekey].items() if a=='type_name'][0] if type_name=='London borough': # was Unitary Authority was European region but Unitary authority seems to work better in Scotland break if key=='': pc='' else: pc=[b for (a,b) in data[key].items() if a==u'name'][0] # UK Parliament constituency if ekey=='': eurr='' else: eurr=[b for (a,b) in data[ekey].items() if a==u'name'][0] # European region #London only if eurr.find('Borough Council')>-1: eurr=eurr.split(' Borough Council')[0] eurr='London Borough of '+eurr return (pc, eurr) if not testmode: commit=' (commit)' logFile="/Volumes/Fae_32GB/Geograph/"+category+commit+".txt" #f = open(logFile, 'w') log = '\n==Test on images in '+filename+'==\n'+time.asctime( time.localtime(time.time()) )+':start\n{{cot}}\n{|class="wikitable sortable"\n!Page!!Coords!!<abbr title="Check if the image is geo-tagged as being the camera or object position.">Object</abbr>!!<abbr title="Is this a Geograph project image?">Geo</abbr>!!Borough!!Action' # Regular Colors (\033[)<effect 0=normal 1=bold 4=underline>(;3)<color 0=black>(m) Red="\033[0;31m" #Red Green="\033[0;32m" #Green GreenB="\033[1;32m" #Green bold GreenU="\033[4;32m" #Green underlined Yellow="\033[0;33m" #Yellow Blue="\033[0;34m" #Blue Purple="\033[0;35m" #Purple Cyan="\033[0;36m" #Cyan White="\033[0;37m" #White print '\n',Purple,'*'*60,White,'\n' if testmode: print Cyan,'TEST MODE '*6,White,'\n\n',Purple,'*'*60,White def trimquotes(mystr): return re.sub('^Category:','',re.sub('^"|"$','',mystr)) def xmlget(mystr): if data.find('<'+mystr+'>')<1 or data.find('</'+mystr+'>')<1: return '' else: return data.split('<'+mystr+'>')[1].split('</'+mystr+'>')[0] for p in gen: # Get categories for candidate image using API call [on error: wait, try 3 times then add to error log] count += 1 if p.find("<!--")>-1 or len(p)<5: if len(p)<1: p='[blank]' continue print Cyan+"Page",count,p continue if count<countMin: continue if count>countMax: break countS = str(count) try: page=wikipedia.Page(site,p) except: print Red,'** Page read failed',count,'page:',p time.sleep(5) continue pageSafe = re.sub('\[\[commons:(.*)\]\]',r'\1',str(page)) catCall='http://commons.wikimedia.org/w/api.php?action=query&titles='+urllib.quote_plus(pageSafe,'[]:')+'&prop=categories&clprop=hidden&format=xml' print Yellow+'Page',countS+'/'+totalPages,pageSafe[5:60],White # catCall:',catCall,'\n' xml= xmltry(catCall) data = xml.read() if data.find('<categories>')<1: ### Need to add a go back and try again loop in case of timeout problems, 1,5,15 minute pauses print '*****\nNo categories found using',catcall,'\nOn ',page print data,'\n*****\n' errors=errors+'\nNo categories found using:'+catcall+'\n----\n'+str(data)+'\n----' continue data = data.split('<categories>')[1].split('</categories>')[0] data = re.sub(r'<cl ns="14" title=(.*?)\s*/>',r'\1|',data) data = re.sub(r'\|$','',data) hcats=[] gcats=[] scats=[] if data.lower().find('geograph ')==-1: isGeograph=0 else: isGeograph=-1 for c in data.split('|'): if c.find('hidden=')>0: hcats.append(c) elif c.find('Geograph')>0: gcats.append(c) else: scats.append(c) # Visible cats # If candidate image is already categorized against a Geograph borough then next # Sort this out later # # Get image page text and extract data from Object location dec or Location dec templates [not found: error log, next] # text=page.get() # This will cause bot to sleep for 4 seconds textlower=text.lower() # Reality check - is this a Geograph image if textlower.find('{{Geograph|')>1 or textlower.find('{{geograph|')>1 or textlower.find('geograph.org.uk')>1 or textlower.find('Geograph.org.uk')>1: pass else: print Red,'** This is not a Geograph image - skip',White continue # Check if object location rather than camera location isObject= (textlower.find('{{object location dec')>-1) if textlower.find('{{location dec')<0 and isObject>-1: print Red+' There is no geodata in this page, so skipping\n'+White #log+='*'+countS+' [[:'+pageSafe+']] has no geo data.\n' #f.write(log) #log='' continue # Extract coordinates lat = re.search(r'\{\{([Oo]bject)?\s?[Ll]ocation dec\s*\|[\s\-]*[\d\.]+',text).group(0).split('|')[1] lon = re.search(r'\{\{([Oo]bject)?\s?[Ll]ocation dec\s*\|[\s\-]*[\d\.]+\s*\| ?[\s\-]*[\d\.]+',text).group(0).split('|')[2] lat = re.sub(r'\s','',lat) lon = re.sub(r'\s','',lon) latn = float(lat) lonn = float(lon) lonSafe=urllib.quote_plus(lon) latSafe=urllib.quote_plus(lat) # # For each image test if within OSM bounding box [if not: next] # if not (latn > 51.28676 and latn < 51.69188 and lonn > -0.51104 and lonn < 0.33402): #print Red,' Location not in London as outside OSM bounding box, so skipping **'+White #log+='** Location not in London as outside OSM bounding box\n' #f.write(log) #log='' continue log+='\n|-\n|'+countS+' [[:'+pageSafe+']]||[http://www.uk-postcodes.com/latlng/'+lat+','+lon+' '+lat+','+lon+']' if isObject: print GreenB+' ** This page uses {{Object location dec}}\n'+White log += '|| <b>object</b> ' else: log+='|| camera ' if isGeograph: log += '|| {{tick}} ' else: log += '|| N ' #f.write(log) log='' urlukpc = 'http://www.uk-postcodes.com/latlng/'+latSafe+','+lonSafe+'.xml' xml = xmltry(urlukpc) data=xml.read() ukPostcode='' # MapIt - JSON (mapitConstituency,mapitEuropean)=mapit(lat,lon) mapitConstituency=mapping(mapitConstituency) mapitEuropean=mapping(mapitEuropean.split(' Council')[0].split(' 0')[0]) print Cyan,'Mapit Const.',mapitConstituency,'('+mapitEuropean+')',White # Open Street Map urlosm='http://nominatim.openstreetmap.org/reverse?format=xml&lat='+latSafe+'&lon='+lonSafe+'&zoom=18&addressdetails=1' xml=xmltry(urlosm) data=xml.read() #print '\n',data,'\n' #print Purple+' Address from Open Street Map' #print ' Road '+xmlget('road') #print ' Suburb '+xmlget('suburb') # OSM city given London Borough # Test against uk-postcode data as OSM seems not reliable enough osmBorough=xmlget('city') osmBorough=mapping(osmBorough) if mapitConstituency.find(osmBorough)==-1 and mapitEuropean.find(osmBorough)>-1: mapitConstituency=mapitEuropean print Cyan,' OSM Borough',osmBorough,White if len(osmBorough)<2: #print Purple+' District '+ukDistrict+Red+' (uk-p as OSM gave null return)'+White log+='|| OSM:(blank) mapit:[[:Category:'+mapitConstituency+'|]] ' boroughCheck=-1 # Null return means a third check is needed else: if osmBorough.lower().find(mapitConstituency.lower())>-1: print Purple,' County '+osmBorough+Green+' confirmed'+Purple county=osmBorough boroughCheck=0 else: print Purple,' City '+Red+osmBorough+Purple+' (mapit:'+mapitConstituency+')' boroughCheck=-1 # Failure here is the most important to follow-up on borough=mapitEuropean # For London County should always be London county=xmlget('county') #print ' County '+county #print ' Country '+xmlget('country')+White if county.lower().find('london')==-1 and borough.lower().find('london')==-1: boroughCheck=0 borough='' isGeograph=0 # # 3rd check of Borough using Google Maps, only when needed, max of 2,500 queries per day # if boroughCheck==-1: gmap=u'http://maps.googleapis.com/maps/api/geocode/json?latlng='+lat+','+lon+'&sensor=true' data = json.load(xmltry(gmap)) gmapBorough=[b['long_name'] for a in data['results'] for b in a['address_components'] for c in b['types'] if c=='administrative_area_level_2'] if len(gmapBorough)>0: gmapBorough=gmapBorough[0] else: gmapBorough='' if gmapBorough=='London' or gmapBorough=='Greater London' : gmapBorough=[b['long_name'] for a in data['results'] for b in a['address_components'] for c in b['types'] if c=='administrative_area_level_3'] if len(gmapBorough)>0: gmapBorough=gmapBorough[0] if len(gmapBorough)>1: print Purple+' GMap Borough '+gmapBorough+White # We know a!=b test a=c and b=c if osmBorough==gmapBorough: borough=gmapBorough print Cyan,'Google Maps agrees with Open Street Map',White print Cyan,'Going with',borough,'by 2 out of 3',White elif mapitConstituency==gmapBorough or mapitEuropean==gmapBorough: borough=gmapBorough print Cyan,'Google Maps agrees with MapIt',White print Cyan,'Going with',borough,'by 2 out of 3',White else: print Red,'** No majority vote **',White borough='' borough=mapping(borough) if isGeograph==-1 and len(borough)>2: if text.find('[[Category:Geograph images in the '+borough+']]')>0: log+='|| <i>Already in [[:Category:Geograph images in the '+borough+'|]]</i>' else: log+='|| <b>Add</b> to [[:Category:Geograph images in the '+borough+'|]]' else: log+='|| None' # f.write(log) log='' if not iswanted(borough): if borough!='': print Red+' ** '+borough,White print Red+' ** Not on the wanted list'+White continue #urlsoc='http://mapit.mysociety.org/point/4326/'+lonSafe+','+latSafe #datajson = json.load(urllib2.urlopen(urlsoc)) #print '\nDump from mapit:\n'+str([a for b in datajson for a in b])+White # Map OSM given borough (=locality) to existing Commons category in Category:London boroughs # If the number of visible non-Geograph categories on the image are 0, then # Add an existing, visible, Commons London borough category if len(scats)<1: text=text+'\n[[Category:'+borough+']]' # If template exists, then remove Uncategorized-Geograph # Add Check categories-Geograph # Add hidden Geograph by London borough category isCooked=(text.find('[[Category:Geograph images in the '+borough+']]')>-1) if (not isCooked) and (not testmode): text=text+'\n[[Category:Geograph images in the '+borough+']]' wikipedia.setAction('Add to [[Category:Geograph images in the '+borough+']], see [[User:Faebot/Geograph#C1]] for project details)') time.sleep(30) page.put(text) print Yellow+' Page updated on Commons'+White if isCooked: print Red+' This page is already categorized under\n Geograph images in the '+borough+White # Write record to local log log = '\n|}\n{{cob}}n'+time.asctime( time.localtime(time.time()) )+':end\n' #f.write(log) #f.close() # Close the log file if testmode: print Green+'Test run complete on '+category+White #f = open(logFile, 'r') #log = f.read() #f.close() #wikipedia.setAction('Geograph '+filename+' analysis report') #page=wikipedia.Page(site,'User:Faebot/SandboxL') #log=page.get()+log #page.put(log) else: print Green+'Commit run complete on '+filename+White
GeoStage2bFromFile.py (Stage 2b: Cornwall, Devon, Somerset and the Isles of Scilly)[edit]
''' Project C: Geograph regional categorization, Stage 2b Link: http://commons.wikimedia.org/wiki/User:Faebot/Geograph What: This is Python code which relies on pywikipedia being installed Description: This code takes a selected file of Commons image page names and uses their embedded geo-coordinates to query up to three websites for location description. If the image is a Geograph project image, the location information is used to add the image to a Commons category to assist in further categorization. See the above Link to see queries to generate the source file of image names. Date: November 2012 Author: http://commons.wikimedia.org/wiki/User:Fae License: CC-BY-SA Status: Working in OSX Colours need to be switched off in Windoze Use of Mapit data could be better Needs spring cleaning to remove spaghetti ''' import wikipedia, re, time, catlib, pagegenerators, json, urllib2, urllib project="[[User:Faebot/Geograph#C2b]]" #filename="GeoboxLondonList.txt" #filename="Listed buildings in London list.txt" filename="Stage2/GeoboxCornwallList.txt" ff=open("/Volumes/Fae_32GB/Geograph/"+filename,'r') gen=ff.read().split('#[[:') gen[0]=re.sub('^.*?File','File',gen[0]) totalPages = str(len(gen)) for i in range(len(gen)): gen[i]=re.sub('\]\].*$','',gen[i].split('\n')[0]) testmode=False #testmode=True mappings=[('London Borough of Greenwich','Royal Borough of Greenwich'), ('Bath & North East Somerset','Somerset'), ('Bath and North East Somerset','Somerset'), ('Taunton Deane','Somerset'), ('Devon County','Devon'), ('South West Devon','Devon'), ('Plymouth','Devon'), ('Torbay','Devon'), ('Central Devon','Devon'), ('Somerset County','Somerset'), ('North Somerset','Somerset'), ('Poole','Dorset'), ('South Dorset','Dorset'), ('Purbeck District','Dorset'), ('Purbeck','Dorset'), ('Bournemouth','Dorset'), ('Newton Abbot','Devon'), ('Torridge and West Devon','Devon'), ('East Devon','Devon'), ('Exeter','Devon'), ('North Devon','Devon'), ('Plymouth, Moor View','Devon'), ('Plymouth, Sutton and Devonport','Devon'), ('South West Devon','Devon'), ('Tiverton and Honiton','Devon'), ('Torbay','Devon'), ('Totnes','Devon'), ('South Somerset','Somerset'), ('Taunton Deane','Somerset'), ('West Somerset','Somerset'), ('Sedgemoor','Somerset'), ('Mendip','Somerset'), ('North Somerset','Somerset'), ('Yeovil','Somerset'), ('Bridgwater and West Somerset','Somerset') ] # Mappings are based on Wikimedia Commons use wanted=['Devon','Cornwall','Somerset','Dorset','Isles of Scilly'] category=filename site = wikipedia.getSite('commons', 'commons') #cat = catlib.Category(site,'Category:'+category) #gen = pagegenerators.CategorizedPageGenerator(cat) # Get pages listed in Category from Commons count=0 countF=0 countMin=470 countMax=148000 lag=0 # Arbitrary pause between url opens to reduce server load hcats='' scats='' pageSafe='' errors='' data='' commit='' def mapping(borough): # If borough is in mappings then return the mapped name for i in mappings: if borough==i[0]: borough=i[1] return borough def wantedlist(borough): for i in wanted: if borough==i: return borough break return '' def xmltry(url): countErr=0 xml='' while xml=='': try: xml = urllib2.urlopen(url) time.sleep(lag) except: xml='' countErr+=1 print Cyan,'** ERROR',countErr,'\n ** Failed to read from '+url+'\n ** Pause for '+str(countErr*15)+' seconds and try again'+White time.sleep(15*countErr) return xml def mapit(lat, lon): ekey='' key='' mapit="http://mapit.mysociety.org/point/4326/"+str(lon)+','+str(lat) print Blue,mapit+'.html',White try: data = json.load(xmltry(mapit)) except: try: print Red,'** Mapit data failed to load, trying again in 5 minutes **' time.sleep(300) data = json.load(xmltry(mapit)) except: print Red,'** Mapit data failed to load, trying again in 15 minutes **' time.sleep(900) try: data = json.load(xmltry(mapit)) except: print Red,'** Mapit data failed to load, trying for a last time in 60 minutes **' time.sleep(60*60) data = json.load(xmltry(mapit)) if data=={}: print Red,'** No data returned from:' print Red, mapit+'.html', White time.sleep(10) return ('','') for key in data.keys(): type_name=[b for (a,b) in data[key].items() if a=='type_name'][0] if type_name=='UK Parliament constituency': break for ekey in data.keys(): type_name=[b for (a,b) in data[ekey].items() if a=='type_name'][0] if type_name=='Unitary Authority': # was European region but Unitary authority seems to work better in Scotland break if key=='': pc='' else: pc=[b for (a,b) in data[key].items() if a==u'name'][0] # UK Parliament constituency if ekey=='': eurr='' else: eurr=[b for (a,b) in data[ekey].items() if a==u'name'][0] # European region return (pc, eurr) if not testmode: commit=' (commit)' logFile="/Volumes/Fae_32GB/Geograph/"+category+commit+".txt" f = open(logFile, 'w') log = '\n==Test on images in '+filename+'==\n'+time.asctime( time.localtime(time.time()) )+':start\n{{cot}}\n{|class="wikitable sortable"\n!Page!!Coords!!<abbr title="Check if the image is geo-tagged as being the camera or object position.">Object</abbr>!!<abbr title="Is this a Geograph project image?">Geo</abbr>!!Borough!!Action' # Regular Colors (\033[)<effect 0=normal 1=bold 4=underline>(;3)<color 0=black>(m) Red="\033[0;31m" #Red Green="\033[0;32m" #Green GreenB="\033[1;32m" #Green bold GreenU="\033[4;32m" #Green underlined Yellow="\033[0;33m" #Yellow Blue="\033[0;34m" #Blue Purple="\033[0;35m" #Purple Cyan="\033[0;36m" #Cyan White="\033[0;37m" #White print '\n',Purple,'*'*60,White,'\n' if testmode: print Cyan,'TEST MODE '*6,White,'\n\n',Purple,'*'*60,White def trimquotes(mystr): return re.sub('^Category:','',re.sub('^"|"$','',mystr)) def xmlget(mystr): if data.find('<'+mystr+'>')<1 or data.find('</'+mystr+'>')<1: return '' else: return data.split('<'+mystr+'>')[1].split('</'+mystr+'>')[0] for p in gen: # Get categories for candidate image using API call [on error: wait, try 3 times then add to error log] count += 1 if p.find("<!--")>-1 or len(p)<5: if len(p)<1: p='[blank]' continue print Cyan+"Page",count,p continue if count<countMin: continue if count>countMax: break countS = str(count) try: page=wikipedia.Page(site,p) except: print Red,'** Page read failed',count,'page:',p print Red,'** I will try once more in 5 minutes' time.sleep(300) continue pageSafe = re.sub('\[\[commons:(.*)\]\]',r'\1',str(page)) catCall='http://commons.wikimedia.org/w/api.php?action=query&titles='+urllib.quote_plus(pageSafe,'[]:')+'&prop=categories&clprop=hidden&format=xml' print Yellow+'Page',countS+'/'+totalPages,pageSafe[5:60],White # catCall:',catCall,'\n' xml= xmltry(catCall) try: data = xml.read() except: print Red,'** failed to read xml, trying for a second time in 5 minutes' time.sleep(300) data = xml.read() if data.find('<categories>')<1: ### Need to add a go back and try again loop in case of timeout problems, 1,5,15 minute pauses print '*****\nNo categories found using',catcall,'\nOn ',page print data,'\n*****\n' errors=errors+'\nNo categories found using:'+catcall+'\n----\n'+str(data)+'\n----' continue data = data.split('<categories>')[1].split('</categories>')[0] data = re.sub(r'<cl ns="14" title=(.*?)\s*/>',r'\1|',data) data = re.sub(r'\|$','',data) hcats=[] gcats=[] scats=[] for c in data.split('|'): if c.find('hidden=')>0: hcats.append(c) elif c.find('Geograph')>0: gcats.append(c) else: scats.append(c) # Visible cats # # Get image page text and extract data from Object location dec or Location dec templates [not found: error log, next] # text=page.get() # This will cause bot to sleep for 4 seconds textlower=text.lower() isObject= (textlower.find('{{object location dec')>-1) if textlower.find('{{location dec')<0 and isObject>-1: print Red,'** There is no geodata in this page, so skipping'+White #log+='*'+countS+' [[:'+pageSafe+']] has no geo data.\n' #f.write(log) #log='' continue # # Extract coordinates # lat = re.search(r'\{\{([Oo]bject)?\s?[Ll]ocation dec\s*\|[\s\-]*[\d\.]+',text).group(0).split('|')[1] lon = re.search(r'\{\{([Oo]bject)?\s?[Ll]ocation dec\s*\|[\s\-]*[\d\.]+\s*\| ?[\s\-]*[\d\.]+',text).group(0).split('|')[2] lat = re.sub(r'\s','',lat) lon = re.sub(r'\s','',lon) latn = float(lat) lonn = float(lon) lonSafe=urllib.quote_plus(lon) latSafe=urllib.quote_plus(lat) # Reality check - is this a Geograph image if textlower.find('{{Geograph|')>1 or textlower.find('{{geograph|')>1 or textlower.find('geograph.org.uk')>1 or textlower.find('Geograph.org.uk')>1: pass else: print Red,'** This is not a Geograph image - skip',White continue # Get OSM address data # uk-postcodes.com #urlukpc='http://www.uk-postcodes.com/latlng/'+latSafe+','+lonSafe+'.xml' #print Blue,'UKP',urlukpc #xml = xmltry(urlukpc) #data=xml.read() ukPostcode=''#xmlget('postcode') #ukPostcode=re.sub('\s*(\s\w?[\d]).*$',r'\1',ukPostcode) # Trim postcode #ukDistrict=xmlget('district') #if ukDistrict.find('<title>')>0 and ukDistrict.find('</title>')>0: # ukDistrict=ukDistrict.split('<title>')[1].split('</title>')[0] #else: # ukDistrict='' (mapitConstituency,mapitEuropean)=mapit(lat,lon) mapitConstituency=mapping(mapitConstituency) mapitEuropean=mapping(mapitEuropean.split(' Council')[0].split(' 0')[0]) print Cyan,'Mapit Const.',mapitConstituency,'('+mapitEuropean+')',White # Open Street Map urlosm='http://nominatim.openstreetmap.org/reverse?format=xml&lat='+latSafe+'&lon='+lonSafe print Blue,'OSM',urlosm,White xml=xmltry(urlosm) data=xml.read() #print '\n',data,'\n' #print Purple+' Address from Open Street Map' #print ' Road '+xmlget('road') #print ' Suburb '+xmlget('suburb') # OSM city given London Borough # Test against uk-postcode data as OSM seems not reliable enough osmBorough=xmlget('county') osmBorough=mapping(osmBorough) if mapitConstituency.find(osmBorough)==-1 and mapitEuropean.find(osmBorough)>-1: mapitConstituency=mapitEuropean if osmBorough==mapitEuropean: mapitConstituency=mapitEuropean # Cut down on the number of GMap checks print Cyan,' OSM County',osmBorough,White if len(osmBorough)<2: #print Purple+' District '+ukDistrict+Red+' (uk-p as OSM gave null return)'+White log+='|| OSM:(blank) mapit:[[:Category:'+mapitConstituency+'|]] ' boroughCheck=-1 # Null return means a third check is needed else: if osmBorough.lower().find(mapitConstituency.lower())>-1 and len(mapitConstituency)>1: print Purple,' County '+osmBorough+Green+' confirmed'+Purple log+='|| [[:Category:'+osmBorough+'|]]' boroughCheck=0 else: print Purple,' City '+Red+osmBorough+Purple+' (mapit:'+mapitConstituency+')' log+='||OSM:[[:Category:'+osmBorough+'|]] uk-p:'+mapitConstituency+' ' boroughCheck=-1 # Failure here is the most improtant to follow-up on borough=osmBorough # For London County should always be London county=xmlget('county') print ' '+county osmPostcode=xmlget('postcode') osmPostcode=re.sub('\s*(\s\w?[\d]).*$',r'\1',osmPostcode) #if osmPostcode.lower().find(ukPostcode.lower()[:2])>-1: #print ' Postcode '+osmPostcode+Green+' confirmed'+Purple #postcodes from OSM appear rubbish #log+='** Postcode: '+osmPostcode+' (check says '+ukPostcode+')\n' #else: #print ' Postcode '+Red+osmPostcode+Purple+' (uk-p):'+ukPostcode #log+='** Postcode: '+osmPostcode+' <b>but</b> uk-postcodes says '+ukPostcode+'\n' #print ' Country '+xmlget('country')+White # # 3rd check of Borough using Google Maps, only when needed, max of 2,500 queries per day # if boroughCheck==-1: gmap=u'http://maps.googleapis.com/maps/api/geocode/json?latlng='+lat+','+lon+'&sensor=true' data = json.load(xmltry(gmap)) gmapCounty=[b['long_name'] for a in data['results'] for b in a['address_components'] for c in b['types'] if c=='administrative_area_level_2'] if len(gmapCounty)>0: gmapCounty=gmapCounty[0] else: gmapCounty='' if len(gmapCounty)>1: print Purple+' GMap County '+gmapCounty+White if mapitConstituency.lower().find(gmapCounty.lower())>-1 or gmapCounty.lower().find(mapitConstituency.lower())>-1: print Cyan+ ' ** Use '+Green+gmapCounty+Cyan+' as preferred county name'+White log+='gmap:<b>[[:Category:'+gmapCounty+'|]]</b> by a poll of 3 websites\n' county=gmapCounty county=mapping(county) if len(county)>2: if text.find('[[Category:Geograph images in '+county+']]')>0: log+='|| <i>Already in [[:Category:Geograph images in '+county+'|]]</i>' else: log+='|| <b>Add</b> to [[:Category:Geograph images in '+county+'|]]' else: log+='|| None' #f.write(log) log='' if len(county)==0: print Red,'** Failed county mapping',White continue #urlsoc='http://mapit.mysociety.org/point/4326/'+lonSafe+','+latSafe #datajson = json.load(urllib2.urlopen(urlsoc)) #print '\nDump from mapit:\n'+str([a for b in datajson for a in b])+White # Map OSM given county (=locality) to existing Commons category in Category:London countys # If the number of visible non-Geograph categories on the image are 0, then # Add an existing, visible, Commons London county category if len(scats)<1: text=text+'\n[[Category:'+county+']]' # If template exists, then remove Uncategorized-Geograph # Add Check categories-Geograph # Add hidden Geograph by London county category oldcounty=county county=wantedlist(county) if county=='': print Red,'**',oldcounty print Red,'** Failed county wanted list',White continue if county.find('Isles ')>-1: county="the "+county isCooked=(text.find('[[Category:Geograph images in '+county+']]')>-1) if (not isCooked) and (not testmode): text=text+'\n[[Category:Geograph images in '+county+']]' wikipedia.setAction('Add to [[Category:Geograph images in '+county+']], see '+project+' for details') time.sleep(15) page.put(text) #print text print Yellow+' Page updated on Commons'+White if isCooked: print Red+' This page is already categorized under\n Geograph images in '+county+White # Write record to local log log = '\n|}\n{{cob}}n'+time.asctime( time.localtime(time.time()) )+':end\n' f.write(log) f.close() # Close the log file if testmode: print Green+'Test run complete on '+filename+White f = open(logFile, 'r') log = f.read() f.close() else: print Green+'Uploads complete on '+filename+White