User:Inkwina/catlistcount.py

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: UTF-8  -*-

import shelve
import urllib
import simplejson
import time
import re
import mwclient


Howmany=200
Whichcategory='Category:Images that should use vector graphics'
Wheretosave=u'Top 200 Images that should use vector graphics by usage'

shelffile="./catlistcount.cache"
apiurl="http://commons.wikimedia.org/w/api.php"
apiparams={'format': "json",
                    'action': "query",
                    'list': "categorymembers",
                    'cmlimit': "50",
                    'cmprop': 'title',
                    'cmtitle': Whichcategory 
                    }
checkusageurl="http://toolserver.org/~daniel/WikiSense/CheckUsage.php"
checkusageparams={'i': , #filename
                                  'w': '_wp_20', #which wikis to check (top 20 wikipedias not to kill server)
                                  'x': 'main',    #what kind of pages
                                  'r': 'on',        #RAW
                                  'b': '1'          # not Bulk, we check 1 by 1
                                  }
wikire=re.compile('\s*\[([^\]]*)\]\s*(\d*)')

datastore=shelve.open(shelffile, writeback=True)
if not ("items" in datastore): #newfile
    datastore["items"]={}
###    datastore["wikis"]={}
    datastore["all-done"] = False
else:
    apiparams["cmcontinue"]=datastore["query-continue"] #pick up where we left last time

while not datastore["all-done"]:
    checkusageparams['i']=
    query=urllib.urlopen(apiurl,urllib.urlencode(apiparams))
    data=simplejson.load(query)
    for item in data["query"]["categorymembers"]:
        Fname=item["title"].split(':')[-1]
        Fname=Fname.encode('UTF-8').replace(' ','_')
        datastore["items"][Fname]={"ns": item["ns"]}
        if item["ns"] == 6: #pick out Image:
            checkusageparams['i'] += Fname+"\n"
            datastore["items"][Fname]["countof"]={}
            datastore["items"][Fname]["counttotal"]=0
            datastore["items"][Fname]["checked"]=False
            print "Added: "+Fname
    datastore.sync()
    print "--- Cached Data ---"
    print "From: "+data["query"]["categorymembers"][0]["title"]
    print "To: "+data["query"]["categorymembers"][-1]["title"]
    
    if  "query-continue" in data:
        apiparams["cmcontinue"]=data["query-continue"]["categorymembers"]["cmcontinue"].encode("UTF-8")
        datastore["query-continue"] =apiparams["cmcontinue"]
        datastore.sync()
    else:
        datastore["all-done"] =True

    query=urllib.urlopen(checkusageurl,urllib.urlencode(checkusageparams))
    for line in query.readlines():
        sulfarini=wikire.match(line)
        if sulfarini != None:
            print sulfarini.group(1)+" : "+sulfarini.group(2)
            whichwiki=sulfarini.group(1)
        else:
            try:
                page,file=line.split()
            except:
                continue
            if whichwiki in datastore["items"][file]["countof"]:
                datastore["items"][file]["countof"][whichwiki]+=1
###                datastore["wikis"][whichwiki]+=1
            else:
                datastore["items"][file]["countof"][whichwiki]=1
###                datastore["wikis"][whichwiki]=1
            datastore["items"][file]["counttotal"]+=1    
            print file+","+whichwiki+","+ str(datastore["items"][file]["countof"][whichwiki])+","+str(datastore["items"][file]["counttotal"])
    datastore.sync()
    time.sleep(2)
    
datastore["wikis"]={}
ftotal=0
for item in datastore["items"].itervalues():
    if item['ns']==6: 
        ftotal+=int(item["counttotal"])
        for w,c in item['countof'].iteritems():
            if w in datastore["wikis"]:
                datastore["wikis"][w]+=c
            else:
                datastore["wikis"][w]=c

output= """
This Page is an Automatically generated list of the 200 most used 
Images that should use vector graphics

The code for making this list is available here
The images are only checkd for use in Articles (not talk pages etc.) on the 20 largest wikipedias

--Inkwina (talk · contribs)

""" output +="\nLast Update "+time.strftime("%a, %d %b %Y %H:%M:%S %Z")+"\n" wtotal=0 for x in datastore["wikis"].itervalues(): wtotal+=int(x) output += "\n*Items in Total: "+str(len(datastore["items"])) output += "\n**Total use(from wikis) : "+str(wtotal) output += "\n**Total use(from files) : "+str(ftotal) output += "\n----\n" wikisort = [(v, k) for k, v in datastore["wikis"].items()] wikisort.sort() wikisort.reverse() for w,v in wikisort: output += "\n# "+str(v)+": "+str(w) def mycmp(x,y): # print x+" : "+str(datastore["items"][x]["counttotal"]) return cmp(datastore["items"][x]["counttotal"],datastore["items"][y]["counttotal"])*-1 sortall=[x for x in datastore["items"] if datastore["items"][x]["ns"]==6] sortall.sort(mycmp) output += "\n\n" #print output site = mwclient.Site('commons.wikimedia.org') site.login("username", "passwd") page = site.Pages[Wheretosave] page.save(output, summary = u'Inkwina Bot Update')