User:WikiLovesESBot/code/WLE stats generation.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

# WLE_stats_generation.py  Creates the statistics for WLE 2016 in Spain. It takes as inputs the category where the images 
#                          are ('WLE_CATEGORY') and a CVS-like page with information about sites of community importance 
#                          in Spain ('SCI_DB_PAGE'). It has to be created beforehand
# author:                  Discasto (WM-ES)
# date:                    2016-05-11
#
# Distributed under the terms of the MIT license.
########################################################################################################################
#
import pywikibot as pb
from pywikibot import pagegenerators
import mwparserfromhell as mwh
from optparse import OptionParser
from datetime import datetime
import pandas as pd
from urllib import urlencode, urlopen
import json
from random import sample
import copy
import re
from collections import OrderedDict
from operator import itemgetter

# Information about the SCI sources
annexes = [
            ['ES-AN', u'Andalusia', u'Anexo:Lugares de importancia comunitaria de Andalucía', '', ''],
            ['ES-AR', u'Aragon', u'Anexo:Lugares de importancia comunitaria de Aragón', '', ''],
            ['ES-AS', u'Asturias', u'Anexo:Lugares de importancia comunitaria de Asturias', '', ''],
            ['ES-CB', u'Cantabria', u'Anexo:Lugares de importancia comunitaria de Cantabria', '', ''],
            ['ES-CE', u'Ceuta and Melilla', u'Anexo:Lugares de importancia comunitaria de Ceuta y Melilla', '', ''],
            ['ES-CL', u'Castile and León', u'Anexo:Lugares de importancia comunitaria de Castilla y León', '', ''],
            ['ES-CM', u'Castile-La Mancha', u'Anexo:Lugares de importancia comunitaria de Castilla-La Mancha', '', ''],
            ['ES-CN', u'Canary Islands', u'Anexo:Lugares de importancia comunitaria de las Islas Canarias', '', ''],
            ['ES-CT', u'Catalonia', u'Anexo:Lugares de importancia comunitaria de Cataluña', '', u'Viquiprojecte:Patrimoni natural/Llista de llocs d\'importància comunitària de Catalunya'],
            ['ES-EX', u'Extremadura', u'Anexo:Lugares de importancia comunitaria de Extremadura', '', ''],
            ['ES-GA', u'Galicia', u'Anexo:Lugares de importancia comunitaria de Galicia', u'Lugares de importancia comunitaria en Galicia', ''],
            ['ES-IB', u'Balearic Islands', u'Anexo:Lugares de importancia comunitaria de las Islas Baleares', '', u'Llista de Llocs d\'Importància Comunitària de les Illes Balears'],
            ['ES-MC', u'Region of Murcia', u'Anexo:Lugares de importancia comunitaria de la Región de Murcia', '', ''],
            ['ES-MD', u'Community of Madrid', u'Anexo:Lugares de importancia comunitaria de la Comunidad de Madrid', '', ''],
            ['ES-ML', u'Melilla', u'Anexo:Lugares de importancia comunitaria de Ceuta y Melilla', '', ''],
            ['ES-NC', u'Navarre', u'Anexo:Lugares de importancia comunitaria de Navarra', '', ''],
            ['ES-PV', u'Basque Country', u'Anexo:Lugares de importancia comunitaria del País Vasco', '', ''],
            ['ES-RI', u'La Rioja', u'Anexo:Lugares de importancia comunitaria de La Rioja', '', ''],
            ['ES-VC', u'Valencian Community', u'Anexo:Lugares de importancia comunitaria de la Comunidad Valenciana', '', u'Llista de llocs d\'importància comunitària del País Valencià'],
            ['ES-MAGRAMA', u'Ministry of Agriculture, Food and Enviroment', u'Anexo:Lugares de importancia comunitaria del MAGRAMA', '', '']
        ]
annexes_df = pd.DataFrame(annexes, columns=['aut_com', 'aut_com_name', 'aut_com_es_annex', 'aut_com_gl_annex', 'aut_com_ca_annex'])

# Colors needed to draw pie charts
aut_com_colors = ['Ivory', 'Beige', 'Wheat', 'Tan', 'DarkKhaki', 'Silver', 'Gray', 'DarkMagenta', 'Navy',
                  'RoyalBlue', 'LightSteelBlue', 'Purple', 'Teal', 'ForestGreen', 'Olive', 'Chartreuse', 'Lime',
                  'GoldenRod', 'PaleGoldenRod', 'LightCoral', 'Salmon', 'DeepPink', 'Fuchsia', 'Lavender',
                  'Plum', 'Indigo', 'Maroon', 'Crimson']

# Mediawiki API calls to get image global usage
API_BASE_URL = u'https://commons.wikimedia.org/w/api.php'
API_QUERY_STRING = {"action": "query",
                    "format": "json",
                    "gulimit": "10",
                    "prop": "globalusage",
                    "guprop": "url|namespace",
                    "titles": None
                    }

# Source category
WLE_CATEGORY = u"Category:Images from Wiki Loves Earth 2016 in Spain"

# Outputs
BASE_WLE2016_NAME           = u"Commons:Wiki Loves Earth 2016 in Spain"
STATISTICS_PAGE             = BASE_WLE2016_NAME + u"/Stats"
IMAGE_LOG_PAGE              = BASE_WLE2016_NAME + u"/Log"
SCI_DB_PAGE                 = BASE_WLE2016_NAME + u"/SCI DB"
GALLERY_PAGE_AUTHORS        = BASE_WLE2016_NAME + u"/Contributors"
GALLERY_PAGE_SCIS           = BASE_WLE2016_NAME + u"/SCIs"
GALLERY_FEATURED_ARTICLES   = BASE_WLE2016_NAME + u'/QI'

# Time variables for finding out who is a new editor
DAY_LENGTH = 86400000
START_TIME = 1462053600000              # 2016 May 01, 00:00:00 CEST (miliseconds)
END_TIME = START_TIME + (31*86400000)   # 2016 May 30, 23:59:59 CEST (miliseconds)
OLD_TIME = START_TIME - (90*86400000)
NEW_USER_TIME = START_TIME - (86400000*14)

def unix_time(zulu_time_string):
    dt = datetime.strptime(zulu_time_string, "%Y-%m-%dT%H:%M:%SZ")
    epoch = datetime.utcfromtimestamp(0)
    delta = dt - epoch
    return int(delta.total_seconds() * 1000)

def return_day (epoch_time) :
    return ((int(epoch_time) - START_TIME) / DAY_LENGTH) + 1

def get_quality_color (grade) :
    if grade < 10.0 :
        percentage_color = 'ff6666'
    elif grade < 30.0 :
        percentage_color = 'ffb366'
    elif grade < 50.0 :
        percentage_color = 'ffff66'
    elif grade < 70.0 :
        percentage_color = 'd9ff66'
    elif grade < 90.0 :
        percentage_color = '66ff66'
    else :
        percentage_color = '668cff '
    return percentage_color

def create_pie_chart (input_dict, input_colors, suffix, special_item_key=None) :
    if special_item_key != None :
        special_item = dict()
        special_item[special_item_key] = 0

    output_text = u'{{#invoke:Chart|pie chart\n' \
                  u'| radius = 180\n' \
                  u'| slices = \n'
    input_dict = dict(input_dict)
    sorted_dict = OrderedDict(sorted(input_dict.items(), key=itemgetter(1), reverse=True))
    for key, value in sorted_dict.iteritems() :
        if special_item_key == None or key != special_item_key :
            output_text += u'    ( %d: %s : %s)\n' %(value, key, input_colors[key])
        else :
            special_item[special_item_key] = value

    if special_item_key != None :
        output_text += u'    ( %d: %s : %s)\n' % (special_item[special_item_key], special_item_key, input_colors[special_item_key])

    output_text += u'| units suffix = _%s\n' \
                   u'| percent = true\n' \
                   u'}}\n' %(suffix)
    return output_text

def split_range (input_range, n) :
    modulus = input_range % n
    quotient = input_range / n
    boundaries = list()
    last_value = 0
    for i in range(1, n+1):
        if modulus > 0:
            boundaries.append(last_value+quotient+1)
            last_value += quotient+1
            modulus-=1
        else :
            boundaries.append(last_value+quotient)
            last_value += quotient
    return boundaries

def main():
    parser = OptionParser()
    parser.add_option('-c', '--cached', action="store_true", default=False, dest='cached', help="works on cached image list in commons")
    (options, args) = parser.parse_args()

    wikipedia_site = pb.Site("es", "wikipedia")
    commons_site = pb.Site("commons", "commons")

    image_list_page = pb.Page(commons_site, IMAGE_LOG_PAGE)
    sci_list_page   = pb.Page(commons_site, SCI_DB_PAGE)
    qi_gallery_page = pb.Page(commons_site, GALLERY_FEATURED_ARTICLES)
    stats_page      = pb.Page(commons_site, STATISTICS_PAGE)
    authors_page    = pb.Page(commons_site, GALLERY_PAGE_AUTHORS)
    scis_page       = pb.Page(commons_site, GALLERY_PAGE_SCIS)

    # SCI dataframe creation
    pb.output('Retrieving --> WLE 2016 SCI list from cache')
    sci_repository = list()
    sci_csv_text = sci_list_page.text

    # useful indices from ['name', 'code', 'magrama_record', 'aut_com', 'bio_region', 'continent', 'alt_max', 'alt_min', 'alt_med',
    #'lon', 'lat', 'area', 'sea_area_percentage', 'sea_area', 'image', 'cat-commons', 'wikidata_id']
    sci_csv_indices = [0, 1, 3, 14, 15, 16]
    for line in sci_csv_text.splitlines(True) :
        tokens = line[:-1].split(';')
        if len(tokens) > 1 :
            valid_tokens = [tokens[i] for i in sci_csv_indices]
            sci_repository.append(valid_tokens)
    pb.output('Retrieved --> WLE 2016 SCI list from cache')
    scis_df = pd.DataFrame(sci_repository, columns=['name', 'code', 'aut_com', 'image', 'category', 'wikidata_id'])
    scis_df = pd.merge(scis_df, annexes_df[['aut_com', 'aut_com_name', 'aut_com_es_annex']], on='aut_com')
    sci_list = list(scis_df["code"])

    api_call_counter = 0
    image_usage_counter = 0
    image_perwiki_counter = dict()
    article_perwiki_counter = dict()
    raw_api_query_string = u''
    query_string_items = list()

    QI_list = list()

    if options.cached == False :
       # Retrieving images from the actual WLE category
        image_repository = list()

        pb.output('Retrieving --> WLE 2016 images from category')
        cat = pb.Category(commons_site, WLE_CATEGORY)
        gen = pagegenerators.CategorizedPageGenerator(cat)

        list_text = u'<pre>\n'
        image_counter = 0
        for page in gen:
            if page.isImage():
                if (image_counter != 0) and (image_counter % 50 == 0) :
                    pb.output ('Retrieving --> '+str(image_counter)+" image descriptions downloaded")
                image_counter += 1
                print '.'

                image_item = [None] * 4
                title = page.title(withNamespace=False)
                creation = page.oldest_revision
                image_item[0] = title
                image_item[2] = creation["user"]
                image_item[3] = str(creation.timestamp)

                text = page.text
                wikicode = mwh.parse(text)
                templates = wikicode.filter_templates()
                WLE_template_found = False
                QI_template_found = False
                WLE_identifier = ''
                for template in templates :
                    if template.name.lower().strip() == u"lic" :
                        WLE_template_found = True
                        match = re.search(r'ES[0-9]{5,6}', template.get(1).value.strip())
                        if match and template.get(1).value.strip() in sci_list:
                            WLE_identifier = template.get(1).value.strip()
                        if QI_template_found : break
                    elif template.name.lower().strip() == u"qualityimage" :
                        QI_template_found = True
                        QI_list.append(image_item[0])
                        if WLE_template_found: break
                image_item[1] = WLE_identifier
                image_repository.append(image_item)
                row_text = u'%s;\n' % (';'.join(image_item))
                list_text += row_text

                title = 'File:' + page.title(withNamespace=False)
                api_call_counter += 1
                query_string_items.append(title)
                if api_call_counter % 5 == 0:
                    raw_api_query_string = unicode(u'|'.join(query_string_items)).encode('utf-8')
                    API_QUERY_STRING["titles"] = raw_api_query_string
                    f = urlopen(API_BASE_URL, urlencode(API_QUERY_STRING))
                    response = f.read()
                    response_dict = json.loads(response)
                    for key, value in response_dict["query"]["pages"].iteritems():
                        if len(value[u'globalusage']) > 0:
                            found_dict = dict()
                            image_usage_counter += 1
                            for item in value[u'globalusage']:
                                if (item[u'ns'] == u'0') or (item[u'ns'] == u'104'):
                                    if item[u'wiki'] in article_perwiki_counter:
                                        article_perwiki_counter[item[u'wiki']] += 1
                                    else:
                                        article_perwiki_counter[item[u'wiki']] = 1
                                    found_dict[item[u'wiki']] = True
                            for key, value in found_dict.iteritems():
                                if key in image_perwiki_counter:
                                    image_perwiki_counter[key] += 1
                                else:
                                    image_perwiki_counter[key] = 1
                    query_string_items = list()

        pb.output ('Retrieving --> '+str(image_counter)+" images downloaded")
        pb.output('Retrieved --> WLE 2016 image list')

        list_text += u'</pre>'
        image_list_page.text = list_text
        image_list_page.save(u"WLE Spain 2016 image log")
        pb.output('Publishing --> WLE 2016 image log')
    else :
        # Taking from a previously uploaded list of images
        image_repository = list()

        csv_text = image_list_page.text
        pb.output('Retrieving --> WLE 2016 images list from cache')

        image_counter = 0
        for line in csv_text.splitlines(True) :
            tokens = line[:-1].split(';')
            if len(tokens) > 1 :
                if (image_counter != 0) and (image_counter % 50 == 0) :
                    pb.output ('Retrieving --> '+str(image_counter)+" image descriptions retrieved from log")
                image_counter += 1
                tokens.pop()
                image_repository.append(tokens)

                page = pb.Page(commons_site, tokens[0], ns=6)
                text = page.text
                wikicode = mwh.parse(text)
                templates = wikicode.filter_templates()

                for template in templates:
                    if template.name.lower().strip() == u"qualityimage":
                        QI_list.append(page.title(withNamespace=False))
                        break

                title = 'File:' + tokens[0]
                api_call_counter += 1
                query_string_items.append(title)
                if api_call_counter % 5 == 0:
                    raw_api_query_string = unicode(u'|'.join(query_string_items)).encode('utf-8')
                    API_QUERY_STRING["titles"] = raw_api_query_string
                    f = urlopen(API_BASE_URL, urlencode(API_QUERY_STRING))
                    response = f.read()
                    response_dict = json.loads(response)
                    for key, value in response_dict["query"]["pages"].iteritems():
                        if len(value[u'globalusage']) > 0:
                            found_dict = dict()
                            image_usage_counter += 1
                            for item in value[u'globalusage']:
                                if (item[u'ns'] == u'0') or (item[u'ns'] == u'104'):
                                    if item[u'wiki'] in article_perwiki_counter:
                                        article_perwiki_counter[item[u'wiki']] += 1
                                    else:
                                        article_perwiki_counter[item[u'wiki']] = 1
                                    found_dict[item[u'wiki']] = True
                            for key, value in found_dict.iteritems():
                                if key in image_perwiki_counter:
                                    image_perwiki_counter[key] += 1
                                else:
                                    image_perwiki_counter[key] = 1
                    query_string_items = list()

        pb.output('Retrieved --> WLE 2016 image list from cache')

    # Panda management
    images_df = pd.DataFrame(image_repository, columns=['image_title', 'code', 'uploader', 'timestamp'])
    images_df["timestamp"] = images_df["timestamp"].apply(unix_time)
    images_df["month_day"] = images_df["timestamp"].apply(return_day)

    sci_coverage_df = pd.DataFrame(columns=['aut_com', 'es_annex', 'gl_annex', 'ca_annex', 'code', 'category', 'image'])
    for annex in annexes :
        if annex[0] == u'ES-ML' : continue
        page = pb.Page(wikipedia_site, annex[2])
        text = page.text
        wikicode = mwh.parse(text)
        templates = wikicode.filter_templates()

        for template in templates :
            if template.name.lower().strip() == u"fila lic" :
                df_row = dict()
                df_row["aut_com"] = annex[1]
                df_row["es_annex"] = annex[2]
                df_row["gl_annex"] = annex[3]
                df_row["ca_annex"] = annex[4]
                df_row["code"] = None
                df_row["image"] = None
                df_row["category"] = None

                try :
                    if len(template.get(u"código").value.strip()) > 3 :
                        df_row["code"] = template.get(u"código").value.strip()
                except :
                    pass
                try :
                    if len(template.get(u"imagen").value.strip()) > 3 :
                        df_row["image"] = template.get(u"imagen").value.strip()
                except :
                    pass
                try :
                    if len(template.get(u"categoría-Commons").value.strip()) > 3 :
                        df_row["category"] = template.get(u"categoría-Commons").value.strip()
                except :
                    pass

                sci_coverage_df = sci_coverage_df.append(df_row, ignore_index=True)
    pb.output('Retrieving --> WLE 2016 SCI list')

    # 'queries' for further creation of wikitext
    images_total_df     = pd.merge(images_df, scis_df, on='code', how='left') # This dataframe add SCI information to image information
    authors             = images_total_df["uploader"].value_counts()          # This query shows the number of images per uploader
    authors_with_code   = images_total_df[images_total_df["code"] != u'']["uploader"].value_counts()
                                                                              # This query shows the number of valid images per uploader
    SCIs                = images_total_df["code"].value_counts()              # This query shows the number of images per SCI
    month_days          = images_total_df["month_day"].value_counts()         # This query shows the number of uploaded images in every day
    images_per_autcom   = images_total_df["aut_com_name"].value_counts()      # This query shows the number of uploaded images per autonomous community
    images_per_autcom   = images_per_autcom.append(pd.Series([images_total_df.count()['image_title']-images_total_df["aut_com"].value_counts().sum()], index=['From no valid site']))

    authors_with_SCIs   = images_total_df.groupby(['uploader'])     # This query groups images per uploader
    scis_with_pictures  = images_total_df.groupby(['code'])         # This query groups images per SCI

    aut_com_with_SCI    = scis_df[scis_df['code'].isin(images_total_df['code'].unique())]["aut_com_name"]

    # Month day uploads computation
    day_counter = [0] * 35
    for day, counter in month_days.iteritems() :
        if day > 31 :
            day_counter[34] += counter
        elif day > 0 :
            day_counter[day+1] = counter
        else :
            day_counter[0] += counter

    # Creation and publication of quality images gallery
    qi_gallery_text = u'This page lists the %d [[Commons:Quality Images|quality images]] uploaded as part of ' \
                      u'the first period of the [[Commons:Wiki Loves Earth 2016 in Spain|Wiki Loves Earth]] contest ' \
                      u'in Spain in 2016.\n\n' % (len(QI_list))
    qi_gallery_text += u"'''Statistics generation date''': {{subst:CURRENTTIME}} UTC, {{subst:CURRENTMONTHNAME}} " \
                       u"{{subst:CURRENTDAY}}, {{subst:CURRENTYEAR}}\n"
    qi_gallery_text += u'<gallery>\n'
    qi_gallery_text += u'\n'.join (QI_list)
    qi_gallery_text += u'\n</gallery>\n\n'
    qi_gallery_text += u'[[Category:Wiki Loves Earth 2016 in Spain| Quality]]'
    qi_gallery_page.text = qi_gallery_text
    pb.output('Publishing --> WLE 2016 quality images gallery')
    qi_gallery_page.save(u"WLE 2016 quality images gallery")

    # Creation and publication of sites of community importance gallery
    sci_gallery_text = u''
    for code, item in scis_with_pictures:
        sitelinks_string = u''
        if len(code) > 0 :
            sci_gallery_text += '== %s (%s) ==\n' % (scis_df[scis_df["code"] == code]["name"].iloc[0], code)
            sci_gallery_text += u"\'\'\'Local name\'\'\': %s\n" % (scis_df[scis_df["code"] == code]["name"].iloc[0])
            sci_gallery_text += u"\'\'\'Uploaded images\'\'\': [[:Category:Images of site of community importance " \
                                u"with code %s from Wiki Loves Earth 2016 in Spain|category]]" % (code)
        else :
            sci_gallery_text += '== No valid site of community importance ==\n\n'
        sci_gallery_text += '<gallery>\n'
        for i, value in item["image_title"].iteritems() :
            sci_gallery_text += u'%s\n' % (value)
        sci_gallery_text += '</gallery>\n\n'
    pb.output('Generating --> WLE 2016 SCI gallery')
    sci_gallery_text += u'[[Category:Wiki Loves Earth 2016 in Spain| Site]]'
    scis_page.text = sci_gallery_text
    pb.output('Publishing --> WLE 2016 SCI gallery')
    scis_page.save(u"WLE Spain 2016 gallery (per SCI)")

    # Statistics page and authors gallery
    author_gallery_text = u''
    statisticts_text =  u'{| align=right\n' \
                        u'|[[File:WLE Austria Logo (transparent).svg|200px|link=]]\n' \
                        u'|-\n' \
                        u'| style="text-align:center; font-family:arial black; font-size:200%; color:grey" ' \
                        u'| {{LangSwitch| es=España|ca=Espanya|en=Spain}}&nbsp;&nbsp;&nbsp;\n' \
                        u'|}\n'
    statisticts_text += u"Welcome to the '''WLE Spain 2016''' statistics. Below you will find information about the " \
                        u"number of uploaded images, the contributors and the WLE sites the pictures belong to. " \
                        u"Enjoy!!!\n\n"
    statisticts_text += u"==Images==\n"
    statisticts_text += u"* '''Main category''': " \
                        u"[[:Category:Images from Wiki Loves Earth 2016 in Spain|Images from Wiki Loves Earth 2016 in Spain]]\n"
    statisticts_text += u"* '''Total''': %d pictures\n" % (images_df.count()['image_title'])
    statisticts_text += u"** '''Quality Images''': %d pictures ([[%s|see]])\n" % (len(QI_list), GALLERY_FEATURED_ARTICLES)
    statisticts_text += u"* '''Statistics generation date''': {{subst:CURRENTTIME}} UTC, {{subst:CURRENTMONTHNAME}} " \
                        u"{{subst:CURRENTDAY}}, {{subst:CURRENTYEAR}}\n"
    statisticts_text += u'==Participants==\n' \
                        u'<br clear="all"/>\n' \
                        u'{| class="wikitable sortable" style="width:65%; font-size:89%; margin-top:0.5em;"\n' \
                        u'|- valign="middle"\n' \
                        u'! Author<br/><small>(registration time)</small>\n' \
                        u'! Uploaded images (total)\n' \
                        u'! Uploaded images<br/>(from a site of community importance)\n' \
                        u'! Contributed to SCIs\n'
    new_authors = list()
    for author, count in authors.iteritems():
        author_gallery_text += u'== %s ==\n\n' % (author)
        author_gallery_text += u'<gallery>\n'
        user = pb.User(commons_site, title=author)
        if unix_time(str(user.registration())) > NEW_USER_TIME : new_authors.append(author)
        author_with_SCIs = list(authors_with_SCIs.get_group(author)["code"].unique())
        author_with_pictures = list(authors_with_SCIs.get_group(author)["image_title"].unique())
        for index, value in enumerate(author_with_pictures) :
            author_gallery_text += u'%s\n' %(value)
        if u'' in author_with_SCIs :
            author_with_SCIs.remove(u'')
        for index, value in enumerate(author_with_SCIs) :
            author_with_SCIs[index] = u'[http://natura2000.eea.europa.eu/Natura2000/SDF.aspx?site=%s %s] (%s)' \
                                      % (value,
                                       value,
                                       scis_df[scis_df["code"] == value]["name"].iloc[0]
                                       )
        if author not in authors_with_code :
            authors_with_code[author] = 0
        if len(author_with_SCIs) == 0 :
            trailing_break = u''
        else :
            trailing_break = u'<br/>'
        statisticts_text += u'|-\n' \
                            u'| {{u|%s}} ([[%s#%s|contribs]])<br/><small>(registered on %d-%s-%s)</small>\n' \
                            u'| align="center" | %s\n' \
                            u'| align="center" | %s\n' \
                            u'| align="center" | %s%s(\'\'\'%d\'\'\')\n' % (author,
                                                          GALLERY_PAGE_AUTHORS,
                                                          author,
                                                          user.registration().year,
                                                          str(user.registration().month).zfill(2),
                                                          str(user.registration().day).zfill(2),
                                                          str(count),
                                                          authors_with_code[author],
                                                          '<br/>'.join(author_with_SCIs),
                                                          trailing_break,
                                                          len(author_with_SCIs))
        author_gallery_text += u'</gallery>\n\n'
    author_gallery_text += u'[[Category:Wiki Loves Earth 2016 in Spain| Contributors]]'
    pb.output('Generating --> WLE 2016 contributors gallery')

    statisticts_text += u'|-\n' \
                        u'! Total: %d contributors\n' \
                        u'! align="center" | %d pictures\n' \
                        u'! align="center" | %d pictures from<br/>a site of community importance\n' \
                        u'! align="center" | %d sites of<br/>community importance\n' \
                        u'|}\n' % (authors.size, images_df.count()['image_title'], authors_with_code.sum(), (len(SCIs)-1))

    statisticts_text += u"\n===New contributors===\n" \
                        u"'''Number of contributors registered during (or just before) the contest''':" \
                        u" '''%d'''\n\n" % (len(new_authors))
    statisticts_text += u"{| border = 0\n" \
                        u"| -\n" \
                        u"|\n"
    boundaries = split_range(len(new_authors), 3)
    statisticts_text += u"*{{u|%s}}\n" % ('}}\n* {{u|'.join(new_authors[:boundaries[0]]))
    statisticts_text += u"|\n"
    statisticts_text += u"*{{u|%s}}\n" % ('}}\n* {{u|'.join(new_authors[boundaries[0]:boundaries[1]]))
    statisticts_text += u"|\n"
    statisticts_text += u"*{{u|%s}}\n" % ('}}\n* {{u|'.join(new_authors[boundaries[1]:]))
    statisticts_text += u"|}\n\n"
    pb.output('Generating --> WLE 2016 contributor statistics')

    statisticts_text += u'=== Per-day contributions chart ===\n' \
                        u'<br clear="all"/>\n'

    chart_text = u'{{ #invoke:Chart | bar chart\n' \
                 u'| height = 450\n' \
                 u'| width = 800\n' \
                 u'| group 1 = %s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s\n' \
                 u'| x legends = <1::1::::5:::::10:::::15:::::20:::::25::::::31::>31\n' \
                 u'| group names = Pictures\n' \
                 u'| colors = orange\n' \
                 u'}}\n' % tuple(day_counter)
    statisticts_text += chart_text
    pb.output('Generating --> WLE 2016 per-day contribution statistics')

    statisticts_text += u'==Sites of Community Importance==\n' \
                        u'<br clear="all"/>\n'

    statisticts_text += u'{| class="wikitable sortable" style="width:55%; font-size:89%; margin-top:0.5em;"\n' \
                        u'|- valign="middle"\n' \
                        u'! Site of Community Importance<br/>(Commons category)\n' \
                        u'! Autonomous community\n' \
                        u'! Uploaded images\n'

    not_found_counter=0
    for site_code, count in SCIs.iteritems():
        if len(site_code) == 0 :
            not_found_counter = str(count)
        else :
            commons_cat_link = u''
            commons_cat = scis_df[scis_df["code"] == site_code]["category"].iloc[0]
            if len(commons_cat) > 0 :
                commons_cat_link = u'<br/>\'\'\'Commons category\'\'\': [[:Category:%s|%s]]' % (commons_cat, commons_cat)
            try:
                aut_com = scis_df[scis_df["code"] == site_code]["aut_com_name"].iloc[0]
            except :
                aut_com_name = u''
            site_name = scis_df[scis_df["code"] == site_code]["name"].iloc[0]
            statisticts_text += u'|-\n' \
                                u'| %s ([http://natura2000.eea.europa.eu/Natura2000/SDF.aspx?site=%s %s])%s\n' \
                                u'| align="center" | %s \n' \
                                u'| align="center" | %d<br/>(see [[%s#%s (%s)|gallery]], ' \
                                u'[[:Category:Images of site of community importance with code %s ' \
                                u'from Wiki Loves Earth 2016 in Spain|category]])\n' % (site_name,
                                                                              site_code,
                                                                              site_code,
                                                                              commons_cat_link,
                                                                              aut_com,
                                                                              count,
                                                                              GALLERY_PAGE_SCIS,
                                                                              site_name,
                                                                              site_code,
                                                                              site_code)
    statisticts_text += u'|-\n' \
                        u'| Images from no valid site of community importance\n' \
                        u'| align="center" | N/A\n' \
                        u'| align="center" | %s ([[%s#No_valid_site|see]])\n' %(not_found_counter, GALLERY_PAGE_SCIS)

    statisticts_text += u'|-\n' \
                        u"! '''Total''': %d/%d sites of community importance (%.2f%%)\n" \
                        u'! %d aut. communities\n' \
                        u'! %s\n' \
                        u'|}\n' % ((len(SCIs)-1),
                                   scis_df.count()['aut_com'],
                                   float((len(SCIs)-1)*100)/float(scis_df.count()['aut_com']),
                                   aut_com_with_SCI.unique().size,
                                   images_df.count()['image_title']
                                   )

    statisticts_text += u'=== Per-autonomous community contributions chart ===\n' \
                        u'{| border="0" width="90%"\n' \
                        u'|-\n' \
                        u'| valign="top" | \n'
    scis_per_autcom = aut_com_with_SCI.value_counts().to_dict()
    selected_colors = sample(aut_com_colors, len(scis_per_autcom)+1)
    reduced_selected_colors = copy.deepcopy(selected_colors)
    reduced_selected_colors.pop()
    expanded_sci_per_autcom_keys = copy.deepcopy(scis_per_autcom.keys())
    expanded_sci_per_autcom_keys.append('From no valid site')

    statisticts_text += create_pie_chart(images_per_autcom, dict(zip(expanded_sci_per_autcom_keys, selected_colors)), 'pictures', 'From no valid site')
    statisticts_text += u'| valign="top" |\n'
    statisticts_text += create_pie_chart(scis_per_autcom, dict(zip(scis_per_autcom.keys(), reduced_selected_colors)), 'SCIs')
    statisticts_text += u'|}\n'

    coverage_statisticts_text = u'==Coverage==\n' \
                        u'Coverage statistics measure the number of sites of community importance ' \
                        u'listed in the annexes in the Spanish Wikipedia. Direct access to said annexes is provided ' \
                        u'through the <nowiki>[es]</nowiki> wikilink in the \'Autonomous Community\' column within ' \
                        u'the table (other links to lists in other Wikipedias may be provided for the sake of ' \
                        u'completeness; however, only the ones in the Spanish Wikipedia are used to create ' \
                        u'this set of statistics).\n\n' \
                        u'{| class="wikitable sortable" style="width:55%; font-size:89%; margin-top:0.5em;"\n' \
                        u'|- valign="middle"\n' \
                        u'! Autonomous Community\n' \
                        u'! Sites (total)\n' \
                        u'! Sites (in lists)<br/> with category\n' \
                        u'! Sites (in lists)<br/> with image\n'

    grouped = sci_coverage_df.groupby(['aut_com'])
    for name, group in grouped:
        percentage_scis_with_cat = float (group.count()['category']*100.0/float(group.count()['aut_com']))
        percentage_scis_with_image = float (group.count()['image']*100.0/float(group.count()['aut_com']))
        wikisites_string = u'[[:es:%s|[es]]]' % group["es_annex"].unique()[0]
        if group["ca_annex"].unique()[0] != None and group["ca_annex"].unique()[0] > 3 :
            casite_string = u' [[:ca:%s|[ca]]]' % group["ca_annex"].unique()[0]
            wikisites_string += casite_string
        if group["gl_annex"].unique()[0] != None and group["gl_annex"].unique()[0] > 3 :
            glsite_string = u' [[:gl:%s|[gl]]]' % group["gl_annex"].unique()[0]
            wikisites_string += glsite_string
        coverage_statisticts_text += u'|-\n' \
                            u'| %s %s\n' \
                            u'| align="center" | %d\n' \
                            u'| align="center" bgcolor="#%s" | %d (%.2f%%)\n' \
                            u'| align="center" bgcolor="#%s" | %d (%.2f%%)\n' % \
                                                           (group["aut_com"].unique()[0],
                                                            wikisites_string,
                                                            group.count()['aut_com'],
                                                            get_quality_color(percentage_scis_with_cat),
                                                            group.count()['category'],
                                                            percentage_scis_with_cat,
                                                            get_quality_color(percentage_scis_with_image),
                                                            group.count()['image'],
                                                            percentage_scis_with_image)
    coverage_statisticts_text += u'|-\n' \
                        u'| Total\n' \
                        u'| align="center" | %d\n' \
                        u'| align="center" bgcolor="#%s" | %d (%.2f%%)\n' \
                        u'| align="center" bgcolor="#%s" | %d (%.2f%%)\n' \
                        u'|}\n\n' % (sci_coverage_df.count()['aut_com'],
                                        get_quality_color(float (sci_coverage_df.count()['category']*100.0/float(sci_coverage_df.count()['aut_com']))),
                                        sci_coverage_df.count()['category'],
                                        float (sci_coverage_df.count()['category']*100.0/float(sci_coverage_df.count()['aut_com'])),
                                        get_quality_color(float (sci_coverage_df.count()['image']*100.0/float(sci_coverage_df.count()['aut_com']))),
                                        sci_coverage_df.count()['image'],
                                        float (sci_coverage_df.count()['image']*100.0/float(sci_coverage_df.count()['aut_com'])))

    pb.output('Generating --> WLE 2016 Coverage Statistics')
    statisticts_text += coverage_statisticts_text

    image_usage_text = u'==Image usage==\n\n' \
                       u'{| class="wikitable sortable" style="width:50%; font-size:89%; margin-top:0.5em;"\n' \
                       u'|- valign="middle"\n' \
                       u'! Item\n' \
                       u'! Counter\n'
    for key, value in image_perwiki_counter.iteritems():
        image_usage_text += u'|-\n' \
                            u'| width="80%%" | WLE 2016 images used in %s\n' \
                            u'| align="center" | %d\n' % (key, value)
    image_usage_text += u'|-\n' \
                        u'| width="80%%" | Distinct WLE 2016 images used in any Wikipedia\n' \
                        u'| align="center" | \'\'\'%d\'\'\' (%.2f%%)\n' % (
                                                            image_usage_counter,
                                                            float(image_usage_counter)*100/float(images_df.count()['image_title'])
                                                            )
    for key, value in article_perwiki_counter.iteritems():
        image_usage_text += u'|-\n' \
                            u'| width="80%%" | Articles with WLE 2016 images in %s\n' \
                            u'| align="center" | %d\n' % (key, value)
    image_usage_text += u'|-\n' \
                        u'| width="80%%" | Articles with WLE 2016 images in any Wikipedia\n' \
                        u'| align="center" | \'\'\'%d\'\'\'\n' % (sum(article_perwiki_counter.values()))
    image_usage_text += u'|}\n\n'
    pb.output('Generating --> WLE 2016 Image Usage Statistics')
    statisticts_text += image_usage_text

    statisticts_text += u'\n[[Category:Wiki Loves Earth 2016 in Spain| Stats]]'
    stats_page.text = statisticts_text
    pb.output('Publishing --> WLE 2016 Statistics')
    stats_page.save(u"WLE Spain 2016 statistics")

    authors_page.text = author_gallery_text
    pb.output('Publishing --> WLE 2016 Authors gallery')
    authors_page.save(u"WLE Spain 2016 gallery (per author)")

if __name__ == "__main__":
    main()
User:WikiLovesESBot/code/WLE stats generation.py

Navigation menu

Search