User:Zhuyifei1999/poty/potylist.py

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search
#! /usr/bin/python
# -*- coding: utf-8 -*-
# LICENSE: WTFPL <http://www.wtfpl.net/txt/copying/>
# Script is really ugly and hacky; please don't [[:zh:吐槽]] ;)

from __future__ import absolute_import, unicode_literals

import calendar
import collections
import datetime
import locale
import re
import sys

from pyquery import PyQuery as pq
import pywikibot

# locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

MONTH = int(sys.argv[1])

YEAR = datetime.datetime.now().year - 1
MONTHS = calendar.month_name[:]

SITE = pywikibot.Site('commons', 'commons')


def if_redirct_get_target(page):
    if page.isRedirectPage():
        page = page.getRedirectTarget()
        if page.namespace() == 6:
            page = pywikibot.FilePage(page)
    return page


def get_FPs():
    for page in [
        'Commons:Featured pictures/chronological/%s-A' % YEAR,
        'Commons:Featured pictures/chronological/%s-B' % YEAR,
    ]:
        page = pywikibot.Page(SITE, page)

        ingallery = False
        curmonth = None
        for line in page.text.split('\n'):
            line = line.strip()

            if not line:
                continue

            if re.match(r'^{{Fp-log-chron-header\|[^{}]+}}$', line):
                assert not ingallery
            elif line == '<gallery>':
                assert not ingallery
                ingallery = True
            elif line == '</gallery>':
                assert ingallery
                ingallery = False
            else:
                if ingallery:
                    # For some reason, neither 2015 POTY nor 2014 have special
                    # consideration for sets. We do the same here.
                    reobj = re.match(r'([^|]*?)\|(\d+) .+', line)
                    if curmonth != MONTH and MONTH:
                        continue
                    if not reobj:
                        if any(kw in line.lower()
                               for kw in ['demote', 'delist']):
                            continue
                        raise RuntimeError('What is this line? ' + line)
                    if not reobj.group(1):
                        # Delinker... please remove the whole line when it's in
                        # a gallery
                        continue
                    yield (
                        curmonth,
                        int(reobj.group(2)),
                        if_redirct_get_target(
                            pywikibot.FilePage(SITE, reobj.group(1)))
                    )
                else:
                    reobj = re.match(
                        r'^== +(%s) %s +==$' % (
                            '|'.join(filter(None, MONTHS)), YEAR),
                        line)
                    curmonth = MONTHS.index(reobj.group(1))


fpc_res_template = pywikibot.Page(SITE, 'Template:FPC-results-reviewed')


def get_category(filepage, month):
    try:
        fpcs = set()

        for fpc_page in filepage.usingPages():
            if not fpc_page.title().startswith(
                    'Commons:Featured picture candidates/'):
                continue

            for template, params in fpc_page.templatesWithParams():
                if template == fpc_res_template:
                    if 'featured=yes' in params:
                        fpcs.add(fpc_page)
                    break

        if len(fpcs) == 1:
            fpc_page = fpcs.pop()
        elif len(fpcs) == 0:
            return None  # gotta handle these manually
        else:  # Argh!
            # HACK: HTML scraping the FPC link\
            html = filepage.getImagePageHtml()
            d = pq(html)
            e = d('#assessments '
                  'a[title^="Commons:Featured picture candidates/"]')
            assert len(e) == 1

            title = e.attr('title')
            fpc_page = if_redirct_get_target(pywikibot.Page(SITE, title))
            assert fpc_page in fpcs

        for template, params in fpc_page.templatesWithParams():
            if template == fpc_res_template:
                for param in params:
                    if param.startswith('category='):
                        return param[len('category='):].replace('_', ' ')

        # return fpc_cat  # .replace('#', '/')
    except Exception as ex:
        __import__('traceback').print_exc()
        __import__('code').interact(local=locals())
        raise


def singular(words):
    exceptions = set(['glass', 'ous'])

    words = words.split(' ')
    for i, word in enumerate(words):
        if not any(word.endswith(exception) for exception in exceptions):
            if word.endswith('ies'):
                words[i] = word[:-3] + 'y'
            elif word.endswith('s'):
                words[i] = word[:-1]

    return ' '.join(words)


token_overrides = {
    'natural': 'nature',
    'animated': 'animation',
}


def poty_tokenizer(catstr):
    catstr = catstr.lower()

    cats = [catstr]
    for split_key in [', ', ' and ']:
        cats = [cat.split(split_key) for cat in cats]
        cats = sum(cats, [])

    cats = [singular(cat.strip()) for cat in cats]
    cats = [cat[len('other '):] if cat.startswith('other ') else cat
            for cat in cats]
    cats = [cat[:-len(' view')] if cat.endswith(' view') else cat
            for cat in cats]
    cats = [token_overrides[cat] if cat in token_overrides else cat
            for cat in cats]
    cats = frozenset(filter(None, cats))

    return cats


candidates_page = pywikibot.Page(
    SITE, 'Commons:Picture of the Year/%s/Candidates' % YEAR)
poty_tokens = {}
reiter = re.finditer(r'\[\[Commons:Picture of the Year/%s/R1/Gallery/'
                     r'([^\|]+)\|([^\]]+)\]\]' % YEAR, candidates_page.text)
final_cats = collections.OrderedDict()
for reobj in reiter:
    pgname, displayname = reobj.group(1), reobj.group(2)
    final_cats[(pgname, displayname)] = []
    for token in poty_tokenizer(', '.join((pgname, displayname))):
        poty_tokens[token] = pgname, displayname


def fp_tokenizer(catstr):
    catstr = catstr.replace('#', '/')
    return map(poty_tokenizer, catstr.split('/')[::-1])


def match(catstr):
    if catstr is not None:
        for tokens in fp_tokenizer(catstr):
            for token in tokens:
                if token in poty_tokens:
                    return token, poty_tokens[token]
    return None, ('dummy', 'dummy')


for month, order, page in get_FPs():
    cat = get_category(page, month)
    token, target = match(cat)
    debug = '%s => %s => %s' % (cat, token, target)
    filestr = ('%s|%s-%02d/%s <!--%s-->' % (
        page.title(),
        YEAR,
        month,
        order,
        debug
    )).encode('utf-8')
    print >> sys.stderr, filestr
    final_cats[target].append(filestr)

print '''{| cellspacing="0" cellpadding="0" style="clear:right; margin-bottom: .5em; float: right; padding: .5em 0 .8em 1.4em; background: none;"
| __TOC__
|}'''

for (pgname, displayname), value in final_cats.items():
    print '== [[Commons:Picture of the Year/%s/R1/Gallery/%s|%s]] ==' % (
        YEAR, pgname, displayname)
    print '<gallery>'
    for item in value:
        print item
    print '</gallery>'


# FIXME: Check for demotes/delists