User:Zhuyifei1999/poty/potylist.py
Jump to navigation
Jump to search
#! /usr/bin/python
# -*- coding: utf-8 -*-
# LICENSE: WTFPL <http://www.wtfpl.net/txt/copying/>
# Script is really ugly and hacky; please don't [[:zh:吐槽]] ;)
from __future__ import absolute_import, unicode_literals
import calendar
import collections
import datetime
import locale
import re
import sys
from pyquery import PyQuery as pq
import pywikibot
# locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
MONTH = int(sys.argv[1])
YEAR = datetime.datetime.now().year - 1
MONTHS = calendar.month_name[:]
SITE = pywikibot.Site('commons', 'commons')
def if_redirct_get_target(page):
if page.isRedirectPage():
page = page.getRedirectTarget()
if page.namespace() == 6:
page = pywikibot.FilePage(page)
return page
def get_FPs():
for page in [
'Commons:Featured pictures/chronological/%s-A' % YEAR,
'Commons:Featured pictures/chronological/%s-B' % YEAR,
]:
page = pywikibot.Page(SITE, page)
ingallery = False
curmonth = None
for line in page.text.split('\n'):
line = line.strip()
if not line:
continue
if re.match(r'^{{Fp-log-chron-header\|[^{}]+}}$', line):
assert not ingallery
elif line == '<gallery>':
assert not ingallery
ingallery = True
elif line == '</gallery>':
assert ingallery
ingallery = False
else:
if ingallery:
# For some reason, neither 2015 POTY nor 2014 have special
# consideration for sets. We do the same here.
reobj = re.match(r'([^|]*?)\|(\d+) .+', line)
if curmonth != MONTH and MONTH:
continue
if not reobj:
if any(kw in line.lower()
for kw in ['demote', 'delist']):
continue
raise RuntimeError('What is this line? ' + line)
if not reobj.group(1):
# Delinker... please remove the whole line when it's in
# a gallery
continue
yield (
curmonth,
int(reobj.group(2)),
if_redirct_get_target(
pywikibot.FilePage(SITE, reobj.group(1)))
)
else:
reobj = re.match(
r'^== +(%s) %s +==$' % (
'|'.join(filter(None, MONTHS)), YEAR),
line)
curmonth = MONTHS.index(reobj.group(1))
fpc_res_template = pywikibot.Page(SITE, 'Template:FPC-results-reviewed')
def get_category(filepage, month):
try:
fpcs = set()
for fpc_page in filepage.usingPages():
if not fpc_page.title().startswith(
'Commons:Featured picture candidates/'):
continue
for template, params in fpc_page.templatesWithParams():
if template == fpc_res_template:
if 'featured=yes' in params:
fpcs.add(fpc_page)
break
if len(fpcs) == 1:
fpc_page = fpcs.pop()
elif len(fpcs) == 0:
return None # gotta handle these manually
else: # Argh!
# HACK: HTML scraping the FPC link\
html = filepage.getImagePageHtml()
d = pq(html)
e = d('#assessments '
'a[title^="Commons:Featured picture candidates/"]')
assert len(e) == 1
title = e.attr('title')
fpc_page = if_redirct_get_target(pywikibot.Page(SITE, title))
assert fpc_page in fpcs
for template, params in fpc_page.templatesWithParams():
if template == fpc_res_template:
for param in params:
if param.startswith('category='):
return param[len('category='):].replace('_', ' ')
# return fpc_cat # .replace('#', '/')
except Exception as ex:
__import__('traceback').print_exc()
__import__('code').interact(local=locals())
raise
def singular(words):
exceptions = set(['glass', 'ous'])
words = words.split(' ')
for i, word in enumerate(words):
if not any(word.endswith(exception) for exception in exceptions):
if word.endswith('ies'):
words[i] = word[:-3] + 'y'
elif word.endswith('s'):
words[i] = word[:-1]
return ' '.join(words)
token_overrides = {
'natural': 'nature',
'animated': 'animation',
}
def poty_tokenizer(catstr):
catstr = catstr.lower()
cats = [catstr]
for split_key in [', ', ' and ']:
cats = [cat.split(split_key) for cat in cats]
cats = sum(cats, [])
cats = [singular(cat.strip()) for cat in cats]
cats = [cat[len('other '):] if cat.startswith('other ') else cat
for cat in cats]
cats = [cat[:-len(' view')] if cat.endswith(' view') else cat
for cat in cats]
cats = [token_overrides[cat] if cat in token_overrides else cat
for cat in cats]
cats = frozenset(filter(None, cats))
return cats
candidates_page = pywikibot.Page(
SITE, 'Commons:Picture of the Year/%s/Candidates' % YEAR)
poty_tokens = {}
reiter = re.finditer(r'\[\[Commons:Picture of the Year/%s/R1/Gallery/'
r'([^\|]+)\|([^\]]+)\]\]' % YEAR, candidates_page.text)
final_cats = collections.OrderedDict()
for reobj in reiter:
pgname, displayname = reobj.group(1), reobj.group(2)
final_cats[(pgname, displayname)] = []
for token in poty_tokenizer(', '.join((pgname, displayname))):
poty_tokens[token] = pgname, displayname
def fp_tokenizer(catstr):
catstr = catstr.replace('#', '/')
return map(poty_tokenizer, catstr.split('/')[::-1])
def match(catstr):
if catstr is not None:
for tokens in fp_tokenizer(catstr):
for token in tokens:
if token in poty_tokens:
return token, poty_tokens[token]
return None, ('dummy', 'dummy')
for month, order, page in get_FPs():
cat = get_category(page, month)
token, target = match(cat)
debug = '%s => %s => %s' % (cat, token, target)
filestr = ('%s|%s-%02d/%s <!--%s-->' % (
page.title(),
YEAR,
month,
order,
debug
)).encode('utf-8')
print >> sys.stderr, filestr
final_cats[target].append(filestr)
print '''{| cellspacing="0" cellpadding="0" style="clear:right; margin-bottom: .5em; float: right; padding: .5em 0 .8em 1.4em; background: none;"
| __TOC__
|}'''
for (pgname, displayname), value in final_cats.items():
print '== [[Commons:Picture of the Year/%s/R1/Gallery/%s|%s]] ==' % (
YEAR, pgname, displayname)
print '<gallery>'
for item in value:
print item
print '</gallery>'
# FIXME: Check for demotes/delists