Created
May 14, 2016 11:40
-
-
Save frankgeerlings/bf9e8931558145451d5557a6f0bc4182 to your computer and use it in GitHub Desktop.
List the most popular articles on a Wikimedia instance over a given number of past days
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from mwviews.api import PageviewsClient | |
import functools # Voor Python 3 compatibility | |
import itertools | |
days = 100 | |
topmost = 10000 | |
showtop = 100 | |
p = PageviewsClient(10) | |
def cleanArticleName(article): | |
return ("[[" + article.replace('_', ' ') + "]]").encode('utf-8') | |
def dagart(d): | |
dag = p.top_articles('nl.wikipedia', limit=topmost, year=d.year, month=d.month, day=d.day) | |
return {entry['article']: (entry['rank'], entry['views']) for entry in dag} | |
"""Zet een lijst met rank+views-tupels om in een stats-dict""" | |
def stats(scores): | |
# [(rank, views), ...] | |
r = [r for r,v in scores] | |
views = sum(v for r,v in scores) | |
# Sum of inverse of rank | |
score = sum(1.0/r for r,v in scores) | |
minrank = min(r) | |
maxrank = max(r) | |
dayslisted = len(r) | |
return (score, minrank, maxrank, views, dayslisted) | |
from datetime import date,timedelta | |
def dagen(vandaag, aantal): | |
while aantal > 0: | |
yield dagart(vandaag-timedelta(days=aantal)) | |
aantal = aantal - 1 | |
a = dagen(date.today()-timedelta(days=1), days) | |
def toegestaan(artikel): | |
if artikel.startswith('Special:'): return False | |
if artikel.startswith('User:'): return False | |
if artikel.startswith('Speciaal:'): return False | |
if artikel == '-': return False | |
return True | |
resultaat = {} | |
for i in a: | |
for j in i.keys(): | |
if not toegestaan(j): | |
continue | |
if not resultaat.has_key(j): | |
resultaat[j] = [] | |
resultaat[j].append(i[j]) | |
class Wikitable: | |
def __init__(self, items, columns): | |
self.items = items | |
self.columns = columns | |
self.columncount = len(columns) | |
def lines(self): | |
yield "{| class=\"wikitable sortable\"" | |
yield "|-" | |
yield "! " + " !! ".join(self.columns) | |
for i in self.items: | |
yield "|-" | |
yield "| " + " || ".join([str(x) for x in i]) | |
yield "|}" | |
def __repr__(self): | |
return "\n".join(self.lines()) | |
# We hebben nu een dict van artikelnaam: [(rank, views), ...] | |
#statistiek = {k: stats(v) for k, v in resultaat.iteritems()} | |
statistiek = [(rank,) + rest for rank, rest in zip([x + 1 for x in range(len(resultaat))[::-1]], sorted([(cleanArticleName(k),) + stats(v) for k, v in resultaat.iteritems()], key=lambda x: x[1]))] | |
statistiek.reverse() | |
print Wikitable(itertools.islice(statistiek,showtop), ["Rang", "Artikel", "Score", "Hoogste", "Laagste", "Views totaal", "Dagen in top %s" % topmost]) | |
#from pprint import pprint | |
#pprint(statistiek) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment