Skip to content

Instantly share code, notes, and snippets.

@koehlma
Created May 7, 2013 20:42
Show Gist options
  • Save koehlma/5535942 to your computer and use it in GitHub Desktop.
Save koehlma/5535942 to your computer and use it in GitHub Desktop.
# -*- coding:utf-8 -*-
#
# Copyright (C) 2013, Maximilian Köhl <linuxmaxi@googlemail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import re
import shelve
import urllib.request
import bs4
WORD = re.compile(r'\w+')
DATABASE = shelve.open('bayesian_filter.db')
class Article():
def __init__(self, url):
self.url = url
content =urllib.request.urlopen(self.url).read().decode('utf-8')
self.soups = [bs4.BeautifulSoup(content)]
self.pages = [self.soups[0].find('article')]
while True:
page = self.pages[-1].find('a', attrs={'id' : 'atoc_next'})
if page is None: break
url = 'http://www.golem.de{}'.format(page.attrs['href'])
content = urllib.request.urlopen(url).read().decode('utf-8')
self.soups.append(bs4.BeautifulSoup(content))
self.pages.append(self.soups[-1].find('article'))
self.words = set()
for page in self.pages:
for tag in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
for element in page.find_all(tag):
for word in WORD.findall(element.text.lower()):
self.words.add(word)
class Archive():
def __init__(self, year=13, month=5):
self.year = year
self.month = month
self.url = 'http://www.golem.de/aa-{:02}{:02}.html'.format(year, month)
content = urllib.request.urlopen(self.url).read().decode('utf-8')
self.articles = []
self.soup = bs4.BeautifulSoup(content)
for ol in self.soup.find_all('ol', {'class' : 'list-tickers'}):
for li in ol.find_all('li'):
self.articles.append(li.find('a').attrs['href'])
def like(article):
for word in article.words:
if word in DATABASE:
DATABASE[word] = tuple(map(sum, zip(DATABASE[word], (1, 0, 0, 0))))
else:
DATABASE[word] = (1, 0, 0, 0)
for word in DATABASE:
DATABASE[word] = tuple(map(sum, zip(DATABASE[word], (0, 0, 1, 0))))
def dislike(article):
for word in article.words:
if word in DATABASE:
DATABASE[word] = tuple(map(sum, zip(DATABASE[word], (0, 1, 0, 0))))
else:
DATABASE[word] = (0, 1, 0, 0)
for word in DATABASE:
DATABASE[word] = tuple(map(sum, zip(DATABASE[word], (0, 0, 0, 1))))
def calc(article, p=0.5):
for word in article.words:
if word not in DATABASE: continue
if DATABASE[word][3] < 1 or DATABASE[word][2] < 1: continue
p_like = p * DATABASE[word][0] / DATABASE[word][2]
p_dislike = p * DATABASE[word][1] / DATABASE[word][3]
p = p_like / (p_like + p_dislike)
if p == 0: return p
return p
archive = Archive()
for url in archive.articles:
article = Article(url)
print(article.url, calc(article))
answer = input('Like (y/n)? ')
while answer not in ['y', 'n']:
print('Please give a valid answer...')
answer = input('Like (y/n)? ')
if answer == 'y':
like(article)
else:
dislike(article)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment