koehlma · May 7, 2013 20:42
diff --git a/bayesian_filter.py b/bayesian_filter.py
 # -*- coding:utf-8 -*-
 #
 # Copyright (C) 2013, Maximilian Köhl <linuxmaxi@googlemail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.

 import re
 import shelve
 import urllib.request

 import bs4

 WORD = re.compile(r'\w+') 
 DATABASE = shelve.open('bayesian_filter.db')

 class Article():
    def __init__(self, url):
        self.url = url
        content =urllib.request.urlopen(self.url).read().decode('utf-8')
        self.soups = [bs4.BeautifulSoup(content)]
        self.pages = [self.soups[0].find('article')]
        while True:
            page = self.pages[-1].find('a', attrs={'id' : 'atoc_next'})
            if page is None: break
            url = 'http://www.golem.de{}'.format(page.attrs['href'])
            content = urllib.request.urlopen(url).read().decode('utf-8')
            self.soups.append(bs4.BeautifulSoup(content))
            self.pages.append(self.soups[-1].find('article'))
        self.words = set()
        for page in self.pages:
            for tag in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
                for element in page.find_all(tag):
                    for word in WORD.findall(element.text.lower()):
                        self.words.add(word)
        

 class Archive():
    def __init__(self, year=13, month=5):
        self.year = year
        self.month = month
        self.url = 'http://www.golem.de/aa-{:02}{:02}.html'.format(year, month)
        content = urllib.request.urlopen(self.url).read().decode('utf-8')
        self.articles = []
        self.soup = bs4.BeautifulSoup(content)
        for ol in self.soup.find_all('ol', {'class' : 'list-tickers'}):
            for li in ol.find_all('li'):
                self.articles.append(li.find('a').attrs['href'])


 def like(article):
    for word in article.words:
        if word in DATABASE:
            DATABASE[word] = tuple(map(sum, zip(DATABASE[word], (1, 0, 0, 0))))
        else:
            DATABASE[word] = (1, 0, 0, 0)
    for word in DATABASE:
        DATABASE[word] = tuple(map(sum, zip(DATABASE[word], (0, 0, 1, 0))))

 def dislike(article):
    for word in article.words:
        if word in DATABASE:
            DATABASE[word] = tuple(map(sum, zip(DATABASE[word], (0, 1, 0, 0))))
        else:
            DATABASE[word] = (0, 1, 0, 0)
    for word in DATABASE:
        DATABASE[word] = tuple(map(sum, zip(DATABASE[word], (0, 0, 0, 1))))

 def calc(article, p=0.5):
    for word in article.words:
        if word not in DATABASE: continue
        if DATABASE[word][3] < 1 or DATABASE[word][2] < 1: continue 
        p_like = p * DATABASE[word][0] / DATABASE[word][2]
        p_dislike = p * DATABASE[word][1] / DATABASE[word][3]
        p = p_like / (p_like + p_dislike)
        if p == 0: return p
    return p         

 archive = Archive()
 for url in archive.articles:
    article = Article(url)
    print(article.url, calc(article))
    answer = input('Like (y/n)? ')
    while answer not in ['y', 'n']:
        print('Please give a valid answer...')
        answer = input('Like (y/n)? ')
    if answer == 'y':
        like(article)
    else:
        dislike(article)
	# -- coding:utf-8 --
	#
	# Copyright (C) 2013, Maximilian Köhl <linuxmaxi@googlemail.com>
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.

	import re
	import shelve
	import urllib.request

	import bs4

	WORD = re.compile(r'\w+')
	DATABASE = shelve.open('bayesian_filter.db')

	class Article():
	def __init__(self, url):
	self.url = url
	content =urllib.request.urlopen(self.url).read().decode('utf-8')
	self.soups = [bs4.BeautifulSoup(content)]
	self.pages = [self.soups[0].find('article')]
	while True:
	page = self.pages[-1].find('a', attrs={'id' : 'atoc_next'})
	if page is None: break
	url = 'http://www.golem.de{}'.format(page.attrs['href'])
	content = urllib.request.urlopen(url).read().decode('utf-8')
	self.soups.append(bs4.BeautifulSoup(content))
	self.pages.append(self.soups[-1].find('article'))
	self.words = set()
	for page in self.pages:
	for tag in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
	for element in page.find_all(tag):
	for word in WORD.findall(element.text.lower()):
	self.words.add(word)


	class Archive():
	def __init__(self, year=13, month=5):
	self.year = year
	self.month = month
	self.url = 'http://www.golem.de/aa-{:02}{:02}.html'.format(year, month)
	content = urllib.request.urlopen(self.url).read().decode('utf-8')
	self.articles = []
	self.soup = bs4.BeautifulSoup(content)
	for ol in self.soup.find_all('ol', {'class' : 'list-tickers'}):
	for li in ol.find_all('li'):
	self.articles.append(li.find('a').attrs['href'])


	def like(article):
	for word in article.words:
	if word in DATABASE:
	DATABASE[word] = tuple(map(sum, zip(DATABASE[word], (1, 0, 0, 0))))
	else:
	DATABASE[word] = (1, 0, 0, 0)
	for word in DATABASE:
	DATABASE[word] = tuple(map(sum, zip(DATABASE[word], (0, 0, 1, 0))))

	def dislike(article):
	for word in article.words:
	if word in DATABASE:
	DATABASE[word] = tuple(map(sum, zip(DATABASE[word], (0, 1, 0, 0))))
	else:
	DATABASE[word] = (0, 1, 0, 0)
	for word in DATABASE:
	DATABASE[word] = tuple(map(sum, zip(DATABASE[word], (0, 0, 0, 1))))

	def calc(article, p=0.5):
	for word in article.words:
	if word not in DATABASE: continue
	if DATABASE[word][3] < 1 or DATABASE[word][2] < 1: continue
	p_like = p * DATABASE[word][0] / DATABASE[word][2]
	p_dislike = p * DATABASE[word][1] / DATABASE[word][3]
	p = p_like / (p_like + p_dislike)
	if p == 0: return p
	return p

	archive = Archive()
	for url in archive.articles:
	article = Article(url)
	print(article.url, calc(article))
	answer = input('Like (y/n)? ')
	while answer not in ['y', 'n']:
	print('Please give a valid answer...')
	answer = input('Like (y/n)? ')
	if answer == 'y':
	like(article)
	else:
	dislike(article)