Created
November 8, 2019 11:45
-
-
Save pataiadam/7a4217e82ecc943f68acebe2c4a5bbaf to your computer and use it in GitHub Desktop.
k means tfidf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const kmeans = require('node-kmeans'); | |
let data = [ | |
{'category': 'szoke' , content: 'ket szoke no beszelget egy masik szoke novel'}, | |
{'category': 'szoke' , content: 'Egy szoke no bemegy az egyetemre'}, | |
{'category': 'szoke' , content: 'Egy szoke no bemegy az boltba'}, | |
{'category': 'allat' , content: 'A kutya elmegy setalni'}, | |
{'category': 'allat' , content: 'A medve meg a roka setal az erdoben'}, | |
{'category': 'allat' , content: 'Ket hernyo hernyoskodik'}, | |
{'category': 'zsido' , content: 'Ket zsido vesz egy hazat es egy autot'}, | |
{'category': 'zsido' , content: 'Egy zsido bemegy a bankba'}, | |
{'category': 'zsido' , content: 'Egy szoke zsido elmegy nyaralni'}, | |
]; | |
let allWords = {} | |
for (item of data) { | |
const wordCount = {} | |
const tf = {} | |
const words = item.content.toLowerCase().split(' '); | |
words.forEach(w => { | |
wordCount[w] = wordCount[w] || 0 | |
wordCount[w]++ | |
allWords[w] = 1 | |
}) | |
for (let item in wordCount) { | |
tf[item] = wordCount[item] / words.length | |
} | |
item.wordCount = wordCount | |
item.tf = tf | |
} | |
allWords = Object.keys(allWords) | |
const idf = {}; | |
for (let dataItem of data) { | |
for (let item in dataItem.wordCount) { | |
idf[item] = idf[item] || 0; | |
idf[item]++ | |
} | |
} | |
for (item in idf) { | |
idf[item] = Math.log(data.length/idf[item]) | |
} | |
for (let dataItem of data) { | |
dataItem.tfIdf = {}; | |
for (let item in dataItem.tf) { | |
dataItem.tfIdf[item] = dataItem.tf[item] * idf[item]; | |
} | |
} | |
let vectors = [] | |
for (let dataItem of data) { | |
vectors.push(allWords.map(w =>{ | |
return dataItem.tfIdf[w] || 0 | |
})) | |
} | |
kmeans.clusterize(vectors, {k: 3}, (err,res) => { | |
if (err) console.error(err); | |
else console.log('%o',res.map(r=>r.clusterInd)); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment