Created
February 4, 2017 21:45
-
-
Save matthewr6/56300fdb1d4c17a1e811d1eed8213732 to your computer and use it in GitHub Desktop.
markov chain data generator I made a while ago
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
// data: output of learnFromTopic or learnFromSubforum | |
// initial: initial word | |
function simulate(data, initial) { | |
let terminate = false; | |
let generatedWords = [initial]; | |
if (!data[initial]) return; | |
while (!terminate) { | |
const prevWord = generatedWords[generatedWords.length - 1]; | |
const nextProbs = data[prevWord].probabilities; | |
const differentiator = Math.random(); | |
let nextWord = null; | |
let totalProb = 0; | |
for (let word in nextProbs) { | |
totalProb += nextProbs[word]; | |
if (totalProb > differentiator && (word != prevWord || totalProb >= 1)) { | |
// todo - json has to store keys as strings | |
if (word !== 'null' && generatedWords.length < 25) { | |
nextWord = word; | |
} else { | |
terminate = true; | |
} | |
break; | |
} | |
} | |
if (nextWord) { | |
generatedWords.push(nextWord); | |
} | |
} | |
return generatedWords.join(' '); | |
} | |
// posts is array of strings | |
function learnFromPosts(posts) { | |
let words = {}; | |
posts.forEach(post => { | |
// do we want to strip punctuation? | |
postWords = post.content.replace( /[^a-zA-Z ]/g, '').replace( /\s\s+/g, ' ' ).split(' '); | |
postWords.forEach((word, index) => { | |
if (!words[word]) { | |
words[word] = { | |
frequencies: {}, | |
total: 1, | |
probabilities: {} | |
}; | |
} else { | |
words[word].total++; | |
} | |
// do we want to check if a word tends to be the last word of the post? | |
if (!postWords[index+1]) { | |
words[word].frequencies[null] = 1; | |
} else if (words[word].frequencies[postWords[index+1]] === undefined) { | |
words[word].frequencies[postWords[index+1]] = 1; | |
} else if (words[word].frequencies[postWords[index+1]] !== undefined) { | |
words[word].frequencies[postWords[index+1]]++; | |
} | |
}); | |
}); | |
for (let word in words) { | |
for (let nextWord in words[word].frequencies) { | |
words[word].probabilities[nextWord] = words[word].frequencies[nextWord] / words[word].total; | |
} | |
} | |
return words; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
🤔