Last active
October 26, 2023 10:59
-
-
Save willwade/091ba471fe3897fb4f0593c58928e751 to your computer and use it in GitHub Desktop.
PPM in swift courtesy of GPT. Here The generateCandidates(word:) method generates candidate words by swapping adjacent characters. This is a very simplistic way to generate candidates; in a real-world application, you might use more sophisticated techniques like Damerau-Levenshtein distance. The autocorrect(word:context:topN:) method takes a mis…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** So really you'd want to update that training text continually too - but you ideally need a way for people correcting this training text. Something that was easy enough with dasher although undcoumented. You literally edit the text file. | |
Also note this for autocorrection - but not sure how we would implement this | |
**/ | |
extension PPM { | |
// Generate candidate words by swapping adjacent characters | |
func generateCandidates(word: String) -> [String] { | |
var candidates: [String] = [] | |
var chars = Array(word) | |
for i in 0..<(chars.count - 1) { | |
chars.swapAt(i, i + 1) | |
candidates.append(String(chars)) | |
chars.swapAt(i, i + 1) // Swap back to original | |
} | |
return candidates | |
} | |
// Autocorrect a misspelled word | |
func autocorrect(word: String, context: String, topN: Int) -> [(String, Double)] { | |
let candidates = generateCandidates(word: word) | |
var scoredCandidates: [(String, Double)] = [] | |
for candidate in candidates { | |
if let likelihoods = predict(context: candidate, topN: 1).words.first?.value { | |
scoredCandidates.append((candidate, likelihoods)) | |
} | |
} | |
return scoredCandidates.sorted { $0.1 > $1.1 }.prefix(topN) | |
} | |
} | |
// Usage | |
let ppm = PPM() | |
ppm.train(text: "hello world hello everyone") | |
// Autocorrect the misspelled word "helo" given the context "hel" | |
let corrections = ppm.autocorrect(word: "helo", context: "hel", topN: 3) | |
print("Autocorrection suggestions:") | |
for (word, likelihood) in corrections { | |
print("Word: \(word), Likelihood: \(likelihood)") | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Foundation | |
class Node { | |
var children: [Character: Node] = [:] | |
var frequency: Int = 0 | |
var words: [String: Int] = [:] | |
} | |
class PPM { | |
let root = Node() | |
func train(fromFile fileURL: URL) { | |
do { | |
let text = try String(contentsOf: fileURL, encoding: .utf8) | |
let words = text.split(separator: " ") | |
for word in words { | |
var currentNode = root | |
for char in word { | |
currentNode.frequency += 1 | |
if currentNode.children[char] == nil { | |
currentNode.children[char] = Node() | |
} | |
currentNode = currentNode.children[char]! | |
} | |
currentNode.frequency += 1 | |
currentNode.words[String(word), default: 0] += 1 | |
} | |
} catch { | |
print("Error reading file: \(error)") | |
} | |
} | |
func predict(context: String, topN: Int) -> (letters: [(Character, Double)], words: [(String, Double)]) { | |
let chars = Array(context) | |
var currentNode = root | |
for char in chars.reversed() { | |
if let nextNode = currentNode.children[char] { | |
currentNode = nextNode | |
} else { | |
return ([], []) | |
} | |
} | |
let topLetters = mostFrequentChildren(of: currentNode, topN: topN) | |
let topWords = mostFrequentWords(of: currentNode, topN: topN) | |
return (topLetters, topWords) | |
} | |
private func mostFrequentChildren(of node: Node, topN: Int) -> [(Character, Double)] { | |
var predictions: [(Character, Double)] = [] | |
let totalFrequency = Double(node.frequency) | |
let sortedChildren = node.children.sorted { $0.value.frequency > $1.value.frequency } | |
for (char, childNode) in sortedChildren.prefix(topN) { | |
let likelihood = Double(childNode.frequency) / totalFrequency | |
predictions.append((char, likelihood)) | |
} | |
return predictions | |
} | |
private func mostFrequentWords(of node: Node, topN: Int) -> [(String, Double)] { | |
var predictions: [(String, Double)] = [] | |
let totalFrequency = Double(node.frequency) | |
let sortedWords = node.words.sorted { $0.value > $1.value } | |
for (word, freq) in sortedWords.prefix(topN) { | |
let likelihood = Double(freq) / totalFrequency | |
predictions.append((word, likelihood)) | |
} | |
return predictions | |
} | |
} | |
// Usage | |
let ppm = PPM() | |
// Replace this URL with the actual file URL | |
if let fileURL = URL(string: "path/to/your/text/file.txt") { | |
ppm.train(fromFile: fileURL) | |
} | |
let (topLetters, topWords) = ppm.predict(context: "hell", topN: 6) | |
print("Top letter predictions:") | |
for (char, likelihood) in topLetters { | |
print("Next letter: \(char), Likelihood: \(likelihood)") | |
} | |
print("\nTop word predictions:") | |
for (word, likelihood) in topWords { | |
print("Next word: \(word), Likelihood: \(likelihood)") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment