Created
November 9, 2021 21:09
-
-
Save profh/479d63b1dfcb5a002c19b74daf323676 to your computer and use it in GitHub Desktop.
Contents of playground for NLP example (parts of speech and language recognition)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Playground using NSLinguisticTagger to analyze and tag a block of text | |
// https://developer.apple.com/documentation/foundation/nslinguistictagger/identifying_parts_of_speech | |
import Foundation | |
// text from some article on Democratic candidate preferences (Nov 2018) | |
let text = "The Democratic frontrunner, according to Politico's poll: Joe Biden, former Vice President and Senator from Delaware, who managed to grab just over a quarter (26%) of the Democrats' vote for who they'd most like to see facing off against Trump in two years for control of the White House. The runner-up is Vermont Senator Bernie Sanders, who ran a close primary campaign against Hillary Clinton in 2016, managing to get about a fifth of the votes (19%). The third-place candidate is Rep. Beto O’Rourke from Texas, who built national name-recognition through his losing Senate bid last week, with 8 percent. Following O’Rourke are three senators, all thought to be likely candidates: Sens. Elizabeth Warren (Mass.) at 5 percent, Kamala Harris (Calif.) at 4 percent and Cory Booker (N.J.) at 3 percent." | |
// getting the first sentence from the text: | |
let endOfSentence = text.firstIndex(of: ".")! | |
let sentence = text[...endOfSentence] | |
// set up a tagger with NSLinguisticTagger | |
let tagger = NSLinguisticTagger(tagSchemes: [NSLinguisticTagScheme.lexicalClass], options: 0) | |
// setting the tagger text to our text | |
tagger.string = text | |
// set the range of the text to be analyzed | |
let range = NSMakeRange(0, 287) // just the first sentence (287 is the count; MS Word) | |
// let range = NSMakeRange(0, sentence.utf16.count) // assuming hadn't found the count | |
// let range = NSMakeRange(0, text.utf16.count) // the entire block | |
// set tagger options (pretty typical choices) | |
let options:NSLinguisticTagger.Options = [.omitPunctuation, .omitWhitespace, .joinNames] | |
// set up a tags array and arrays to hold results | |
// other tag options: https://developer.apple.com/documentation/foundation/nslinguistictag | |
let tags:[NSLinguisticTag] = [.noun, .verb, .adjective] | |
var nouns:[String] = [] | |
var verbs:[String] = [] | |
var adjs:[String] = [] | |
// The meat of the operation: enumerate the tags and add to results | |
tagger.enumerateTags(in: range, unit: .word, scheme: .lexicalClass, options: options){ | |
tag, tokenRange, stop in | |
guard let tag = tag else { return } // just in case there is no tag... | |
let token = (text as NSString).substring(with: tokenRange) | |
// depending on the tag, add to the appropriate results array | |
switch tag { | |
case .noun: nouns.append(token) | |
case .verb: verbs.append(token) | |
case .adjective: adjs.append(token) | |
default:break | |
} | |
} | |
// Display results arrays | |
print("NOUNS:") | |
for noun in nouns { | |
print(noun) | |
} | |
print("--------") | |
print("VERBS:") | |
for verb in verbs { | |
print(verb) | |
} | |
print("--------") | |
print("ADJECTIVES:") | |
for adj in adjs { | |
print(adj) | |
} | |
//** Adding language detection... **// | |
print("--------") | |
// verify that the dominant language is English | |
let language = tagger.dominantLanguage | |
print("The language is \(language!)") | |
// turn this into func; use repeatedly | |
func determineLanguage(for text: String) { | |
tagger.string = text | |
let language = tagger.dominantLanguage | |
print("The language is \(language!)") | |
} | |
let frenchQuote = "La science n'a pas de patrie." // Science has no homeland -- Pasteur | |
let germanQuote = "Das ist nicht mein Bier." // That's not my beer | |
let italianQuote = "L’amore è cieco" // Love is blind | |
let spanishQuote = "El amor todo lo puede." // Love will find a way | |
let klingonQuote = "Heghlu'meH QaQ jajvam" // It is a good day to die | |
let loremQuote = "Lorem ipsum dolor sit amet" | |
let gibberish = "asdf plmjus qawsedrf" | |
determineLanguage(for: frenchQuote) | |
determineLanguage(for: germanQuote) | |
determineLanguage(for: italianQuote) | |
determineLanguage(for: spanishQuote) | |
determineLanguage(for: klingonQuote) // thinks it's Croation, not Klingon | |
determineLanguage(for: loremQuote) // thinks it's Romanian | |
determineLanguage(for: gibberish) // that's English too... | |
determineLanguage(for: "x") // this is undefined |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment