Created
September 12, 2018 13:01
-
-
Save ironhouzi/77c8a44edf7533dce77263b1e255c673 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pub mod table; | |
use std::collections::HashMap; | |
use std::char; | |
#[cfg(test)] | |
mod tests { | |
use super::maybe_sanskrit; | |
use super::tokenize; | |
use super::get_root; | |
use super::default_parse; | |
// use super::to_unicode; | |
// use super::vowel_indices; | |
// use super::analyze_root; | |
// use super::Letter; | |
use super::Slice; | |
use super::LetterType; | |
use super::Word; | |
use table; | |
// #[test] | |
// fn test_root_analyzer() { | |
// let parts = tokenize("a", &table::W_SORTED_ALPHABET); | |
// assert_eq!(analyze_root("a", parts, 0), vec![Syllable::Root]) | |
// } | |
#[test] | |
fn test_get_root() { | |
assert_eq!( | |
get_root(&default_parse("a")), | |
vec![LetterType::Root] | |
); | |
assert_eq!( | |
get_root(&default_parse("ba")), | |
vec![LetterType::Root, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("bya")), | |
vec![LetterType::Root, LetterType::Subjoined, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("rja")), | |
vec![LetterType::Super, LetterType::Root, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("g.ya")), | |
vec![LetterType::Prefix, LetterType::Root, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("dba")), | |
vec![LetterType::Prefix, LetterType::Root, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("srwa")), | |
vec![LetterType::Root, LetterType::Subjoined, | |
LetterType::Subjoined2, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("bsnga")), | |
vec![LetterType::Prefix, LetterType::Super, | |
LetterType::Root, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("dbya")), | |
vec![LetterType::Prefix, LetterType::Root, | |
LetterType::Subjoined, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("skya")), | |
vec![LetterType::Super, LetterType::Root, | |
LetterType::Subjoined, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("bskya")), | |
vec![LetterType::Prefix, LetterType::Super, LetterType::Root, | |
LetterType::Subjoined, LetterType::Vowel] | |
); | |
} | |
#[test] | |
fn test_letter_slice() { | |
let s = "sangs"; | |
let w = tokenize(&s, &table::W_SORTED_ALPHABET); | |
assert_eq!(w.vowels[0], 1); | |
assert_eq!(w.letter(0), "s"); | |
assert_eq!(w.letter(1), "a"); | |
assert_eq!(w.letter(2), "ng"); | |
assert_eq!(w.letter(3), "s"); | |
} | |
#[test] | |
fn test_letter_partition() { | |
let mut w = "sangs"; | |
assert_eq!( | |
tokenize(w, &table::W_SORTED_ALPHABET), | |
Word { | |
string: w, | |
vowels: vec![1], | |
letters: vec![ | |
Slice{i: 0, len: 1}, | |
Slice{i: 1, len: 1}, | |
Slice{i: 2, len: 2}, | |
Slice{i: 4, len: 1} | |
] | |
} | |
); | |
w = "'tshags"; | |
assert_eq!( | |
tokenize(w, &table::W_SORTED_ALPHABET), | |
// vec!["'", "tsh", "a", "g", "s"]); | |
Word { | |
string: w, | |
vowels: vec![2], | |
letters: vec![ | |
Slice{i: 0, len: 1}, | |
Slice{i: 1, len: 3}, | |
Slice{i: 4, len: 1}, | |
Slice{i: 5, len: 1}, | |
Slice{i: 6, len: 1} | |
] | |
} | |
); | |
w = "g.yag"; | |
assert_eq!( | |
tokenize(w, &table::W_SORTED_ALPHABET), | |
Word { | |
string: w, | |
vowels: vec![2], | |
letters: vec![ | |
Slice{i: 0, len: 2}, | |
Slice{i: 2, len: 1}, | |
Slice{i: 3, len: 1}, | |
Slice{i: 4, len: 1} | |
] | |
} | |
); | |
} | |
#[test] | |
fn test_quickcheck() { | |
let sanskrit: [&'static str; 6] = ["sarva", "ai", "au", "akṣye", "vajra", "kyai"]; | |
for s in &sanskrit { | |
assert!(maybe_sanskrit(s)); | |
} | |
} | |
// #[test] | |
// fn test_unicode() { | |
// // assert_eq!(to_unicode("e", LetterType::Root), Some('a')); | |
// let s = to_unicode("s", LetterType::Root); | |
// assert_eq!(s, Some('a')); | |
// } | |
} | |
// // #[derive(Copy, Clone, Debug, PartialEq)] | |
// #[derive(Debug, PartialEq)] | |
// enum Letter { | |
// Consonant, | |
// Vowel | |
// } | |
// #[derive(Copy, Clone, Debug, PartialEq)] | |
#[derive(Debug, PartialEq)] | |
enum LetterType { | |
Vowel, | |
Prefix, | |
Super, | |
Root, | |
Subjoined, | |
Subjoined2, | |
// Suffix, | |
// Suffix2, | |
// Genitive, | |
// GenVowel | |
} | |
#[derive(Debug, PartialEq)] | |
struct Slice { | |
i: usize, | |
len: usize, | |
} | |
#[derive(Debug, PartialEq)] | |
struct Letter { | |
slice: Slice, | |
category: LetterType | |
} | |
#[derive(Debug, PartialEq)] | |
struct WordInfo { | |
root: Vec<usize>, | |
letters: Vec<Slice> | |
} | |
#[derive(Debug, PartialEq)] | |
struct Word<'a> { | |
string: &'a str, | |
vowels: Vec<usize>, | |
letters: Vec<Slice> | |
} | |
#[derive(Debug, PartialEq)] | |
struct ParsedWord<'a> { | |
word: Word<'a>, | |
structure: Vec<LetterType> | |
} | |
// impl Letter { | |
// pub fn str(self) -> &str { | |
// self | |
// } | |
// } | |
impl<'a> Word<'a> { | |
pub fn letter(&'a self, index: usize) -> &'a str { | |
w_letter(self.string, &self.letters[index]) | |
} | |
// pub fn to_unicode(self) -> String { | |
// // self.letters.map(|l| l.to_unicode()).collect(); | |
// "".to_string() | |
// } | |
} | |
impl<'a> IntoIterator for Word<'a> { | |
type Item = &'a str; | |
type IntoIter = ::std::vec::IntoIter<&'a str>; | |
fn into_iter(self) -> Self::IntoIter { | |
self.letters.iter().enumerate().map(|(i, _)| self.letter(i)) | |
} | |
} | |
// impl<'a> Iterator for ParsedWord<'a> { | |
// type Item = (&'a str, LetterType); | |
// fn next(&mut self) -> Option<(&'a str, LetterType)> { | |
// Some(("s", LetterType::Root)) | |
// // word.word.letters.iter().zip(word.structure) | |
// } | |
// } | |
// // TODO: conjoin neighbouring vowels to count as one vowel.. | |
// fn vowel_indices(string: &str, vowels: &[char]) -> Vec<usize> { | |
// let indices: Vec<usize> = string.chars() | |
// .enumerate() | |
// .filter(|&(_, c)| vowels.contains(&c)) | |
// .map(|(i, _)| i) | |
// .collect(); | |
// indices | |
// } | |
// fn letter(string: &str, slice: (usize, usize)) -> &str { | |
// &string[slice.0..slice.0+slice.1] | |
// } | |
// fn analyze_root<'a>(string: &str, parts: &'a Vec<Letter>) -> Vec<Letter> { | |
// } | |
// fn get_root(string: &str, vowel_indices: Vec<usize>, slices: Vec<Slice>) -> Vec<LetterType> { | |
fn get_root<'a>(word: &'a Word) -> Vec<LetterType> { | |
let mut result: Vec<LetterType> = Vec::new(); | |
if word.vowels[0] == 0 { | |
result.push(LetterType::Root); | |
return result; | |
} else if word.vowels[0] == 1 { | |
if table::W_CONSONANTS.contains(&word.letter(0)) { | |
result.push(LetterType::Root); | |
} // TODO: raise error on else | |
} else if word.vowels[0] == 2 { | |
if is_subscribed(&word) { | |
result.push(LetterType::Root); | |
result.push(LetterType::Subjoined); | |
} else if is_superscribed(&word) { | |
result.push(LetterType::Super); | |
result.push(LetterType::Root); | |
} else if table::PREFIXES.contains(&word.letter(0)) | |
&& table::W_CONSONANTS.contains(&word.letter(1)) { | |
result.push(LetterType::Prefix); | |
result.push(LetterType::Root); | |
} | |
} else if word.vowels[0] == 3 { | |
if word.letter(2) == "w" && word.letter(1) == "r" { | |
result.push(LetterType::Root); | |
result.push(LetterType::Subjoined); | |
result.push(LetterType::Subjoined2); | |
} else if is_superscribed(&word) { | |
result.push(LetterType::Prefix); | |
result.push(LetterType::Super); | |
result.push(LetterType::Root); | |
} else if is_subscribed(&word) { | |
result.push(LetterType::Prefix); | |
result.push(LetterType::Root); | |
result.push(LetterType::Subjoined); | |
} else if table::SUPERJOINED.contains(&word.letter(0)) | |
&& table::W_CONSONANTS.contains(&word.letter(1)) | |
&& table::SUBJOINED.contains(&word.letter(2)) { | |
result.push(LetterType::Super); | |
result.push(LetterType::Root); | |
result.push(LetterType::Subjoined); | |
} | |
} else if word.vowels[0] == 4 { | |
if !(table::PREFIXES.contains(&word.letter(0)) | |
&& table::SUPERJOINED.contains(&word.letter(1)) | |
&& table::W_CONSONANTS.contains(&word.letter(2)) | |
&& table::SUBJOINED.contains(&word.letter(3))) { | |
// TODO raise error! | |
} | |
result.push(LetterType::Prefix); | |
result.push(LetterType::Super); | |
result.push(LetterType::Root); | |
result.push(LetterType::Subjoined); | |
} | |
result.push(LetterType::Vowel); | |
result | |
} | |
fn generate_lookup<'a, 'b>() -> HashMap<&'b&'a str, &'b&'a str> { | |
let mut lookup = HashMap::new(); | |
for (l, u) in table::W_CONSONANTS.iter().chain(table::W_VOWELSS.iter()).zip( | |
table::U_CONSONANTS.iter().chain(table::U_VOWELS.iter())) { | |
lookup.insert(l, u); | |
} | |
lookup | |
} | |
fn foo<'a, 'b>() -> HashMap<&'b &'a str, &'b u32> { | |
let mut lookup = HashMap::new(); | |
for (l, u) in table::W_CONSONANTS.iter().chain(table::W_VOWELSS.iter()).zip( | |
table::U_CONSONANTSI.iter().chain(table::U_VOWELSI.iter())) { | |
lookup.insert(l, u); | |
} | |
lookup | |
} | |
fn create_parsed_word<'a>(string: &'a str) -> ParsedWord<'a> { | |
let word = default_parse(string); | |
let structure = get_root(&word); | |
ParsedWord {word: word, structure: structure} | |
} | |
fn subjoin_unicode<'a, 'b>(codepoint: Option<&'a &'b u32>) -> Option<char>{ | |
match codepoint { | |
Some(cp) => char::from_u32(*cp + 0x50), | |
None => return None | |
} | |
} | |
// fn to_unicode<'a>(word: ParsedWord) -> Option<String> { | |
// // let lookup = generate_lookup(); | |
// let lookup = foo(); | |
// let mut result: Vec<char> = Vec::new(); | |
// for (letter, letter_type) in word.word.letters.iter().zip(word.structure) { | |
// if letter == "a" && letter_type == LetterType::Root { | |
// continue | |
// } | |
// if table::W_VOWELSS.contains(letter) && letter_type == LetterType::Root { | |
// match lookup.get(letter) { | |
// Some(l) => result.push(l), | |
// None => return None | |
// } | |
// } | |
// } | |
// Some("a".to_string()) | |
// } | |
fn w_letter<'a>(string: &'a str, slice: &'a Slice) -> &'a str { | |
&string[slice.i..slice.i+slice.len] | |
} | |
fn default_parse<'a>(string: &'a str) -> Word<'a> { | |
tokenize(&string, &table::W_SORTED_ALPHABET) | |
} | |
fn tokenize<'a>(string: &'a str, alphabet: &[&'static str]) -> Word<'a> { | |
let mut result: Vec<Slice> = Vec::new(); | |
let mut vowel_indices: Vec<usize> = Vec::new(); | |
let mut progress = 0; | |
while progress < string.len() { | |
for (i, letter) in alphabet.iter().enumerate() { | |
let slice = &string[progress..]; | |
let g_prefix_edge_case = slice.starts_with("g."); | |
if !(g_prefix_edge_case || slice.starts_with(letter)) { | |
if i == alphabet.len() - 1 { | |
// TODO: raise exception invalid tibetan character! | |
progress = string.len(); | |
} | |
continue; | |
} | |
let letter_length = | |
if g_prefix_edge_case { | |
2 | |
} else { | |
letter.len() | |
}; | |
result.push(Slice{i: progress, len: letter_length}); | |
if i >= table::W_SORTED_ALPHABET.len() - table::TIBETAN_VOWELS.len() { | |
vowel_indices.push(result.len() - 1) | |
} | |
progress += letter_length; | |
break; | |
} | |
} | |
Word {string: string, vowels: vowel_indices, letters: result} | |
} | |
fn maybe_sanskrit(string: &str) -> bool { | |
if string.len() == 3 && table::S_DOUBLE_CONSONANTS.contains(&&string[0..2]) { | |
return true; | |
} | |
for r in &table::S_BASIC_RULES { | |
if string.starts_with(r) { | |
return true; | |
} | |
} | |
if string.contains("ai") || string.contains("au") { | |
return true; | |
} | |
let mut vowel_count = 0; | |
for v in &table::TIBETAN_VOWELS { | |
let m: Vec<&str> = string.matches(&v.to_string()).collect(); | |
vowel_count += m.len(); | |
} | |
// achung | |
!string.contains(table::W_CONSONANTS[22]) && vowel_count > 1 | |
} | |
// fn is_subscribed(string: &str, vowel_index: usize, slices: &Vec<Slice>) -> bool { | |
fn is_subscribed(word: &Word) -> bool { | |
if word.vowels[0] == 2 { | |
!valid_superscribe(word.letter(0), word.letter(1)) | |
&& valid_subscribe(word.letter(0), word.letter(1)) | |
} else { // vowel_index == 3 | |
table::PREFIXES.contains(&word.letter(0)) | |
&& !valid_superscribe(word.letter(1), word.letter(2)) | |
&& valid_subscribe(word.letter(1), word.letter(2)) | |
} | |
} | |
// fn is_superscribed(string: &str, vowel_index: usize, slices: &Vec<Slice>) -> bool { | |
fn is_superscribed(word: &Word) -> bool { | |
if word.vowels[0] == 2 { | |
valid_superscribe(word.letter(0), word.letter(1)) | |
&& !valid_subscribe(word.letter(0), word.letter(1)) | |
} else { // vowel_index == 3 | |
table::PREFIXES.contains(&word.letter(0)) | |
&& valid_superscribe(word.letter(1), word.letter(2)) | |
&& !valid_subscribe(word.letter(1), word.letter(2)) | |
} | |
} | |
fn valid_superscribe(head_letter: &str, root_letter: &str) -> bool { | |
table::SUPERJOINED.contains(&head_letter) | |
&& table::SUPERJOINABLE.contains(&root_letter) | |
} | |
fn valid_subscribe(root_letter: &str, subjoined_letter: &str) -> bool { | |
table::SUBJOINED.contains(&subjoined_letter) | |
&& table::SUBJOINABLE.contains(&root_letter) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment