Created
September 15, 2018 11:44
-
-
Save ironhouzi/195b04a20218720487c6215961a001c3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pub mod table; | |
use std::collections::HashMap; | |
use std::char; | |
#[cfg(test)] | |
mod tests { | |
use super::maybe_sanskrit; | |
use super::tokenize; | |
use super::get_root; | |
use super::default_parse; | |
// use super::to_unicode; | |
// use super::vowel_indices; | |
// use super::analyze_root; | |
// use super::Letter; | |
use super::Slice; | |
use super::LetterType; | |
use super::Word; | |
use table; | |
// #[test] | |
// fn test_root_analyzer() { | |
// let parts = tokenize("a", &table::W_SORTED_ALPHABET); | |
// assert_eq!(analyze_root("a", parts, 0), vec![Syllable::Root]) | |
// } | |
#[test] | |
fn test_get_root() { | |
assert_eq!( | |
get_root(&default_parse("a")), | |
vec![LetterType::Root] | |
); | |
assert_eq!( | |
get_root(&default_parse("ba")), | |
vec![LetterType::Root, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("bya")), | |
vec![LetterType::Root, LetterType::Subjoined, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("rja")), | |
vec![LetterType::Super, LetterType::Root, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("g.ya")), | |
vec![LetterType::Prefix, LetterType::Root, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("dba")), | |
vec![LetterType::Prefix, LetterType::Root, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("srwa")), | |
vec![LetterType::Root, LetterType::Subjoined, | |
LetterType::Subjoined2, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("bsnga")), | |
vec![LetterType::Prefix, LetterType::Super, | |
LetterType::Root, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("dbya")), | |
vec![LetterType::Prefix, LetterType::Root, | |
LetterType::Subjoined, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("skya")), | |
vec![LetterType::Super, LetterType::Root, | |
LetterType::Subjoined, LetterType::Vowel] | |
); | |
assert_eq!( | |
get_root(&default_parse("bskya")), | |
vec![LetterType::Prefix, LetterType::Super, LetterType::Root, | |
LetterType::Subjoined, LetterType::Vowel] | |
); | |
} | |
#[test] | |
fn test_letter_slice() { | |
let s = "sangs"; | |
let w = tokenize(&s, &table::W_SORTED_ALPHABET); | |
assert_eq!(w.vowels[0], 1); | |
assert_eq!(w.letter(0), "s"); | |
assert_eq!(w.letter(1), "a"); | |
assert_eq!(w.letter(2), "ng"); | |
assert_eq!(w.letter(3), "s"); | |
} | |
#[test] | |
fn test_letter_partition() { | |
let mut w = "sangs"; | |
assert_eq!( | |
tokenize(w, &table::W_SORTED_ALPHABET), | |
Word { | |
string: w, | |
vowels: vec![1], | |
letters: vec![ | |
Slice{i: 0, len: 1}, | |
Slice{i: 1, len: 1}, | |
Slice{i: 2, len: 2}, | |
Slice{i: 4, len: 1} | |
], | |
index: 0 | |
} | |
); | |
w = "'tshags"; | |
assert_eq!( | |
tokenize(w, &table::W_SORTED_ALPHABET), | |
// vec!["'", "tsh", "a", "g", "s"]); | |
Word { | |
string: w, | |
vowels: vec![2], | |
letters: vec![ | |
Slice{i: 0, len: 1}, | |
Slice{i: 1, len: 3}, | |
Slice{i: 4, len: 1}, | |
Slice{i: 5, len: 1}, | |
Slice{i: 6, len: 1} | |
], | |
index: 0 | |
} | |
); | |
w = "g.yag"; | |
assert_eq!( | |
tokenize(w, &table::W_SORTED_ALPHABET), | |
Word { | |
string: w, | |
vowels: vec![2], | |
letters: vec![ | |
Slice{i: 0, len: 2}, | |
Slice{i: 2, len: 1}, | |
Slice{i: 3, len: 1}, | |
Slice{i: 4, len: 1} | |
], | |
index: 0 | |
} | |
); | |
} | |
#[test] | |
fn test_quickcheck() { | |
let sanskrit: [&'static str; 6] = ["sarva", "ai", "au", "akṣye", "vajra", "kyai"]; | |
for s in &sanskrit { | |
assert!(maybe_sanskrit(s)); | |
} | |
} | |
#[test] | |
fn test_foo() { | |
let w = tokenize("g.yag", &table::W_SORTED_ALPHABET); | |
for l in w { | |
print!("`{}`, ", l); | |
} | |
println!(); | |
} | |
// #[test] | |
// fn test_unicode() { | |
// // assert_eq!(to_unicode("e", LetterType::Root), Some('a')); | |
// let s = to_unicode("s", LetterType::Root); | |
// assert_eq!(s, Some('a')); | |
// } | |
} | |
// #[derive(Copy, Clone, Debug, PartialEq)] | |
#[derive(Debug, PartialEq)] | |
enum LetterType { | |
Vowel, | |
Prefix, | |
Super, | |
Root, | |
Subjoined, | |
Subjoined2, | |
// Suffix, | |
// Suffix2, | |
// Genitive, | |
// GenVowel | |
} | |
#[derive(Debug, PartialEq)] | |
struct Slice { | |
i: usize, | |
len: usize, | |
} | |
#[derive(Debug, PartialEq)] | |
struct Letter { | |
slice: Slice, | |
category: LetterType | |
} | |
#[derive(Debug, PartialEq)] | |
struct WordInfo { | |
root: Vec<usize>, | |
letters: Vec<Slice> | |
} | |
#[derive(Debug, PartialEq)] | |
struct Word<'a> { | |
string: &'a str, | |
vowels: Vec<usize>, | |
letters: Vec<Slice>, | |
index: usize | |
} | |
#[derive(Debug, PartialEq)] | |
struct ParsedWord<'a> { | |
letters: Vec<&'a str>, | |
structure: Vec<LetterType>, | |
index: usize | |
} | |
impl<'a> Word<'a> { | |
pub fn letter(&'a self, index: usize) -> &'a str { | |
w_letter(self.string, &self.letters[index]) | |
} | |
// pub fn to_unicode(self) -> String { | |
// // self.letters.map(|l| l.to_unicode()).collect(); | |
// "".to_string() | |
// } | |
} | |
impl<'a> Iterator for ParsedWord<'a> { | |
type Item = (&'a str, LetterType); | |
fn next(&mut self) -> Option<Self::Item> { | |
if self.index >= self.letters.len() { | |
return None | |
} | |
let result = (self.letters[self.index], self.structure[self.index]); | |
self.index += 1; | |
Some(result) | |
} | |
} | |
impl<'a> Iterator for Word<'a> { | |
type Item = &'a str; | |
fn next(&mut self) -> Option<Self::Item> { | |
if self.index >= self.letters.len() { | |
return None | |
} | |
let slice = &self.letters[self.index]; | |
let result = &self.string[slice.i..slice.i+slice.len]; | |
self.index += 1; | |
Some(result) | |
} | |
} | |
// impl<'a> Iterator for Word<'a> { | |
// type Item = &'a str; | |
// fn next(&mut self) -> Option<Self::Item> { | |
// if self.index >= self.letters.len() { | |
// return None | |
// } | |
// let result = self.letter(self.index); | |
// self.index += 1; | |
// Some(result) | |
// } | |
// } | |
// impl<'a> Iterator for ParsedWord<'a> { | |
// type Item = (&'a str, LetterType); | |
// fn next(&mut self) -> Option<(&'a str, LetterType)> { | |
// Some(("s", LetterType::Root)) | |
// // word.word.letters.iter().zip(word.structure) | |
// } | |
// } | |
// // TODO: conjoin neighbouring vowels to count as one vowel.. | |
// fn vowel_indices(string: &str, vowels: &[char]) -> Vec<usize> { | |
// let indices: Vec<usize> = string.chars() | |
// .enumerate() | |
// .filter(|&(_, c)| vowels.contains(&c)) | |
// .map(|(i, _)| i) | |
// .collect(); | |
// indices | |
// } | |
// fn letter(string: &str, slice: (usize, usize)) -> &str { | |
// &string[slice.0..slice.0+slice.1] | |
// } | |
// fn analyze_root<'a>(string: &str, parts: &'a Vec<Letter>) -> Vec<Letter> { | |
// } | |
// fn get_root(string: &str, vowel_indices: Vec<usize>, slices: Vec<Slice>) -> Vec<LetterType> { | |
fn get_root<'a>(word: &'a Word) -> Vec<LetterType> { | |
let mut result: Vec<LetterType> = Vec::new(); | |
if word.vowels[0] == 0 { | |
result.push(LetterType::Root); | |
return result; | |
} else if word.vowels[0] == 1 { | |
if table::W_CONSONANTS.contains(&word.letter(0)) { | |
result.push(LetterType::Root); | |
} // TODO: raise error on else | |
} else if word.vowels[0] == 2 { | |
if is_subscribed(&word) { | |
result.push(LetterType::Root); | |
result.push(LetterType::Subjoined); | |
} else if is_superscribed(&word) { | |
result.push(LetterType::Super); | |
result.push(LetterType::Root); | |
} else if table::PREFIXES.contains(&word.letter(0)) | |
&& table::W_CONSONANTS.contains(&word.letter(1)) { | |
result.push(LetterType::Prefix); | |
result.push(LetterType::Root); | |
} | |
} else if word.vowels[0] == 3 { | |
if word.letter(2) == "w" && word.letter(1) == "r" { | |
result.push(LetterType::Root); | |
result.push(LetterType::Subjoined); | |
result.push(LetterType::Subjoined2); | |
} else if is_superscribed(&word) { | |
result.push(LetterType::Prefix); | |
result.push(LetterType::Super); | |
result.push(LetterType::Root); | |
} else if is_subscribed(&word) { | |
result.push(LetterType::Prefix); | |
result.push(LetterType::Root); | |
result.push(LetterType::Subjoined); | |
} else if table::SUPERJOINED.contains(&word.letter(0)) | |
&& table::W_CONSONANTS.contains(&word.letter(1)) | |
&& table::SUBJOINED.contains(&word.letter(2)) { | |
result.push(LetterType::Super); | |
result.push(LetterType::Root); | |
result.push(LetterType::Subjoined); | |
} | |
} else if word.vowels[0] == 4 { | |
if !(table::PREFIXES.contains(&word.letter(0)) | |
&& table::SUPERJOINED.contains(&word.letter(1)) | |
&& table::W_CONSONANTS.contains(&word.letter(2)) | |
&& table::SUBJOINED.contains(&word.letter(3))) { | |
// TODO raise error! | |
} | |
result.push(LetterType::Prefix); | |
result.push(LetterType::Super); | |
result.push(LetterType::Root); | |
result.push(LetterType::Subjoined); | |
} | |
result.push(LetterType::Vowel); | |
result | |
} | |
fn generate_lookup<'a, 'b>() -> HashMap<&'b&'a str, &'b&'a str> { | |
let mut lookup = HashMap::new(); | |
for (l, u) in table::W_CONSONANTS.iter().chain(table::W_VOWELSS.iter()).zip( | |
table::U_CONSONANTS.iter().chain(table::U_VOWELS.iter())) { | |
lookup.insert(l, u); | |
} | |
lookup | |
} | |
fn foo<'a, 'b>() -> HashMap<&'b &'a str, &'b u32> { | |
let mut lookup = HashMap::new(); | |
for (l, u) in table::W_CONSONANTS.iter().chain(table::W_VOWELSS.iter()).zip( | |
table::U_CONSONANTSI.iter().chain(table::U_VOWELSI.iter())) { | |
lookup.insert(l, u); | |
} | |
lookup | |
} | |
fn create_parsed_word<'a>(string: &'a str) -> ParsedWord<'a> { | |
let word = default_parse(string); | |
let structure = get_root(&word); | |
let letters = word.collect(); | |
ParsedWord {letters: letters, structure: structure, index: 0} | |
} | |
fn subjoin_unicode<'a, 'b>(codepoint: Option<&'a &'b u32>) -> Option<char>{ | |
match codepoint { | |
Some(cp) => char::from_u32(*cp + 0x50), | |
None => return None | |
} | |
} | |
// fn to_unicode<'a>(word: ParsedWord) -> Option<String> { | |
// // let lookup = generate_lookup(); | |
// let lookup = foo(); | |
// let mut result: Vec<char> = Vec::new(); | |
// for (letter, letter_type) in word.word.letters.iter().zip(word.structure) { | |
// if letter == "a" && letter_type == LetterType::Root { | |
// continue | |
// } | |
// if table::W_VOWELSS.contains(letter) && letter_type == LetterType::Root { | |
// match lookup.get(letter) { | |
// Some(l) => result.push(l), | |
// None => return None | |
// } | |
// } | |
// } | |
// Some("a".to_string()) | |
// } | |
fn w_letter<'a>(string: &'a str, slice: &'a Slice) -> &'a str { | |
&string[slice.i..slice.i+slice.len] | |
} | |
fn default_parse<'a>(string: &'a str) -> Word<'a> { | |
tokenize(&string, &table::W_SORTED_ALPHABET) | |
} | |
fn tokenize<'a>(string: &'a str, alphabet: &[&'static str]) -> Word<'a> { | |
let mut result: Vec<Slice> = Vec::new(); | |
let mut vowel_indices: Vec<usize> = Vec::new(); | |
let mut progress = 0; | |
while progress < string.len() { | |
for (i, letter) in alphabet.iter().enumerate() { | |
let slice = &string[progress..]; | |
let g_prefix_edge_case = slice.starts_with("g."); | |
if !(g_prefix_edge_case || slice.starts_with(letter)) { | |
if i == alphabet.len() - 1 { | |
// TODO: raise exception invalid tibetan character! | |
progress = string.len(); | |
} | |
continue; | |
} | |
let letter_length = | |
if g_prefix_edge_case { | |
2 | |
} else { | |
letter.len() | |
}; | |
result.push(Slice{i: progress, len: letter_length}); | |
if i >= table::W_SORTED_ALPHABET.len() - table::TIBETAN_VOWELS.len() { | |
vowel_indices.push(result.len() - 1) | |
} | |
progress += letter_length; | |
break; | |
} | |
} | |
Word {string: string, vowels: vowel_indices, letters: result, index: 0} | |
} | |
fn maybe_sanskrit(string: &str) -> bool { | |
if string.len() == 3 && table::S_DOUBLE_CONSONANTS.contains(&&string[0..2]) { | |
return true; | |
} | |
for r in &table::S_BASIC_RULES { | |
if string.starts_with(r) { | |
return true; | |
} | |
} | |
if string.contains("ai") || string.contains("au") { | |
return true; | |
} | |
let mut vowel_count = 0; | |
for v in &table::TIBETAN_VOWELS { | |
let m: Vec<&str> = string.matches(&v.to_string()).collect(); | |
vowel_count += m.len(); | |
} | |
// achung | |
!string.contains(table::W_CONSONANTS[22]) && vowel_count > 1 | |
} | |
// fn is_subscribed(string: &str, vowel_index: usize, slices: &Vec<Slice>) -> bool { | |
fn is_subscribed(word: &Word) -> bool { | |
if word.vowels[0] == 2 { | |
!valid_superscribe(word.letter(0), word.letter(1)) | |
&& valid_subscribe(word.letter(0), word.letter(1)) | |
} else { // vowel_index == 3 | |
table::PREFIXES.contains(&word.letter(0)) | |
&& !valid_superscribe(word.letter(1), word.letter(2)) | |
&& valid_subscribe(word.letter(1), word.letter(2)) | |
} | |
} | |
// fn is_superscribed(string: &str, vowel_index: usize, slices: &Vec<Slice>) -> bool { | |
fn is_superscribed(word: &Word) -> bool { | |
if word.vowels[0] == 2 { | |
valid_superscribe(word.letter(0), word.letter(1)) | |
&& !valid_subscribe(word.letter(0), word.letter(1)) | |
} else { // vowel_index == 3 | |
table::PREFIXES.contains(&word.letter(0)) | |
&& valid_superscribe(word.letter(1), word.letter(2)) | |
&& !valid_subscribe(word.letter(1), word.letter(2)) | |
} | |
} | |
fn valid_superscribe(head_letter: &str, root_letter: &str) -> bool { | |
table::SUPERJOINED.contains(&head_letter) | |
&& table::SUPERJOINABLE.contains(&root_letter) | |
} | |
fn valid_subscribe(root_letter: &str, subjoined_letter: &str) -> bool { | |
table::SUBJOINED.contains(&subjoined_letter) | |
&& table::SUBJOINABLE.contains(&root_letter) | |
} |
Author
ironhouzi
commented
Sep 15, 2018
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment