Last active
July 9, 2024 02:13
-
-
Save lancejpollard/e2b75cc8eec01fee84086bfddf2eb21a to your computer and use it in GitHub Desktop.
Parse Tibetan Syllables (attempt)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const PREFIXES = ['ག', 'ད', 'བ', 'མ', 'འ'] | |
const SUPERSCRIPTS = ['ར', 'ལ', 'ས'] | |
const EXTENDED_ROOT_LETTERS = [ | |
'ཫ', | |
'ཬ', | |
'ཁ༹', | |
'ག༹', | |
'ཕ༹', | |
'བ༹', | |
'གྷ', | |
'ཛྷ', | |
'ཊ', | |
'ཋ', | |
'ཌ', | |
'ཌྷ', | |
'ཎ', | |
'དྷ', | |
'བྷ', | |
'ཥ', | |
'ཀྵ', | |
] | |
const SPECIAL = ['ༀ'] | |
const ROOT_LETTERS = [ | |
'ཀ', | |
'ཁ', | |
'ག', | |
'ང', | |
'ཅ', | |
'ཆ', | |
'ཇ', | |
'ཉ', | |
'ཏ', | |
'ཐ', | |
'ད', | |
'ན', | |
'པ', | |
'ཕ', | |
'བ', | |
'མ', | |
'ཙ', | |
'ཚ', | |
'ཛ', | |
'ཝ', | |
'ཞ', | |
'ཟ', | |
'འ', | |
'ཡ', | |
'ར', | |
'ལ', | |
'ཤ', | |
'ས', | |
'ཧ', | |
'ཨ', | |
].concat(EXTENDED_ROOT_LETTERS) | |
const SUBSCRIPTS = [ | |
'ྐ', | |
'ྑ', | |
'ྒ', | |
'ྒྷ', | |
'ྔ', | |
'ྕ', | |
'ྖ', | |
'ྗ', | |
'ྙ', | |
'ྚ', | |
'ྛ', | |
'ྜ', | |
'ྜྷ', | |
'ྞ', | |
'ྟ', | |
'ྠ', | |
'ྡ', | |
'ྡྷ', | |
'ྣ', | |
'ྤ', | |
'ྥ', | |
'ྦ', | |
'ྦྷ', | |
'ྨ', | |
'ྩ', | |
'ྪ', | |
'ྫ', | |
'ྫྷ', | |
'ྭ', | |
'ྮ', | |
'ྯ', | |
'ྰ', | |
'ྱ', | |
'ྲ', | |
'ླ', | |
'ྴ', | |
'ྵ', | |
'ྶ', | |
'ྷ', | |
'ྸ', | |
'ྐྵ', | |
'ྺ', | |
'ྻ', | |
'ྼ', | |
] | |
const VOWEL_SIGNS = [ | |
'ཱ', | |
'ི', | |
'ཱི', | |
'ུ', | |
'ཱུ', | |
'ྲྀ', | |
'ཷ', | |
'ླྀ', | |
'ཹ', | |
'ྀ', | |
'ཱྀ', | |
] | |
const SUFFIXES = ['ག', 'ན', 'བ', 'ད', 'མ', 'འ', 'ར', 'ང', 'ས', 'ལ'] | |
const POST_SUFFIXES = ['ད', 'ས'] | |
const IGNORED = [ | |
'༄', | |
'ེ', | |
'ཻ', | |
'ོ', | |
'ཽ', | |
'ཾ', | |
'ཿ', | |
'ྂ', | |
'ྃ', | |
'྄', | |
'྅', | |
'྆', | |
'྇', | |
'ྈ', | |
'ྉ', | |
'ྊ', | |
'ྋ', | |
'ྌ', | |
'ྍ', | |
'ྎ', | |
'ྏ', | |
'\u0f89', | |
] | |
let errorI = 1 | |
function splitSyllable(candidate: string) { | |
if (SPECIAL.includes(candidate)) { | |
return [candidate] | |
} | |
const parts = [...candidate] | |
if (parts[0] === parts[1]) { | |
splitSyllable(parts.slice(1).join('')) | |
} | |
const output = [] | |
let current = [] | |
let state = 'new' | |
while (parts.length) { | |
const next = parts.shift() | |
if (IGNORED.includes(next)) { | |
if (current.length) { | |
output.push(current.join('')) | |
} | |
current = [] | |
state = 'new' | |
continue | |
} | |
switch (state) { | |
case 'new': { | |
if (PREFIXES.includes(next)) { | |
const peek = parts[0] | |
if (SUPERSCRIPTS.includes(peek)) { | |
const peek2 = parts[1] | |
if (ROOT_LETTERS.includes(peek2)) { | |
parts.shift() | |
parts.shift() | |
current.push(`${next}${peek}${peek2}`) | |
} else { | |
parts.shift() | |
current.push(`${next}${peek}`) | |
} | |
} else if (ROOT_LETTERS.includes(peek)) { | |
parts.shift() | |
current.push(`${next}${peek}`) | |
} else { | |
current.push(next) | |
} | |
state = 'root' | |
} else if (ROOT_LETTERS.includes(next)) { | |
current.push(next) | |
state = 'root' | |
} else { | |
console.log( | |
errorI++, | |
candidate + | |
' ' + | |
JSON.stringify( | |
[...candidate].map(x => | |
x.codePointAt(0).toString(16).padStart(4, '0'), | |
), | |
), | |
) | |
return [] | |
throw new Error( | |
'New state: ' + | |
candidate + | |
' ' + | |
JSON.stringify( | |
[...candidate].map(x => | |
x.codePointAt(0).toString(16).padStart(4, '0'), | |
), | |
), | |
) | |
} | |
break | |
} | |
case 'root': { | |
if (SUBSCRIPTS.includes(next)) { | |
current.push(next) | |
state = 'subscript' | |
} else if (VOWEL_SIGNS.includes(next)) { | |
current.push(next) | |
state = 'vowel' | |
} else if (SUFFIXES.includes(next)) { | |
current.push(next) | |
state = 'suffix' | |
} else { | |
parts.unshift(next) | |
output.push(current.join('')) | |
current = [] | |
state = 'new' | |
} | |
break | |
} | |
case 'subscript': { | |
if (VOWEL_SIGNS.includes(next)) { | |
current.push(next) | |
state = 'vowel' | |
} else if (SUFFIXES.includes(next)) { | |
current.push(next) | |
state = 'suffix' | |
} else { | |
parts.unshift(next) | |
output.push(current.join('')) | |
current = [] | |
state = 'new' | |
} | |
break | |
} | |
case 'vowel': { | |
if (SUFFIXES.includes(next)) { | |
current.push(next) | |
state = 'suffix' | |
} else { | |
parts.unshift(next) | |
output.push(current.join('')) | |
current = [] | |
state = 'new' | |
} | |
break | |
} | |
case 'suffix': { | |
if (POST_SUFFIXES.includes(next)) { | |
current.push(next) | |
output.push(current.join('')) | |
current = [] | |
state = 'new' | |
} else { | |
parts.unshift(next) | |
output.push(current.join('')) | |
current = [] | |
state = 'new' | |
} | |
break | |
} | |
} | |
} | |
if (current.length) { | |
output.push(current.join('')) | |
} | |
return output | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment