Last active
August 25, 2022 13:38
-
-
Save s5unty/08c3a6d7429c65a37b4b4aaaf3f9bed5 to your computer and use it in GitHub Desktop.
带权重的超强快码的单字字典
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env elvish | |
## | |
# 生成一份带权重的、单字版的超强快码字典 | |
# | |
# [1]: https://elv.sh/get/ | |
# [2]: (单字字频) https://lingua.mtsu.edu/chinese-computing/statistics/char/list.php?Which=MO | |
# [3]: (五笔字典) https://github.com/rime/rime-wubi/ (wubi86.dict.yaml) | |
#### | |
use str | |
var order = [&] | |
var @wubi = (cat wubi86.dict.yaml | from-lines) | |
for line $wubi { | |
if (not (str:contains-any $line "\t")) { | |
continue | |
} | |
echo $line | eawk {|_ word code weight @_| | |
# 同一个字,不同码长,权重不同 | |
# 三 dg 159129581 | |
# 三 dgg 2230000000 | |
# 上 h 783204383 hh | |
# 上 hhgg 4250000000 | |
var mix = [&] | |
var len = (to-string (- (str:count $code '') 1)) | |
if (has-key $order $word) { | |
set mix = (assoc $order[$word] $len $weight) | |
} else { | |
set mix = [ | |
&1= $weight | |
&2= $weight | |
&3= $weight | |
&4= $weight | |
] | |
} | |
# $order[三][2] = 159129581 | |
# $order[上][1] = 783204383 | |
# $order[上][4] = 4250000000 | |
set order = (assoc $order $word $mix) | |
} | |
} | |
var dict = [&] | |
var @cqkm = (cat cqkm_cm.dict.yaml | from-lines) | |
for line $cqkm { | |
if (not (str:contains-any $line "\t")) { | |
continue | |
} | |
echo $line | eawk {|_ word code @_| | |
# 同一个字,不同编码。如多音字 | |
if (has-key $dict $word) { | |
var seq = [(all $dict[$word]) $code] | |
set dict = (assoc $dict $word $seq) | |
continue | |
} | |
set dict = (assoc $dict $word [$code]) | |
} | |
} | |
echo """### | |
# Rime dictionary | |
# encoding:utf-8 | |
--- | |
name: cqkm.single | |
version: 4.10 | |
sort: by_weight | |
use_preset_vocabulary: false | |
columns: | |
- text | |
- code | |
- weight | |
... | |
""" | |
# 输出带权重的字词 | |
keys $dict | order | each {|it| | |
if (not (has-key $order $it)) { | |
continue | |
} | |
if (> (str:count $it "") 2) { # 单字 | |
continue | |
} | |
var purge_i = $false | |
all $dict[$it] | each {|code| | |
if (==s $code "i") { | |
set purge_i = $false | |
break | |
} elif (!=s $code[0] "i") { # 含有声母开头的编码(除1简),就可以清理模糊音 i 了 | |
set purge_i = $true | |
} | |
} | |
all $dict[$it] | each {|code| | |
if (and (==s $code[0] "i") $purge_i) { | |
continue | |
} | |
var len = (- (str:count $code '') 1) | |
set len = (to-string $len) | |
var weight = $order[$it][$len] | |
echo $it"\t"$code"\t"$weight | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment