Skip to content

Instantly share code, notes, and snippets.

@sci-42ver
Last active April 9, 2024 04:24
Show Gist options
  • Save sci-42ver/23574c8de29f829d81e5386625923592 to your computer and use it in GitHub Desktop.
Save sci-42ver/23574c8de29f829d81e5386625923592 to your computer and use it in GitHub Desktop.
Rime里 保持更新用户词典的同时 自动添加空格

在Rime里使用Lua实现保持更新用户词典的同时自动添加空格 | Lua implemention of adding spaces among punctuation, English words and Chinese words in Rime while keeping spaces

Kw to help searching in Github Gist: Rime rime 空格 更新 词典 用户词典 更新用户词典

This is based on rime-ice merge from rime-fast-xhup. But it seems to not update user_dict due to too early manipulating keys before the translator. Since I only want to add the space and the segmentor can't give one candidate, so I tried implementing it in the filter (not one whole new word). Then I followed this issue comment. The following is one combination of the above 2 links with some small addition.

Since there is probably no need for updating the userdict in rime-fast-xhup because the big dicts. I didn't add one PR. Anyone want the feature of updating the userdict can search manually and get here.

Hope this can help you.


Here preedit and word are both considered as English words. null means no addition of spaces.

modify in schema.yaml

engine:
  processors:   
    - lua_processor@non_en_cn_checker_processor_1
    ...
  ...
  filters:
    ...
    - lua_filter@cn_en_checker_filter_1
    ...
    - uniquifier                                    # 去重

To reuse the lua scripts, modify in rime.lua

cn_en_checker_filter_1 = require("cn_en_checker")
-- used in the 2nd yaml
cn_en_checker_filter_2 = require("cn_en_checker")
--[[
notice the userdb has the format 'word ⇥word⇥c=...' where one space after the syllables.
]]
local cn_en_space_common_module = require "cn_en_space_common_module"
local FILE=cn_en_space_common_module.script_path(2)
local try=cn_en_space_common_module.try
local function check_cn(cand_text)
return string.match(cand_text,'%p?[\u{4e00}-\u{9FFF}]+')~=nil
end
local function check_en(cand_text)
return string.match(cand_text, '^ ?[%l%u]+') ~= nil
end
local F={}
function F.init(env)
env.size=20
log.warning("init filter")
env.history={}
-- Since env is not global across the scripts, so we need the notifier. https://github.com/hchunhui/librime-lua/issues/239#issuecomment-1501424179
env.notifier= env.engine.context.commit_notifier:connect( function(ctx)
for i=env.size, #env.history do
env.history[i] = nil
end
if ctx:get_selected_candidate() then
if check_cn(ctx:get_selected_candidate().text) then
cn_en_space_common_module.reset_cand_property(env)
ctx:set_property('prev_cand_is_hanzi', '1')
log.error(FILE .. ": en IME uses prev_cand_is_hanzi")
elseif check_en(ctx:get_selected_candidate().text) then
cn_en_space_common_module.reset_cand_property(env)
ctx:set_property('prev_cand_is_aword', '1')
log.warning(FILE .. ": set prev_cand_is_aword")
else
log.error(FILE .. "one candidate is neither english nor chinese:" .. ctx:get_selected_candidate().text)
end
table.insert(env.history, 1, ctx:get_selected_candidate().text)
end
end)
env.property_notifier=env.engine.context.property_update_notifier:connect(function(ctx, name)
cn_en_space_common_module.sync_prev_cands(env,ctx)
cn_en_space_common_module.check_unique_true_property(env)
end)
-- Here must put in th last filter to make all previous 'property_update_notifier' work.
cn_en_space_common_module.reset_cand_property(env)
end
function F.fini(env)
-- Remember to Destruct connection which can avoid unnecessarily receive connection out of scope.
env.notifier:disconnect()
env.property_notifier:disconnect()
log.warning("end filter")
end
function F.tag_match(seg, env)
-- return env.engine.context:get_option('auto_space") -- 可以加上開關
return true
end
function F.func(inp,env)
local context = env.engine.context
local pre_space = ""
log.warning("history:")
for idx, cmt in ipairs(env.history) do
log.warning(idx .. ":" .. cmt)
end
local prev_cand_is_hanzi = env.prev_cand_is_hanziv=='1'
local prev_cand_is_aword = env.prev_cand_is_awordv=='1'
local prev_cand_is_aword_preedit=prev_cand_is_aword or env.prev_cand_is_preeditv == '1'
try(function()
log.warning("current env: hanzi-" .. tostring(prev_cand_is_hanzi) .. "; aword-" .. tostring(prev_cand_is_aword) .. "; preeditv-" .. env.prev_cand_is_preeditv .. "; nullv-" .. env.prev_cand_is_nullv..";punctv-"..env.prev_cand_is_punctv)
end,function(e)
cn_en_space_common_module.prev_cand_exception_handle(e,env)
end)
for cand in inp:iter() do
log.warning("update cand from:" .. cand.text .. ";")
--[[
1. Here since en following all except for `prev_cand_is_nullv` needs preceding one space
so we only need `env.prev_cand_is_nullv == '0'`
2. Since cn dones't need preceding one space when following punct,
so here prev_cand_is_punct is not used at all for CN/EN which is also based on 1.
3. `(#env.history>0 or env.prev_cand_is_preeditv == '1')` is one short circuit where we only think about preceding one space when having typed something. Here considers ascii_mode=1 situation.
]]
if (#env.history>0 or env.prev_cand_is_preeditv == '1' or env.prev_cand_is_punctv == '1') and
((env.prev_cand_is_nullv == '0' and check_en(cand.text)) or
((prev_cand_is_aword_preedit) and check_cn(cand.text))) then
cand = cand:to_shadow_candidate(cand.type, " " .. cand.text, cand.comment)
end
log.warning("to:" .. cand.text .. ";")
yield( cand )
end
end
return F
-- use `require` to load this module instead of dofile https://stackoverflow.com/a/31149198/21294350 https://www.lua.org/manual/2.4/node37.html
-- if mod_table then return end -- needed if using dofile
local mod_table={}
-- borrow from rime-fast-xhup
local function detect_os()
local user_distribute_name = rime_api:get_distribution_code_name()
if user_distribute_name:lower():match("weasel") then
return "Windows"
end
local system = io.popen("uname -s"):read("*l")
return system
end
-- Here level 2 because this is module file and we need the path of file calling this module.
function mod_table.script_path(level)
local debug_info = debug.getinfo(level)
local str = debug_info.source:sub(2)
local line = debug_info.currentline
return str .. "-L" .. line
end
-- TODO use array for prev_cand_... although it only simplifies here. When calling env.prev_cand_... where it may need something like env.prev_cand['nullv'], it dones't simplify much.
--[[
notice env is one member instead of global https://github.com/shewer/librime-lua/blob/e587aef46c75cab00f73e7cea49011e0b1bc4d43/src/lua_gears.h#L76 which is said in one issue
]]
function mod_table.reset_cand_property (env)
local context = env.engine.context
context:set_property('prev_cand_is_null', "0") -- doen't append/precede space
context:set_property('prev_cand_is_aword', "0")
context:set_property('prev_cand_is_hanzi', "0")
context:set_property('prev_cand_is_preedit', "0")
context:set_property('prev_cand_is_punct', "0")
-- Here when property_notifier:disconnect(), it seems to have delays. So the last IME filter/processor may got the current IME notifier.
-- Use 'try catch' in filter to ignore the error.
env.prev_cand_is_nullv=context:get_property('prev_cand_is_null')
env.prev_cand_is_preeditv=context:get_property('prev_cand_is_preedit')
env.prev_cand_is_hanziv=context:get_property('prev_cand_is_hanzi')
env.prev_cand_is_awordv=context:get_property('prev_cand_is_aword')
env.prev_cand_is_punctv=context:get_property('prev_cand_is_punct')
log.warning(mod_table.script_path(3) .. ": reset_cand_property")
end
function mod_table.sync_prev_cands (env,ctx)
env.prev_cand_is_nullv=ctx:get_property('prev_cand_is_null')
env.prev_cand_is_preeditv=ctx:get_property('prev_cand_is_preedit')
env.prev_cand_is_hanziv=ctx:get_property('prev_cand_is_hanzi')
env.prev_cand_is_awordv=ctx:get_property('prev_cand_is_aword')
env.prev_cand_is_punctv=ctx:get_property('prev_cand_is_punct')
-- TODO here some are empty after reset_cand_property
log.warning(mod_table.script_path(3) .. ": update prev_cand to: nullv-" .. env.prev_cand_is_nullv .. "; preeditv-" .. env.prev_cand_is_preeditv .. "; hanziv-" .. env.prev_cand_is_hanziv .. "; awordv-" .. env.prev_cand_is_awordv .. "; prev_cand_is_punct-" .. env.prev_cand_is_punctv)
end
function mod_table.check_unique_true_property (env)
property_str=env.prev_cand_is_nullv..env.prev_cand_is_preeditv..env.prev_cand_is_hanziv..env.prev_cand_is_awordv..env.prev_cand_is_punctv
-- https://stackoverflow.com/a/11158158/21294350
local _, count = string.gsub(property_str,'1','0')
if count>1 then
log.error(mod_table.script_path(3) .. ": Wrong prev_cands: nullv-" .. env.prev_cand_is_nullv .. "; preeditv-" .. env.prev_cand_is_preeditv .. "; hanziv-" .. env.prev_cand_is_hanziv .. "; awordv-" .. env.prev_cand_is_awordv .. "; prev_cand_is_punct-" .. env.prev_cand_is_punctv)
end
end
-- https://www.lua.org/wshop06/Belmonte.pdf
function mod_table.try(f, catch_f)
local status, exception = pcall(f)
if not status then
catch_f(exception)
end
end
function prev_cand_exception_handle(e,env)
log.warning("error:"..e)
local debug_prev_cand_arr = {env.prev_cand_is_hanziv,env.prev_cand_is_awordv,env.prev_cand_is_preeditv,env.prev_cand_is_nullv,env.prev_cand_is_punctv}
if #debug_prev_cand_arr==0 then
log.warning("all prev_cand checks are nil")
end
log.warning("to check error src with error cnt:"..#debug_prev_cand_arr)
for i=1,#debug_prev_cand_arr do
if debug_prev_cand_arr[i]==nil then
log.warning(i.."th element is nil")
end
end
end
return mod_table
-- 为交替输出中英情况加空格
-- 为中英混输词条(cn_en.dict.yaml)自动空格
-- 示例:`VIP中P` → `VIP 中 P`
-- local puts = require("tools/debugtool")
local use_fluency_editor=true
-- Since we may need space to split between words, so use "Shift+Return"
local fluency_commit_key="Shift+Return"
local LOG=true
-- here must assign to one local variable https://stackoverflow.com/a/63461673/21294350
local cn_en_space_common_module = require "cn_en_space_common_module"
local function init(env)
log.warning("init auto_append_space_processor")
env.property_notifier=env.engine.context.property_update_notifier:connect(function(ctx, name)
cn_en_space_common_module.sync_prev_cands(env,ctx)
cn_en_space_common_module.check_unique_true_property(env)
end)
cn_en_space_common_module.reset_cand_property(env)
end
local function fini(env)
env.property_notifier:disconnect()
log.warning("end processor")
end
local function auto_append_space_processor(key, env)
local engine = env.engine
local context = engine.context
local input_code = context.input
local pos = context.caret_pos
local composition = context.composition
local dict_mem = Memory(env.engine, env.engine.schema)
local cand_select_kyes = {
["semicolon"] = 1,
-- ["apostrophe"] = 2,
["1"] = 0,
["2"] = 1,
["3"] = 2,
["4"] = 3,
["5"] = 4,
["6"] = 5,
["7"] = 6,
["8"] = 7,
["9"] = 8
-- ["10"] = 9
}
local spec_keys = {
-- ['equal'] = true,
-- ['apostrophe'] = true,
['grave'] = true,
['minus'] = true,
-- ['slash'] = true,
['Shift+at'] = true,
['Shift+plus'] = true,
['Shift+dollar'] = true,
['Shift+quotedbl'] = true,
['Shift+asterisk'] = true,
['Shift+underscore'] = true,
['Shift+parenleft'] = true,
['Shift+parenright'] = true,
-- ['Return'] = true,
['Control+Return'] = true,
['Alt+Return'] = true
}
local punct_keys = {
['Shift+exclam'] = true,
['Shift+question'] = true,
['comma'] = true,
['period'] = true
-- ['semicolon'] = true, -- may confilct with the self-defined keybinding
}
local whether_preedit
local cfg = engine.schema.config
local procs_list=cfg:get_list("engine/processors")
if procs_list:get_value_at(procs_list.size-1):get_string() == "express_editor" then
use_fluency_editor=false
end
if use_fluency_editor then
cand_select_kyes["Return"]="x"
-- spec_keys['Return']=false
whether_preedit= key:repr() == fluency_commit_key
else
cand_select_kyes["space"]="x"
whether_preedit= key:repr() == "Return"
end
local prev_cand_is_nullv = env.prev_cand_is_nullv
local prev_cand_is_hanziv = env.prev_cand_is_hanziv
local prev_cand_is_awordv = env.prev_cand_is_awordv
local prev_cand_is_preeditv = env.prev_cand_is_preeditv
local prev_cand_is_punctv = env.prev_cand_is_punctv
if (#input_code >= 1 and composition:empty()) then
log.error("composition:empty() when having the input_code")
end
if (#input_code == 0) and (spec_keys[key:repr()]) then
cn_en_space_common_module.reset_cand_property(env)
context:set_property('prev_cand_is_null', '1')
end
if (#input_code == 0) and (punct_keys[key:repr()]) then
cn_en_space_common_module.reset_cand_property(env)
context:set_property('prev_cand_is_punct', '1')
end
if (#input_code >= 1) and (whether_preedit) then
if LOG then
log.warning("get preedit key " .. key:repr() .. " property " .. prev_cand_is_nullv .. "; prev_cand_is_hanzi-" .. prev_cand_is_hanziv .. ";" .. prev_cand_is_awordv .. "; preeditv-" .. prev_cand_is_preeditv .. ";prev_cand_is_punct-" .. prev_cand_is_punctv .. " ")
end
local cand_text = input_code
if (prev_cand_is_nullv ~= '1') and
((prev_cand_is_hanziv == '1') or (prev_cand_is_awordv == '1') or prev_cand_is_preeditv == '1') then
cand_text = " " .. input_code
engine:commit_text(cand_text)
else
engine:commit_text(cand_text)
end
cn_en_space_common_module.reset_cand_property(env)
context:set_property('prev_cand_is_preedit', "1")
context:clear()
return 1 -- kAccepted
end
return 2 -- kNoop
end
local function add_spaces(s,env)
local modified
-- 在中文字符后和英文字符前插入空格
modified = s:gsub("([\228-\233][\128-\191]-)([%w%p])", "%1 %2")
-- 在英文字符后和中文字符前插入空格
modified = modified:gsub("([%w%p])([\228-\233][\128-\191]-)", "%1 %2")
return modified
end
-- 是否同时包含中文和英文数字
local function is_mixed_cn_en_num(s)
return s:find("([\228-\233][\128-\191]-)") and s:find("[%a]")
end
local function cn_en_spacer(input, env)
for cand in input:iter() do
if is_mixed_cn_en_num(cand.text) then
cand = cand:to_shadow_candidate(cand.type, add_spaces(cand.text,env), cand.comment)
end
yield(cand)
end
end
return {processor = {init=init, func=auto_append_space_processor, fini=fini}, filter = cn_en_spacer}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment