for I3S Schema Search Engine
Token
字段表
create table Token
(
words nvarchar(100) not null,
tid int identity(1,1) not null,
interfreq int not null default(0),
intrafreq int not null default(1),
constraint PK_Token primary key clustered (words),
constraint UQ_Token_TID unique(tid)
)
InvIndex
反索引表
create table InvIndex
(
tid int not null,
oid int not null,
[weight] int not null default(1),
[field] binary(8),
constraint PK_InvIndex primary key clustered (tid,oid),
constraint FK_InvIndex_TID foreign key(tid) references Token(tid)
)
RawIndex
正索引表
create table RawIndex
(
oid int not null,
tid int not null,
constraint PK_RawIndex primary key clustered (oid,tid),
constraint FK_RawIndex_TID foreign key(tid) references Token(tid)
)
Trg_Index_insert
新增到反索引表的觸發,會自動新增到正索引表,並自動計算 weight
與 interfreq
create trigger Trg_Index_insert on InvIndex INSTEAD OF INSERT
as
declare @oid int
declare @tid int
declare @field int
declare index_cursor cursor for select oid,tid,field from inserted
open index_cursor fetch next from index_cursor into @oid,@tid,@field
while @@FETCH_STATUS=0
begin
if exists(select * from InvIndex where tid=@tid and oid=@oid)
update InvIndex set [weight]=[weight]+1,field=field|@field
where tid = @tid and oid = @oid
else
begin
insert into InvIndex values(@tid,@oid,1,@field)
insert into RawIndex values(@oid,@tid)
update Token set interfreq=interfreq+1 where tid = @tid
end
fetch next from index_cursor into @oid,@tid,@field
end
close index_cursor
deallocate index_cursor
fn_toInt
將二進位表示的字串轉成數字,方便 field 使用
create function fn_toInt (@binary nvarchar(31)) returns int
as
begin
declare @res int = 0
declare @i int=len(@binary)
declare @pow int = 0
declare @c char(1)
while(@i>0)
begin
set @c = substring(@binary,@i,1)
if @c = '1'
set @res += power(2, @pow)
set @pow += 1
set @i -= 1
end
return(@res)
end
fn_fulltoken
進行 sliding window 並以 table 的形式回傳,這裡的 ngram
寫死是 5
create function fn_fulltoken(@text nvarchar(4000))
returns @token table(
words nvarchar(4000)
)
as
begin
declare @i int = 1
declare @len int = len(@text)
declare @gram int
declare @gram int = 5
while(@gram>0)
begin
set @i = 1
while((@i+@gram-1)<=@len)
begin
insert into @token values(SUBSTRING(@text,@i,@gram))
set @i=@i+1
end
set @gram = @gram-1
end
return;
end
xp_insertToken
將字段寫入 Token
表中,並自動建立其索引 InvIndex
create procedure xp_insertToken
@words nvarchar(4000),
@oid int,
@field nvarchar(31)
as
begin
declare @tid int
declare @tids table(tid int)
select @tid=tid from token where words=@words
if @tid is not null
update token set intrafreq = intrafreq+1 where words=@words
else
begin
insert into token(words) output inserted.tid into @tids values(@words)
select @tid=tid from @tids
end
insert into InvIndex(tid,oid,field) values(@tid,@oid,dbo.fn_toInt(@field))
end
xp_insertFullToken
將字段先進行 sliding window (fn_fulltoken
),再藉由 xp_insertToken
寫入 Token
create procedure xp_insertFullToken
@segment nvarchar(4000),
@oid int,
@field nvarchar(31)
as
begin
declare @tid int
declare @words nvarchar(4000)
declare token_cursor cursor for select words from dbo.fn_fulltoken(@segment)
open token_cursor fetch next from token_cursor into @words
while @@FETCH_STATUS = 0
begin
exec dbo.xp_insertToken @words,@oid,@field
fetch next from token_cursor into @words
end
close token_cursor
deallocate token_cursor
end
xp_tokenize
進行索引處理,將字串去除特殊符號後,做中、英數分離。
中文部分做 slideing window後逐一進入索引,
英數部分直接進入索引
create procedure xp_tokenize
@oid int,
@text nvarchar(4000),
@field nvarchar(31)
as
begin
declare @len int = len(@text)
declare @i int = 1
declare @mode int
declare @prevmode int
declare @tmpChar nvarchar(2)
declare @tmpStr nvarchar(4000)
while(@i<=@len)
begin
set @tmpChar=lower(substring(@text,@i,1))
set @i+=1
-- mode:1 英數
if @tmpChar like '%[-a-Z0-9_'']%' set @mode=1
-- mode:2 漢字
else if unicode(@tmpChar) between 19968 and 40917 set @mode=2
-- mode:0 其他
else set @mode=0
-- 跟上一個字一樣 mode
if @prevmode = @mode
set @tmpStr+=@tmpChar
else
-- 跟上一個字不同 mode
begin
-- 暫存的是 mode 1 or 2 才是要的
if @prevmode=1
exec dbo.xp_insertToken @tmpStr,@oid,@field
if @prevmode=2
exec dbo.xp_insertFullToken @tmpStr,@oid,@field
set @tmpStr = @tmpChar
set @prevmode = @mode
end
end
-- 暫存裡面清出
if len(@tmpStr)>0
begin
if @prevmode=1
exec dbo.xp_insertToken @tmpStr,@oid,@field
if @prevmode=2
exec dbo.xp_insertFullToken @tmpStr,@oid,@field
end
return;
end
從 xp_tokenize
作為起點
exec xp_tokenize @oid,@text,@field
-- e.g. exec xp_tokenize 1,'好棒棒好棒棒我好怕','00001'
如此便能全自動建立索引
Unindexed
xp_unindexed
移除某一個
oid
的索引