函数已改进,请使用新版本函数,参看PostgreSQL 黑科技-递规二分法切分汉字
1 模糊查询时,大多数情况下首先想到的是like '%关键字%'或基于gin索引的正则表达式,gin至少需要三个字符才会使用索引,对于汉语来说十分不方便;
2 在海量数据的情况下,基于like和gin索引的正则表达式均不理想时,一般采用分词后进行查询.
3 分词存在两个弊端
3.1 词库的维护是比较繁重的工作.当词库中没有的关键词会导致查询结果不正确.
3.2 历史数据的维护工作不好处理.新增关键词时,历史数据并不包含些新的关键词,使用新关键词查询时无法查询到历史数据.
4 不使用like/不使用正则/不使用分词并保证查询快捷准确的另一种方法
此方法的缺点是比较浪费空间,不过在当前相比较下来以空间换取时间的方法是值得的.
4.1 首先清除文本中的标点符号
drop function if exists clear_punctuation(text);create or replace function clear_punctuation(text) returns textas $$select regexp_replace($1,'[\ |\~|\`|\!|\@|\#|\$|\%|\^|\&|\*|\(|\)|\-|\_|\+|\=|\||\\|\[|\]|\{|\}|\;|\:|\"|\''|\,|\<|\.|\>|\/|\?|\:|\。|\;|\,|\:|\“|\”|\(|\)|\、|\?|\《|\》]','','g');$$ language sql strict immutable;
4.2 每两个字符做为一个词处理,将字符串转换为tsvector
drop function if exists str_to_tsvector(text);create or replace function str_to_tsvector(text) returns tsvectoras $$declarev_count integer;v_txt text;v_txts text[];v_result tsvector;beginv_txt := clear_punctuation($1);--数组大小为字符数量-1v_count := length(v_txt)-1;if( v_count < 1 ) thenraise exception '输入参数("%")去除标点符号后至少需要2个字符',$1;end if;for i in 1..v_count loopv_txts := array_append(v_txts, substring(v_txt,i,2));end loop;--tsvector类型要求去除重复并排序with cte1 as(select f from unnest(v_txts) as f group by f),cte2 as(select f from cte1 order by f)select array_to_tsvector(array_agg(f)) into v_result from cte2;return v_result;end;$$ language plpgsql strict immutable;
4.3 创建测试表
drop table if exists test_cond;drop table if exists test;create table test(objectid bigserial not null,--唯一编号name text not null,--名称newtime timestamptz default(now()) not null,--首次添加时间lastime timestamptz,--最后一次修改时间constraint pk_test_objectid primary key (objectid));--rum需要大量的计算和写入,保存速度非常慢,因此创建表时设置unlogged标记--unlogged标记不是绝对安全的,因此和主表分开--虽然浪费了一定的磁盘空间,但可以最大程度保证写入速度--test_cond表损坏后,通过主表可以很容易的重建,不过机率非常小--test_cond中的数据通过触发器更新,不需要操作这个表create unlogged table test_cond(objectid bigserial not null,--唯一编号keys tsvector not null,--关键字constraint pk_test_cond_objectid primary key (objectid),constraint fk_test_cond_objectid foreign key(objectid) references test(objectid) on delete cascade);create index idx_test_cond_keys on test_cond using rum(keys rum_tsvector_ops);
4.4 创建关联触发器
--test_cond表设置了级联删除,所以不需要delete触发drop trigger if exists tri_test_change on test;drop function if exists tri_test_trigger();create or replace function tri_test_trigger() returns trigger as $$ begin if (TG_OP = 'INSERT') theninsert into test_cond(objectid,keys) values(NEW.objectid,str_to_tsvector(NEW.name));return NEW;elsif (TG_OP = 'UPDATE') thenupdate test_cond set keys=str_to_tsvector(NEW.name) where objectid=NEW.objectid;return NEW;end if;RETURN NULL; end; $$ language 'plpgsql' SECURITY DEFINER;--test_cond依赖test表,因此只能在test成功后再触好,只能设置为aftercreate trigger tri_test_change after INSERT or UPDATE on test for each ROW EXECUTE PROCEDURE tri_test_trigger();
新版本函数,请参看PostgreSQL 黑科技-递规二分法切分汉字
--test_cond表设置了级联删除,所以不需要delete触发drop trigger if exists tri_test_change on test;drop function if exists tri_test_trigger();create or replace function tri_test_trigger() returns trigger as $$ begin if (TG_OP = 'INSERT') theninsert into test_cond(objectid,keys) values(NEW.objectid,dichotomy_split_tsv(NEW.name));return NEW;elsif (TG_OP = 'UPDATE') thenupdate test_cond set keys=dichotomy_split_tsv(NEW.name) where objectid=NEW.objectid;return NEW;end if;RETURN NULL; end; $$ language 'plpgsql' SECURITY DEFINER;--test_cond依赖test表,因此只能在test成功后再触好,只能设置为aftercreate trigger tri_test_change after INSERT or UPDATE on test for each ROW EXECUTE PROCEDURE tri_test_trigger();
4.5 创建随机生成汉字
drop function if exists gen_random_zh(int,int);create or replace function gen_random_zh(int,int)returns textas $$select string_agg(chr((random()*(20901-19968)+19968 )::integer) , '') from generate_series(1,(random()*($2-$1)+$1)::integer);$$ language sql;
4.6 生成测试数据
每调一次ins_test插入100万数据,可以同时调用ins_test插入更多数据,以便验证模糊查询性能
drop function if exists ins_test();create or replace function ins_test() returns void as $$declare v_index integer;beginv_index := 0;for i in 1..1000 loopv_index := v_index + 1;insert into test(name)select gen_random_zh(8,32) as name from generate_series(1,1000);raise notice '%', v_index;end loop;end;$$ language plpgsql;--每调一次ins_test插入100万数据select ins_test();
4.7 验证触发器的update功能
update test set name='哈哈,我来验证了' where objectid=10000;
4.7 查询数据量
select count(*) from test;select count(*) from test_cond;
5 模糊查询测试
5.1 创建查询转换函数
drop function if exists str_to_tsquery(text,boolean);create or replace function str_to_tsquery(text,boolean default true) returns tsqueryas $$declarev_count integer;v_txt text;v_txts text[];v_result tsquery;beginv_txt := clear_punctuation($1);--数组大小为字符数量-1v_count := length(v_txt)-1;if( v_count < 1 ) thenraise exception '输入参数("%")去除标点符号后至少需要2个字符',$1;end if;for i in 1..v_count loopv_txts := array_append(v_txts, substring(v_txt,i,2));end loop;--tsquery类型要求去除重复并排序with cte1 as(select f from unnest(v_txts) as f group by f),cte2 as(select f from cte1 order by f)select string_agg(f, (case when $2 then '&' else '|' end ) )::tsquery into v_result from cte2;return v_result;end;$$ language plpgsql strict immutable;
5.2 模糊数据
关键字字数越多,查询越准确,并且查询速度
--因优先级问题,此语句可能会不走rum索引explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere name @@ (str_to_tsquery('価仴'))
建议采用下面的方式,保证查询使用rum索引
explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select str_to_tsquery('哈哈'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select str_to_tsquery('哈我'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select str_to_tsquery('我来'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select str_to_tsquery('来验'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select str_to_tsquery('验证'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select str_to_tsquery('证了'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select str_to_tsquery('哈哈,我来验证了'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select str_to_tsquery('侒亩'));
新版本函数,请参看PostgreSQL 黑科技-递规二分法切分汉字
explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select dichotomy_split_tsq('哈哈'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select dichotomy_split_tsq('哈我'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select dichotomy_split_tsq('我来'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select dichotomy_split_tsq('来验'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select dichotomy_split_tsq('验证'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select dichotomy_split_tsq('证了'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select dichotomy_split_tsq('哈哈,我来验证了'));explain (analyze,verbose,costs,buffers,timing) select f.* from test as f left join test_cond as son f.objectid=s.objectidwhere s.keys @@ (select dichotomy_split_tsq('侒亩'));
5.3 验证级联删除功能
删除后再执行上面的sql查询均查不到数据
delete from test where objectid=10000;``