android sqlite 分词,sqlite fts3自定义分词器 zz
returnsqlite3_finalize(pStmt);}要想实现自定义的分词器,最关键的时是得到指向sqlite3_tokenizer_module结构的一个指针,sqlite3_tokenizer_module结构体定义如下:struct sqlite3_tokenizer_module {int iVersion; //版本号,必须设置为0int (*xCreate)( //创建虚表时自
return
sqlite3_finalize(pStmt);
}
要想实现自定义的分词器,最关键的时是得到指向sqlite3_tokenizer_module结构的一个指针,sqlite3_tokenizer_module结构体定义如下:
struct sqlite3_tokenizer_module {
int iVersion; //版本号,必须设置为0
int (*xCreate)( //创建虚表时自动调用并创建分词器
int
argc, const char
*const*argv, sqlite3_tokenizer
**ppTokenizer );
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
//数据库连接关闭时自动调用,用于销毁资源
int (*xOpen)( //插入数据或检索时自动调用以进行分词
sqlite3_tok
enizer
*pTokenizer, const char
*pInput, int
nBytes, sqlite3_tokenizer_cursor **ppCursor
);
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
//分词结果提取完毕后自动调用
int (*xNext)( //逐个提取分词结果
sqlite3_tokenizer_cursor
*pCursor, const char
**ppToken, int *pnBytes,
int
*piStartOffset,
int
*piEndOffset,
int
*piPosition
);
};
有几点需要注意的是:
1
分词引擎使用sql语句注册意味着每建立一个sqlite连接都必须注册一次分词器,对于需要使用词库的中文分词器来说也意味着巨大的内存消耗。
2
在检索时分词结果的提取和语义的解析式交替进行的。例如我们搜索"kanif OR
sqlite"的时候,引擎先将全部传入到分词器,在调用一次next获取到词
kanif后,在将词sqlite传入到分词器,直到全部解析完毕。
3
由于中文分词本身的特殊性,例如"北京市"很有可能视为一个完整的词,这样在搜索"北京"的时候就无法获取到结果。如果分词器支持将"北京市"切分为"北
京市"和"北京"或者将十一月切分为"11月"和"十一",那么需注意(*xNext)函数中的piStartOffset和piEndOffset参
数。经测试在插入数据的时候这两个参数无实际用途,但在查询的时候这两个参数决定了下一次的输入串。
附:
#include
#include
#include
#include
#include
#include
#include "fts3_tokenizer.h"
#include "mmseg/mmseg.cpp"
static bool loadDic = true;
typedef struct cus_tokenizer {
sqlite3_tokenizer base;
} cus_tokenizer;
typedef struct cus_tokenizer_cursor {
sqlite3_tokenizer_cursor base;
char *pInput;
int nBytes;
int iToken;
char *pToken;
rmmseg::Algorithm *pAlgor;
} cus_tokenizer_cursor;
void initmmseg(void){
if(!loadDic)
return;
mmseg_load_words("chars.dic");
mmseg_load_words("words.dic");
loadDic =
False;
}
static int cusCreate(
int argc, const char * const *argv,
sqlite3_tokenizer **ppTokenizer
){
cus_tokenizer *t;
t = (cus_tokenizer *) sqlite3_malloc(sizeof(*t));
if( t==NULL ) return SQLITE_NOMEM;
memset(t, 0, sizeof(*t));
initmmseg();
*ppTokenizer = &t->base;
return SQLITE_OK;
}
static int cusDestroy(sqlite3_tokenizer *pTokenizer){
sqlite3_free(pTokenizer);
return SQLITE_OK;
}
static int cusOpen(
sqlite3_tokenizer
*pTokenizer, const char *pInput, int
nBytes, sqlite3_tokenizer_cursor
**ppCursor ){
cus_tokenizer_cursor *c;
if(pInput == 0){
nBytes =
0;
}else if(nBytes < 0)
nBytes = (int)strlen(pInput);
c = (cus_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
if(c == NULL)
return SQLITE_NOMEM;
c->iToken = c->nBytes = 0;
c->pInput = c->pToken = NULL;
c->pAlgor = mmseg_algor_create(pInput,
nBytes);
c->nBytes = nBytes;
*ppCursor = &c->base;
return SQLITE_OK;
}
static int cusClose(sqlite3_tokenizer_cursor *pCursor){
cus_tokenizer_cursor *c = (cus_tokenizer_cursor *) pCursor;
if(c->pInput != NULL){
sqlite3_free(c->pInput);
}
if(c->pToken != NULL){
sqlite3_free(c->pToken);
}
if(c->pAlgor != NULL){
mmseg_algor_destroy(c->pAlgor);
}
c->pInput = c->pToken = NULL;
c->pAlgor = NULL;
sqlite3_free(c);
return SQLITE_OK;
}
static int cusNext(
sqlite3_tokenizer_cursor *pCursor,
const char
**ppToken, int
*pnBytes, int
*piStartOffset, int
*piEndOffset, int
*piPosition ){
cus_tokenizer_cursor *c = (cus_tokenizer_cursor *) pCursor;
cus_tokenizer *t = (cus_tokenizer *)
pCursor->pTokenizer;
if(c->pToken != NULL){
sqlite3_free(c->pToken);
c->pToken = NULL;
}
struct Token token =
mmseg_next_token(c->pAlgor);
if(token.length != 0 ){
int l =
token.length;
c->pToken = (char *)sqlite3_malloc(l+1);
if(c->pToken == NULL)
return SQLITE_NOMEM;
c->pToken[l] = 0;
memcpy(c->pToken, token.text, l);
*ppToken =
c->pToken;
*pnBytes =
l;
*piStartOffset = token.offset;
*piEndOffset
= token.offset + token.length;
*piPosition
= c->iToken++;
return
SQLITE_OK;
}
//一般来说只有插入数据时才会进入到这里
return SQLITE_DONE;
}
static const sqlite3_tokenizer_module cusTokenizerModule = {
0,
cusCreate,
cusDestroy,
cusOpen,
cusClose,
cusNext,
};
int registerTokenizer(
sqlite3 *db,
char *zName,
const sqlite3_tokenizer_module *p
){
int
rc;
sqlite3_stmt
*pStmt;
const char
*zSql = "SELECT fts3_tokenizer(?, ?)";
rc =
sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
if(
rc!=SQLITE_OK ){
return rc;
}
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p),
SQLITE_STATIC);
sqlite3_step(pStmt);
return
sqlite3_finalize(pStmt);
}
int main(){
const
sqlite3_tokenizer_module *ptr =
&cusTokenizerModule;
sqlite3
*pDB;
sqlite3_stmt
* stmt;
char *
errMsg = NULL;
const char
*zTail;
int rc =
sqlite3_open("test.sqlite3", &pDB);
if(rc){
printf("create error. %s\n",sqlite3_errmsg(pDB));
return rc;
}
char
token_name[] = "custoken";
registerTokenizer(pDB, token_name, ptr);
rc =
sqlite3_exec(pDB, "CREATE VIRTUAL TABLE foo USING
fts3(tokenize=custoken)", 0, 0, &errMsg);
if(rc !=
SQLITE_OK){
printf("create virtual error, %s\n", errMsg);
if(rc !=
SQLITE_OK){
printf("create virtual error, %s\n", errMsg);
return rc;
}
rc =
sqlite3_exec(pDB, "INSERT INTO foo
VALUES('\xe5\x8c\x97\xe4\xba\xac\xe5\xb8\x82')", 0, 0,
&errMsg);
if(rc !=
SQLITE_OK){
printf("insert value error, %s\n", errMsg);
return rc;
}
int nrow =
0, ncolumn = 0;
char
**azResult; //二维数组存放结果
sqlite3_get_table(pDB , "SELECT * FROM foo WHERE content MATCH
'\xe5\x8c\x97\xe4\xba\xac\xe5\xb8\x82'" , &azResult
, &nrow , &ncolumn ,
&errMsg );
int i = 0
;
printf(
"row:%d column=%d \n" , nrow , ncolumn );
printf(
"\nThe result of querying is : \n" );
for( i=0 ;
i
printf( "azResult[%d] = %s\n", i , azResult[i] );
sqlite3_free_table( azResult );
sqlite3_close(pDB);
return
0;
}
更多推荐
所有评论(0)