12.9. Example of Creating a Rule-Based Dictionary

The motivation for this example dictionary is to control the indexing of integers (signed and unsigned), and, consequently, to minimize the number of unique words which greatly affects to performance of searching.

The dictionary accepts two options:

A similar idea can be applied to the indexing of decimal numbers, for example, in the DecDict dictionary. The dictionary accepts two options: the MAXLENFRAC parameter specifies the maximum length of the fractional part considered as a 'good' decimal. The default value is 3. The REJECTLONG parameter controls whether a decimal number with a 'long' fractional part should be indexed or treated as a stop word. If REJECTLONG=FALSE (default), the dictionary returns the decimal number with the length of its fraction part truncated to MAXLEN. If REJECTLONG=TRUE, the dictionary considers the number as a stop word. Notice that REJECTLONG=FALSE allows the indexing of 'shortened' numbers and search results will contain documents with shortened numbers.

Examples:

SELECT ts_lexize('intdict', 11234567890);
 ts_lexize
-----------
 {112345}

Now, we want to ignore long integers:


ALTER TEXT SEARCH DICTIONARY intdict (
    MAXLEN = 6, REJECTLONG = TRUE
);

SELECT ts_lexize('intdict', 11234567890);
 ts_lexize
-----------
 {}

Create contrib/dict_intdict directory with files dict_tmpl.c, Makefile, dict_intdict.sql.in:

$ make && make install
$ psql DBNAME < dict_intdict.sql

This is a dict_tmpl.c file:

#include "postgres.h"
#include "utils/builtins.h"
#include "fmgr.h"

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif

#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "tsearch/ts_utils.h"

typedef struct {
  int     maxlen;
  bool    rejectlong;
} DictInt;


PG_FUNCTION_INFO_V1(dinit_intdict);
Datum dinit_intdict(PG_FUNCTION_ARGS);

Datum
dinit_intdict(PG_FUNCTION_ARGS) {
    DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
    Map *cfg, *pcfg;
    text *in;

    if (!d)
        elog(ERROR, "No memory");
    memset(d, 0, sizeof(DictInt));

    /* Your INIT code */
    /* defaults */
    d->maxlen = 6;
    d->rejectlong = false;

    if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL) /* no options */
        PG_RETURN_POINTER(d);

    in = PG_GETARG_TEXT_P(0);
    parse_keyvalpairs(in, &cfg);
    PG_FREE_IF_COPY(in, 0);
    pcfg=cfg;

    while (pcfg->key)
    {
        if (strcasecmp("MAXLEN", pcfg->key) == 0)
                d->maxlen=atoi(pcfg->value);
        else if ( strcasecmp("REJECTLONG", pcfg->key) == 0)
        {
           if ( strcasecmp("true", pcfg->value) == 0 )
               d->rejectlong=true;
           else if ( strcasecmp("false", pcfg->value) == 0)
               d->rejectlong=false;
           else
               elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value);
        }
        else
            elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value);

        pfree(pcfg->key);
        pfree(pcfg->value);
        pcfg++;
    }
    pfree(cfg);

    PG_RETURN_POINTER(d);
 }

PG_FUNCTION_INFO_V1(dlexize_intdict);
Datum dlexize_intdict(PG_FUNCTION_ARGS);
Datum
dlexize_intdict(PG_FUNCTION_ARGS)
{
    DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
    char       *in = (char*)PG_GETARG_POINTER(1);
    char *txt = pnstrdup(in, PG_GETARG_INT32(2));
    TSLexeme *res = palloc(sizeof(TSLexeme) * 2);

    /* Your INIT dictionary code */
    res[1].lexeme = NULL;

    if  (PG_GETARG_INT32(2) > d->maxlen)
    {
       if (d->rejectlong)
       { /* stop, return void array */
           pfree(txt);
           res[0].lexeme = NULL;
        }
        else
        { /* cut integer */
           txt[d->maxlen] = '\0';
           res[0].lexeme = txt;
        }
    }
    else
        res[0].lexeme = txt;

    PG_RETURN_POINTER(res);
}

This is the Makefile:

subdir = contrib/dict_intdict
top_builddir = ../..
include $(top_builddir)/src/Makefile.global

MODULE_big = dict_intdict
OBJS =  dict_tmpl.o
DATA_built = dict_intdict.sql
DOCS =

include $(top_srcdir)/contrib/contrib-global.mk

This is a dict_intdict.sql.in:

SET default_text_search_config = 'english';

BEGIN;

CREATE OR REPLACE FUNCTION dinit_intdict(internal)
    RETURNS internal
    AS 'MODULE_PATHNAME'
    LANGUAGE 'C';

CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal)
    RETURNS internal
    AS 'MODULE_PATHNAME'
    LANGUAGE 'C'
    WITH (isstrict);

CREATE TEXT SEARCH TEMPLATE intdict_template (
    LEXIZE = dlexize_intdict, INIT = dinit_intdict
);

CREATE TEXT SEARCH DICTIONARY intdict (
  TEMPLATE = intdict_template,
  MAXLEN = 6, REJECTLONG = false
);

COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers';

END;