LCOV - code coverage report
Current view: top level - src/backend/snowball - dict_snowball.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 47 66 71.2 %
Date: 2017-09-29 13:40:31 Functions: 6 6 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * dict_snowball.c
       4             :  *      Snowball dictionary
       5             :  *
       6             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/backend/snowball/dict_snowball.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "postgres.h"
      14             : 
      15             : #include "commands/defrem.h"
      16             : #include "tsearch/ts_locale.h"
      17             : #include "tsearch/ts_utils.h"
      18             : 
      19             : /* Some platforms define MAXINT and/or MININT, causing conflicts */
      20             : #ifdef MAXINT
      21             : #undef MAXINT
      22             : #endif
      23             : #ifdef MININT
      24             : #undef MININT
      25             : #endif
      26             : 
      27             : /* Now we can include the original Snowball header.h */
      28             : #include "snowball/libstemmer/header.h"
      29             : #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
      30             : #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
      31             : #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
      32             : #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
      33             : #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
      34             : #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
      35             : #include "snowball/libstemmer/stem_ISO_8859_1_hungarian.h"
      36             : #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
      37             : #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
      38             : #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
      39             : #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
      40             : #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
      41             : #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
      42             : #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
      43             : #include "snowball/libstemmer/stem_KOI8_R_russian.h"
      44             : #include "snowball/libstemmer/stem_UTF_8_danish.h"
      45             : #include "snowball/libstemmer/stem_UTF_8_dutch.h"
      46             : #include "snowball/libstemmer/stem_UTF_8_english.h"
      47             : #include "snowball/libstemmer/stem_UTF_8_finnish.h"
      48             : #include "snowball/libstemmer/stem_UTF_8_french.h"
      49             : #include "snowball/libstemmer/stem_UTF_8_german.h"
      50             : #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
      51             : #include "snowball/libstemmer/stem_UTF_8_italian.h"
      52             : #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
      53             : #include "snowball/libstemmer/stem_UTF_8_porter.h"
      54             : #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
      55             : #include "snowball/libstemmer/stem_UTF_8_romanian.h"
      56             : #include "snowball/libstemmer/stem_UTF_8_russian.h"
      57             : #include "snowball/libstemmer/stem_UTF_8_spanish.h"
      58             : #include "snowball/libstemmer/stem_UTF_8_swedish.h"
      59             : #include "snowball/libstemmer/stem_UTF_8_turkish.h"
      60             : 
      61           6 : PG_MODULE_MAGIC;
      62             : 
      63           6 : PG_FUNCTION_INFO_V1(dsnowball_init);
      64             : 
      65           6 : PG_FUNCTION_INFO_V1(dsnowball_lexize);
      66             : 
      67             : /* List of supported modules */
      68             : typedef struct stemmer_module
      69             : {
      70             :     const char *name;
      71             :     pg_enc      enc;
      72             :     struct SN_env *(*create) (void);
      73             :     void        (*close) (struct SN_env *);
      74             :     int         (*stem) (struct SN_env *);
      75             : } stemmer_module;
      76             : 
      77             : static const stemmer_module stemmer_modules[] =
      78             : {
      79             :     /*
      80             :      * Stemmers list from Snowball distribution
      81             :      */
      82             :     {"danish", PG_LATIN1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
      83             :     {"dutch", PG_LATIN1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
      84             :     {"english", PG_LATIN1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
      85             :     {"finnish", PG_LATIN1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
      86             :     {"french", PG_LATIN1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
      87             :     {"german", PG_LATIN1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
      88             :     {"hungarian", PG_LATIN1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
      89             :     {"italian", PG_LATIN1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
      90             :     {"norwegian", PG_LATIN1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
      91             :     {"porter", PG_LATIN1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
      92             :     {"portuguese", PG_LATIN1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
      93             :     {"spanish", PG_LATIN1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
      94             :     {"swedish", PG_LATIN1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
      95             :     {"romanian", PG_LATIN2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
      96             :     {"russian", PG_KOI8R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
      97             :     {"danish", PG_UTF8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
      98             :     {"dutch", PG_UTF8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
      99             :     {"english", PG_UTF8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
     100             :     {"finnish", PG_UTF8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
     101             :     {"french", PG_UTF8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
     102             :     {"german", PG_UTF8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
     103             :     {"hungarian", PG_UTF8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
     104             :     {"italian", PG_UTF8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
     105             :     {"norwegian", PG_UTF8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
     106             :     {"porter", PG_UTF8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
     107             :     {"portuguese", PG_UTF8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
     108             :     {"romanian", PG_UTF8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
     109             :     {"russian", PG_UTF8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
     110             :     {"spanish", PG_UTF8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
     111             :     {"swedish", PG_UTF8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
     112             :     {"turkish", PG_UTF8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
     113             : 
     114             :     /*
     115             :      * Stemmer with PG_SQL_ASCII encoding should be valid for any server
     116             :      * encoding
     117             :      */
     118             :     {"english", PG_SQL_ASCII, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
     119             : 
     120             :     {NULL, 0, NULL, NULL, NULL} /* list end marker */
     121             : };
     122             : 
     123             : 
     124             : typedef struct DictSnowball
     125             : {
     126             :     struct SN_env *z;
     127             :     StopList    stoplist;
     128             :     bool        needrecode;     /* needs recoding before/after call stem */
     129             :     int         (*stem) (struct SN_env *z);
     130             : 
     131             :     /*
     132             :      * snowball saves alloced memory between calls, so we should run it in our
     133             :      * private memory context. Note, init function is executed in long lived
     134             :      * context, so we just remember CurrentMemoryContext
     135             :      */
     136             :     MemoryContext dictCtx;
     137             : } DictSnowball;
     138             : 
     139             : 
     140             : static void
     141           6 : locate_stem_module(DictSnowball *d, char *lang)
     142             : {
     143             :     const stemmer_module *m;
     144             : 
     145             :     /*
     146             :      * First, try to find exact match of stemmer module. Stemmer with
     147             :      * PG_SQL_ASCII encoding is treated as working with any server encoding
     148             :      */
     149         108 :     for (m = stemmer_modules; m->name; m++)
     150             :     {
     151         126 :         if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
     152          18 :             pg_strcasecmp(m->name, lang) == 0)
     153             :         {
     154           6 :             d->stem = m->stem;
     155           6 :             d->z = m->create();
     156           6 :             d->needrecode = false;
     157           6 :             return;
     158             :         }
     159             :     }
     160             : 
     161             :     /*
     162             :      * Second, try to find stemmer for needed language for UTF8 encoding.
     163             :      */
     164           0 :     for (m = stemmer_modules; m->name; m++)
     165             :     {
     166           0 :         if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
     167             :         {
     168           0 :             d->stem = m->stem;
     169           0 :             d->z = m->create();
     170           0 :             d->needrecode = true;
     171           0 :             return;
     172             :         }
     173             :     }
     174             : 
     175           0 :     ereport(ERROR,
     176             :             (errcode(ERRCODE_UNDEFINED_OBJECT),
     177             :              errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
     178             :                     lang, GetDatabaseEncodingName())));
     179             : }
     180             : 
     181             : Datum
     182           6 : dsnowball_init(PG_FUNCTION_ARGS)
     183             : {
     184           6 :     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     185             :     DictSnowball *d;
     186           6 :     bool        stoploaded = false;
     187             :     ListCell   *l;
     188             : 
     189           6 :     d = (DictSnowball *) palloc0(sizeof(DictSnowball));
     190             : 
     191          18 :     foreach(l, dictoptions)
     192             :     {
     193          12 :         DefElem    *defel = (DefElem *) lfirst(l);
     194             : 
     195          12 :         if (pg_strcasecmp("StopWords", defel->defname) == 0)
     196             :         {
     197           6 :             if (stoploaded)
     198           0 :                 ereport(ERROR,
     199             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     200             :                          errmsg("multiple StopWords parameters")));
     201           6 :             readstoplist(defGetString(defel), &d->stoplist, lowerstr);
     202           6 :             stoploaded = true;
     203             :         }
     204           6 :         else if (pg_strcasecmp("Language", defel->defname) == 0)
     205             :         {
     206           6 :             if (d->stem)
     207           0 :                 ereport(ERROR,
     208             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     209             :                          errmsg("multiple Language parameters")));
     210           6 :             locate_stem_module(d, defGetString(defel));
     211             :         }
     212             :         else
     213             :         {
     214           0 :             ereport(ERROR,
     215             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     216             :                      errmsg("unrecognized Snowball parameter: \"%s\"",
     217             :                             defel->defname)));
     218             :         }
     219             :     }
     220             : 
     221           6 :     if (!d->stem)
     222           0 :         ereport(ERROR,
     223             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     224             :                  errmsg("missing Language parameter")));
     225             : 
     226           6 :     d->dictCtx = CurrentMemoryContext;
     227             : 
     228           6 :     PG_RETURN_POINTER(d);
     229             : }
     230             : 
     231             : Datum
     232        1142 : dsnowball_lexize(PG_FUNCTION_ARGS)
     233             : {
     234        1142 :     DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
     235        1142 :     char       *in = (char *) PG_GETARG_POINTER(1);
     236        1142 :     int32       len = PG_GETARG_INT32(2);
     237        1142 :     char       *txt = lowerstr_with_len(in, len);
     238        1142 :     TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
     239             : 
     240        1142 :     if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
     241             :     {
     242         397 :         pfree(txt);
     243             :     }
     244             :     else
     245             :     {
     246             :         MemoryContext saveCtx;
     247             : 
     248             :         /*
     249             :          * recode to utf8 if stemmer is utf8 and doesn't match server encoding
     250             :          */
     251         745 :         if (d->needrecode)
     252             :         {
     253             :             char       *recoded;
     254             : 
     255           0 :             recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
     256           0 :             if (recoded != txt)
     257             :             {
     258           0 :                 pfree(txt);
     259           0 :                 txt = recoded;
     260             :             }
     261             :         }
     262             : 
     263             :         /* see comment about d->dictCtx */
     264         745 :         saveCtx = MemoryContextSwitchTo(d->dictCtx);
     265         745 :         SN_set_current(d->z, strlen(txt), (symbol *) txt);
     266         745 :         d->stem(d->z);
     267         745 :         MemoryContextSwitchTo(saveCtx);
     268             : 
     269         745 :         if (d->z->p && d->z->l)
     270             :         {
     271         745 :             txt = repalloc(txt, d->z->l + 1);
     272         745 :             memcpy(txt, d->z->p, d->z->l);
     273         745 :             txt[d->z->l] = '\0';
     274             :         }
     275             : 
     276             :         /* back recode if needed */
     277         745 :         if (d->needrecode)
     278             :         {
     279             :             char       *recoded;
     280             : 
     281           0 :             recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
     282           0 :             if (recoded != txt)
     283             :             {
     284           0 :                 pfree(txt);
     285           0 :                 txt = recoded;
     286             :             }
     287             :         }
     288             : 
     289         745 :         res->lexeme = txt;
     290             :     }
     291             : 
     292        1142 :     PG_RETURN_POINTER(res);
     293             : }

Generated by: LCOV version 1.11