Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dict_snowball.c
4 : * Snowball dictionary
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/backend/snowball/dict_snowball.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "postgres.h"
14 :
15 : #include "commands/defrem.h"
16 : #include "tsearch/ts_locale.h"
17 : #include "tsearch/ts_utils.h"
18 :
19 : /* Some platforms define MAXINT and/or MININT, causing conflicts */
20 : #ifdef MAXINT
21 : #undef MAXINT
22 : #endif
23 : #ifdef MININT
24 : #undef MININT
25 : #endif
26 :
27 : /* Now we can include the original Snowball header.h */
28 : #include "snowball/libstemmer/header.h"
29 : #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
30 : #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
31 : #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
32 : #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
33 : #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
34 : #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
35 : #include "snowball/libstemmer/stem_ISO_8859_1_hungarian.h"
36 : #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
37 : #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
38 : #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
39 : #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
40 : #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
41 : #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
42 : #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
43 : #include "snowball/libstemmer/stem_KOI8_R_russian.h"
44 : #include "snowball/libstemmer/stem_UTF_8_danish.h"
45 : #include "snowball/libstemmer/stem_UTF_8_dutch.h"
46 : #include "snowball/libstemmer/stem_UTF_8_english.h"
47 : #include "snowball/libstemmer/stem_UTF_8_finnish.h"
48 : #include "snowball/libstemmer/stem_UTF_8_french.h"
49 : #include "snowball/libstemmer/stem_UTF_8_german.h"
50 : #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
51 : #include "snowball/libstemmer/stem_UTF_8_italian.h"
52 : #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
53 : #include "snowball/libstemmer/stem_UTF_8_porter.h"
54 : #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
55 : #include "snowball/libstemmer/stem_UTF_8_romanian.h"
56 : #include "snowball/libstemmer/stem_UTF_8_russian.h"
57 : #include "snowball/libstemmer/stem_UTF_8_spanish.h"
58 : #include "snowball/libstemmer/stem_UTF_8_swedish.h"
59 : #include "snowball/libstemmer/stem_UTF_8_turkish.h"
60 :
61 6 : PG_MODULE_MAGIC;
62 :
63 6 : PG_FUNCTION_INFO_V1(dsnowball_init);
64 :
65 6 : PG_FUNCTION_INFO_V1(dsnowball_lexize);
66 :
67 : /* List of supported modules */
68 : typedef struct stemmer_module
69 : {
70 : const char *name;
71 : pg_enc enc;
72 : struct SN_env *(*create) (void);
73 : void (*close) (struct SN_env *);
74 : int (*stem) (struct SN_env *);
75 : } stemmer_module;
76 :
77 : static const stemmer_module stemmer_modules[] =
78 : {
79 : /*
80 : * Stemmers list from Snowball distribution
81 : */
82 : {"danish", PG_LATIN1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
83 : {"dutch", PG_LATIN1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
84 : {"english", PG_LATIN1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
85 : {"finnish", PG_LATIN1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
86 : {"french", PG_LATIN1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
87 : {"german", PG_LATIN1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
88 : {"hungarian", PG_LATIN1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
89 : {"italian", PG_LATIN1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
90 : {"norwegian", PG_LATIN1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
91 : {"porter", PG_LATIN1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
92 : {"portuguese", PG_LATIN1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
93 : {"spanish", PG_LATIN1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
94 : {"swedish", PG_LATIN1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
95 : {"romanian", PG_LATIN2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
96 : {"russian", PG_KOI8R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
97 : {"danish", PG_UTF8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
98 : {"dutch", PG_UTF8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
99 : {"english", PG_UTF8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
100 : {"finnish", PG_UTF8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
101 : {"french", PG_UTF8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
102 : {"german", PG_UTF8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
103 : {"hungarian", PG_UTF8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
104 : {"italian", PG_UTF8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
105 : {"norwegian", PG_UTF8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
106 : {"porter", PG_UTF8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
107 : {"portuguese", PG_UTF8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
108 : {"romanian", PG_UTF8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
109 : {"russian", PG_UTF8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
110 : {"spanish", PG_UTF8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
111 : {"swedish", PG_UTF8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
112 : {"turkish", PG_UTF8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
113 :
114 : /*
115 : * Stemmer with PG_SQL_ASCII encoding should be valid for any server
116 : * encoding
117 : */
118 : {"english", PG_SQL_ASCII, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
119 :
120 : {NULL, 0, NULL, NULL, NULL} /* list end marker */
121 : };
122 :
123 :
124 : typedef struct DictSnowball
125 : {
126 : struct SN_env *z;
127 : StopList stoplist;
128 : bool needrecode; /* needs recoding before/after call stem */
129 : int (*stem) (struct SN_env *z);
130 :
131 : /*
132 : * snowball saves alloced memory between calls, so we should run it in our
133 : * private memory context. Note, init function is executed in long lived
134 : * context, so we just remember CurrentMemoryContext
135 : */
136 : MemoryContext dictCtx;
137 : } DictSnowball;
138 :
139 :
140 : static void
141 12 : locate_stem_module(DictSnowball *d, char *lang)
142 : {
143 : const stemmer_module *m;
144 :
145 : /*
146 : * First, try to find exact match of stemmer module. Stemmer with
147 : * PG_SQL_ASCII encoding is treated as working with any server encoding
148 : */
149 216 : for (m = stemmer_modules; m->name; m++)
150 : {
151 252 : if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
152 36 : pg_strcasecmp(m->name, lang) == 0)
153 : {
154 12 : d->stem = m->stem;
155 12 : d->z = m->create();
156 12 : d->needrecode = false;
157 12 : return;
158 : }
159 : }
160 :
161 : /*
162 : * Second, try to find stemmer for needed language for UTF8 encoding.
163 : */
164 0 : for (m = stemmer_modules; m->name; m++)
165 : {
166 0 : if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
167 : {
168 0 : d->stem = m->stem;
169 0 : d->z = m->create();
170 0 : d->needrecode = true;
171 0 : return;
172 : }
173 : }
174 :
175 0 : ereport(ERROR,
176 : (errcode(ERRCODE_UNDEFINED_OBJECT),
177 : errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
178 : lang, GetDatabaseEncodingName())));
179 : }
180 :
181 : Datum
182 12 : dsnowball_init(PG_FUNCTION_ARGS)
183 : {
184 12 : List *dictoptions = (List *) PG_GETARG_POINTER(0);
185 : DictSnowball *d;
186 12 : bool stoploaded = false;
187 : ListCell *l;
188 :
189 12 : d = (DictSnowball *) palloc0(sizeof(DictSnowball));
190 :
191 36 : foreach(l, dictoptions)
192 : {
193 24 : DefElem *defel = (DefElem *) lfirst(l);
194 :
195 24 : if (pg_strcasecmp("StopWords", defel->defname) == 0)
196 : {
197 12 : if (stoploaded)
198 0 : ereport(ERROR,
199 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
200 : errmsg("multiple StopWords parameters")));
201 12 : readstoplist(defGetString(defel), &d->stoplist, lowerstr);
202 12 : stoploaded = true;
203 : }
204 12 : else if (pg_strcasecmp("Language", defel->defname) == 0)
205 : {
206 12 : if (d->stem)
207 0 : ereport(ERROR,
208 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
209 : errmsg("multiple Language parameters")));
210 12 : locate_stem_module(d, defGetString(defel));
211 : }
212 : else
213 : {
214 0 : ereport(ERROR,
215 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
216 : errmsg("unrecognized Snowball parameter: \"%s\"",
217 : defel->defname)));
218 : }
219 : }
220 :
221 12 : if (!d->stem)
222 0 : ereport(ERROR,
223 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
224 : errmsg("missing Language parameter")));
225 :
226 12 : d->dictCtx = CurrentMemoryContext;
227 :
228 12 : PG_RETURN_POINTER(d);
229 : }
230 :
231 : Datum
232 1142 : dsnowball_lexize(PG_FUNCTION_ARGS)
233 : {
234 1142 : DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
235 1142 : char *in = (char *) PG_GETARG_POINTER(1);
236 1142 : int32 len = PG_GETARG_INT32(2);
237 1142 : char *txt = lowerstr_with_len(in, len);
238 1142 : TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
239 :
240 1142 : if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
241 : {
242 397 : pfree(txt);
243 : }
244 : else
245 : {
246 : MemoryContext saveCtx;
247 :
248 : /*
249 : * recode to utf8 if stemmer is utf8 and doesn't match server encoding
250 : */
251 745 : if (d->needrecode)
252 : {
253 : char *recoded;
254 :
255 0 : recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
256 0 : if (recoded != txt)
257 : {
258 0 : pfree(txt);
259 0 : txt = recoded;
260 : }
261 : }
262 :
263 : /* see comment about d->dictCtx */
264 745 : saveCtx = MemoryContextSwitchTo(d->dictCtx);
265 745 : SN_set_current(d->z, strlen(txt), (symbol *) txt);
266 745 : d->stem(d->z);
267 745 : MemoryContextSwitchTo(saveCtx);
268 :
269 745 : if (d->z->p && d->z->l)
270 : {
271 745 : txt = repalloc(txt, d->z->l + 1);
272 745 : memcpy(txt, d->z->p, d->z->l);
273 745 : txt[d->z->l] = '\0';
274 : }
275 :
276 : /* back recode if needed */
277 745 : if (d->needrecode)
278 : {
279 : char *recoded;
280 :
281 0 : recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
282 0 : if (recoded != txt)
283 : {
284 0 : pfree(txt);
285 0 : txt = recoded;
286 : }
287 : }
288 :
289 745 : res->lexeme = txt;
290 : }
291 :
292 1142 : PG_RETURN_POINTER(res);
293 : }
|