Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * wparser_def.c
4 : * Default text search parser
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/tsearch/wparser_def.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include <limits.h>
18 :
19 : #include "catalog/pg_collation.h"
20 : #include "commands/defrem.h"
21 : #include "tsearch/ts_locale.h"
22 : #include "tsearch/ts_public.h"
23 : #include "tsearch/ts_type.h"
24 : #include "tsearch/ts_utils.h"
25 : #include "utils/builtins.h"
26 :
27 :
28 : /* Define me to enable tracing of parser behavior */
29 : /* #define WPARSER_TRACE */
30 :
31 :
32 : /* Output token categories */
33 :
34 : #define ASCIIWORD 1
35 : #define WORD_T 2
36 : #define NUMWORD 3
37 : #define EMAIL 4
38 : #define URL_T 5
39 : #define HOST 6
40 : #define SCIENTIFIC 7
41 : #define VERSIONNUMBER 8
42 : #define NUMPARTHWORD 9
43 : #define PARTHWORD 10
44 : #define ASCIIPARTHWORD 11
45 : #define SPACE 12
46 : #define TAG_T 13
47 : #define PROTOCOL 14
48 : #define NUMHWORD 15
49 : #define ASCIIHWORD 16
50 : #define HWORD 17
51 : #define URLPATH 18
52 : #define FILEPATH 19
53 : #define DECIMAL_T 20
54 : #define SIGNEDINT 21
55 : #define UNSIGNEDINT 22
56 : #define XMLENTITY 23
57 :
58 : #define LASTNUM 23
59 :
60 : static const char *const tok_alias[] = {
61 : "",
62 : "asciiword",
63 : "word",
64 : "numword",
65 : "email",
66 : "url",
67 : "host",
68 : "sfloat",
69 : "version",
70 : "hword_numpart",
71 : "hword_part",
72 : "hword_asciipart",
73 : "blank",
74 : "tag",
75 : "protocol",
76 : "numhword",
77 : "asciihword",
78 : "hword",
79 : "url_path",
80 : "file",
81 : "float",
82 : "int",
83 : "uint",
84 : "entity"
85 : };
86 :
87 : static const char *const lex_descr[] = {
88 : "",
89 : "Word, all ASCII",
90 : "Word, all letters",
91 : "Word, letters and digits",
92 : "Email address",
93 : "URL",
94 : "Host",
95 : "Scientific notation",
96 : "Version number",
97 : "Hyphenated word part, letters and digits",
98 : "Hyphenated word part, all letters",
99 : "Hyphenated word part, all ASCII",
100 : "Space symbols",
101 : "XML tag",
102 : "Protocol head",
103 : "Hyphenated word, letters and digits",
104 : "Hyphenated word, all ASCII",
105 : "Hyphenated word, all letters",
106 : "URL path",
107 : "File or path name",
108 : "Decimal notation",
109 : "Signed integer",
110 : "Unsigned integer",
111 : "XML entity"
112 : };
113 :
114 :
115 : /* Parser states */
116 :
117 : typedef enum
118 : {
119 : TPS_Base = 0,
120 : TPS_InNumWord,
121 : TPS_InAsciiWord,
122 : TPS_InWord,
123 : TPS_InUnsignedInt,
124 : TPS_InSignedIntFirst,
125 : TPS_InSignedInt,
126 : TPS_InSpace,
127 : TPS_InUDecimalFirst,
128 : TPS_InUDecimal,
129 : TPS_InDecimalFirst,
130 : TPS_InDecimal,
131 : TPS_InVerVersion,
132 : TPS_InSVerVersion,
133 : TPS_InVersionFirst,
134 : TPS_InVersion,
135 : TPS_InMantissaFirst,
136 : TPS_InMantissaSign,
137 : TPS_InMantissa,
138 : TPS_InXMLEntityFirst,
139 : TPS_InXMLEntity,
140 : TPS_InXMLEntityNumFirst,
141 : TPS_InXMLEntityNum,
142 : TPS_InXMLEntityHexNumFirst,
143 : TPS_InXMLEntityHexNum,
144 : TPS_InXMLEntityEnd,
145 : TPS_InTagFirst,
146 : TPS_InXMLBegin,
147 : TPS_InTagCloseFirst,
148 : TPS_InTagName,
149 : TPS_InTagBeginEnd,
150 : TPS_InTag,
151 : TPS_InTagEscapeK,
152 : TPS_InTagEscapeKK,
153 : TPS_InTagBackSleshed,
154 : TPS_InTagEnd,
155 : TPS_InCommentFirst,
156 : TPS_InCommentLast,
157 : TPS_InComment,
158 : TPS_InCloseCommentFirst,
159 : TPS_InCloseCommentLast,
160 : TPS_InCommentEnd,
161 : TPS_InHostFirstDomain,
162 : TPS_InHostDomainSecond,
163 : TPS_InHostDomain,
164 : TPS_InPortFirst,
165 : TPS_InPort,
166 : TPS_InHostFirstAN,
167 : TPS_InHost,
168 : TPS_InEmail,
169 : TPS_InFileFirst,
170 : TPS_InFileTwiddle,
171 : TPS_InPathFirst,
172 : TPS_InPathFirstFirst,
173 : TPS_InPathSecond,
174 : TPS_InFile,
175 : TPS_InFileNext,
176 : TPS_InURLPathFirst,
177 : TPS_InURLPathStart,
178 : TPS_InURLPath,
179 : TPS_InFURL,
180 : TPS_InProtocolFirst,
181 : TPS_InProtocolSecond,
182 : TPS_InProtocolEnd,
183 : TPS_InHyphenAsciiWordFirst,
184 : TPS_InHyphenAsciiWord,
185 : TPS_InHyphenWordFirst,
186 : TPS_InHyphenWord,
187 : TPS_InHyphenNumWordFirst,
188 : TPS_InHyphenNumWord,
189 : TPS_InHyphenDigitLookahead,
190 : TPS_InParseHyphen,
191 : TPS_InParseHyphenHyphen,
192 : TPS_InHyphenWordPart,
193 : TPS_InHyphenAsciiWordPart,
194 : TPS_InHyphenNumWordPart,
195 : TPS_InHyphenUnsignedInt,
196 : TPS_Null /* last state (fake value) */
197 : } TParserState;
198 :
199 : /* forward declaration */
200 : struct TParser;
201 :
202 : typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
203 : * except p_iseq */
204 : typedef void (*TParserSpecial) (struct TParser *); /* special handler for
205 : * special cases... */
206 :
207 : typedef struct
208 : {
209 : TParserCharTest isclass;
210 : char c;
211 : uint16 flags;
212 : TParserState tostate;
213 : int type;
214 : TParserSpecial special;
215 : } TParserStateActionItem;
216 :
217 : /* Flag bits in TParserStateActionItem.flags */
218 : #define A_NEXT 0x0000
219 : #define A_BINGO 0x0001
220 : #define A_POP 0x0002
221 : #define A_PUSH 0x0004
222 : #define A_RERUN 0x0008
223 : #define A_CLEAR 0x0010
224 : #define A_MERGE 0x0020
225 : #define A_CLRALL 0x0040
226 :
227 : typedef struct TParserPosition
228 : {
229 : int posbyte; /* position of parser in bytes */
230 : int poschar; /* position of parser in characters */
231 : int charlen; /* length of current char */
232 : int lenbytetoken; /* length of token-so-far in bytes */
233 : int lenchartoken; /* and in chars */
234 : TParserState state;
235 : struct TParserPosition *prev;
236 : const TParserStateActionItem *pushedAtAction;
237 : } TParserPosition;
238 :
239 : typedef struct TParser
240 : {
241 : /* string and position information */
242 : char *str; /* multibyte string */
243 : int lenstr; /* length of mbstring */
244 : #ifdef USE_WIDE_UPPER_LOWER
245 : wchar_t *wstr; /* wide character string */
246 : pg_wchar *pgwstr; /* wide character string for C-locale */
247 : bool usewide;
248 : #endif
249 :
250 : /* State of parse */
251 : int charmaxlen;
252 : TParserPosition *state;
253 : bool ignore;
254 : bool wanthost;
255 :
256 : /* silly char */
257 : char c;
258 :
259 : /* out */
260 : char *token;
261 : int lenbytetoken;
262 : int lenchartoken;
263 : int type;
264 : } TParser;
265 :
266 :
267 : /* forward decls here */
268 : static bool TParserGet(TParser *prs);
269 :
270 :
271 : static TParserPosition *
272 1235 : newTParserPosition(TParserPosition *prev)
273 : {
274 1235 : TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
275 :
276 1235 : if (prev)
277 741 : memcpy(res, prev, sizeof(TParserPosition));
278 : else
279 494 : memset(res, 0, sizeof(TParserPosition));
280 :
281 1235 : res->prev = prev;
282 :
283 1235 : res->pushedAtAction = NULL;
284 :
285 1235 : return res;
286 : }
287 :
288 : static TParser *
289 454 : TParserInit(char *str, int len)
290 : {
291 454 : TParser *prs = (TParser *) palloc0(sizeof(TParser));
292 :
293 454 : prs->charmaxlen = pg_database_encoding_max_length();
294 454 : prs->str = str;
295 454 : prs->lenstr = len;
296 :
297 : #ifdef USE_WIDE_UPPER_LOWER
298 :
299 : /*
300 : * Use wide char code only when max encoding length > 1.
301 : */
302 454 : if (prs->charmaxlen > 1)
303 : {
304 454 : Oid collation = DEFAULT_COLLATION_OID; /* TODO */
305 454 : pg_locale_t mylocale = 0; /* TODO */
306 :
307 454 : prs->usewide = true;
308 454 : if (lc_ctype_is_c(collation))
309 : {
310 : /*
311 : * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
312 : * be different from sizeof(wchar_t)
313 : */
314 0 : prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
315 0 : pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
316 : }
317 : else
318 : {
319 454 : prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
320 454 : char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
321 : mylocale);
322 : }
323 : }
324 : else
325 0 : prs->usewide = false;
326 : #endif
327 :
328 454 : prs->state = newTParserPosition(NULL);
329 454 : prs->state->state = TPS_Base;
330 :
331 : #ifdef WPARSER_TRACE
332 :
333 : /*
334 : * Use of %.*s here is a bit risky since it can misbehave if the data is
335 : * not in what libc thinks is the prevailing encoding. However, since
336 : * this is just a debugging aid, we choose to live with that.
337 : */
338 : fprintf(stderr, "parsing \"%.*s\"\n", len, str);
339 : #endif
340 :
341 454 : return prs;
342 : }
343 :
344 : /*
345 : * As an alternative to a full TParserInit one can create a
346 : * TParserCopy which basically is a regular TParser without a private
347 : * copy of the string - instead it uses the one from another TParser.
348 : * This is useful because at some places TParsers are created
349 : * recursively and the repeated copying around of the strings can
350 : * cause major inefficiency if the source string is long.
351 : * The new parser starts parsing at the original's current position.
352 : *
353 : * Obviously one must not close the original TParser before the copy.
354 : */
355 : static TParser *
356 40 : TParserCopyInit(const TParser *orig)
357 : {
358 40 : TParser *prs = (TParser *) palloc0(sizeof(TParser));
359 :
360 40 : prs->charmaxlen = orig->charmaxlen;
361 40 : prs->str = orig->str + orig->state->posbyte;
362 40 : prs->lenstr = orig->lenstr - orig->state->posbyte;
363 :
364 : #ifdef USE_WIDE_UPPER_LOWER
365 40 : prs->usewide = orig->usewide;
366 :
367 40 : if (orig->pgwstr)
368 0 : prs->pgwstr = orig->pgwstr + orig->state->poschar;
369 40 : if (orig->wstr)
370 40 : prs->wstr = orig->wstr + orig->state->poschar;
371 : #endif
372 :
373 40 : prs->state = newTParserPosition(NULL);
374 40 : prs->state->state = TPS_Base;
375 :
376 : #ifdef WPARSER_TRACE
377 : /* See note above about %.*s */
378 : fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
379 : #endif
380 :
381 40 : return prs;
382 : }
383 :
384 :
385 : static void
386 454 : TParserClose(TParser *prs)
387 : {
388 1362 : while (prs->state)
389 : {
390 454 : TParserPosition *ptr = prs->state->prev;
391 :
392 454 : pfree(prs->state);
393 454 : prs->state = ptr;
394 : }
395 :
396 : #ifdef USE_WIDE_UPPER_LOWER
397 454 : if (prs->wstr)
398 454 : pfree(prs->wstr);
399 454 : if (prs->pgwstr)
400 0 : pfree(prs->pgwstr);
401 : #endif
402 :
403 : #ifdef WPARSER_TRACE
404 : fprintf(stderr, "closing parser\n");
405 : #endif
406 454 : pfree(prs);
407 454 : }
408 :
409 : /*
410 : * Close a parser created with TParserCopyInit
411 : */
412 : static void
413 40 : TParserCopyClose(TParser *prs)
414 : {
415 142 : while (prs->state)
416 : {
417 62 : TParserPosition *ptr = prs->state->prev;
418 :
419 62 : pfree(prs->state);
420 62 : prs->state = ptr;
421 : }
422 :
423 : #ifdef WPARSER_TRACE
424 : fprintf(stderr, "closing parser copy\n");
425 : #endif
426 40 : pfree(prs);
427 40 : }
428 :
429 :
430 : /*
431 : * Character-type support functions, equivalent to is* macros, but
432 : * working with any possible encodings and locales. Notes:
433 : * - with multibyte encoding and C-locale isw* function may fail
434 : * or give wrong result.
435 : * - multibyte encoding and C-locale often are used for
436 : * Asian languages.
437 : * - if locale is C then we use pgwstr instead of wstr.
438 : */
439 :
440 : #ifdef USE_WIDE_UPPER_LOWER
441 :
442 : #define p_iswhat(type) \
443 : static int \
444 : p_is##type(TParser *prs) { \
445 : Assert( prs->state ); \
446 : if ( prs->usewide ) \
447 : { \
448 : if ( prs->pgwstr ) \
449 : { \
450 : unsigned int c = *(prs->pgwstr + prs->state->poschar); \
451 : if ( c > 0x7f ) \
452 : return 0; \
453 : return is##type( c ); \
454 : } \
455 : return isw##type( *( prs->wstr + prs->state->poschar ) ); \
456 : } \
457 : \
458 : return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
459 : } \
460 : \
461 : static int \
462 : p_isnot##type(TParser *prs) { \
463 : return !p_is##type(prs); \
464 : }
465 :
466 : static int
467 1562 : p_isalnum(TParser *prs)
468 : {
469 1562 : Assert(prs->state);
470 :
471 1562 : if (prs->usewide)
472 : {
473 1562 : if (prs->pgwstr)
474 : {
475 0 : unsigned int c = *(prs->pgwstr + prs->state->poschar);
476 :
477 : /*
478 : * any non-ascii symbol with multibyte encoding with C-locale is
479 : * an alpha character
480 : */
481 0 : if (c > 0x7f)
482 0 : return 1;
483 :
484 0 : return isalnum(c);
485 : }
486 :
487 1562 : return iswalnum(*(prs->wstr + prs->state->poschar));
488 : }
489 :
490 0 : return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
491 : }
492 : static int
493 1440 : p_isnotalnum(TParser *prs)
494 : {
495 1440 : return !p_isalnum(prs);
496 : }
497 :
498 : static int
499 11347 : p_isalpha(TParser *prs)
500 : {
501 11347 : Assert(prs->state);
502 :
503 11347 : if (prs->usewide)
504 : {
505 11347 : if (prs->pgwstr)
506 : {
507 0 : unsigned int c = *(prs->pgwstr + prs->state->poschar);
508 :
509 : /*
510 : * any non-ascii symbol with multibyte encoding with C-locale is
511 : * an alpha character
512 : */
513 0 : if (c > 0x7f)
514 0 : return 1;
515 :
516 0 : return isalpha(c);
517 : }
518 :
519 11347 : return iswalpha(*(prs->wstr + prs->state->poschar));
520 : }
521 :
522 0 : return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
523 : }
524 :
525 : static int
526 0 : p_isnotalpha(TParser *prs)
527 : {
528 0 : return !p_isalpha(prs);
529 : }
530 :
531 : /* p_iseq should be used only for ascii symbols */
532 :
533 : static int
534 28051 : p_iseq(TParser *prs, char c)
535 : {
536 28051 : Assert(prs->state);
537 28051 : return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
538 : }
539 : #else /* USE_WIDE_UPPER_LOWER */
540 :
541 : #define p_iswhat(type) \
542 : static int \
543 : p_is##type(TParser *prs) { \
544 : Assert( prs->state ); \
545 : return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
546 : } \
547 : \
548 : static int \
549 : p_isnot##type(TParser *prs) { \
550 : return !p_is##type(prs); \
551 : }
552 :
553 :
554 : static int
555 : p_iseq(TParser *prs, char c)
556 : {
557 : Assert(prs->state);
558 : return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
559 : }
560 :
561 : p_iswhat(alnum)
562 : p_iswhat(alpha)
563 : #endif /* USE_WIDE_UPPER_LOWER */
564 :
565 4586 : p_iswhat(digit)
566 0 : p_iswhat(lower)
567 0 : p_iswhat(print)
568 0 : p_iswhat(punct)
569 113 : p_iswhat(space)
570 0 : p_iswhat(upper)
571 3 : p_iswhat(xdigit)
572 :
573 : static int
574 12212 : p_isEOF(TParser *prs)
575 : {
576 12212 : Assert(prs->state);
577 12212 : return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
578 : }
579 :
580 : static int
581 28051 : p_iseqC(TParser *prs)
582 : {
583 28051 : return p_iseq(prs, prs->c);
584 : }
585 :
586 : static int
587 0 : p_isneC(TParser *prs)
588 : {
589 0 : return !p_iseq(prs, prs->c);
590 : }
591 :
592 : static int
593 8889 : p_isascii(TParser *prs)
594 : {
595 8889 : return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
596 : }
597 :
598 : static int
599 8889 : p_isasclet(TParser *prs)
600 : {
601 8889 : return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
602 : }
603 :
604 : static int
605 443 : p_isurlchar(TParser *prs)
606 : {
607 : char ch;
608 :
609 : /* no non-ASCII need apply */
610 443 : if (prs->state->charlen != 1)
611 0 : return 0;
612 443 : ch = *(prs->str + prs->state->posbyte);
613 : /* no spaces or control characters */
614 443 : if (ch <= 0x20 || ch >= 0x7F)
615 39 : return 0;
616 : /* reject characters disallowed by RFC 3986 */
617 404 : switch (ch)
618 : {
619 : case '"':
620 : case '<':
621 : case '>':
622 : case '\\':
623 : case '^':
624 : case '`':
625 : case '{':
626 : case '|':
627 : case '}':
628 4 : return 0;
629 : }
630 400 : return 1;
631 : }
632 :
633 :
634 : /* deliberately suppress unused-function complaints for the above */
635 : void _make_compiler_happy(void);
636 : void
637 0 : _make_compiler_happy(void)
638 : {
639 0 : p_isalnum(NULL);
640 0 : p_isnotalnum(NULL);
641 0 : p_isalpha(NULL);
642 0 : p_isnotalpha(NULL);
643 0 : p_isdigit(NULL);
644 0 : p_isnotdigit(NULL);
645 0 : p_islower(NULL);
646 0 : p_isnotlower(NULL);
647 0 : p_isprint(NULL);
648 0 : p_isnotprint(NULL);
649 0 : p_ispunct(NULL);
650 0 : p_isnotpunct(NULL);
651 0 : p_isspace(NULL);
652 0 : p_isnotspace(NULL);
653 0 : p_isupper(NULL);
654 0 : p_isnotupper(NULL);
655 0 : p_isxdigit(NULL);
656 0 : p_isnotxdigit(NULL);
657 0 : p_isEOF(NULL);
658 0 : p_iseqC(NULL);
659 0 : p_isneC(NULL);
660 0 : }
661 :
662 :
663 : static void
664 42 : SpecialTags(TParser *prs)
665 : {
666 42 : switch (prs->state->lenchartoken)
667 : {
668 : case 8: /* </script */
669 1 : if (pg_strncasecmp(prs->token, "</script", 8) == 0)
670 1 : prs->ignore = false;
671 1 : break;
672 : case 7: /* <script || </style */
673 4 : if (pg_strncasecmp(prs->token, "</style", 7) == 0)
674 0 : prs->ignore = false;
675 4 : else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
676 1 : prs->ignore = true;
677 4 : break;
678 : case 6: /* <style */
679 3 : if (pg_strncasecmp(prs->token, "<style", 6) == 0)
680 0 : prs->ignore = true;
681 3 : break;
682 : default:
683 34 : break;
684 : }
685 42 : }
686 :
687 : static void
688 22 : SpecialFURL(TParser *prs)
689 : {
690 22 : prs->wanthost = true;
691 22 : prs->state->posbyte -= prs->state->lenbytetoken;
692 22 : prs->state->poschar -= prs->state->lenchartoken;
693 22 : }
694 :
695 : static void
696 4 : SpecialHyphen(TParser *prs)
697 : {
698 4 : prs->state->posbyte -= prs->state->lenbytetoken;
699 4 : prs->state->poschar -= prs->state->lenchartoken;
700 4 : }
701 :
702 : static void
703 0 : SpecialVerVersion(TParser *prs)
704 : {
705 0 : prs->state->posbyte -= prs->state->lenbytetoken;
706 0 : prs->state->poschar -= prs->state->lenchartoken;
707 0 : prs->state->lenbytetoken = 0;
708 0 : prs->state->lenchartoken = 0;
709 0 : }
710 :
711 : static int
712 80 : p_isstophost(TParser *prs)
713 : {
714 80 : if (prs->wanthost)
715 : {
716 34 : prs->wanthost = false;
717 34 : return 1;
718 : }
719 46 : return 0;
720 : }
721 :
722 : static int
723 4221 : p_isignore(TParser *prs)
724 : {
725 4221 : return (prs->ignore) ? 1 : 0;
726 : }
727 :
728 : static int
729 15 : p_ishost(TParser *prs)
730 : {
731 15 : TParser *tmpprs = TParserCopyInit(prs);
732 15 : int res = 0;
733 :
734 15 : tmpprs->wanthost = true;
735 :
736 15 : if (TParserGet(tmpprs) && tmpprs->type == HOST)
737 : {
738 12 : prs->state->posbyte += tmpprs->lenbytetoken;
739 12 : prs->state->poschar += tmpprs->lenchartoken;
740 12 : prs->state->lenbytetoken += tmpprs->lenbytetoken;
741 12 : prs->state->lenchartoken += tmpprs->lenchartoken;
742 12 : prs->state->charlen = tmpprs->state->charlen;
743 12 : res = 1;
744 : }
745 15 : TParserCopyClose(tmpprs);
746 :
747 15 : return res;
748 : }
749 :
750 : static int
751 25 : p_isURLPath(TParser *prs)
752 : {
753 25 : TParser *tmpprs = TParserCopyInit(prs);
754 25 : int res = 0;
755 :
756 25 : tmpprs->state = newTParserPosition(tmpprs->state);
757 25 : tmpprs->state->state = TPS_InURLPathFirst;
758 :
759 25 : if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
760 : {
761 22 : prs->state->posbyte += tmpprs->lenbytetoken;
762 22 : prs->state->poschar += tmpprs->lenchartoken;
763 22 : prs->state->lenbytetoken += tmpprs->lenbytetoken;
764 22 : prs->state->lenchartoken += tmpprs->lenchartoken;
765 22 : prs->state->charlen = tmpprs->state->charlen;
766 22 : res = 1;
767 : }
768 25 : TParserCopyClose(tmpprs);
769 :
770 25 : return res;
771 : }
772 :
773 : /*
774 : * returns true if current character has zero display length or
775 : * it's a special sign in several languages. Such characters
776 : * aren't a word-breaker although they aren't an isalpha.
777 : * In beginning of word they aren't a part of it.
778 : */
779 : static int
780 1027 : p_isspecial(TParser *prs)
781 : {
782 : /*
783 : * pg_dsplen could return -1 which means error or control character
784 : */
785 1027 : if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
786 0 : return 1;
787 :
788 : #ifdef USE_WIDE_UPPER_LOWER
789 :
790 : /*
791 : * Unicode Characters in the 'Mark, Spacing Combining' Category That
792 : * characters are not alpha although they are not breakers of word too.
793 : * Check that only in utf encoding, because other encodings aren't
794 : * supported by postgres or even exists.
795 : */
796 1027 : if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
797 : {
798 : static const pg_wchar strange_letter[] = {
799 : /*
800 : * use binary search, so elements should be ordered
801 : */
802 : 0x0903, /* DEVANAGARI SIGN VISARGA */
803 : 0x093E, /* DEVANAGARI VOWEL SIGN AA */
804 : 0x093F, /* DEVANAGARI VOWEL SIGN I */
805 : 0x0940, /* DEVANAGARI VOWEL SIGN II */
806 : 0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
807 : 0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
808 : 0x094B, /* DEVANAGARI VOWEL SIGN O */
809 : 0x094C, /* DEVANAGARI VOWEL SIGN AU */
810 : 0x0982, /* BENGALI SIGN ANUSVARA */
811 : 0x0983, /* BENGALI SIGN VISARGA */
812 : 0x09BE, /* BENGALI VOWEL SIGN AA */
813 : 0x09BF, /* BENGALI VOWEL SIGN I */
814 : 0x09C0, /* BENGALI VOWEL SIGN II */
815 : 0x09C7, /* BENGALI VOWEL SIGN E */
816 : 0x09C8, /* BENGALI VOWEL SIGN AI */
817 : 0x09CB, /* BENGALI VOWEL SIGN O */
818 : 0x09CC, /* BENGALI VOWEL SIGN AU */
819 : 0x09D7, /* BENGALI AU LENGTH MARK */
820 : 0x0A03, /* GURMUKHI SIGN VISARGA */
821 : 0x0A3E, /* GURMUKHI VOWEL SIGN AA */
822 : 0x0A3F, /* GURMUKHI VOWEL SIGN I */
823 : 0x0A40, /* GURMUKHI VOWEL SIGN II */
824 : 0x0A83, /* GUJARATI SIGN VISARGA */
825 : 0x0ABE, /* GUJARATI VOWEL SIGN AA */
826 : 0x0ABF, /* GUJARATI VOWEL SIGN I */
827 : 0x0AC0, /* GUJARATI VOWEL SIGN II */
828 : 0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
829 : 0x0ACB, /* GUJARATI VOWEL SIGN O */
830 : 0x0ACC, /* GUJARATI VOWEL SIGN AU */
831 : 0x0B02, /* ORIYA SIGN ANUSVARA */
832 : 0x0B03, /* ORIYA SIGN VISARGA */
833 : 0x0B3E, /* ORIYA VOWEL SIGN AA */
834 : 0x0B40, /* ORIYA VOWEL SIGN II */
835 : 0x0B47, /* ORIYA VOWEL SIGN E */
836 : 0x0B48, /* ORIYA VOWEL SIGN AI */
837 : 0x0B4B, /* ORIYA VOWEL SIGN O */
838 : 0x0B4C, /* ORIYA VOWEL SIGN AU */
839 : 0x0B57, /* ORIYA AU LENGTH MARK */
840 : 0x0BBE, /* TAMIL VOWEL SIGN AA */
841 : 0x0BBF, /* TAMIL VOWEL SIGN I */
842 : 0x0BC1, /* TAMIL VOWEL SIGN U */
843 : 0x0BC2, /* TAMIL VOWEL SIGN UU */
844 : 0x0BC6, /* TAMIL VOWEL SIGN E */
845 : 0x0BC7, /* TAMIL VOWEL SIGN EE */
846 : 0x0BC8, /* TAMIL VOWEL SIGN AI */
847 : 0x0BCA, /* TAMIL VOWEL SIGN O */
848 : 0x0BCB, /* TAMIL VOWEL SIGN OO */
849 : 0x0BCC, /* TAMIL VOWEL SIGN AU */
850 : 0x0BD7, /* TAMIL AU LENGTH MARK */
851 : 0x0C01, /* TELUGU SIGN CANDRABINDU */
852 : 0x0C02, /* TELUGU SIGN ANUSVARA */
853 : 0x0C03, /* TELUGU SIGN VISARGA */
854 : 0x0C41, /* TELUGU VOWEL SIGN U */
855 : 0x0C42, /* TELUGU VOWEL SIGN UU */
856 : 0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
857 : 0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
858 : 0x0C82, /* KANNADA SIGN ANUSVARA */
859 : 0x0C83, /* KANNADA SIGN VISARGA */
860 : 0x0CBE, /* KANNADA VOWEL SIGN AA */
861 : 0x0CC0, /* KANNADA VOWEL SIGN II */
862 : 0x0CC1, /* KANNADA VOWEL SIGN U */
863 : 0x0CC2, /* KANNADA VOWEL SIGN UU */
864 : 0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
865 : 0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
866 : 0x0CC7, /* KANNADA VOWEL SIGN EE */
867 : 0x0CC8, /* KANNADA VOWEL SIGN AI */
868 : 0x0CCA, /* KANNADA VOWEL SIGN O */
869 : 0x0CCB, /* KANNADA VOWEL SIGN OO */
870 : 0x0CD5, /* KANNADA LENGTH MARK */
871 : 0x0CD6, /* KANNADA AI LENGTH MARK */
872 : 0x0D02, /* MALAYALAM SIGN ANUSVARA */
873 : 0x0D03, /* MALAYALAM SIGN VISARGA */
874 : 0x0D3E, /* MALAYALAM VOWEL SIGN AA */
875 : 0x0D3F, /* MALAYALAM VOWEL SIGN I */
876 : 0x0D40, /* MALAYALAM VOWEL SIGN II */
877 : 0x0D46, /* MALAYALAM VOWEL SIGN E */
878 : 0x0D47, /* MALAYALAM VOWEL SIGN EE */
879 : 0x0D48, /* MALAYALAM VOWEL SIGN AI */
880 : 0x0D4A, /* MALAYALAM VOWEL SIGN O */
881 : 0x0D4B, /* MALAYALAM VOWEL SIGN OO */
882 : 0x0D4C, /* MALAYALAM VOWEL SIGN AU */
883 : 0x0D57, /* MALAYALAM AU LENGTH MARK */
884 : 0x0D82, /* SINHALA SIGN ANUSVARAYA */
885 : 0x0D83, /* SINHALA SIGN VISARGAYA */
886 : 0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
887 : 0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
888 : 0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
889 : 0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
890 : 0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
891 : 0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
892 : 0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
893 : 0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
894 : 0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
895 : * AELA-PILLA */
896 : 0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
897 : 0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
898 : 0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
899 : 0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
900 : 0x0F3E, /* TIBETAN SIGN YAR TSHES */
901 : 0x0F3F, /* TIBETAN SIGN MAR TSHES */
902 : 0x0F7F, /* TIBETAN SIGN RNAM BCAD */
903 : 0x102B, /* MYANMAR VOWEL SIGN TALL AA */
904 : 0x102C, /* MYANMAR VOWEL SIGN AA */
905 : 0x1031, /* MYANMAR VOWEL SIGN E */
906 : 0x1038, /* MYANMAR SIGN VISARGA */
907 : 0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
908 : 0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
909 : 0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
910 : 0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
911 : 0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
912 : 0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
913 : 0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
914 : 0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
915 : 0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
916 : 0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
917 : 0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
918 : 0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
919 : 0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
920 : 0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
921 : 0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
922 : 0x1084, /* MYANMAR VOWEL SIGN SHAN E */
923 : 0x1087, /* MYANMAR SIGN SHAN TONE-2 */
924 : 0x1088, /* MYANMAR SIGN SHAN TONE-3 */
925 : 0x1089, /* MYANMAR SIGN SHAN TONE-5 */
926 : 0x108A, /* MYANMAR SIGN SHAN TONE-6 */
927 : 0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
928 : 0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
929 : 0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
930 : 0x17B6, /* KHMER VOWEL SIGN AA */
931 : 0x17BE, /* KHMER VOWEL SIGN OE */
932 : 0x17BF, /* KHMER VOWEL SIGN YA */
933 : 0x17C0, /* KHMER VOWEL SIGN IE */
934 : 0x17C1, /* KHMER VOWEL SIGN E */
935 : 0x17C2, /* KHMER VOWEL SIGN AE */
936 : 0x17C3, /* KHMER VOWEL SIGN AI */
937 : 0x17C4, /* KHMER VOWEL SIGN OO */
938 : 0x17C5, /* KHMER VOWEL SIGN AU */
939 : 0x17C7, /* KHMER SIGN REAHMUK */
940 : 0x17C8, /* KHMER SIGN YUUKALEAPINTU */
941 : 0x1923, /* LIMBU VOWEL SIGN EE */
942 : 0x1924, /* LIMBU VOWEL SIGN AI */
943 : 0x1925, /* LIMBU VOWEL SIGN OO */
944 : 0x1926, /* LIMBU VOWEL SIGN AU */
945 : 0x1929, /* LIMBU SUBJOINED LETTER YA */
946 : 0x192A, /* LIMBU SUBJOINED LETTER RA */
947 : 0x192B, /* LIMBU SUBJOINED LETTER WA */
948 : 0x1930, /* LIMBU SMALL LETTER KA */
949 : 0x1931, /* LIMBU SMALL LETTER NGA */
950 : 0x1933, /* LIMBU SMALL LETTER TA */
951 : 0x1934, /* LIMBU SMALL LETTER NA */
952 : 0x1935, /* LIMBU SMALL LETTER PA */
953 : 0x1936, /* LIMBU SMALL LETTER MA */
954 : 0x1937, /* LIMBU SMALL LETTER RA */
955 : 0x1938, /* LIMBU SMALL LETTER LA */
956 : 0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
957 : 0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
958 : 0x19B2, /* NEW TAI LUE VOWEL SIGN II */
959 : 0x19B3, /* NEW TAI LUE VOWEL SIGN U */
960 : 0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
961 : 0x19B5, /* NEW TAI LUE VOWEL SIGN E */
962 : 0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
963 : 0x19B7, /* NEW TAI LUE VOWEL SIGN O */
964 : 0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
965 : 0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
966 : 0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
967 : 0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
968 : 0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
969 : 0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
970 : 0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
971 : 0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
972 : 0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
973 : 0x19C8, /* NEW TAI LUE TONE MARK-1 */
974 : 0x19C9, /* NEW TAI LUE TONE MARK-2 */
975 : 0x1A19, /* BUGINESE VOWEL SIGN E */
976 : 0x1A1A, /* BUGINESE VOWEL SIGN O */
977 : 0x1A1B, /* BUGINESE VOWEL SIGN AE */
978 : 0x1B04, /* BALINESE SIGN BISAH */
979 : 0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
980 : 0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
981 : 0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
982 : 0x1B3E, /* BALINESE VOWEL SIGN TALING */
983 : 0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
984 : 0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
985 : 0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
986 : 0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
987 : 0x1B44, /* BALINESE ADEG ADEG */
988 : 0x1B82, /* SUNDANESE SIGN PANGWISAD */
989 : 0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
990 : 0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
991 : 0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
992 : 0x1BAA, /* SUNDANESE SIGN PAMAAEH */
993 : 0x1C24, /* LEPCHA SUBJOINED LETTER YA */
994 : 0x1C25, /* LEPCHA SUBJOINED LETTER RA */
995 : 0x1C26, /* LEPCHA VOWEL SIGN AA */
996 : 0x1C27, /* LEPCHA VOWEL SIGN I */
997 : 0x1C28, /* LEPCHA VOWEL SIGN O */
998 : 0x1C29, /* LEPCHA VOWEL SIGN OO */
999 : 0x1C2A, /* LEPCHA VOWEL SIGN U */
1000 : 0x1C2B, /* LEPCHA VOWEL SIGN UU */
1001 : 0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
1002 : 0x1C35, /* LEPCHA CONSONANT SIGN KANG */
1003 : 0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
1004 : 0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
1005 : 0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
1006 : 0xA880, /* SAURASHTRA SIGN ANUSVARA */
1007 : 0xA881, /* SAURASHTRA SIGN VISARGA */
1008 : 0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
1009 : 0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
1010 : 0xA8B6, /* SAURASHTRA VOWEL SIGN I */
1011 : 0xA8B7, /* SAURASHTRA VOWEL SIGN II */
1012 : 0xA8B8, /* SAURASHTRA VOWEL SIGN U */
1013 : 0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
1014 : 0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
1015 : 0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
1016 : 0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
1017 : 0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
1018 : 0xA8BE, /* SAURASHTRA VOWEL SIGN E */
1019 : 0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
1020 : 0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
1021 : 0xA8C1, /* SAURASHTRA VOWEL SIGN O */
1022 : 0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
1023 : 0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
1024 : 0xA952, /* REJANG CONSONANT SIGN H */
1025 : 0xA953, /* REJANG VIRAMA */
1026 : 0xAA2F, /* CHAM VOWEL SIGN O */
1027 : 0xAA30, /* CHAM VOWEL SIGN AI */
1028 : 0xAA33, /* CHAM CONSONANT SIGN YA */
1029 : 0xAA34, /* CHAM CONSONANT SIGN RA */
1030 : 0xAA4D /* CHAM CONSONANT SIGN FINAL H */
1031 : };
1032 1027 : const pg_wchar *StopLow = strange_letter,
1033 1027 : *StopHigh = strange_letter + lengthof(strange_letter),
1034 : *StopMiddle;
1035 : pg_wchar c;
1036 :
1037 1027 : if (prs->pgwstr)
1038 0 : c = *(prs->pgwstr + prs->state->poschar);
1039 : else
1040 1027 : c = (pg_wchar) *(prs->wstr + prs->state->poschar);
1041 :
1042 10270 : while (StopLow < StopHigh)
1043 : {
1044 8216 : StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
1045 8216 : if (*StopMiddle == c)
1046 0 : return 1;
1047 8216 : else if (*StopMiddle < c)
1048 0 : StopLow = StopMiddle + 1;
1049 : else
1050 8216 : StopHigh = StopMiddle;
1051 : }
1052 : }
1053 : #endif
1054 :
1055 1027 : return 0;
1056 : }
1057 :
1058 : /*
1059 : * Table of state/action of parser
1060 : */
1061 :
1062 : static const TParserStateActionItem actionTPS_Base[] = {
1063 : {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
1064 : {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
1065 : {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
1066 : {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
1067 : {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1068 : {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
1069 : {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
1070 : {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
1071 : {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
1072 : {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1073 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1074 : {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
1075 : {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
1076 : };
1077 :
1078 :
1079 : static const TParserStateActionItem actionTPS_InNumWord[] = {
1080 : {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
1081 : {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1082 : {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1083 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1084 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1085 : {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1086 : {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1087 : {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
1088 : };
1089 :
1090 : static const TParserStateActionItem actionTPS_InAsciiWord[] = {
1091 : {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
1092 : {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1093 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1094 : {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1095 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1096 : {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1097 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1098 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1099 : {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
1100 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1101 : {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1102 : {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1103 : {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1104 : {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1105 : {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1106 : };
1107 :
1108 : static const TParserStateActionItem actionTPS_InWord[] = {
1109 : {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1110 : {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1111 : {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1112 : {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1113 : {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1114 : {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1115 : };
1116 :
1117 : static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
1118 : {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1119 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1120 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1121 : {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1122 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1123 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1124 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1125 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1126 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1127 : {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1128 : {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1129 : {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1130 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1131 : {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1132 : };
1133 :
1134 : static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
1135 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1136 : {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1137 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1138 : };
1139 :
1140 : static const TParserStateActionItem actionTPS_InSignedInt[] = {
1141 : {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1142 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1143 : {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1144 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1145 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1146 : {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1147 : };
1148 :
1149 : static const TParserStateActionItem actionTPS_InSpace[] = {
1150 : {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1151 : {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1152 : {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1153 : {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1154 : {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1155 : {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1156 : {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1157 : {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1158 : {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1159 : };
1160 :
1161 : static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
1162 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1163 : {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1164 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1165 : };
1166 :
1167 : static const TParserStateActionItem actionTPS_InUDecimal[] = {
1168 : {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1169 : {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1170 : {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1171 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1172 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1173 : {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1174 : };
1175 :
1176 : static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
1177 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1178 : {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1179 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1180 : };
1181 :
1182 : static const TParserStateActionItem actionTPS_InDecimal[] = {
1183 : {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1184 : {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1185 : {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1186 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1187 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1188 : {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1189 : };
1190 :
1191 : static const TParserStateActionItem actionTPS_InVerVersion[] = {
1192 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1193 : {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1194 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1195 : };
1196 :
1197 : static const TParserStateActionItem actionTPS_InSVerVersion[] = {
1198 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1199 : {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1200 : {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1201 : };
1202 :
1203 :
1204 : static const TParserStateActionItem actionTPS_InVersionFirst[] = {
1205 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1206 : {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1207 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1208 : };
1209 :
1210 : static const TParserStateActionItem actionTPS_InVersion[] = {
1211 : {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1212 : {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1213 : {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1214 : {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1215 : };
1216 :
1217 : static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
1218 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1219 : {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1220 : {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1221 : {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1222 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1223 : };
1224 :
1225 : static const TParserStateActionItem actionTPS_InMantissaSign[] = {
1226 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1227 : {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1228 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1229 : };
1230 :
1231 : static const TParserStateActionItem actionTPS_InMantissa[] = {
1232 : {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1233 : {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1234 : {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1235 : };
1236 :
1237 : static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
1238 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1239 : {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1240 : {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1241 : {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1242 : {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1243 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1244 : };
1245 :
1246 : static const TParserStateActionItem actionTPS_InXMLEntity[] = {
1247 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1248 : {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1249 : {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1250 : {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1251 : {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1252 : {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1253 : {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1254 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1255 : };
1256 :
1257 : static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
1258 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1259 : {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1260 : {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1261 : {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1262 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1263 : };
1264 :
1265 : static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
1266 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1267 : {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1268 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1269 : };
1270 :
1271 : static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
1272 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1273 : {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1274 : {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1275 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1276 : };
1277 :
1278 : static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
1279 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1280 : {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1281 : {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1282 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1283 : };
1284 :
1285 : static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
1286 : {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1287 : };
1288 :
1289 : static const TParserStateActionItem actionTPS_InTagFirst[] = {
1290 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1291 : {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1292 : {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1293 : {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1294 : {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1295 : {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1296 : {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1297 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1298 : };
1299 :
1300 : static const TParserStateActionItem actionTPS_InXMLBegin[] = {
1301 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1302 : /* <?xml ... */
1303 : /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1304 : {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1305 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1306 : };
1307 :
1308 : static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
1309 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1310 : {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1311 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1312 : };
1313 :
1314 : static const TParserStateActionItem actionTPS_InTagName[] = {
1315 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1316 : /* <br/> case */
1317 : {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1318 : {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1319 : {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1320 : {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1321 : {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1322 : {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1323 : {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1324 : {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1325 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1326 : };
1327 :
1328 : static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
1329 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1330 : {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1331 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1332 : };
1333 :
1334 : static const TParserStateActionItem actionTPS_InTag[] = {
1335 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1336 : {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1337 : {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1338 : {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1339 : {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1340 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1341 : {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1342 : {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1343 : {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1344 : {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1345 : {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1346 : {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1347 : {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1348 : {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1349 : {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1350 : {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1351 : {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1352 : {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1353 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1354 : };
1355 :
1356 : static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
1357 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1358 : {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1359 : {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1360 : {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1361 : };
1362 :
1363 : static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
1364 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1365 : {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1366 : {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1367 : {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1368 : };
1369 :
1370 : static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
1371 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1372 : {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1373 : };
1374 :
1375 : static const TParserStateActionItem actionTPS_InTagEnd[] = {
1376 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1377 : };
1378 :
1379 : static const TParserStateActionItem actionTPS_InCommentFirst[] = {
1380 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1381 : {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1382 : /* <!DOCTYPE ...> */
1383 : {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1384 : {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1385 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1386 : };
1387 :
1388 : static const TParserStateActionItem actionTPS_InCommentLast[] = {
1389 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1390 : {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1391 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1392 : };
1393 :
1394 : static const TParserStateActionItem actionTPS_InComment[] = {
1395 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1396 : {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1397 : {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1398 : };
1399 :
1400 : static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
1401 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1402 : {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1403 : {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1404 : };
1405 :
1406 : static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
1407 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1408 : {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1409 : {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1410 : {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1411 : };
1412 :
1413 : static const TParserStateActionItem actionTPS_InCommentEnd[] = {
1414 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1415 : };
1416 :
1417 : static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
1418 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1419 : {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1420 : {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1421 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1422 : };
1423 :
1424 : static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1425 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1426 : {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1427 : {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1428 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1429 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1430 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1431 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1432 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1433 : };
1434 :
1435 : static const TParserStateActionItem actionTPS_InHostDomain[] = {
1436 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1437 : {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1438 : {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1439 : {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1440 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1441 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1442 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1443 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1444 : {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1445 : {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1446 : {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1447 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1448 : };
1449 :
1450 : static const TParserStateActionItem actionTPS_InPortFirst[] = {
1451 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1452 : {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1453 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1454 : };
1455 :
1456 : static const TParserStateActionItem actionTPS_InPort[] = {
1457 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1458 : {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1459 : {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1460 : {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1461 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1462 : };
1463 :
1464 : static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1465 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1466 : {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1467 : {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1468 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1469 : };
1470 :
1471 : static const TParserStateActionItem actionTPS_InHost[] = {
1472 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1473 : {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1474 : {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1475 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1476 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1477 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1478 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1479 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1480 : };
1481 :
1482 : static const TParserStateActionItem actionTPS_InEmail[] = {
1483 : {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1484 : {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1485 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1486 : };
1487 :
1488 : static const TParserStateActionItem actionTPS_InFileFirst[] = {
1489 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1490 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1491 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1492 : {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1493 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1494 : {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1495 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1496 : };
1497 :
1498 : static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1499 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1500 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1501 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1502 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1503 : {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1504 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1505 : };
1506 :
1507 : static const TParserStateActionItem actionTPS_InPathFirst[] = {
1508 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1509 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1510 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1511 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1512 : {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1513 : {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1514 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1515 : };
1516 :
1517 : static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1518 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1519 : {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1520 : {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1521 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1522 : };
1523 :
1524 : static const TParserStateActionItem actionTPS_InPathSecond[] = {
1525 : {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1526 : {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1527 : {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1528 : {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1529 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1530 : };
1531 :
1532 : static const TParserStateActionItem actionTPS_InFile[] = {
1533 : {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1534 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1535 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1536 : {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1537 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1538 : {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1539 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1540 : {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1541 : };
1542 :
1543 : static const TParserStateActionItem actionTPS_InFileNext[] = {
1544 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1545 : {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1546 : {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1547 : {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1548 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1549 : };
1550 :
1551 : static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1552 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1553 : {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1554 : {NULL, 0, A_POP, TPS_Null, 0, NULL},
1555 : };
1556 :
1557 : static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1558 : {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1559 : };
1560 :
1561 : static const TParserStateActionItem actionTPS_InURLPath[] = {
1562 : {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1563 : {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1564 : {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1565 : };
1566 :
1567 : static const TParserStateActionItem actionTPS_InFURL[] = {
1568 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1569 : {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1570 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1571 : };
1572 :
1573 : static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1574 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1575 : {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1576 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1577 : };
1578 :
1579 : static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1580 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1581 : {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1582 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1583 : };
1584 :
1585 : static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1586 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1587 : };
1588 :
1589 : static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1590 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1591 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1592 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1593 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1594 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1595 : };
1596 :
1597 : static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1598 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1599 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1600 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1601 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1602 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1603 : {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1604 : {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1605 : };
1606 :
1607 : static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1608 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1609 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1610 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1611 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1612 : };
1613 :
1614 : static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1615 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1616 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1617 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1618 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1619 : {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1620 : {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1621 : };
1622 :
1623 : static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1624 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1625 : {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1626 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1627 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1628 : };
1629 :
1630 : static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1631 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1632 : {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1633 : {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1634 : {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1635 : {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1636 : };
1637 :
1638 : static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1639 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1640 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1641 : {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1642 : {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1643 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1644 : };
1645 :
1646 : static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1647 : {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1648 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1649 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1650 : {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1651 : {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1652 : {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1653 : };
1654 :
1655 : static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1656 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1657 : {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1658 : {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1659 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1660 : };
1661 :
1662 : static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1663 : {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1664 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1665 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1666 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1667 : {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1668 : };
1669 :
1670 : static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1671 : {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1672 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1673 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1674 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1675 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1676 : {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1677 : };
1678 :
1679 : static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1680 : {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1681 : {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1682 : {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1683 : {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1684 : };
1685 :
1686 : static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1687 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1688 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1689 : {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1690 : {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1691 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1692 : };
1693 :
1694 :
1695 : /*
1696 : * main table of per-state parser actions
1697 : */
1698 : typedef struct
1699 : {
1700 : const TParserStateActionItem *action; /* the actual state info */
1701 : TParserState state; /* only for Assert crosscheck */
1702 : #ifdef WPARSER_TRACE
1703 : const char *state_name; /* only for debug printout */
1704 : #endif
1705 : } TParserStateAction;
1706 :
1707 : #ifdef WPARSER_TRACE
1708 : #define TPARSERSTATEACTION(state) \
1709 : { CppConcat(action,state), state, CppAsString(state) }
1710 : #else
1711 : #define TPARSERSTATEACTION(state) \
1712 : { CppConcat(action,state), state }
1713 : #endif
1714 :
1715 : /*
1716 : * order must be the same as in typedef enum {} TParserState!!
1717 : */
1718 :
1719 : static const TParserStateAction Actions[] = {
1720 : TPARSERSTATEACTION(TPS_Base),
1721 : TPARSERSTATEACTION(TPS_InNumWord),
1722 : TPARSERSTATEACTION(TPS_InAsciiWord),
1723 : TPARSERSTATEACTION(TPS_InWord),
1724 : TPARSERSTATEACTION(TPS_InUnsignedInt),
1725 : TPARSERSTATEACTION(TPS_InSignedIntFirst),
1726 : TPARSERSTATEACTION(TPS_InSignedInt),
1727 : TPARSERSTATEACTION(TPS_InSpace),
1728 : TPARSERSTATEACTION(TPS_InUDecimalFirst),
1729 : TPARSERSTATEACTION(TPS_InUDecimal),
1730 : TPARSERSTATEACTION(TPS_InDecimalFirst),
1731 : TPARSERSTATEACTION(TPS_InDecimal),
1732 : TPARSERSTATEACTION(TPS_InVerVersion),
1733 : TPARSERSTATEACTION(TPS_InSVerVersion),
1734 : TPARSERSTATEACTION(TPS_InVersionFirst),
1735 : TPARSERSTATEACTION(TPS_InVersion),
1736 : TPARSERSTATEACTION(TPS_InMantissaFirst),
1737 : TPARSERSTATEACTION(TPS_InMantissaSign),
1738 : TPARSERSTATEACTION(TPS_InMantissa),
1739 : TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1740 : TPARSERSTATEACTION(TPS_InXMLEntity),
1741 : TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1742 : TPARSERSTATEACTION(TPS_InXMLEntityNum),
1743 : TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1744 : TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1745 : TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1746 : TPARSERSTATEACTION(TPS_InTagFirst),
1747 : TPARSERSTATEACTION(TPS_InXMLBegin),
1748 : TPARSERSTATEACTION(TPS_InTagCloseFirst),
1749 : TPARSERSTATEACTION(TPS_InTagName),
1750 : TPARSERSTATEACTION(TPS_InTagBeginEnd),
1751 : TPARSERSTATEACTION(TPS_InTag),
1752 : TPARSERSTATEACTION(TPS_InTagEscapeK),
1753 : TPARSERSTATEACTION(TPS_InTagEscapeKK),
1754 : TPARSERSTATEACTION(TPS_InTagBackSleshed),
1755 : TPARSERSTATEACTION(TPS_InTagEnd),
1756 : TPARSERSTATEACTION(TPS_InCommentFirst),
1757 : TPARSERSTATEACTION(TPS_InCommentLast),
1758 : TPARSERSTATEACTION(TPS_InComment),
1759 : TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1760 : TPARSERSTATEACTION(TPS_InCloseCommentLast),
1761 : TPARSERSTATEACTION(TPS_InCommentEnd),
1762 : TPARSERSTATEACTION(TPS_InHostFirstDomain),
1763 : TPARSERSTATEACTION(TPS_InHostDomainSecond),
1764 : TPARSERSTATEACTION(TPS_InHostDomain),
1765 : TPARSERSTATEACTION(TPS_InPortFirst),
1766 : TPARSERSTATEACTION(TPS_InPort),
1767 : TPARSERSTATEACTION(TPS_InHostFirstAN),
1768 : TPARSERSTATEACTION(TPS_InHost),
1769 : TPARSERSTATEACTION(TPS_InEmail),
1770 : TPARSERSTATEACTION(TPS_InFileFirst),
1771 : TPARSERSTATEACTION(TPS_InFileTwiddle),
1772 : TPARSERSTATEACTION(TPS_InPathFirst),
1773 : TPARSERSTATEACTION(TPS_InPathFirstFirst),
1774 : TPARSERSTATEACTION(TPS_InPathSecond),
1775 : TPARSERSTATEACTION(TPS_InFile),
1776 : TPARSERSTATEACTION(TPS_InFileNext),
1777 : TPARSERSTATEACTION(TPS_InURLPathFirst),
1778 : TPARSERSTATEACTION(TPS_InURLPathStart),
1779 : TPARSERSTATEACTION(TPS_InURLPath),
1780 : TPARSERSTATEACTION(TPS_InFURL),
1781 : TPARSERSTATEACTION(TPS_InProtocolFirst),
1782 : TPARSERSTATEACTION(TPS_InProtocolSecond),
1783 : TPARSERSTATEACTION(TPS_InProtocolEnd),
1784 : TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1785 : TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1786 : TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1787 : TPARSERSTATEACTION(TPS_InHyphenWord),
1788 : TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1789 : TPARSERSTATEACTION(TPS_InHyphenNumWord),
1790 : TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1791 : TPARSERSTATEACTION(TPS_InParseHyphen),
1792 : TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1793 : TPARSERSTATEACTION(TPS_InHyphenWordPart),
1794 : TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1795 : TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1796 : TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1797 : };
1798 :
1799 :
1800 : static bool
1801 3278 : TParserGet(TParser *prs)
1802 : {
1803 3278 : const TParserStateActionItem *item = NULL;
1804 :
1805 3278 : Assert(prs->state);
1806 :
1807 3278 : if (prs->state->posbyte >= prs->lenstr)
1808 454 : return false;
1809 :
1810 2824 : prs->token = prs->str + prs->state->posbyte;
1811 2824 : prs->state->pushedAtAction = NULL;
1812 :
1813 : /* look at string */
1814 15434 : while (prs->state->posbyte <= prs->lenstr)
1815 : {
1816 12610 : if (prs->state->posbyte == prs->lenstr)
1817 462 : prs->state->charlen = 0;
1818 : else
1819 24296 : prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1820 12148 : pg_mblen(prs->str + prs->state->posbyte);
1821 :
1822 12610 : Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1823 12610 : Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1824 12610 : Assert(Actions[prs->state->state].state == prs->state->state);
1825 :
1826 12610 : if (prs->state->pushedAtAction)
1827 : {
1828 : /* After a POP, pick up at the next test */
1829 310 : item = prs->state->pushedAtAction + 1;
1830 310 : prs->state->pushedAtAction = NULL;
1831 : }
1832 : else
1833 : {
1834 12300 : item = Actions[prs->state->state].action;
1835 12300 : Assert(item != NULL);
1836 : }
1837 :
1838 : /* find action by character class */
1839 80094 : while (item->isclass)
1840 : {
1841 63685 : prs->c = item->c;
1842 63685 : if (item->isclass(prs) != 0)
1843 8811 : break;
1844 54874 : item++;
1845 : }
1846 :
1847 : #ifdef WPARSER_TRACE
1848 : {
1849 : TParserPosition *ptr;
1850 :
1851 : fprintf(stderr, "state ");
1852 : /* indent according to stack depth */
1853 : for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1854 : fprintf(stderr, " ");
1855 : fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1856 : if (prs->state->posbyte < prs->lenstr)
1857 : fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1858 : else
1859 : fprintf(stderr, "at EOF");
1860 : fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1861 : (int) (item - Actions[prs->state->state].action),
1862 : (item->flags & A_BINGO) ? " BINGO" : "",
1863 : (item->flags & A_POP) ? " POP" : "",
1864 : (item->flags & A_PUSH) ? " PUSH" : "",
1865 : (item->flags & A_RERUN) ? " RERUN" : "",
1866 : (item->flags & A_CLEAR) ? " CLEAR" : "",
1867 : (item->flags & A_MERGE) ? " MERGE" : "",
1868 : (item->flags & A_CLRALL) ? " CLRALL" : "",
1869 : (item->tostate != TPS_Null) ? " tostate " : "",
1870 : (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1871 : (item->type > 0) ? " type " : "",
1872 : tok_alias[item->type]);
1873 : }
1874 : #endif
1875 :
1876 : /* call special handler if exists */
1877 12610 : if (item->special)
1878 68 : item->special(prs);
1879 :
1880 : /* BINGO, token is found */
1881 12610 : if (item->flags & A_BINGO)
1882 : {
1883 2824 : Assert(item->type > 0);
1884 2824 : prs->lenbytetoken = prs->state->lenbytetoken;
1885 2824 : prs->lenchartoken = prs->state->lenchartoken;
1886 2824 : prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1887 2824 : prs->type = item->type;
1888 : }
1889 :
1890 : /* do various actions by flags */
1891 12610 : if (item->flags & A_POP)
1892 : { /* pop stored state in stack */
1893 313 : TParserPosition *ptr = prs->state->prev;
1894 :
1895 313 : pfree(prs->state);
1896 313 : prs->state = ptr;
1897 313 : Assert(prs->state);
1898 : }
1899 12297 : else if (item->flags & A_PUSH)
1900 : { /* push (store) state in stack */
1901 716 : prs->state->pushedAtAction = item; /* remember where we push */
1902 716 : prs->state = newTParserPosition(prs->state);
1903 : }
1904 11581 : else if (item->flags & A_CLEAR)
1905 : { /* clear previous pushed state */
1906 : TParserPosition *ptr;
1907 :
1908 75 : Assert(prs->state->prev);
1909 75 : ptr = prs->state->prev->prev;
1910 75 : pfree(prs->state->prev);
1911 75 : prs->state->prev = ptr;
1912 : }
1913 11506 : else if (item->flags & A_CLRALL)
1914 : { /* clear all previous pushed state */
1915 : TParserPosition *ptr;
1916 :
1917 587 : while (prs->state->prev)
1918 : {
1919 331 : ptr = prs->state->prev->prev;
1920 331 : pfree(prs->state->prev);
1921 331 : prs->state->prev = ptr;
1922 : }
1923 : }
1924 11378 : else if (item->flags & A_MERGE)
1925 : { /* merge posinfo with current and pushed state */
1926 0 : TParserPosition *ptr = prs->state;
1927 :
1928 0 : Assert(prs->state->prev);
1929 0 : prs->state = prs->state->prev;
1930 :
1931 0 : prs->state->posbyte = ptr->posbyte;
1932 0 : prs->state->poschar = ptr->poschar;
1933 0 : prs->state->charlen = ptr->charlen;
1934 0 : prs->state->lenbytetoken = ptr->lenbytetoken;
1935 0 : prs->state->lenchartoken = ptr->lenchartoken;
1936 0 : pfree(ptr);
1937 : }
1938 :
1939 : /* set new state if pointed */
1940 12610 : if (item->tostate != TPS_Null)
1941 8236 : prs->state->state = item->tostate;
1942 :
1943 : /* check for go away */
1944 22396 : if ((item->flags & A_BINGO) ||
1945 9786 : (prs->state->posbyte >= prs->lenstr &&
1946 0 : (item->flags & A_RERUN) == 0))
1947 : break;
1948 :
1949 : /* go to beginning of loop if we should rerun or we just restore state */
1950 9786 : if (item->flags & (A_RERUN | A_POP))
1951 317 : continue;
1952 :
1953 : /* move forward */
1954 9469 : if (prs->state->charlen)
1955 : {
1956 9469 : prs->state->posbyte += prs->state->charlen;
1957 9469 : prs->state->lenbytetoken += prs->state->charlen;
1958 9469 : prs->state->poschar++;
1959 9469 : prs->state->lenchartoken++;
1960 : }
1961 : }
1962 :
1963 2824 : return (item && (item->flags & A_BINGO)) ? true : false;
1964 : }
1965 :
1966 : Datum
1967 74 : prsd_lextype(PG_FUNCTION_ARGS)
1968 : {
1969 74 : LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1970 : int i;
1971 :
1972 1776 : for (i = 1; i <= LASTNUM; i++)
1973 : {
1974 1702 : descr[i - 1].lexid = i;
1975 1702 : descr[i - 1].alias = pstrdup(tok_alias[i]);
1976 1702 : descr[i - 1].descr = pstrdup(lex_descr[i]);
1977 : }
1978 :
1979 74 : descr[LASTNUM].lexid = 0;
1980 :
1981 74 : PG_RETURN_POINTER(descr);
1982 : }
1983 :
1984 : Datum
1985 454 : prsd_start(PG_FUNCTION_ARGS)
1986 : {
1987 454 : PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1988 : }
1989 :
1990 : Datum
1991 3238 : prsd_nexttoken(PG_FUNCTION_ARGS)
1992 : {
1993 3238 : TParser *p = (TParser *) PG_GETARG_POINTER(0);
1994 3238 : char **t = (char **) PG_GETARG_POINTER(1);
1995 3238 : int *tlen = (int *) PG_GETARG_POINTER(2);
1996 :
1997 3238 : if (!TParserGet(p))
1998 454 : PG_RETURN_INT32(0);
1999 :
2000 2784 : *t = p->token;
2001 2784 : *tlen = p->lenbytetoken;
2002 :
2003 2784 : PG_RETURN_INT32(p->type);
2004 : }
2005 :
2006 : Datum
2007 454 : prsd_end(PG_FUNCTION_ARGS)
2008 : {
2009 454 : TParser *p = (TParser *) PG_GETARG_POINTER(0);
2010 :
2011 454 : TParserClose(p);
2012 454 : PG_RETURN_VOID();
2013 : }
2014 :
2015 : #define LEAVETOKEN(x) ( (x)==SPACE )
2016 : #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
2017 : #define ENDPUNCTOKEN(x) ( (x)==SPACE )
2018 :
2019 : #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
2020 : #define HLIDREPLACE(x) ( (x)==TAG_T )
2021 : #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
2022 : #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
2023 : #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
2024 : #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
2025 :
2026 : typedef struct
2027 : {
2028 : HeadlineWordEntry *words;
2029 : int len;
2030 : } hlCheck;
2031 :
2032 : static bool
2033 88 : checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
2034 : {
2035 : int i;
2036 88 : hlCheck *checkval = (hlCheck *) opaque;
2037 :
2038 424 : for (i = 0; i < checkval->len; i++)
2039 : {
2040 364 : if (checkval->words[i].item == val)
2041 : {
2042 : /* don't need to find all positions */
2043 45 : if (!data)
2044 28 : return true;
2045 :
2046 17 : if (!data->pos)
2047 : {
2048 17 : data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
2049 17 : data->allocated = true;
2050 17 : data->npos = 1;
2051 17 : data->pos[0] = checkval->words[i].pos;
2052 : }
2053 0 : else if (data->pos[data->npos - 1] < checkval->words[i].pos)
2054 : {
2055 0 : data->pos[data->npos++] = checkval->words[i].pos;
2056 : }
2057 : }
2058 : }
2059 :
2060 60 : if (data && data->npos > 0)
2061 17 : return true;
2062 :
2063 43 : return false;
2064 : }
2065 :
2066 :
2067 : static bool
2068 110 : hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
2069 : {
2070 : int i,
2071 : j;
2072 110 : QueryItem *item = GETQUERY(query);
2073 110 : int pos = *p;
2074 :
2075 110 : *q = -1;
2076 110 : *p = INT_MAX;
2077 :
2078 580 : for (j = 0; j < query->size; j++)
2079 : {
2080 470 : if (item->type != QI_VAL)
2081 : {
2082 180 : item++;
2083 180 : continue;
2084 : }
2085 3380 : for (i = pos; i < prs->curwords; i++)
2086 : {
2087 3170 : if (prs->words[i].item == &item->qoperand)
2088 : {
2089 80 : if (i > *q)
2090 66 : *q = i;
2091 80 : break;
2092 : }
2093 : }
2094 290 : item++;
2095 : }
2096 :
2097 110 : if (*q < 0)
2098 50 : return false;
2099 :
2100 60 : item = GETQUERY(query);
2101 304 : for (j = 0; j < query->size; j++)
2102 : {
2103 244 : if (item->type != QI_VAL)
2104 : {
2105 92 : item++;
2106 92 : continue;
2107 : }
2108 922 : for (i = *q; i >= pos; i--)
2109 : {
2110 850 : if (prs->words[i].item == &item->qoperand)
2111 : {
2112 80 : if (i < *p)
2113 74 : *p = i;
2114 80 : break;
2115 : }
2116 : }
2117 152 : item++;
2118 : }
2119 :
2120 60 : if (*p <= *q)
2121 : {
2122 : hlCheck ch;
2123 :
2124 60 : ch.words = &(prs->words[*p]);
2125 60 : ch.len = *q - *p + 1;
2126 60 : if (TS_execute(GETQUERY(query), &ch, TS_EXEC_EMPTY, checkcondition_HL))
2127 13 : return true;
2128 : else
2129 : {
2130 47 : (*p)++;
2131 47 : return hlCover(prs, query, p, q);
2132 : }
2133 : }
2134 :
2135 0 : return false;
2136 : }
2137 :
2138 : static void
2139 6 : mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
2140 : {
2141 : int i;
2142 :
2143 265 : for (i = startpos; i <= endpos; i++)
2144 : {
2145 259 : if (prs->words[i].item)
2146 5 : prs->words[i].selected = 1;
2147 259 : if (highlight == 0)
2148 : {
2149 259 : if (HLIDREPLACE(prs->words[i].type))
2150 0 : prs->words[i].replace = 1;
2151 259 : else if (HLIDSKIP(prs->words[i].type))
2152 0 : prs->words[i].skip = 1;
2153 : }
2154 : else
2155 : {
2156 0 : if (XMLHLIDSKIP(prs->words[i].type))
2157 0 : prs->words[i].skip = 1;
2158 : }
2159 :
2160 259 : prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2161 : }
2162 6 : }
2163 :
2164 : typedef struct
2165 : {
2166 : int32 startpos;
2167 : int32 endpos;
2168 : int32 poslen;
2169 : int32 curlen;
2170 : int16 in;
2171 : int16 excluded;
2172 : } CoverPos;
2173 :
2174 : static void
2175 5 : get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
2176 : int *curlen, int *poslen, int max_words)
2177 : {
2178 : int i;
2179 :
2180 : /*
2181 : * Objective: Generate a fragment of words between startpos and endpos
2182 : * such that it has at most max_words and both ends has query words. If
2183 : * the startpos and endpos are the endpoints of the cover and the cover
2184 : * has fewer words than max_words, then this function should just return
2185 : * the cover
2186 : */
2187 : /* first move startpos to an item */
2188 147 : for (i = *startpos; i <= *endpos; i++)
2189 : {
2190 147 : *startpos = i;
2191 147 : if (prs->words[i].item && !prs->words[i].repeated)
2192 5 : break;
2193 : }
2194 : /* cut endpos to have only max_words */
2195 5 : *curlen = 0;
2196 5 : *poslen = 0;
2197 146 : for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2198 : {
2199 141 : if (!NONWORDTOKEN(prs->words[i].type))
2200 73 : *curlen += 1;
2201 141 : if (prs->words[i].item && !prs->words[i].repeated)
2202 5 : *poslen += 1;
2203 : }
2204 : /* if the cover was cut then move back endpos to a query item */
2205 5 : if (*endpos > i)
2206 : {
2207 2 : *endpos = i;
2208 140 : for (i = *endpos; i >= *startpos; i--)
2209 : {
2210 140 : *endpos = i;
2211 140 : if (prs->words[i].item && !prs->words[i].repeated)
2212 2 : break;
2213 138 : if (!NONWORDTOKEN(prs->words[i].type))
2214 68 : *curlen -= 1;
2215 : }
2216 : }
2217 5 : }
2218 :
2219 : static void
2220 4 : mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
2221 : int shortword, int min_words,
2222 : int max_words, int max_fragments)
2223 : {
2224 : int32 poslen,
2225 : curlen,
2226 : i,
2227 : f,
2228 4 : num_f = 0;
2229 : int32 stretch,
2230 : maxstretch,
2231 : posmarker;
2232 :
2233 4 : int32 startpos = 0,
2234 4 : endpos = 0,
2235 4 : p = 0,
2236 4 : q = 0;
2237 :
2238 4 : int32 numcovers = 0,
2239 4 : maxcovers = 32;
2240 :
2241 : int32 minI,
2242 : minwords,
2243 : maxitems;
2244 : CoverPos *covers;
2245 :
2246 4 : covers = palloc(maxcovers * sizeof(CoverPos));
2247 :
2248 : /* get all covers */
2249 11 : while (hlCover(prs, query, &p, &q))
2250 : {
2251 3 : startpos = p;
2252 3 : endpos = q;
2253 :
2254 : /*
2255 : * Break the cover into smaller fragments such that each fragment has
2256 : * at most max_words. Also ensure that each end of the fragment is a
2257 : * query word. This will allow us to stretch the fragment in either
2258 : * direction
2259 : */
2260 :
2261 11 : while (startpos <= endpos)
2262 : {
2263 5 : get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2264 5 : if (numcovers >= maxcovers)
2265 : {
2266 0 : maxcovers *= 2;
2267 0 : covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2268 : }
2269 5 : covers[numcovers].startpos = startpos;
2270 5 : covers[numcovers].endpos = endpos;
2271 5 : covers[numcovers].curlen = curlen;
2272 5 : covers[numcovers].poslen = poslen;
2273 5 : covers[numcovers].in = 0;
2274 5 : covers[numcovers].excluded = 0;
2275 5 : numcovers++;
2276 5 : startpos = endpos + 1;
2277 5 : endpos = q;
2278 : }
2279 : /* move p to generate the next cover */
2280 3 : p++;
2281 : }
2282 :
2283 : /* choose best covers */
2284 9 : for (f = 0; f < max_fragments; f++)
2285 : {
2286 6 : maxitems = 0;
2287 6 : minwords = PG_INT32_MAX;
2288 6 : minI = -1;
2289 :
2290 : /*
2291 : * Choose the cover that contains max items. In case of tie choose the
2292 : * one with smaller number of words.
2293 : */
2294 15 : for (i = 0; i < numcovers; i++)
2295 : {
2296 16 : if (!covers[i].in && !covers[i].excluded &&
2297 9 : (maxitems < covers[i].poslen || (maxitems == covers[i].poslen
2298 2 : && minwords > covers[i].curlen)))
2299 : {
2300 5 : maxitems = covers[i].poslen;
2301 5 : minwords = covers[i].curlen;
2302 5 : minI = i;
2303 : }
2304 : }
2305 : /* if a cover was found mark it */
2306 6 : if (minI >= 0)
2307 : {
2308 5 : covers[minI].in = 1;
2309 : /* adjust the size of cover */
2310 5 : startpos = covers[minI].startpos;
2311 5 : endpos = covers[minI].endpos;
2312 5 : curlen = covers[minI].curlen;
2313 : /* stretch the cover if cover size is lower than max_words */
2314 5 : if (curlen < max_words)
2315 : {
2316 : /* divide the stretch on both sides of cover */
2317 5 : maxstretch = (max_words - curlen) / 2;
2318 :
2319 : /*
2320 : * first stretch the startpos stop stretching if 1. we hit the
2321 : * beginning of document 2. exceed maxstretch 3. we hit an
2322 : * already marked fragment
2323 : */
2324 5 : stretch = 0;
2325 5 : posmarker = startpos;
2326 99 : for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2327 : {
2328 94 : if (!NONWORDTOKEN(prs->words[i].type))
2329 : {
2330 45 : curlen++;
2331 45 : stretch++;
2332 : }
2333 94 : posmarker = i;
2334 : }
2335 : /* cut back startpos till we find a non short token */
2336 21 : for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
2337 : {
2338 16 : if (!NONWORDTOKEN(prs->words[i].type))
2339 6 : curlen--;
2340 : }
2341 5 : startpos = i;
2342 : /* now stretch the endpos as much as possible */
2343 5 : posmarker = endpos;
2344 159 : for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2345 : {
2346 154 : if (!NONWORDTOKEN(prs->words[i].type))
2347 77 : curlen++;
2348 154 : posmarker = i;
2349 : }
2350 : /* cut back endpos till we find a non-short token */
2351 13 : for (i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
2352 : {
2353 8 : if (!NONWORDTOKEN(prs->words[i].type))
2354 4 : curlen--;
2355 : }
2356 5 : endpos = i;
2357 : }
2358 5 : covers[minI].startpos = startpos;
2359 5 : covers[minI].endpos = endpos;
2360 5 : covers[minI].curlen = curlen;
2361 : /* Mark the chosen fragments (covers) */
2362 5 : mark_fragment(prs, highlight, startpos, endpos);
2363 5 : num_f++;
2364 : /* exclude overlapping covers */
2365 14 : for (i = 0; i < numcovers; i++)
2366 : {
2367 9 : if (i != minI && ((covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
2368 0 : covers[i].excluded = 1;
2369 : }
2370 : }
2371 : else
2372 1 : break;
2373 : }
2374 :
2375 : /* show at least min_words we have not marked anything */
2376 4 : if (num_f <= 0)
2377 : {
2378 1 : startpos = endpos = curlen = 0;
2379 31 : for (i = 0; i < prs->curwords && curlen < min_words; i++)
2380 : {
2381 30 : if (!NONWORDTOKEN(prs->words[i].type))
2382 15 : curlen++;
2383 30 : endpos = i;
2384 : }
2385 1 : mark_fragment(prs, highlight, startpos, endpos);
2386 : }
2387 4 : pfree(covers);
2388 4 : }
2389 :
2390 : static void
2391 47 : mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
2392 : int shortword, int min_words, int max_words)
2393 : {
2394 47 : int p = 0,
2395 47 : q = 0;
2396 47 : int bestb = -1,
2397 47 : beste = -1;
2398 47 : int bestlen = -1;
2399 47 : int pose = 0,
2400 : posb,
2401 : poslen,
2402 : curlen;
2403 :
2404 : int i;
2405 :
2406 47 : if (highlight == 0)
2407 : {
2408 102 : while (hlCover(prs, query, &p, &q))
2409 : {
2410 : /* find cover len in words */
2411 10 : curlen = 0;
2412 10 : poslen = 0;
2413 72 : for (i = p; i <= q && curlen < max_words; i++)
2414 : {
2415 62 : if (!NONWORDTOKEN(prs->words[i].type))
2416 36 : curlen++;
2417 62 : if (prs->words[i].item && !prs->words[i].repeated)
2418 21 : poslen++;
2419 62 : pose = i;
2420 : }
2421 :
2422 10 : if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
2423 : {
2424 : /* best already found, so try one more cover */
2425 0 : p++;
2426 0 : continue;
2427 : }
2428 :
2429 10 : posb = p;
2430 10 : if (curlen < max_words)
2431 : { /* find good end */
2432 121 : for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2433 : {
2434 117 : if (i != q)
2435 : {
2436 108 : if (!NONWORDTOKEN(prs->words[i].type))
2437 54 : curlen++;
2438 108 : if (prs->words[i].item && !prs->words[i].repeated)
2439 7 : poslen++;
2440 : }
2441 117 : pose = i;
2442 117 : if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
2443 76 : continue;
2444 41 : if (curlen >= min_words)
2445 5 : break;
2446 : }
2447 9 : if (curlen < min_words && i >= prs->curwords)
2448 : { /* got end of text and our cover is shorter
2449 : * than min_words */
2450 0 : for (i = p - 1; i >= 0; i--)
2451 : {
2452 0 : if (!NONWORDTOKEN(prs->words[i].type))
2453 0 : curlen++;
2454 0 : if (prs->words[i].item && !prs->words[i].repeated)
2455 0 : poslen++;
2456 0 : if (curlen >= max_words)
2457 0 : break;
2458 0 : if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
2459 0 : continue;
2460 0 : if (curlen >= min_words)
2461 0 : break;
2462 : }
2463 0 : posb = (i >= 0) ? i : 0;
2464 : }
2465 : }
2466 : else
2467 : { /* shorter cover :((( */
2468 1 : if (i > q)
2469 1 : i = q;
2470 4 : for (; curlen > min_words; i--)
2471 : {
2472 1 : if (!NONWORDTOKEN(prs->words[i].type))
2473 1 : curlen--;
2474 1 : if (prs->words[i].item && !prs->words[i].repeated)
2475 1 : poslen--;
2476 1 : pose = i;
2477 1 : if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
2478 1 : continue;
2479 0 : break;
2480 : }
2481 : }
2482 :
2483 10 : if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
2484 2 : (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
2485 0 : (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
2486 : {
2487 8 : bestb = posb;
2488 8 : beste = pose;
2489 8 : bestlen = poslen;
2490 : }
2491 :
2492 10 : p++;
2493 : }
2494 :
2495 46 : if (bestlen < 0)
2496 : {
2497 38 : curlen = 0;
2498 168 : for (i = 0; i < prs->curwords && curlen < min_words; i++)
2499 : {
2500 130 : if (!NONWORDTOKEN(prs->words[i].type))
2501 84 : curlen++;
2502 130 : pose = i;
2503 : }
2504 38 : bestb = 0;
2505 38 : beste = pose;
2506 : }
2507 : }
2508 : else
2509 : {
2510 1 : bestb = 0;
2511 1 : beste = prs->curwords - 1;
2512 : }
2513 :
2514 380 : for (i = bestb; i <= beste; i++)
2515 : {
2516 333 : if (prs->words[i].item)
2517 49 : prs->words[i].selected = 1;
2518 333 : if (highlight == 0)
2519 : {
2520 292 : if (HLIDREPLACE(prs->words[i].type))
2521 0 : prs->words[i].replace = 1;
2522 292 : else if (HLIDSKIP(prs->words[i].type))
2523 0 : prs->words[i].skip = 1;
2524 : }
2525 : else
2526 : {
2527 41 : if (XMLHLIDSKIP(prs->words[i].type))
2528 1 : prs->words[i].skip = 1;
2529 : }
2530 :
2531 333 : prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2532 : }
2533 :
2534 47 : }
2535 :
2536 : Datum
2537 51 : prsd_headline(PG_FUNCTION_ARGS)
2538 : {
2539 51 : HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
2540 51 : List *prsoptions = (List *) PG_GETARG_POINTER(1);
2541 51 : TSQuery query = PG_GETARG_TSQUERY(2);
2542 :
2543 : /* from opt + start and end tag */
2544 51 : int min_words = 15;
2545 51 : int max_words = 35;
2546 51 : int shortword = 3;
2547 51 : int max_fragments = 0;
2548 51 : int highlight = 0;
2549 : ListCell *l;
2550 :
2551 : /* config */
2552 51 : prs->startsel = NULL;
2553 51 : prs->stopsel = NULL;
2554 103 : foreach(l, prsoptions)
2555 : {
2556 52 : DefElem *defel = (DefElem *) lfirst(l);
2557 52 : char *val = defGetString(defel);
2558 :
2559 52 : if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2560 3 : max_words = pg_atoi(val, sizeof(int32), 0);
2561 49 : else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2562 3 : min_words = pg_atoi(val, sizeof(int32), 0);
2563 46 : else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2564 0 : shortword = pg_atoi(val, sizeof(int32), 0);
2565 46 : else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2566 4 : max_fragments = pg_atoi(val, sizeof(int32), 0);
2567 42 : else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2568 20 : prs->startsel = pstrdup(val);
2569 22 : else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2570 20 : prs->stopsel = pstrdup(val);
2571 2 : else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2572 1 : prs->fragdelim = pstrdup(val);
2573 1 : else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2574 3 : highlight = (pg_strcasecmp(val, "1") == 0 ||
2575 2 : pg_strcasecmp(val, "on") == 0 ||
2576 1 : pg_strcasecmp(val, "true") == 0 ||
2577 0 : pg_strcasecmp(val, "t") == 0 ||
2578 1 : pg_strcasecmp(val, "y") == 0 ||
2579 0 : pg_strcasecmp(val, "yes") == 0);
2580 : else
2581 0 : ereport(ERROR,
2582 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2583 : errmsg("unrecognized headline parameter: \"%s\"",
2584 : defel->defname)));
2585 : }
2586 :
2587 51 : if (highlight == 0)
2588 : {
2589 50 : if (min_words >= max_words)
2590 0 : ereport(ERROR,
2591 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2592 : errmsg("MinWords should be less than MaxWords")));
2593 50 : if (min_words <= 0)
2594 0 : ereport(ERROR,
2595 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2596 : errmsg("MinWords should be positive")));
2597 50 : if (shortword < 0)
2598 0 : ereport(ERROR,
2599 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2600 : errmsg("ShortWord should be >= 0")));
2601 50 : if (max_fragments < 0)
2602 0 : ereport(ERROR,
2603 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2604 : errmsg("MaxFragments should be >= 0")));
2605 : }
2606 :
2607 51 : if (max_fragments == 0)
2608 : /* call the default headline generator */
2609 47 : mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
2610 : else
2611 4 : mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
2612 :
2613 51 : if (!prs->startsel)
2614 31 : prs->startsel = pstrdup("<b>");
2615 51 : if (!prs->stopsel)
2616 31 : prs->stopsel = pstrdup("</b>");
2617 51 : if (!prs->fragdelim)
2618 20 : prs->fragdelim = pstrdup(" ... ");
2619 51 : prs->startsellen = strlen(prs->startsel);
2620 51 : prs->stopsellen = strlen(prs->stopsel);
2621 51 : prs->fragdelimlen = strlen(prs->fragdelim);
2622 :
2623 51 : PG_RETURN_POINTER(prs);
2624 : }
|