LCOV - code coverage report
Current view: top level - src/backend/tsearch - wparser_def.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 520 625 83.2 %
Date: 2017-09-29 15:12:54 Functions: 37 52 71.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * wparser_def.c
       4             :  *      Default text search parser
       5             :  *
       6             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
       7             :  *
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/tsearch/wparser_def.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : 
      15             : #include "postgres.h"
      16             : 
      17             : #include <limits.h>
      18             : 
      19             : #include "catalog/pg_collation.h"
      20             : #include "commands/defrem.h"
      21             : #include "tsearch/ts_locale.h"
      22             : #include "tsearch/ts_public.h"
      23             : #include "tsearch/ts_type.h"
      24             : #include "tsearch/ts_utils.h"
      25             : #include "utils/builtins.h"
      26             : 
      27             : 
      28             : /* Define me to enable tracing of parser behavior */
      29             : /* #define WPARSER_TRACE */
      30             : 
      31             : 
      32             : /* Output token categories */
      33             : 
      34             : #define ASCIIWORD       1
      35             : #define WORD_T          2
      36             : #define NUMWORD         3
      37             : #define EMAIL           4
      38             : #define URL_T           5
      39             : #define HOST            6
      40             : #define SCIENTIFIC      7
      41             : #define VERSIONNUMBER   8
      42             : #define NUMPARTHWORD    9
      43             : #define PARTHWORD       10
      44             : #define ASCIIPARTHWORD  11
      45             : #define SPACE           12
      46             : #define TAG_T           13
      47             : #define PROTOCOL        14
      48             : #define NUMHWORD        15
      49             : #define ASCIIHWORD      16
      50             : #define HWORD           17
      51             : #define URLPATH         18
      52             : #define FILEPATH        19
      53             : #define DECIMAL_T       20
      54             : #define SIGNEDINT       21
      55             : #define UNSIGNEDINT     22
      56             : #define XMLENTITY       23
      57             : 
      58             : #define LASTNUM         23
      59             : 
      60             : static const char *const tok_alias[] = {
      61             :     "",
      62             :     "asciiword",
      63             :     "word",
      64             :     "numword",
      65             :     "email",
      66             :     "url",
      67             :     "host",
      68             :     "sfloat",
      69             :     "version",
      70             :     "hword_numpart",
      71             :     "hword_part",
      72             :     "hword_asciipart",
      73             :     "blank",
      74             :     "tag",
      75             :     "protocol",
      76             :     "numhword",
      77             :     "asciihword",
      78             :     "hword",
      79             :     "url_path",
      80             :     "file",
      81             :     "float",
      82             :     "int",
      83             :     "uint",
      84             :     "entity"
      85             : };
      86             : 
      87             : static const char *const lex_descr[] = {
      88             :     "",
      89             :     "Word, all ASCII",
      90             :     "Word, all letters",
      91             :     "Word, letters and digits",
      92             :     "Email address",
      93             :     "URL",
      94             :     "Host",
      95             :     "Scientific notation",
      96             :     "Version number",
      97             :     "Hyphenated word part, letters and digits",
      98             :     "Hyphenated word part, all letters",
      99             :     "Hyphenated word part, all ASCII",
     100             :     "Space symbols",
     101             :     "XML tag",
     102             :     "Protocol head",
     103             :     "Hyphenated word, letters and digits",
     104             :     "Hyphenated word, all ASCII",
     105             :     "Hyphenated word, all letters",
     106             :     "URL path",
     107             :     "File or path name",
     108             :     "Decimal notation",
     109             :     "Signed integer",
     110             :     "Unsigned integer",
     111             :     "XML entity"
     112             : };
     113             : 
     114             : 
     115             : /* Parser states */
     116             : 
     117             : typedef enum
     118             : {
     119             :     TPS_Base = 0,
     120             :     TPS_InNumWord,
     121             :     TPS_InAsciiWord,
     122             :     TPS_InWord,
     123             :     TPS_InUnsignedInt,
     124             :     TPS_InSignedIntFirst,
     125             :     TPS_InSignedInt,
     126             :     TPS_InSpace,
     127             :     TPS_InUDecimalFirst,
     128             :     TPS_InUDecimal,
     129             :     TPS_InDecimalFirst,
     130             :     TPS_InDecimal,
     131             :     TPS_InVerVersion,
     132             :     TPS_InSVerVersion,
     133             :     TPS_InVersionFirst,
     134             :     TPS_InVersion,
     135             :     TPS_InMantissaFirst,
     136             :     TPS_InMantissaSign,
     137             :     TPS_InMantissa,
     138             :     TPS_InXMLEntityFirst,
     139             :     TPS_InXMLEntity,
     140             :     TPS_InXMLEntityNumFirst,
     141             :     TPS_InXMLEntityNum,
     142             :     TPS_InXMLEntityHexNumFirst,
     143             :     TPS_InXMLEntityHexNum,
     144             :     TPS_InXMLEntityEnd,
     145             :     TPS_InTagFirst,
     146             :     TPS_InXMLBegin,
     147             :     TPS_InTagCloseFirst,
     148             :     TPS_InTagName,
     149             :     TPS_InTagBeginEnd,
     150             :     TPS_InTag,
     151             :     TPS_InTagEscapeK,
     152             :     TPS_InTagEscapeKK,
     153             :     TPS_InTagBackSleshed,
     154             :     TPS_InTagEnd,
     155             :     TPS_InCommentFirst,
     156             :     TPS_InCommentLast,
     157             :     TPS_InComment,
     158             :     TPS_InCloseCommentFirst,
     159             :     TPS_InCloseCommentLast,
     160             :     TPS_InCommentEnd,
     161             :     TPS_InHostFirstDomain,
     162             :     TPS_InHostDomainSecond,
     163             :     TPS_InHostDomain,
     164             :     TPS_InPortFirst,
     165             :     TPS_InPort,
     166             :     TPS_InHostFirstAN,
     167             :     TPS_InHost,
     168             :     TPS_InEmail,
     169             :     TPS_InFileFirst,
     170             :     TPS_InFileTwiddle,
     171             :     TPS_InPathFirst,
     172             :     TPS_InPathFirstFirst,
     173             :     TPS_InPathSecond,
     174             :     TPS_InFile,
     175             :     TPS_InFileNext,
     176             :     TPS_InURLPathFirst,
     177             :     TPS_InURLPathStart,
     178             :     TPS_InURLPath,
     179             :     TPS_InFURL,
     180             :     TPS_InProtocolFirst,
     181             :     TPS_InProtocolSecond,
     182             :     TPS_InProtocolEnd,
     183             :     TPS_InHyphenAsciiWordFirst,
     184             :     TPS_InHyphenAsciiWord,
     185             :     TPS_InHyphenWordFirst,
     186             :     TPS_InHyphenWord,
     187             :     TPS_InHyphenNumWordFirst,
     188             :     TPS_InHyphenNumWord,
     189             :     TPS_InHyphenDigitLookahead,
     190             :     TPS_InParseHyphen,
     191             :     TPS_InParseHyphenHyphen,
     192             :     TPS_InHyphenWordPart,
     193             :     TPS_InHyphenAsciiWordPart,
     194             :     TPS_InHyphenNumWordPart,
     195             :     TPS_InHyphenUnsignedInt,
     196             :     TPS_Null                    /* last state (fake value) */
     197             : } TParserState;
     198             : 
     199             : /* forward declaration */
     200             : struct TParser;
     201             : 
     202             : typedef int (*TParserCharTest) (struct TParser *);  /* any p_is* functions
     203             :                                                      * except p_iseq */
     204             : typedef void (*TParserSpecial) (struct TParser *);  /* special handler for
     205             :                                                      * special cases... */
     206             : 
     207             : typedef struct
     208             : {
     209             :     TParserCharTest isclass;
     210             :     char        c;
     211             :     uint16      flags;
     212             :     TParserState tostate;
     213             :     int         type;
     214             :     TParserSpecial special;
     215             : } TParserStateActionItem;
     216             : 
     217             : /* Flag bits in TParserStateActionItem.flags */
     218             : #define A_NEXT      0x0000
     219             : #define A_BINGO     0x0001
     220             : #define A_POP       0x0002
     221             : #define A_PUSH      0x0004
     222             : #define A_RERUN     0x0008
     223             : #define A_CLEAR     0x0010
     224             : #define A_MERGE     0x0020
     225             : #define A_CLRALL    0x0040
     226             : 
     227             : typedef struct TParserPosition
     228             : {
     229             :     int         posbyte;        /* position of parser in bytes */
     230             :     int         poschar;        /* position of parser in characters */
     231             :     int         charlen;        /* length of current char */
     232             :     int         lenbytetoken;   /* length of token-so-far in bytes */
     233             :     int         lenchartoken;   /* and in chars */
     234             :     TParserState state;
     235             :     struct TParserPosition *prev;
     236             :     const TParserStateActionItem *pushedAtAction;
     237             : } TParserPosition;
     238             : 
     239             : typedef struct TParser
     240             : {
     241             :     /* string and position information */
     242             :     char       *str;            /* multibyte string */
     243             :     int         lenstr;         /* length of mbstring */
     244             : #ifdef USE_WIDE_UPPER_LOWER
     245             :     wchar_t    *wstr;           /* wide character string */
     246             :     pg_wchar   *pgwstr;         /* wide character string for C-locale */
     247             :     bool        usewide;
     248             : #endif
     249             : 
     250             :     /* State of parse */
     251             :     int         charmaxlen;
     252             :     TParserPosition *state;
     253             :     bool        ignore;
     254             :     bool        wanthost;
     255             : 
     256             :     /* silly char */
     257             :     char        c;
     258             : 
     259             :     /* out */
     260             :     char       *token;
     261             :     int         lenbytetoken;
     262             :     int         lenchartoken;
     263             :     int         type;
     264             : } TParser;
     265             : 
     266             : 
     267             : /* forward decls here */
     268             : static bool TParserGet(TParser *prs);
     269             : 
     270             : 
     271             : static TParserPosition *
     272        1235 : newTParserPosition(TParserPosition *prev)
     273             : {
     274        1235 :     TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
     275             : 
     276        1235 :     if (prev)
     277         741 :         memcpy(res, prev, sizeof(TParserPosition));
     278             :     else
     279         494 :         memset(res, 0, sizeof(TParserPosition));
     280             : 
     281        1235 :     res->prev = prev;
     282             : 
     283        1235 :     res->pushedAtAction = NULL;
     284             : 
     285        1235 :     return res;
     286             : }
     287             : 
     288             : static TParser *
     289         454 : TParserInit(char *str, int len)
     290             : {
     291         454 :     TParser    *prs = (TParser *) palloc0(sizeof(TParser));
     292             : 
     293         454 :     prs->charmaxlen = pg_database_encoding_max_length();
     294         454 :     prs->str = str;
     295         454 :     prs->lenstr = len;
     296             : 
     297             : #ifdef USE_WIDE_UPPER_LOWER
     298             : 
     299             :     /*
     300             :      * Use wide char code only when max encoding length > 1.
     301             :      */
     302         454 :     if (prs->charmaxlen > 1)
     303             :     {
     304         454 :         Oid         collation = DEFAULT_COLLATION_OID;  /* TODO */
     305         454 :         pg_locale_t mylocale = 0;   /* TODO */
     306             : 
     307         454 :         prs->usewide = true;
     308         454 :         if (lc_ctype_is_c(collation))
     309             :         {
     310             :             /*
     311             :              * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
     312             :              * be different from sizeof(wchar_t)
     313             :              */
     314           0 :             prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
     315           0 :             pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
     316             :         }
     317             :         else
     318             :         {
     319         454 :             prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
     320         454 :             char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
     321             :                        mylocale);
     322             :         }
     323             :     }
     324             :     else
     325           0 :         prs->usewide = false;
     326             : #endif
     327             : 
     328         454 :     prs->state = newTParserPosition(NULL);
     329         454 :     prs->state->state = TPS_Base;
     330             : 
     331             : #ifdef WPARSER_TRACE
     332             : 
     333             :     /*
     334             :      * Use of %.*s here is a bit risky since it can misbehave if the data is
     335             :      * not in what libc thinks is the prevailing encoding.  However, since
     336             :      * this is just a debugging aid, we choose to live with that.
     337             :      */
     338             :     fprintf(stderr, "parsing \"%.*s\"\n", len, str);
     339             : #endif
     340             : 
     341         454 :     return prs;
     342             : }
     343             : 
     344             : /*
     345             :  * As an alternative to a full TParserInit one can create a
     346             :  * TParserCopy which basically is a regular TParser without a private
     347             :  * copy of the string - instead it uses the one from another TParser.
     348             :  * This is useful because at some places TParsers are created
     349             :  * recursively and the repeated copying around of the strings can
     350             :  * cause major inefficiency if the source string is long.
     351             :  * The new parser starts parsing at the original's current position.
     352             :  *
     353             :  * Obviously one must not close the original TParser before the copy.
     354             :  */
     355             : static TParser *
     356          40 : TParserCopyInit(const TParser *orig)
     357             : {
     358          40 :     TParser    *prs = (TParser *) palloc0(sizeof(TParser));
     359             : 
     360          40 :     prs->charmaxlen = orig->charmaxlen;
     361          40 :     prs->str = orig->str + orig->state->posbyte;
     362          40 :     prs->lenstr = orig->lenstr - orig->state->posbyte;
     363             : 
     364             : #ifdef USE_WIDE_UPPER_LOWER
     365          40 :     prs->usewide = orig->usewide;
     366             : 
     367          40 :     if (orig->pgwstr)
     368           0 :         prs->pgwstr = orig->pgwstr + orig->state->poschar;
     369          40 :     if (orig->wstr)
     370          40 :         prs->wstr = orig->wstr + orig->state->poschar;
     371             : #endif
     372             : 
     373          40 :     prs->state = newTParserPosition(NULL);
     374          40 :     prs->state->state = TPS_Base;
     375             : 
     376             : #ifdef WPARSER_TRACE
     377             :     /* See note above about %.*s */
     378             :     fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
     379             : #endif
     380             : 
     381          40 :     return prs;
     382             : }
     383             : 
     384             : 
     385             : static void
     386         454 : TParserClose(TParser *prs)
     387             : {
     388        1362 :     while (prs->state)
     389             :     {
     390         454 :         TParserPosition *ptr = prs->state->prev;
     391             : 
     392         454 :         pfree(prs->state);
     393         454 :         prs->state = ptr;
     394             :     }
     395             : 
     396             : #ifdef USE_WIDE_UPPER_LOWER
     397         454 :     if (prs->wstr)
     398         454 :         pfree(prs->wstr);
     399         454 :     if (prs->pgwstr)
     400           0 :         pfree(prs->pgwstr);
     401             : #endif
     402             : 
     403             : #ifdef WPARSER_TRACE
     404             :     fprintf(stderr, "closing parser\n");
     405             : #endif
     406         454 :     pfree(prs);
     407         454 : }
     408             : 
     409             : /*
     410             :  * Close a parser created with TParserCopyInit
     411             :  */
     412             : static void
     413          40 : TParserCopyClose(TParser *prs)
     414             : {
     415         142 :     while (prs->state)
     416             :     {
     417          62 :         TParserPosition *ptr = prs->state->prev;
     418             : 
     419          62 :         pfree(prs->state);
     420          62 :         prs->state = ptr;
     421             :     }
     422             : 
     423             : #ifdef WPARSER_TRACE
     424             :     fprintf(stderr, "closing parser copy\n");
     425             : #endif
     426          40 :     pfree(prs);
     427          40 : }
     428             : 
     429             : 
     430             : /*
     431             :  * Character-type support functions, equivalent to is* macros, but
     432             :  * working with any possible encodings and locales. Notes:
     433             :  *  - with multibyte encoding and C-locale isw* function may fail
     434             :  *    or give wrong result.
     435             :  *  - multibyte encoding and C-locale often are used for
     436             :  *    Asian languages.
     437             :  *  - if locale is C then we use pgwstr instead of wstr.
     438             :  */
     439             : 
     440             : #ifdef USE_WIDE_UPPER_LOWER
     441             : 
     442             : #define p_iswhat(type)                                                      \
     443             : static int                                                                  \
     444             : p_is##type(TParser *prs) {                                                  \
     445             :     Assert( prs->state );                                                    \
     446             :     if ( prs->usewide )                                                      \
     447             :     {                                                                       \
     448             :         if ( prs->pgwstr )                                                   \
     449             :         {                                                                   \
     450             :             unsigned int c = *(prs->pgwstr + prs->state->poschar);         \
     451             :             if ( c > 0x7f )                                                  \
     452             :                 return 0;                                                   \
     453             :             return is##type( c );                                           \
     454             :         }                                                                   \
     455             :         return isw##type( *( prs->wstr + prs->state->poschar ) );          \
     456             :     }                                                                       \
     457             :                                                                             \
     458             :     return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
     459             : }   \
     460             :                                                                             \
     461             : static int                                                                  \
     462             : p_isnot##type(TParser *prs) {                                               \
     463             :     return !p_is##type(prs);                                                \
     464             : }
     465             : 
     466             : static int
     467        1562 : p_isalnum(TParser *prs)
     468             : {
     469        1562 :     Assert(prs->state);
     470             : 
     471        1562 :     if (prs->usewide)
     472             :     {
     473        1562 :         if (prs->pgwstr)
     474             :         {
     475           0 :             unsigned int c = *(prs->pgwstr + prs->state->poschar);
     476             : 
     477             :             /*
     478             :              * any non-ascii symbol with multibyte encoding with C-locale is
     479             :              * an alpha character
     480             :              */
     481           0 :             if (c > 0x7f)
     482           0 :                 return 1;
     483             : 
     484           0 :             return isalnum(c);
     485             :         }
     486             : 
     487        1562 :         return iswalnum(*(prs->wstr + prs->state->poschar));
     488             :     }
     489             : 
     490           0 :     return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
     491             : }
     492             : static int
     493        1440 : p_isnotalnum(TParser *prs)
     494             : {
     495        1440 :     return !p_isalnum(prs);
     496             : }
     497             : 
     498             : static int
     499       11347 : p_isalpha(TParser *prs)
     500             : {
     501       11347 :     Assert(prs->state);
     502             : 
     503       11347 :     if (prs->usewide)
     504             :     {
     505       11347 :         if (prs->pgwstr)
     506             :         {
     507           0 :             unsigned int c = *(prs->pgwstr + prs->state->poschar);
     508             : 
     509             :             /*
     510             :              * any non-ascii symbol with multibyte encoding with C-locale is
     511             :              * an alpha character
     512             :              */
     513           0 :             if (c > 0x7f)
     514           0 :                 return 1;
     515             : 
     516           0 :             return isalpha(c);
     517             :         }
     518             : 
     519       11347 :         return iswalpha(*(prs->wstr + prs->state->poschar));
     520             :     }
     521             : 
     522           0 :     return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
     523             : }
     524             : 
     525             : static int
     526           0 : p_isnotalpha(TParser *prs)
     527             : {
     528           0 :     return !p_isalpha(prs);
     529             : }
     530             : 
     531             : /* p_iseq should be used only for ascii symbols */
     532             : 
     533             : static int
     534       28051 : p_iseq(TParser *prs, char c)
     535             : {
     536       28051 :     Assert(prs->state);
     537       28051 :     return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
     538             : }
     539             : #else                           /* USE_WIDE_UPPER_LOWER */
     540             : 
     541             : #define p_iswhat(type)                                                      \
     542             : static int                                                                  \
     543             : p_is##type(TParser *prs) {                                                  \
     544             :     Assert( prs->state );                                                    \
     545             :     return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
     546             : }   \
     547             :                                                                             \
     548             : static int                                                                  \
     549             : p_isnot##type(TParser *prs) {                                               \
     550             :     return !p_is##type(prs);                                                \
     551             : }
     552             : 
     553             : 
     554             : static int
     555             : p_iseq(TParser *prs, char c)
     556             : {
     557             :     Assert(prs->state);
     558             :     return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
     559             : }
     560             : 
     561             : p_iswhat(alnum)
     562             : p_iswhat(alpha)
     563             : #endif                          /* USE_WIDE_UPPER_LOWER */
     564             : 
     565        4586 : p_iswhat(digit)
     566           0 : p_iswhat(lower)
     567           0 : p_iswhat(print)
     568           0 : p_iswhat(punct)
     569         113 : p_iswhat(space)
     570           0 : p_iswhat(upper)
     571           3 : p_iswhat(xdigit)
     572             : 
     573             : static int
     574       12212 : p_isEOF(TParser *prs)
     575             : {
     576       12212 :     Assert(prs->state);
     577       12212 :     return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
     578             : }
     579             : 
     580             : static int
     581       28051 : p_iseqC(TParser *prs)
     582             : {
     583       28051 :     return p_iseq(prs, prs->c);
     584             : }
     585             : 
     586             : static int
     587           0 : p_isneC(TParser *prs)
     588             : {
     589           0 :     return !p_iseq(prs, prs->c);
     590             : }
     591             : 
     592             : static int
     593        8889 : p_isascii(TParser *prs)
     594             : {
     595        8889 :     return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
     596             : }
     597             : 
     598             : static int
     599        8889 : p_isasclet(TParser *prs)
     600             : {
     601        8889 :     return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
     602             : }
     603             : 
     604             : static int
     605         443 : p_isurlchar(TParser *prs)
     606             : {
     607             :     char        ch;
     608             : 
     609             :     /* no non-ASCII need apply */
     610         443 :     if (prs->state->charlen != 1)
     611           0 :         return 0;
     612         443 :     ch = *(prs->str + prs->state->posbyte);
     613             :     /* no spaces or control characters */
     614         443 :     if (ch <= 0x20 || ch >= 0x7F)
     615          39 :         return 0;
     616             :     /* reject characters disallowed by RFC 3986 */
     617         404 :     switch (ch)
     618             :     {
     619             :         case '"':
     620             :         case '<':
     621             :         case '>':
     622             :         case '\\':
     623             :         case '^':
     624             :         case '`':
     625             :         case '{':
     626             :         case '|':
     627             :         case '}':
     628           4 :             return 0;
     629             :     }
     630         400 :     return 1;
     631             : }
     632             : 
     633             : 
     634             : /* deliberately suppress unused-function complaints for the above */
     635             : void        _make_compiler_happy(void);
     636             : void
     637           0 : _make_compiler_happy(void)
     638             : {
     639           0 :     p_isalnum(NULL);
     640           0 :     p_isnotalnum(NULL);
     641           0 :     p_isalpha(NULL);
     642           0 :     p_isnotalpha(NULL);
     643           0 :     p_isdigit(NULL);
     644           0 :     p_isnotdigit(NULL);
     645           0 :     p_islower(NULL);
     646           0 :     p_isnotlower(NULL);
     647           0 :     p_isprint(NULL);
     648           0 :     p_isnotprint(NULL);
     649           0 :     p_ispunct(NULL);
     650           0 :     p_isnotpunct(NULL);
     651           0 :     p_isspace(NULL);
     652           0 :     p_isnotspace(NULL);
     653           0 :     p_isupper(NULL);
     654           0 :     p_isnotupper(NULL);
     655           0 :     p_isxdigit(NULL);
     656           0 :     p_isnotxdigit(NULL);
     657           0 :     p_isEOF(NULL);
     658           0 :     p_iseqC(NULL);
     659           0 :     p_isneC(NULL);
     660           0 : }
     661             : 
     662             : 
     663             : static void
     664          42 : SpecialTags(TParser *prs)
     665             : {
     666          42 :     switch (prs->state->lenchartoken)
     667             :     {
     668             :         case 8:                 /* </script */
     669           1 :             if (pg_strncasecmp(prs->token, "</script", 8) == 0)
     670           1 :                 prs->ignore = false;
     671           1 :             break;
     672             :         case 7:                 /* <script || </style */
     673           4 :             if (pg_strncasecmp(prs->token, "</style", 7) == 0)
     674           0 :                 prs->ignore = false;
     675           4 :             else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
     676           1 :                 prs->ignore = true;
     677           4 :             break;
     678             :         case 6:                 /* <style */
     679           3 :             if (pg_strncasecmp(prs->token, "<style", 6) == 0)
     680           0 :                 prs->ignore = true;
     681           3 :             break;
     682             :         default:
     683          34 :             break;
     684             :     }
     685          42 : }
     686             : 
     687             : static void
     688          22 : SpecialFURL(TParser *prs)
     689             : {
     690          22 :     prs->wanthost = true;
     691          22 :     prs->state->posbyte -= prs->state->lenbytetoken;
     692          22 :     prs->state->poschar -= prs->state->lenchartoken;
     693          22 : }
     694             : 
     695             : static void
     696           4 : SpecialHyphen(TParser *prs)
     697             : {
     698           4 :     prs->state->posbyte -= prs->state->lenbytetoken;
     699           4 :     prs->state->poschar -= prs->state->lenchartoken;
     700           4 : }
     701             : 
     702             : static void
     703           0 : SpecialVerVersion(TParser *prs)
     704             : {
     705           0 :     prs->state->posbyte -= prs->state->lenbytetoken;
     706           0 :     prs->state->poschar -= prs->state->lenchartoken;
     707           0 :     prs->state->lenbytetoken = 0;
     708           0 :     prs->state->lenchartoken = 0;
     709           0 : }
     710             : 
     711             : static int
     712          80 : p_isstophost(TParser *prs)
     713             : {
     714          80 :     if (prs->wanthost)
     715             :     {
     716          34 :         prs->wanthost = false;
     717          34 :         return 1;
     718             :     }
     719          46 :     return 0;
     720             : }
     721             : 
     722             : static int
     723        4221 : p_isignore(TParser *prs)
     724             : {
     725        4221 :     return (prs->ignore) ? 1 : 0;
     726             : }
     727             : 
     728             : static int
     729          15 : p_ishost(TParser *prs)
     730             : {
     731          15 :     TParser    *tmpprs = TParserCopyInit(prs);
     732          15 :     int         res = 0;
     733             : 
     734          15 :     tmpprs->wanthost = true;
     735             : 
     736          15 :     if (TParserGet(tmpprs) && tmpprs->type == HOST)
     737             :     {
     738          12 :         prs->state->posbyte += tmpprs->lenbytetoken;
     739          12 :         prs->state->poschar += tmpprs->lenchartoken;
     740          12 :         prs->state->lenbytetoken += tmpprs->lenbytetoken;
     741          12 :         prs->state->lenchartoken += tmpprs->lenchartoken;
     742          12 :         prs->state->charlen = tmpprs->state->charlen;
     743          12 :         res = 1;
     744             :     }
     745          15 :     TParserCopyClose(tmpprs);
     746             : 
     747          15 :     return res;
     748             : }
     749             : 
     750             : static int
     751          25 : p_isURLPath(TParser *prs)
     752             : {
     753          25 :     TParser    *tmpprs = TParserCopyInit(prs);
     754          25 :     int         res = 0;
     755             : 
     756          25 :     tmpprs->state = newTParserPosition(tmpprs->state);
     757          25 :     tmpprs->state->state = TPS_InURLPathFirst;
     758             : 
     759          25 :     if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
     760             :     {
     761          22 :         prs->state->posbyte += tmpprs->lenbytetoken;
     762          22 :         prs->state->poschar += tmpprs->lenchartoken;
     763          22 :         prs->state->lenbytetoken += tmpprs->lenbytetoken;
     764          22 :         prs->state->lenchartoken += tmpprs->lenchartoken;
     765          22 :         prs->state->charlen = tmpprs->state->charlen;
     766          22 :         res = 1;
     767             :     }
     768          25 :     TParserCopyClose(tmpprs);
     769             : 
     770          25 :     return res;
     771             : }
     772             : 
     773             : /*
     774             :  * returns true if current character has zero display length or
     775             :  * it's a special sign in several languages. Such characters
     776             :  * aren't a word-breaker although they aren't an isalpha.
     777             :  * In beginning of word they aren't a part of it.
     778             :  */
     779             : static int
     780        1027 : p_isspecial(TParser *prs)
     781             : {
     782             :     /*
     783             :      * pg_dsplen could return -1 which means error or control character
     784             :      */
     785        1027 :     if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
     786           0 :         return 1;
     787             : 
     788             : #ifdef USE_WIDE_UPPER_LOWER
     789             : 
     790             :     /*
     791             :      * Unicode Characters in the 'Mark, Spacing Combining' Category That
     792             :      * characters are not alpha although they are not breakers of word too.
     793             :      * Check that only in utf encoding, because other encodings aren't
     794             :      * supported by postgres or even exists.
     795             :      */
     796        1027 :     if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
     797             :     {
     798             :         static const pg_wchar strange_letter[] = {
     799             :             /*
     800             :              * use binary search, so elements should be ordered
     801             :              */
     802             :             0x0903,             /* DEVANAGARI SIGN VISARGA */
     803             :             0x093E,             /* DEVANAGARI VOWEL SIGN AA */
     804             :             0x093F,             /* DEVANAGARI VOWEL SIGN I */
     805             :             0x0940,             /* DEVANAGARI VOWEL SIGN II */
     806             :             0x0949,             /* DEVANAGARI VOWEL SIGN CANDRA O */
     807             :             0x094A,             /* DEVANAGARI VOWEL SIGN SHORT O */
     808             :             0x094B,             /* DEVANAGARI VOWEL SIGN O */
     809             :             0x094C,             /* DEVANAGARI VOWEL SIGN AU */
     810             :             0x0982,             /* BENGALI SIGN ANUSVARA */
     811             :             0x0983,             /* BENGALI SIGN VISARGA */
     812             :             0x09BE,             /* BENGALI VOWEL SIGN AA */
     813             :             0x09BF,             /* BENGALI VOWEL SIGN I */
     814             :             0x09C0,             /* BENGALI VOWEL SIGN II */
     815             :             0x09C7,             /* BENGALI VOWEL SIGN E */
     816             :             0x09C8,             /* BENGALI VOWEL SIGN AI */
     817             :             0x09CB,             /* BENGALI VOWEL SIGN O */
     818             :             0x09CC,             /* BENGALI VOWEL SIGN AU */
     819             :             0x09D7,             /* BENGALI AU LENGTH MARK */
     820             :             0x0A03,             /* GURMUKHI SIGN VISARGA */
     821             :             0x0A3E,             /* GURMUKHI VOWEL SIGN AA */
     822             :             0x0A3F,             /* GURMUKHI VOWEL SIGN I */
     823             :             0x0A40,             /* GURMUKHI VOWEL SIGN II */
     824             :             0x0A83,             /* GUJARATI SIGN VISARGA */
     825             :             0x0ABE,             /* GUJARATI VOWEL SIGN AA */
     826             :             0x0ABF,             /* GUJARATI VOWEL SIGN I */
     827             :             0x0AC0,             /* GUJARATI VOWEL SIGN II */
     828             :             0x0AC9,             /* GUJARATI VOWEL SIGN CANDRA O */
     829             :             0x0ACB,             /* GUJARATI VOWEL SIGN O */
     830             :             0x0ACC,             /* GUJARATI VOWEL SIGN AU */
     831             :             0x0B02,             /* ORIYA SIGN ANUSVARA */
     832             :             0x0B03,             /* ORIYA SIGN VISARGA */
     833             :             0x0B3E,             /* ORIYA VOWEL SIGN AA */
     834             :             0x0B40,             /* ORIYA VOWEL SIGN II */
     835             :             0x0B47,             /* ORIYA VOWEL SIGN E */
     836             :             0x0B48,             /* ORIYA VOWEL SIGN AI */
     837             :             0x0B4B,             /* ORIYA VOWEL SIGN O */
     838             :             0x0B4C,             /* ORIYA VOWEL SIGN AU */
     839             :             0x0B57,             /* ORIYA AU LENGTH MARK */
     840             :             0x0BBE,             /* TAMIL VOWEL SIGN AA */
     841             :             0x0BBF,             /* TAMIL VOWEL SIGN I */
     842             :             0x0BC1,             /* TAMIL VOWEL SIGN U */
     843             :             0x0BC2,             /* TAMIL VOWEL SIGN UU */
     844             :             0x0BC6,             /* TAMIL VOWEL SIGN E */
     845             :             0x0BC7,             /* TAMIL VOWEL SIGN EE */
     846             :             0x0BC8,             /* TAMIL VOWEL SIGN AI */
     847             :             0x0BCA,             /* TAMIL VOWEL SIGN O */
     848             :             0x0BCB,             /* TAMIL VOWEL SIGN OO */
     849             :             0x0BCC,             /* TAMIL VOWEL SIGN AU */
     850             :             0x0BD7,             /* TAMIL AU LENGTH MARK */
     851             :             0x0C01,             /* TELUGU SIGN CANDRABINDU */
     852             :             0x0C02,             /* TELUGU SIGN ANUSVARA */
     853             :             0x0C03,             /* TELUGU SIGN VISARGA */
     854             :             0x0C41,             /* TELUGU VOWEL SIGN U */
     855             :             0x0C42,             /* TELUGU VOWEL SIGN UU */
     856             :             0x0C43,             /* TELUGU VOWEL SIGN VOCALIC R */
     857             :             0x0C44,             /* TELUGU VOWEL SIGN VOCALIC RR */
     858             :             0x0C82,             /* KANNADA SIGN ANUSVARA */
     859             :             0x0C83,             /* KANNADA SIGN VISARGA */
     860             :             0x0CBE,             /* KANNADA VOWEL SIGN AA */
     861             :             0x0CC0,             /* KANNADA VOWEL SIGN II */
     862             :             0x0CC1,             /* KANNADA VOWEL SIGN U */
     863             :             0x0CC2,             /* KANNADA VOWEL SIGN UU */
     864             :             0x0CC3,             /* KANNADA VOWEL SIGN VOCALIC R */
     865             :             0x0CC4,             /* KANNADA VOWEL SIGN VOCALIC RR */
     866             :             0x0CC7,             /* KANNADA VOWEL SIGN EE */
     867             :             0x0CC8,             /* KANNADA VOWEL SIGN AI */
     868             :             0x0CCA,             /* KANNADA VOWEL SIGN O */
     869             :             0x0CCB,             /* KANNADA VOWEL SIGN OO */
     870             :             0x0CD5,             /* KANNADA LENGTH MARK */
     871             :             0x0CD6,             /* KANNADA AI LENGTH MARK */
     872             :             0x0D02,             /* MALAYALAM SIGN ANUSVARA */
     873             :             0x0D03,             /* MALAYALAM SIGN VISARGA */
     874             :             0x0D3E,             /* MALAYALAM VOWEL SIGN AA */
     875             :             0x0D3F,             /* MALAYALAM VOWEL SIGN I */
     876             :             0x0D40,             /* MALAYALAM VOWEL SIGN II */
     877             :             0x0D46,             /* MALAYALAM VOWEL SIGN E */
     878             :             0x0D47,             /* MALAYALAM VOWEL SIGN EE */
     879             :             0x0D48,             /* MALAYALAM VOWEL SIGN AI */
     880             :             0x0D4A,             /* MALAYALAM VOWEL SIGN O */
     881             :             0x0D4B,             /* MALAYALAM VOWEL SIGN OO */
     882             :             0x0D4C,             /* MALAYALAM VOWEL SIGN AU */
     883             :             0x0D57,             /* MALAYALAM AU LENGTH MARK */
     884             :             0x0D82,             /* SINHALA SIGN ANUSVARAYA */
     885             :             0x0D83,             /* SINHALA SIGN VISARGAYA */
     886             :             0x0DCF,             /* SINHALA VOWEL SIGN AELA-PILLA */
     887             :             0x0DD0,             /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
     888             :             0x0DD1,             /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
     889             :             0x0DD8,             /* SINHALA VOWEL SIGN GAETTA-PILLA */
     890             :             0x0DD9,             /* SINHALA VOWEL SIGN KOMBUVA */
     891             :             0x0DDA,             /* SINHALA VOWEL SIGN DIGA KOMBUVA */
     892             :             0x0DDB,             /* SINHALA VOWEL SIGN KOMBU DEKA */
     893             :             0x0DDC,             /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
     894             :             0x0DDD,             /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
     895             :                                  * AELA-PILLA */
     896             :             0x0DDE,             /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
     897             :             0x0DDF,             /* SINHALA VOWEL SIGN GAYANUKITTA */
     898             :             0x0DF2,             /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
     899             :             0x0DF3,             /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
     900             :             0x0F3E,             /* TIBETAN SIGN YAR TSHES */
     901             :             0x0F3F,             /* TIBETAN SIGN MAR TSHES */
     902             :             0x0F7F,             /* TIBETAN SIGN RNAM BCAD */
     903             :             0x102B,             /* MYANMAR VOWEL SIGN TALL AA */
     904             :             0x102C,             /* MYANMAR VOWEL SIGN AA */
     905             :             0x1031,             /* MYANMAR VOWEL SIGN E */
     906             :             0x1038,             /* MYANMAR SIGN VISARGA */
     907             :             0x103B,             /* MYANMAR CONSONANT SIGN MEDIAL YA */
     908             :             0x103C,             /* MYANMAR CONSONANT SIGN MEDIAL RA */
     909             :             0x1056,             /* MYANMAR VOWEL SIGN VOCALIC R */
     910             :             0x1057,             /* MYANMAR VOWEL SIGN VOCALIC RR */
     911             :             0x1062,             /* MYANMAR VOWEL SIGN SGAW KAREN EU */
     912             :             0x1063,             /* MYANMAR TONE MARK SGAW KAREN HATHI */
     913             :             0x1064,             /* MYANMAR TONE MARK SGAW KAREN KE PHO */
     914             :             0x1067,             /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
     915             :             0x1068,             /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
     916             :             0x1069,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
     917             :             0x106A,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
     918             :             0x106B,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
     919             :             0x106C,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
     920             :             0x106D,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
     921             :             0x1083,             /* MYANMAR VOWEL SIGN SHAN AA */
     922             :             0x1084,             /* MYANMAR VOWEL SIGN SHAN E */
     923             :             0x1087,             /* MYANMAR SIGN SHAN TONE-2 */
     924             :             0x1088,             /* MYANMAR SIGN SHAN TONE-3 */
     925             :             0x1089,             /* MYANMAR SIGN SHAN TONE-5 */
     926             :             0x108A,             /* MYANMAR SIGN SHAN TONE-6 */
     927             :             0x108B,             /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
     928             :             0x108C,             /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
     929             :             0x108F,             /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
     930             :             0x17B6,             /* KHMER VOWEL SIGN AA */
     931             :             0x17BE,             /* KHMER VOWEL SIGN OE */
     932             :             0x17BF,             /* KHMER VOWEL SIGN YA */
     933             :             0x17C0,             /* KHMER VOWEL SIGN IE */
     934             :             0x17C1,             /* KHMER VOWEL SIGN E */
     935             :             0x17C2,             /* KHMER VOWEL SIGN AE */
     936             :             0x17C3,             /* KHMER VOWEL SIGN AI */
     937             :             0x17C4,             /* KHMER VOWEL SIGN OO */
     938             :             0x17C5,             /* KHMER VOWEL SIGN AU */
     939             :             0x17C7,             /* KHMER SIGN REAHMUK */
     940             :             0x17C8,             /* KHMER SIGN YUUKALEAPINTU */
     941             :             0x1923,             /* LIMBU VOWEL SIGN EE */
     942             :             0x1924,             /* LIMBU VOWEL SIGN AI */
     943             :             0x1925,             /* LIMBU VOWEL SIGN OO */
     944             :             0x1926,             /* LIMBU VOWEL SIGN AU */
     945             :             0x1929,             /* LIMBU SUBJOINED LETTER YA */
     946             :             0x192A,             /* LIMBU SUBJOINED LETTER RA */
     947             :             0x192B,             /* LIMBU SUBJOINED LETTER WA */
     948             :             0x1930,             /* LIMBU SMALL LETTER KA */
     949             :             0x1931,             /* LIMBU SMALL LETTER NGA */
     950             :             0x1933,             /* LIMBU SMALL LETTER TA */
     951             :             0x1934,             /* LIMBU SMALL LETTER NA */
     952             :             0x1935,             /* LIMBU SMALL LETTER PA */
     953             :             0x1936,             /* LIMBU SMALL LETTER MA */
     954             :             0x1937,             /* LIMBU SMALL LETTER RA */
     955             :             0x1938,             /* LIMBU SMALL LETTER LA */
     956             :             0x19B0,             /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
     957             :             0x19B1,             /* NEW TAI LUE VOWEL SIGN AA */
     958             :             0x19B2,             /* NEW TAI LUE VOWEL SIGN II */
     959             :             0x19B3,             /* NEW TAI LUE VOWEL SIGN U */
     960             :             0x19B4,             /* NEW TAI LUE VOWEL SIGN UU */
     961             :             0x19B5,             /* NEW TAI LUE VOWEL SIGN E */
     962             :             0x19B6,             /* NEW TAI LUE VOWEL SIGN AE */
     963             :             0x19B7,             /* NEW TAI LUE VOWEL SIGN O */
     964             :             0x19B8,             /* NEW TAI LUE VOWEL SIGN OA */
     965             :             0x19B9,             /* NEW TAI LUE VOWEL SIGN UE */
     966             :             0x19BA,             /* NEW TAI LUE VOWEL SIGN AY */
     967             :             0x19BB,             /* NEW TAI LUE VOWEL SIGN AAY */
     968             :             0x19BC,             /* NEW TAI LUE VOWEL SIGN UY */
     969             :             0x19BD,             /* NEW TAI LUE VOWEL SIGN OY */
     970             :             0x19BE,             /* NEW TAI LUE VOWEL SIGN OAY */
     971             :             0x19BF,             /* NEW TAI LUE VOWEL SIGN UEY */
     972             :             0x19C0,             /* NEW TAI LUE VOWEL SIGN IY */
     973             :             0x19C8,             /* NEW TAI LUE TONE MARK-1 */
     974             :             0x19C9,             /* NEW TAI LUE TONE MARK-2 */
     975             :             0x1A19,             /* BUGINESE VOWEL SIGN E */
     976             :             0x1A1A,             /* BUGINESE VOWEL SIGN O */
     977             :             0x1A1B,             /* BUGINESE VOWEL SIGN AE */
     978             :             0x1B04,             /* BALINESE SIGN BISAH */
     979             :             0x1B35,             /* BALINESE VOWEL SIGN TEDUNG */
     980             :             0x1B3B,             /* BALINESE VOWEL SIGN RA REPA TEDUNG */
     981             :             0x1B3D,             /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
     982             :             0x1B3E,             /* BALINESE VOWEL SIGN TALING */
     983             :             0x1B3F,             /* BALINESE VOWEL SIGN TALING REPA */
     984             :             0x1B40,             /* BALINESE VOWEL SIGN TALING TEDUNG */
     985             :             0x1B41,             /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
     986             :             0x1B43,             /* BALINESE VOWEL SIGN PEPET TEDUNG */
     987             :             0x1B44,             /* BALINESE ADEG ADEG */
     988             :             0x1B82,             /* SUNDANESE SIGN PANGWISAD */
     989             :             0x1BA1,             /* SUNDANESE CONSONANT SIGN PAMINGKAL */
     990             :             0x1BA6,             /* SUNDANESE VOWEL SIGN PANAELAENG */
     991             :             0x1BA7,             /* SUNDANESE VOWEL SIGN PANOLONG */
     992             :             0x1BAA,             /* SUNDANESE SIGN PAMAAEH */
     993             :             0x1C24,             /* LEPCHA SUBJOINED LETTER YA */
     994             :             0x1C25,             /* LEPCHA SUBJOINED LETTER RA */
     995             :             0x1C26,             /* LEPCHA VOWEL SIGN AA */
     996             :             0x1C27,             /* LEPCHA VOWEL SIGN I */
     997             :             0x1C28,             /* LEPCHA VOWEL SIGN O */
     998             :             0x1C29,             /* LEPCHA VOWEL SIGN OO */
     999             :             0x1C2A,             /* LEPCHA VOWEL SIGN U */
    1000             :             0x1C2B,             /* LEPCHA VOWEL SIGN UU */
    1001             :             0x1C34,             /* LEPCHA CONSONANT SIGN NYIN-DO */
    1002             :             0x1C35,             /* LEPCHA CONSONANT SIGN KANG */
    1003             :             0xA823,             /* SYLOTI NAGRI VOWEL SIGN A */
    1004             :             0xA824,             /* SYLOTI NAGRI VOWEL SIGN I */
    1005             :             0xA827,             /* SYLOTI NAGRI VOWEL SIGN OO */
    1006             :             0xA880,             /* SAURASHTRA SIGN ANUSVARA */
    1007             :             0xA881,             /* SAURASHTRA SIGN VISARGA */
    1008             :             0xA8B4,             /* SAURASHTRA CONSONANT SIGN HAARU */
    1009             :             0xA8B5,             /* SAURASHTRA VOWEL SIGN AA */
    1010             :             0xA8B6,             /* SAURASHTRA VOWEL SIGN I */
    1011             :             0xA8B7,             /* SAURASHTRA VOWEL SIGN II */
    1012             :             0xA8B8,             /* SAURASHTRA VOWEL SIGN U */
    1013             :             0xA8B9,             /* SAURASHTRA VOWEL SIGN UU */
    1014             :             0xA8BA,             /* SAURASHTRA VOWEL SIGN VOCALIC R */
    1015             :             0xA8BB,             /* SAURASHTRA VOWEL SIGN VOCALIC RR */
    1016             :             0xA8BC,             /* SAURASHTRA VOWEL SIGN VOCALIC L */
    1017             :             0xA8BD,             /* SAURASHTRA VOWEL SIGN VOCALIC LL */
    1018             :             0xA8BE,             /* SAURASHTRA VOWEL SIGN E */
    1019             :             0xA8BF,             /* SAURASHTRA VOWEL SIGN EE */
    1020             :             0xA8C0,             /* SAURASHTRA VOWEL SIGN AI */
    1021             :             0xA8C1,             /* SAURASHTRA VOWEL SIGN O */
    1022             :             0xA8C2,             /* SAURASHTRA VOWEL SIGN OO */
    1023             :             0xA8C3,             /* SAURASHTRA VOWEL SIGN AU */
    1024             :             0xA952,             /* REJANG CONSONANT SIGN H */
    1025             :             0xA953,             /* REJANG VIRAMA */
    1026             :             0xAA2F,             /* CHAM VOWEL SIGN O */
    1027             :             0xAA30,             /* CHAM VOWEL SIGN AI */
    1028             :             0xAA33,             /* CHAM CONSONANT SIGN YA */
    1029             :             0xAA34,             /* CHAM CONSONANT SIGN RA */
    1030             :             0xAA4D              /* CHAM CONSONANT SIGN FINAL H */
    1031             :         };
    1032        1027 :         const pg_wchar *StopLow = strange_letter,
    1033        1027 :                    *StopHigh = strange_letter + lengthof(strange_letter),
    1034             :                    *StopMiddle;
    1035             :         pg_wchar    c;
    1036             : 
    1037        1027 :         if (prs->pgwstr)
    1038           0 :             c = *(prs->pgwstr + prs->state->poschar);
    1039             :         else
    1040        1027 :             c = (pg_wchar) *(prs->wstr + prs->state->poschar);
    1041             : 
    1042       10270 :         while (StopLow < StopHigh)
    1043             :         {
    1044        8216 :             StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
    1045        8216 :             if (*StopMiddle == c)
    1046           0 :                 return 1;
    1047        8216 :             else if (*StopMiddle < c)
    1048           0 :                 StopLow = StopMiddle + 1;
    1049             :             else
    1050        8216 :                 StopHigh = StopMiddle;
    1051             :         }
    1052             :     }
    1053             : #endif
    1054             : 
    1055        1027 :     return 0;
    1056             : }
    1057             : 
    1058             : /*
    1059             :  * Table of state/action of parser
    1060             :  */
    1061             : 
    1062             : static const TParserStateActionItem actionTPS_Base[] = {
    1063             :     {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
    1064             :     {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
    1065             :     {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
    1066             :     {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
    1067             :     {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
    1068             :     {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
    1069             :     {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
    1070             :     {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
    1071             :     {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
    1072             :     {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
    1073             :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
    1074             :     {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
    1075             :     {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
    1076             : };
    1077             : 
    1078             : 
    1079             : static const TParserStateActionItem actionTPS_InNumWord[] = {
    1080             :     {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
    1081             :     {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
    1082             :     {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
    1083             :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    1084             :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
    1085             :     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
    1086             :     {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
    1087             :     {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
    1088             : };
    1089             : 
    1090             : static const TParserStateActionItem actionTPS_InAsciiWord[] = {
    1091             :     {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
    1092             :     {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
    1093             :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    1094             :     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
    1095             :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1096             :     {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
    1097             :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1098             :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    1099             :     {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
    1100             :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
    1101             :     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
    1102             :     {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
    1103             :     {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
    1104             :     {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
    1105             :     {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
    1106             : };
    1107             : 
    1108             : static const TParserStateActionItem actionTPS_InWord[] = {
    1109             :     {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
    1110             :     {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
    1111             :     {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
    1112             :     {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
    1113             :     {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
    1114             :     {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
    1115             : };
    1116             : 
    1117             : static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
    1118             :     {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
    1119             :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
    1120             :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    1121             :     {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
    1122             :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1123             :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1124             :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1125             :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1126             :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    1127             :     {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
    1128             :     {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
    1129             :     {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
    1130             :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
    1131             :     {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
    1132             : };
    1133             : 
    1134             : static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
    1135             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1136             :     {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
    1137             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1138             : };
    1139             : 
    1140             : static const TParserStateActionItem actionTPS_InSignedInt[] = {
    1141             :     {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
    1142             :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
    1143             :     {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
    1144             :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1145             :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1146             :     {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
    1147             : };
    1148             : 
    1149             : static const TParserStateActionItem actionTPS_InSpace[] = {
    1150             :     {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
    1151             :     {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
    1152             :     {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
    1153             :     {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
    1154             :     {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
    1155             :     {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
    1156             :     {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
    1157             :     {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
    1158             :     {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
    1159             : };
    1160             : 
    1161             : static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
    1162             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1163             :     {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
    1164             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1165             : };
    1166             : 
    1167             : static const TParserStateActionItem actionTPS_InUDecimal[] = {
    1168             :     {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
    1169             :     {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
    1170             :     {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
    1171             :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1172             :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1173             :     {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
    1174             : };
    1175             : 
    1176             : static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
    1177             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1178             :     {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
    1179             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1180             : };
    1181             : 
    1182             : static const TParserStateActionItem actionTPS_InDecimal[] = {
    1183             :     {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
    1184             :     {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
    1185             :     {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
    1186             :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1187             :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1188             :     {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
    1189             : };
    1190             : 
    1191             : static const TParserStateActionItem actionTPS_InVerVersion[] = {
    1192             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1193             :     {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
    1194             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1195             : };
    1196             : 
    1197             : static const TParserStateActionItem actionTPS_InSVerVersion[] = {
    1198             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1199             :     {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
    1200             :     {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
    1201             : };
    1202             : 
    1203             : 
    1204             : static const TParserStateActionItem actionTPS_InVersionFirst[] = {
    1205             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1206             :     {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
    1207             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1208             : };
    1209             : 
    1210             : static const TParserStateActionItem actionTPS_InVersion[] = {
    1211             :     {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
    1212             :     {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
    1213             :     {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
    1214             :     {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
    1215             : };
    1216             : 
    1217             : static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
    1218             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1219             :     {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
    1220             :     {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
    1221             :     {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
    1222             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1223             : };
    1224             : 
    1225             : static const TParserStateActionItem actionTPS_InMantissaSign[] = {
    1226             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1227             :     {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
    1228             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1229             : };
    1230             : 
    1231             : static const TParserStateActionItem actionTPS_InMantissa[] = {
    1232             :     {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
    1233             :     {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
    1234             :     {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
    1235             : };
    1236             : 
    1237             : static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
    1238             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1239             :     {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
    1240             :     {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
    1241             :     {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1242             :     {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1243             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1244             : };
    1245             : 
    1246             : static const TParserStateActionItem actionTPS_InXMLEntity[] = {
    1247             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1248             :     {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
    1249             :     {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1250             :     {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1251             :     {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1252             :     {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1253             :     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
    1254             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1255             : };
    1256             : 
    1257             : static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
    1258             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1259             :     {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
    1260             :     {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
    1261             :     {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
    1262             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1263             : };
    1264             : 
    1265             : static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
    1266             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1267             :     {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
    1268             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1269             : };
    1270             : 
    1271             : static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
    1272             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1273             :     {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
    1274             :     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
    1275             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1276             : };
    1277             : 
    1278             : static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
    1279             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1280             :     {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
    1281             :     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
    1282             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1283             : };
    1284             : 
    1285             : static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
    1286             :     {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
    1287             : };
    1288             : 
    1289             : static const TParserStateActionItem actionTPS_InTagFirst[] = {
    1290             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1291             :     {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
    1292             :     {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
    1293             :     {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
    1294             :     {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
    1295             :     {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
    1296             :     {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
    1297             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1298             : };
    1299             : 
    1300             : static const TParserStateActionItem actionTPS_InXMLBegin[] = {
    1301             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1302             :     /* <?xml ... */
    1303             :     /* XXX do we wants states for the m and l ?  Right now this accepts <?xZ */
    1304             :     {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
    1305             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1306             : };
    1307             : 
    1308             : static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
    1309             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1310             :     {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
    1311             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1312             : };
    1313             : 
    1314             : static const TParserStateActionItem actionTPS_InTagName[] = {
    1315             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1316             :     /* <br/> case */
    1317             :     {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
    1318             :     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
    1319             :     {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
    1320             :     {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
    1321             :     {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
    1322             :     {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
    1323             :     {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
    1324             :     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
    1325             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1326             : };
    1327             : 
    1328             : static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
    1329             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1330             :     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
    1331             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1332             : };
    1333             : 
    1334             : static const TParserStateActionItem actionTPS_InTag[] = {
    1335             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1336             :     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
    1337             :     {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
    1338             :     {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
    1339             :     {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
    1340             :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
    1341             :     {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
    1342             :     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
    1343             :     {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
    1344             :     {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
    1345             :     {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
    1346             :     {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
    1347             :     {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
    1348             :     {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
    1349             :     {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
    1350             :     {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
    1351             :     {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
    1352             :     {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
    1353             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1354             : };
    1355             : 
    1356             : static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
    1357             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1358             :     {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
    1359             :     {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
    1360             :     {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
    1361             : };
    1362             : 
    1363             : static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
    1364             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1365             :     {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
    1366             :     {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
    1367             :     {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
    1368             : };
    1369             : 
    1370             : static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
    1371             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1372             :     {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
    1373             : };
    1374             : 
    1375             : static const TParserStateActionItem actionTPS_InTagEnd[] = {
    1376             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
    1377             : };
    1378             : 
    1379             : static const TParserStateActionItem actionTPS_InCommentFirst[] = {
    1380             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1381             :     {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
    1382             :     /* <!DOCTYPE ...> */
    1383             :     {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
    1384             :     {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
    1385             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1386             : };
    1387             : 
    1388             : static const TParserStateActionItem actionTPS_InCommentLast[] = {
    1389             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1390             :     {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
    1391             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1392             : };
    1393             : 
    1394             : static const TParserStateActionItem actionTPS_InComment[] = {
    1395             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1396             :     {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
    1397             :     {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
    1398             : };
    1399             : 
    1400             : static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
    1401             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1402             :     {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
    1403             :     {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
    1404             : };
    1405             : 
    1406             : static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
    1407             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1408             :     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
    1409             :     {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
    1410             :     {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
    1411             : };
    1412             : 
    1413             : static const TParserStateActionItem actionTPS_InCommentEnd[] = {
    1414             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
    1415             : };
    1416             : 
    1417             : static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
    1418             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1419             :     {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
    1420             :     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
    1421             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1422             : };
    1423             : 
    1424             : static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
    1425             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1426             :     {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
    1427             :     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
    1428             :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1429             :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1430             :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    1431             :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    1432             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1433             : };
    1434             : 
    1435             : static const TParserStateActionItem actionTPS_InHostDomain[] = {
    1436             :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
    1437             :     {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
    1438             :     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
    1439             :     {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
    1440             :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1441             :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1442             :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    1443             :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    1444             :     {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
    1445             :     {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
    1446             :     {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
    1447             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
    1448             : };
    1449             : 
    1450             : static const TParserStateActionItem actionTPS_InPortFirst[] = {
    1451             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1452             :     {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
    1453             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1454             : };
    1455             : 
    1456             : static const TParserStateActionItem actionTPS_InPort[] = {
    1457             :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
    1458             :     {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
    1459             :     {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
    1460             :     {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
    1461             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
    1462             : };
    1463             : 
    1464             : static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
    1465             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1466             :     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
    1467             :     {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
    1468             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1469             : };
    1470             : 
    1471             : static const TParserStateActionItem actionTPS_InHost[] = {
    1472             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1473             :     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
    1474             :     {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
    1475             :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    1476             :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    1477             :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1478             :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1479             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1480             : };
    1481             : 
    1482             : static const TParserStateActionItem actionTPS_InEmail[] = {
    1483             :     {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
    1484             :     {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
    1485             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1486             : };
    1487             : 
    1488             : static const TParserStateActionItem actionTPS_InFileFirst[] = {
    1489             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1490             :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
    1491             :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
    1492             :     {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
    1493             :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
    1494             :     {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
    1495             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1496             : };
    1497             : 
    1498             : static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
    1499             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1500             :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
    1501             :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
    1502             :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
    1503             :     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
    1504             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1505             : };
    1506             : 
    1507             : static const TParserStateActionItem actionTPS_InPathFirst[] = {
    1508             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1509             :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
    1510             :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
    1511             :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
    1512             :     {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
    1513             :     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
    1514             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1515             : };
    1516             : 
    1517             : static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
    1518             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1519             :     {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
    1520             :     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
    1521             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1522             : };
    1523             : 
    1524             : static const TParserStateActionItem actionTPS_InPathSecond[] = {
    1525             :     {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
    1526             :     {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
    1527             :     {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
    1528             :     {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
    1529             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1530             : };
    1531             : 
    1532             : static const TParserStateActionItem actionTPS_InFile[] = {
    1533             :     {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
    1534             :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
    1535             :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
    1536             :     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
    1537             :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
    1538             :     {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
    1539             :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
    1540             :     {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
    1541             : };
    1542             : 
    1543             : static const TParserStateActionItem actionTPS_InFileNext[] = {
    1544             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1545             :     {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
    1546             :     {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
    1547             :     {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
    1548             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1549             : };
    1550             : 
    1551             : static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
    1552             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1553             :     {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
    1554             :     {NULL, 0, A_POP, TPS_Null, 0, NULL},
    1555             : };
    1556             : 
    1557             : static const TParserStateActionItem actionTPS_InURLPathStart[] = {
    1558             :     {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
    1559             : };
    1560             : 
    1561             : static const TParserStateActionItem actionTPS_InURLPath[] = {
    1562             :     {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
    1563             :     {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
    1564             :     {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
    1565             : };
    1566             : 
    1567             : static const TParserStateActionItem actionTPS_InFURL[] = {
    1568             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1569             :     {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
    1570             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1571             : };
    1572             : 
    1573             : static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
    1574             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1575             :     {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
    1576             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1577             : };
    1578             : 
    1579             : static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
    1580             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1581             :     {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
    1582             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1583             : };
    1584             : 
    1585             : static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
    1586             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
    1587             : };
    1588             : 
    1589             : static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
    1590             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1591             :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
    1592             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1593             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
    1594             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1595             : };
    1596             : 
    1597             : static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
    1598             :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
    1599             :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
    1600             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1601             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1602             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1603             :     {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
    1604             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
    1605             : };
    1606             : 
    1607             : static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
    1608             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1609             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1610             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
    1611             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1612             : };
    1613             : 
    1614             : static const TParserStateActionItem actionTPS_InHyphenWord[] = {
    1615             :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
    1616             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1617             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1618             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1619             :     {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
    1620             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
    1621             : };
    1622             : 
    1623             : static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
    1624             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1625             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1626             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
    1627             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1628             : };
    1629             : 
    1630             : static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
    1631             :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
    1632             :     {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1633             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1634             :     {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
    1635             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
    1636             : };
    1637             : 
    1638             : static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
    1639             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1640             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
    1641             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1642             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1643             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1644             : };
    1645             : 
    1646             : static const TParserStateActionItem actionTPS_InParseHyphen[] = {
    1647             :     {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
    1648             :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
    1649             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1650             :     {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
    1651             :     {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
    1652             :     {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
    1653             : };
    1654             : 
    1655             : static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
    1656             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1657             :     {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
    1658             :     {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
    1659             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1660             : };
    1661             : 
    1662             : static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
    1663             :     {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
    1664             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1665             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1666             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    1667             :     {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
    1668             : };
    1669             : 
    1670             : static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
    1671             :     {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
    1672             :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
    1673             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1674             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1675             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    1676             :     {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
    1677             : };
    1678             : 
    1679             : static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
    1680             :     {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
    1681             :     {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    1682             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    1683             :     {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
    1684             : };
    1685             : 
    1686             : static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
    1687             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1688             :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
    1689             :     {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
    1690             :     {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
    1691             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1692             : };
    1693             : 
    1694             : 
    1695             : /*
    1696             :  * main table of per-state parser actions
    1697             :  */
    1698             : typedef struct
    1699             : {
    1700             :     const TParserStateActionItem *action;   /* the actual state info */
    1701             :     TParserState state;         /* only for Assert crosscheck */
    1702             : #ifdef WPARSER_TRACE
    1703             :     const char *state_name;     /* only for debug printout */
    1704             : #endif
    1705             : } TParserStateAction;
    1706             : 
    1707             : #ifdef WPARSER_TRACE
    1708             : #define TPARSERSTATEACTION(state) \
    1709             :     { CppConcat(action,state), state, CppAsString(state) }
    1710             : #else
    1711             : #define TPARSERSTATEACTION(state) \
    1712             :     { CppConcat(action,state), state }
    1713             : #endif
    1714             : 
    1715             : /*
    1716             :  * order must be the same as in typedef enum {} TParserState!!
    1717             :  */
    1718             : 
    1719             : static const TParserStateAction Actions[] = {
    1720             :     TPARSERSTATEACTION(TPS_Base),
    1721             :     TPARSERSTATEACTION(TPS_InNumWord),
    1722             :     TPARSERSTATEACTION(TPS_InAsciiWord),
    1723             :     TPARSERSTATEACTION(TPS_InWord),
    1724             :     TPARSERSTATEACTION(TPS_InUnsignedInt),
    1725             :     TPARSERSTATEACTION(TPS_InSignedIntFirst),
    1726             :     TPARSERSTATEACTION(TPS_InSignedInt),
    1727             :     TPARSERSTATEACTION(TPS_InSpace),
    1728             :     TPARSERSTATEACTION(TPS_InUDecimalFirst),
    1729             :     TPARSERSTATEACTION(TPS_InUDecimal),
    1730             :     TPARSERSTATEACTION(TPS_InDecimalFirst),
    1731             :     TPARSERSTATEACTION(TPS_InDecimal),
    1732             :     TPARSERSTATEACTION(TPS_InVerVersion),
    1733             :     TPARSERSTATEACTION(TPS_InSVerVersion),
    1734             :     TPARSERSTATEACTION(TPS_InVersionFirst),
    1735             :     TPARSERSTATEACTION(TPS_InVersion),
    1736             :     TPARSERSTATEACTION(TPS_InMantissaFirst),
    1737             :     TPARSERSTATEACTION(TPS_InMantissaSign),
    1738             :     TPARSERSTATEACTION(TPS_InMantissa),
    1739             :     TPARSERSTATEACTION(TPS_InXMLEntityFirst),
    1740             :     TPARSERSTATEACTION(TPS_InXMLEntity),
    1741             :     TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
    1742             :     TPARSERSTATEACTION(TPS_InXMLEntityNum),
    1743             :     TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
    1744             :     TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
    1745             :     TPARSERSTATEACTION(TPS_InXMLEntityEnd),
    1746             :     TPARSERSTATEACTION(TPS_InTagFirst),
    1747             :     TPARSERSTATEACTION(TPS_InXMLBegin),
    1748             :     TPARSERSTATEACTION(TPS_InTagCloseFirst),
    1749             :     TPARSERSTATEACTION(TPS_InTagName),
    1750             :     TPARSERSTATEACTION(TPS_InTagBeginEnd),
    1751             :     TPARSERSTATEACTION(TPS_InTag),
    1752             :     TPARSERSTATEACTION(TPS_InTagEscapeK),
    1753             :     TPARSERSTATEACTION(TPS_InTagEscapeKK),
    1754             :     TPARSERSTATEACTION(TPS_InTagBackSleshed),
    1755             :     TPARSERSTATEACTION(TPS_InTagEnd),
    1756             :     TPARSERSTATEACTION(TPS_InCommentFirst),
    1757             :     TPARSERSTATEACTION(TPS_InCommentLast),
    1758             :     TPARSERSTATEACTION(TPS_InComment),
    1759             :     TPARSERSTATEACTION(TPS_InCloseCommentFirst),
    1760             :     TPARSERSTATEACTION(TPS_InCloseCommentLast),
    1761             :     TPARSERSTATEACTION(TPS_InCommentEnd),
    1762             :     TPARSERSTATEACTION(TPS_InHostFirstDomain),
    1763             :     TPARSERSTATEACTION(TPS_InHostDomainSecond),
    1764             :     TPARSERSTATEACTION(TPS_InHostDomain),
    1765             :     TPARSERSTATEACTION(TPS_InPortFirst),
    1766             :     TPARSERSTATEACTION(TPS_InPort),
    1767             :     TPARSERSTATEACTION(TPS_InHostFirstAN),
    1768             :     TPARSERSTATEACTION(TPS_InHost),
    1769             :     TPARSERSTATEACTION(TPS_InEmail),
    1770             :     TPARSERSTATEACTION(TPS_InFileFirst),
    1771             :     TPARSERSTATEACTION(TPS_InFileTwiddle),
    1772             :     TPARSERSTATEACTION(TPS_InPathFirst),
    1773             :     TPARSERSTATEACTION(TPS_InPathFirstFirst),
    1774             :     TPARSERSTATEACTION(TPS_InPathSecond),
    1775             :     TPARSERSTATEACTION(TPS_InFile),
    1776             :     TPARSERSTATEACTION(TPS_InFileNext),
    1777             :     TPARSERSTATEACTION(TPS_InURLPathFirst),
    1778             :     TPARSERSTATEACTION(TPS_InURLPathStart),
    1779             :     TPARSERSTATEACTION(TPS_InURLPath),
    1780             :     TPARSERSTATEACTION(TPS_InFURL),
    1781             :     TPARSERSTATEACTION(TPS_InProtocolFirst),
    1782             :     TPARSERSTATEACTION(TPS_InProtocolSecond),
    1783             :     TPARSERSTATEACTION(TPS_InProtocolEnd),
    1784             :     TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
    1785             :     TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
    1786             :     TPARSERSTATEACTION(TPS_InHyphenWordFirst),
    1787             :     TPARSERSTATEACTION(TPS_InHyphenWord),
    1788             :     TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
    1789             :     TPARSERSTATEACTION(TPS_InHyphenNumWord),
    1790             :     TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
    1791             :     TPARSERSTATEACTION(TPS_InParseHyphen),
    1792             :     TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
    1793             :     TPARSERSTATEACTION(TPS_InHyphenWordPart),
    1794             :     TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
    1795             :     TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
    1796             :     TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
    1797             : };
    1798             : 
    1799             : 
    1800             : static bool
    1801        3278 : TParserGet(TParser *prs)
    1802             : {
    1803        3278 :     const TParserStateActionItem *item = NULL;
    1804             : 
    1805        3278 :     Assert(prs->state);
    1806             : 
    1807        3278 :     if (prs->state->posbyte >= prs->lenstr)
    1808         454 :         return false;
    1809             : 
    1810        2824 :     prs->token = prs->str + prs->state->posbyte;
    1811        2824 :     prs->state->pushedAtAction = NULL;
    1812             : 
    1813             :     /* look at string */
    1814       15434 :     while (prs->state->posbyte <= prs->lenstr)
    1815             :     {
    1816       12610 :         if (prs->state->posbyte == prs->lenstr)
    1817         462 :             prs->state->charlen = 0;
    1818             :         else
    1819       24296 :             prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
    1820       12148 :                 pg_mblen(prs->str + prs->state->posbyte);
    1821             : 
    1822       12610 :         Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
    1823       12610 :         Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
    1824       12610 :         Assert(Actions[prs->state->state].state == prs->state->state);
    1825             : 
    1826       12610 :         if (prs->state->pushedAtAction)
    1827             :         {
    1828             :             /* After a POP, pick up at the next test */
    1829         310 :             item = prs->state->pushedAtAction + 1;
    1830         310 :             prs->state->pushedAtAction = NULL;
    1831             :         }
    1832             :         else
    1833             :         {
    1834       12300 :             item = Actions[prs->state->state].action;
    1835       12300 :             Assert(item != NULL);
    1836             :         }
    1837             : 
    1838             :         /* find action by character class */
    1839       80094 :         while (item->isclass)
    1840             :         {
    1841       63685 :             prs->c = item->c;
    1842       63685 :             if (item->isclass(prs) != 0)
    1843        8811 :                 break;
    1844       54874 :             item++;
    1845             :         }
    1846             : 
    1847             : #ifdef WPARSER_TRACE
    1848             :         {
    1849             :             TParserPosition *ptr;
    1850             : 
    1851             :             fprintf(stderr, "state ");
    1852             :             /* indent according to stack depth */
    1853             :             for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
    1854             :                 fprintf(stderr, "  ");
    1855             :             fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
    1856             :             if (prs->state->posbyte < prs->lenstr)
    1857             :                 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
    1858             :             else
    1859             :                 fprintf(stderr, "at EOF");
    1860             :             fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
    1861             :                     (int) (item - Actions[prs->state->state].action),
    1862             :                     (item->flags & A_BINGO) ? " BINGO" : "",
    1863             :                     (item->flags & A_POP) ? " POP" : "",
    1864             :                     (item->flags & A_PUSH) ? " PUSH" : "",
    1865             :                     (item->flags & A_RERUN) ? " RERUN" : "",
    1866             :                     (item->flags & A_CLEAR) ? " CLEAR" : "",
    1867             :                     (item->flags & A_MERGE) ? " MERGE" : "",
    1868             :                     (item->flags & A_CLRALL) ? " CLRALL" : "",
    1869             :                     (item->tostate != TPS_Null) ? " tostate " : "",
    1870             :                     (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
    1871             :                     (item->type > 0) ? " type " : "",
    1872             :                     tok_alias[item->type]);
    1873             :         }
    1874             : #endif
    1875             : 
    1876             :         /* call special handler if exists */
    1877       12610 :         if (item->special)
    1878          68 :             item->special(prs);
    1879             : 
    1880             :         /* BINGO, token is found */
    1881       12610 :         if (item->flags & A_BINGO)
    1882             :         {
    1883        2824 :             Assert(item->type > 0);
    1884        2824 :             prs->lenbytetoken = prs->state->lenbytetoken;
    1885        2824 :             prs->lenchartoken = prs->state->lenchartoken;
    1886        2824 :             prs->state->lenbytetoken = prs->state->lenchartoken = 0;
    1887        2824 :             prs->type = item->type;
    1888             :         }
    1889             : 
    1890             :         /* do various actions by flags */
    1891       12610 :         if (item->flags & A_POP)
    1892             :         {                       /* pop stored state in stack */
    1893         313 :             TParserPosition *ptr = prs->state->prev;
    1894             : 
    1895         313 :             pfree(prs->state);
    1896         313 :             prs->state = ptr;
    1897         313 :             Assert(prs->state);
    1898             :         }
    1899       12297 :         else if (item->flags & A_PUSH)
    1900             :         {                       /* push (store) state in stack */
    1901         716 :             prs->state->pushedAtAction = item;    /* remember where we push */
    1902         716 :             prs->state = newTParserPosition(prs->state);
    1903             :         }
    1904       11581 :         else if (item->flags & A_CLEAR)
    1905             :         {                       /* clear previous pushed state */
    1906             :             TParserPosition *ptr;
    1907             : 
    1908          75 :             Assert(prs->state->prev);
    1909          75 :             ptr = prs->state->prev->prev;
    1910          75 :             pfree(prs->state->prev);
    1911          75 :             prs->state->prev = ptr;
    1912             :         }
    1913       11506 :         else if (item->flags & A_CLRALL)
    1914             :         {                       /* clear all previous pushed state */
    1915             :             TParserPosition *ptr;
    1916             : 
    1917         587 :             while (prs->state->prev)
    1918             :             {
    1919         331 :                 ptr = prs->state->prev->prev;
    1920         331 :                 pfree(prs->state->prev);
    1921         331 :                 prs->state->prev = ptr;
    1922             :             }
    1923             :         }
    1924       11378 :         else if (item->flags & A_MERGE)
    1925             :         {                       /* merge posinfo with current and pushed state */
    1926           0 :             TParserPosition *ptr = prs->state;
    1927             : 
    1928           0 :             Assert(prs->state->prev);
    1929           0 :             prs->state = prs->state->prev;
    1930             : 
    1931           0 :             prs->state->posbyte = ptr->posbyte;
    1932           0 :             prs->state->poschar = ptr->poschar;
    1933           0 :             prs->state->charlen = ptr->charlen;
    1934           0 :             prs->state->lenbytetoken = ptr->lenbytetoken;
    1935           0 :             prs->state->lenchartoken = ptr->lenchartoken;
    1936           0 :             pfree(ptr);
    1937             :         }
    1938             : 
    1939             :         /* set new state if pointed */
    1940       12610 :         if (item->tostate != TPS_Null)
    1941        8236 :             prs->state->state = item->tostate;
    1942             : 
    1943             :         /* check for go away */
    1944       22396 :         if ((item->flags & A_BINGO) ||
    1945        9786 :             (prs->state->posbyte >= prs->lenstr &&
    1946           0 :              (item->flags & A_RERUN) == 0))
    1947             :             break;
    1948             : 
    1949             :         /* go to beginning of loop if we should rerun or we just restore state */
    1950        9786 :         if (item->flags & (A_RERUN | A_POP))
    1951         317 :             continue;
    1952             : 
    1953             :         /* move forward */
    1954        9469 :         if (prs->state->charlen)
    1955             :         {
    1956        9469 :             prs->state->posbyte += prs->state->charlen;
    1957        9469 :             prs->state->lenbytetoken += prs->state->charlen;
    1958        9469 :             prs->state->poschar++;
    1959        9469 :             prs->state->lenchartoken++;
    1960             :         }
    1961             :     }
    1962             : 
    1963        2824 :     return (item && (item->flags & A_BINGO)) ? true : false;
    1964             : }
    1965             : 
    1966             : Datum
    1967          70 : prsd_lextype(PG_FUNCTION_ARGS)
    1968             : {
    1969          70 :     LexDescr   *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
    1970             :     int         i;
    1971             : 
    1972        1680 :     for (i = 1; i <= LASTNUM; i++)
    1973             :     {
    1974        1610 :         descr[i - 1].lexid = i;
    1975        1610 :         descr[i - 1].alias = pstrdup(tok_alias[i]);
    1976        1610 :         descr[i - 1].descr = pstrdup(lex_descr[i]);
    1977             :     }
    1978             : 
    1979          70 :     descr[LASTNUM].lexid = 0;
    1980             : 
    1981          70 :     PG_RETURN_POINTER(descr);
    1982             : }
    1983             : 
    1984             : Datum
    1985         454 : prsd_start(PG_FUNCTION_ARGS)
    1986             : {
    1987         454 :     PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
    1988             : }
    1989             : 
    1990             : Datum
    1991        3238 : prsd_nexttoken(PG_FUNCTION_ARGS)
    1992             : {
    1993        3238 :     TParser    *p = (TParser *) PG_GETARG_POINTER(0);
    1994        3238 :     char      **t = (char **) PG_GETARG_POINTER(1);
    1995        3238 :     int        *tlen = (int *) PG_GETARG_POINTER(2);
    1996             : 
    1997        3238 :     if (!TParserGet(p))
    1998         454 :         PG_RETURN_INT32(0);
    1999             : 
    2000        2784 :     *t = p->token;
    2001        2784 :     *tlen = p->lenbytetoken;
    2002             : 
    2003        2784 :     PG_RETURN_INT32(p->type);
    2004             : }
    2005             : 
    2006             : Datum
    2007         454 : prsd_end(PG_FUNCTION_ARGS)
    2008             : {
    2009         454 :     TParser    *p = (TParser *) PG_GETARG_POINTER(0);
    2010             : 
    2011         454 :     TParserClose(p);
    2012         454 :     PG_RETURN_VOID();
    2013             : }
    2014             : 
    2015             : #define LEAVETOKEN(x)   ( (x)==SPACE )
    2016             : #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
    2017             : #define ENDPUNCTOKEN(x) ( (x)==SPACE )
    2018             : 
    2019             : #define TS_IDIGNORE(x)  ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
    2020             : #define HLIDREPLACE(x)  ( (x)==TAG_T )
    2021             : #define HLIDSKIP(x)     ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
    2022             : #define XMLHLIDSKIP(x)  ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
    2023             : #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
    2024             : #define NOENDTOKEN(x)   ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
    2025             : 
    2026             : typedef struct
    2027             : {
    2028             :     HeadlineWordEntry *words;
    2029             :     int         len;
    2030             : } hlCheck;
    2031             : 
    2032             : static bool
    2033          88 : checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
    2034             : {
    2035             :     int         i;
    2036          88 :     hlCheck    *checkval = (hlCheck *) opaque;
    2037             : 
    2038         424 :     for (i = 0; i < checkval->len; i++)
    2039             :     {
    2040         364 :         if (checkval->words[i].item == val)
    2041             :         {
    2042             :             /* don't need to find all positions */
    2043          45 :             if (!data)
    2044          28 :                 return true;
    2045             : 
    2046          17 :             if (!data->pos)
    2047             :             {
    2048          17 :                 data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
    2049          17 :                 data->allocated = true;
    2050          17 :                 data->npos = 1;
    2051          17 :                 data->pos[0] = checkval->words[i].pos;
    2052             :             }
    2053           0 :             else if (data->pos[data->npos - 1] < checkval->words[i].pos)
    2054             :             {
    2055           0 :                 data->pos[data->npos++] = checkval->words[i].pos;
    2056             :             }
    2057             :         }
    2058             :     }
    2059             : 
    2060          60 :     if (data && data->npos > 0)
    2061          17 :         return true;
    2062             : 
    2063          43 :     return false;
    2064             : }
    2065             : 
    2066             : 
    2067             : static bool
    2068         110 : hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
    2069             : {
    2070             :     int         i,
    2071             :                 j;
    2072         110 :     QueryItem  *item = GETQUERY(query);
    2073         110 :     int         pos = *p;
    2074             : 
    2075         110 :     *q = -1;
    2076         110 :     *p = INT_MAX;
    2077             : 
    2078         580 :     for (j = 0; j < query->size; j++)
    2079             :     {
    2080         470 :         if (item->type != QI_VAL)
    2081             :         {
    2082         180 :             item++;
    2083         180 :             continue;
    2084             :         }
    2085        3380 :         for (i = pos; i < prs->curwords; i++)
    2086             :         {
    2087        3170 :             if (prs->words[i].item == &item->qoperand)
    2088             :             {
    2089          80 :                 if (i > *q)
    2090          66 :                     *q = i;
    2091          80 :                 break;
    2092             :             }
    2093             :         }
    2094         290 :         item++;
    2095             :     }
    2096             : 
    2097         110 :     if (*q < 0)
    2098          50 :         return false;
    2099             : 
    2100          60 :     item = GETQUERY(query);
    2101         304 :     for (j = 0; j < query->size; j++)
    2102             :     {
    2103         244 :         if (item->type != QI_VAL)
    2104             :         {
    2105          92 :             item++;
    2106          92 :             continue;
    2107             :         }
    2108         922 :         for (i = *q; i >= pos; i--)
    2109             :         {
    2110         850 :             if (prs->words[i].item == &item->qoperand)
    2111             :             {
    2112          80 :                 if (i < *p)
    2113          74 :                     *p = i;
    2114          80 :                 break;
    2115             :             }
    2116             :         }
    2117         152 :         item++;
    2118             :     }
    2119             : 
    2120          60 :     if (*p <= *q)
    2121             :     {
    2122             :         hlCheck     ch;
    2123             : 
    2124          60 :         ch.words = &(prs->words[*p]);
    2125          60 :         ch.len = *q - *p + 1;
    2126          60 :         if (TS_execute(GETQUERY(query), &ch, TS_EXEC_EMPTY, checkcondition_HL))
    2127          13 :             return true;
    2128             :         else
    2129             :         {
    2130          47 :             (*p)++;
    2131          47 :             return hlCover(prs, query, p, q);
    2132             :         }
    2133             :     }
    2134             : 
    2135           0 :     return false;
    2136             : }
    2137             : 
    2138             : static void
    2139           6 : mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
    2140             : {
    2141             :     int         i;
    2142             : 
    2143         265 :     for (i = startpos; i <= endpos; i++)
    2144             :     {
    2145         259 :         if (prs->words[i].item)
    2146           5 :             prs->words[i].selected = 1;
    2147         259 :         if (highlight == 0)
    2148             :         {
    2149         259 :             if (HLIDREPLACE(prs->words[i].type))
    2150           0 :                 prs->words[i].replace = 1;
    2151         259 :             else if (HLIDSKIP(prs->words[i].type))
    2152           0 :                 prs->words[i].skip = 1;
    2153             :         }
    2154             :         else
    2155             :         {
    2156           0 :             if (XMLHLIDSKIP(prs->words[i].type))
    2157           0 :                 prs->words[i].skip = 1;
    2158             :         }
    2159             : 
    2160         259 :         prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
    2161             :     }
    2162           6 : }
    2163             : 
    2164             : typedef struct
    2165             : {
    2166             :     int32       startpos;
    2167             :     int32       endpos;
    2168             :     int32       poslen;
    2169             :     int32       curlen;
    2170             :     int16       in;
    2171             :     int16       excluded;
    2172             : } CoverPos;
    2173             : 
    2174             : static void
    2175           5 : get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
    2176             :                   int *curlen, int *poslen, int max_words)
    2177             : {
    2178             :     int         i;
    2179             : 
    2180             :     /*
    2181             :      * Objective: Generate a fragment of words between startpos and endpos
    2182             :      * such that it has at most max_words and both ends has query words. If
    2183             :      * the startpos and endpos are the endpoints of the cover and the cover
    2184             :      * has fewer words than max_words, then this function should just return
    2185             :      * the cover
    2186             :      */
    2187             :     /* first move startpos to an item */
    2188         147 :     for (i = *startpos; i <= *endpos; i++)
    2189             :     {
    2190         147 :         *startpos = i;
    2191         147 :         if (prs->words[i].item && !prs->words[i].repeated)
    2192           5 :             break;
    2193             :     }
    2194             :     /* cut endpos to have only max_words */
    2195           5 :     *curlen = 0;
    2196           5 :     *poslen = 0;
    2197         146 :     for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
    2198             :     {
    2199         141 :         if (!NONWORDTOKEN(prs->words[i].type))
    2200          73 :             *curlen += 1;
    2201         141 :         if (prs->words[i].item && !prs->words[i].repeated)
    2202           5 :             *poslen += 1;
    2203             :     }
    2204             :     /* if the cover was cut then move back endpos to a query item */
    2205           5 :     if (*endpos > i)
    2206             :     {
    2207           2 :         *endpos = i;
    2208         140 :         for (i = *endpos; i >= *startpos; i--)
    2209             :         {
    2210         140 :             *endpos = i;
    2211         140 :             if (prs->words[i].item && !prs->words[i].repeated)
    2212           2 :                 break;
    2213         138 :             if (!NONWORDTOKEN(prs->words[i].type))
    2214          68 :                 *curlen -= 1;
    2215             :         }
    2216             :     }
    2217           5 : }
    2218             : 
    2219             : static void
    2220           4 : mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
    2221             :                   int shortword, int min_words,
    2222             :                   int max_words, int max_fragments)
    2223             : {
    2224             :     int32       poslen,
    2225             :                 curlen,
    2226             :                 i,
    2227             :                 f,
    2228           4 :                 num_f = 0;
    2229             :     int32       stretch,
    2230             :                 maxstretch,
    2231             :                 posmarker;
    2232             : 
    2233           4 :     int32       startpos = 0,
    2234           4 :                 endpos = 0,
    2235           4 :                 p = 0,
    2236           4 :                 q = 0;
    2237             : 
    2238           4 :     int32       numcovers = 0,
    2239           4 :                 maxcovers = 32;
    2240             : 
    2241             :     int32       minI,
    2242             :                 minwords,
    2243             :                 maxitems;
    2244             :     CoverPos   *covers;
    2245             : 
    2246           4 :     covers = palloc(maxcovers * sizeof(CoverPos));
    2247             : 
    2248             :     /* get all covers */
    2249          11 :     while (hlCover(prs, query, &p, &q))
    2250             :     {
    2251           3 :         startpos = p;
    2252           3 :         endpos = q;
    2253             : 
    2254             :         /*
    2255             :          * Break the cover into smaller fragments such that each fragment has
    2256             :          * at most max_words. Also ensure that each end of the fragment is a
    2257             :          * query word. This will allow us to stretch the fragment in either
    2258             :          * direction
    2259             :          */
    2260             : 
    2261          11 :         while (startpos <= endpos)
    2262             :         {
    2263           5 :             get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
    2264           5 :             if (numcovers >= maxcovers)
    2265             :             {
    2266           0 :                 maxcovers *= 2;
    2267           0 :                 covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
    2268             :             }
    2269           5 :             covers[numcovers].startpos = startpos;
    2270           5 :             covers[numcovers].endpos = endpos;
    2271           5 :             covers[numcovers].curlen = curlen;
    2272           5 :             covers[numcovers].poslen = poslen;
    2273           5 :             covers[numcovers].in = 0;
    2274           5 :             covers[numcovers].excluded = 0;
    2275           5 :             numcovers++;
    2276           5 :             startpos = endpos + 1;
    2277           5 :             endpos = q;
    2278             :         }
    2279             :         /* move p to generate the next cover */
    2280           3 :         p++;
    2281             :     }
    2282             : 
    2283             :     /* choose best covers */
    2284           9 :     for (f = 0; f < max_fragments; f++)
    2285             :     {
    2286           6 :         maxitems = 0;
    2287           6 :         minwords = PG_INT32_MAX;
    2288           6 :         minI = -1;
    2289             : 
    2290             :         /*
    2291             :          * Choose the cover that contains max items. In case of tie choose the
    2292             :          * one with smaller number of words.
    2293             :          */
    2294          15 :         for (i = 0; i < numcovers; i++)
    2295             :         {
    2296          16 :             if (!covers[i].in && !covers[i].excluded &&
    2297           9 :                 (maxitems < covers[i].poslen || (maxitems == covers[i].poslen
    2298           2 :                                                  && minwords > covers[i].curlen)))
    2299             :             {
    2300           5 :                 maxitems = covers[i].poslen;
    2301           5 :                 minwords = covers[i].curlen;
    2302           5 :                 minI = i;
    2303             :             }
    2304             :         }
    2305             :         /* if a cover was found mark it */
    2306           6 :         if (minI >= 0)
    2307             :         {
    2308           5 :             covers[minI].in = 1;
    2309             :             /* adjust the size of cover */
    2310           5 :             startpos = covers[minI].startpos;
    2311           5 :             endpos = covers[minI].endpos;
    2312           5 :             curlen = covers[minI].curlen;
    2313             :             /* stretch the cover if cover size is lower than max_words */
    2314           5 :             if (curlen < max_words)
    2315             :             {
    2316             :                 /* divide the stretch on both sides of cover */
    2317           5 :                 maxstretch = (max_words - curlen) / 2;
    2318             : 
    2319             :                 /*
    2320             :                  * first stretch the startpos stop stretching if 1. we hit the
    2321             :                  * beginning of document 2. exceed maxstretch 3. we hit an
    2322             :                  * already marked fragment
    2323             :                  */
    2324           5 :                 stretch = 0;
    2325           5 :                 posmarker = startpos;
    2326          99 :                 for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
    2327             :                 {
    2328          94 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2329             :                     {
    2330          45 :                         curlen++;
    2331          45 :                         stretch++;
    2332             :                     }
    2333          94 :                     posmarker = i;
    2334             :                 }
    2335             :                 /* cut back startpos till we find a non short token */
    2336          21 :                 for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
    2337             :                 {
    2338          16 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2339           6 :                         curlen--;
    2340             :                 }
    2341           5 :                 startpos = i;
    2342             :                 /* now stretch the endpos as much as possible */
    2343           5 :                 posmarker = endpos;
    2344         159 :                 for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
    2345             :                 {
    2346         154 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2347          77 :                         curlen++;
    2348         154 :                     posmarker = i;
    2349             :                 }
    2350             :                 /* cut back endpos till we find a non-short token */
    2351          13 :                 for (i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
    2352             :                 {
    2353           8 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2354           4 :                         curlen--;
    2355             :                 }
    2356           5 :                 endpos = i;
    2357             :             }
    2358           5 :             covers[minI].startpos = startpos;
    2359           5 :             covers[minI].endpos = endpos;
    2360           5 :             covers[minI].curlen = curlen;
    2361             :             /* Mark the chosen fragments (covers) */
    2362           5 :             mark_fragment(prs, highlight, startpos, endpos);
    2363           5 :             num_f++;
    2364             :             /* exclude overlapping covers */
    2365          14 :             for (i = 0; i < numcovers; i++)
    2366             :             {
    2367           9 :                 if (i != minI && ((covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
    2368           0 :                     covers[i].excluded = 1;
    2369             :             }
    2370             :         }
    2371             :         else
    2372           1 :             break;
    2373             :     }
    2374             : 
    2375             :     /* show at least min_words we have not marked anything */
    2376           4 :     if (num_f <= 0)
    2377             :     {
    2378           1 :         startpos = endpos = curlen = 0;
    2379          31 :         for (i = 0; i < prs->curwords && curlen < min_words; i++)
    2380             :         {
    2381          30 :             if (!NONWORDTOKEN(prs->words[i].type))
    2382          15 :                 curlen++;
    2383          30 :             endpos = i;
    2384             :         }
    2385           1 :         mark_fragment(prs, highlight, startpos, endpos);
    2386             :     }
    2387           4 :     pfree(covers);
    2388           4 : }
    2389             : 
    2390             : static void
    2391          47 : mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
    2392             :               int shortword, int min_words, int max_words)
    2393             : {
    2394          47 :     int         p = 0,
    2395          47 :                 q = 0;
    2396          47 :     int         bestb = -1,
    2397          47 :                 beste = -1;
    2398          47 :     int         bestlen = -1;
    2399          47 :     int         pose = 0,
    2400             :                 posb,
    2401             :                 poslen,
    2402             :                 curlen;
    2403             : 
    2404             :     int         i;
    2405             : 
    2406          47 :     if (highlight == 0)
    2407             :     {
    2408         102 :         while (hlCover(prs, query, &p, &q))
    2409             :         {
    2410             :             /* find cover len in words */
    2411          10 :             curlen = 0;
    2412          10 :             poslen = 0;
    2413          72 :             for (i = p; i <= q && curlen < max_words; i++)
    2414             :             {
    2415          62 :                 if (!NONWORDTOKEN(prs->words[i].type))
    2416          36 :                     curlen++;
    2417          62 :                 if (prs->words[i].item && !prs->words[i].repeated)
    2418          21 :                     poslen++;
    2419          62 :                 pose = i;
    2420             :             }
    2421             : 
    2422          10 :             if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
    2423             :             {
    2424             :                 /* best already found, so try one more cover */
    2425           0 :                 p++;
    2426           0 :                 continue;
    2427             :             }
    2428             : 
    2429          10 :             posb = p;
    2430          10 :             if (curlen < max_words)
    2431             :             {                   /* find good end */
    2432         121 :                 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
    2433             :                 {
    2434         117 :                     if (i != q)
    2435             :                     {
    2436         108 :                         if (!NONWORDTOKEN(prs->words[i].type))
    2437          54 :                             curlen++;
    2438         108 :                         if (prs->words[i].item && !prs->words[i].repeated)
    2439           7 :                             poslen++;
    2440             :                     }
    2441         117 :                     pose = i;
    2442         117 :                     if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
    2443          76 :                         continue;
    2444          41 :                     if (curlen >= min_words)
    2445           5 :                         break;
    2446             :                 }
    2447           9 :                 if (curlen < min_words && i >= prs->curwords)
    2448             :                 {               /* got end of text and our cover is shorter
    2449             :                                  * than min_words */
    2450           0 :                     for (i = p - 1; i >= 0; i--)
    2451             :                     {
    2452           0 :                         if (!NONWORDTOKEN(prs->words[i].type))
    2453           0 :                             curlen++;
    2454           0 :                         if (prs->words[i].item && !prs->words[i].repeated)
    2455           0 :                             poslen++;
    2456           0 :                         if (curlen >= max_words)
    2457           0 :                             break;
    2458           0 :                         if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
    2459           0 :                             continue;
    2460           0 :                         if (curlen >= min_words)
    2461           0 :                             break;
    2462             :                     }
    2463           0 :                     posb = (i >= 0) ? i : 0;
    2464             :                 }
    2465             :             }
    2466             :             else
    2467             :             {                   /* shorter cover :((( */
    2468           1 :                 if (i > q)
    2469           1 :                     i = q;
    2470           4 :                 for (; curlen > min_words; i--)
    2471             :                 {
    2472           1 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2473           1 :                         curlen--;
    2474           1 :                     if (prs->words[i].item && !prs->words[i].repeated)
    2475           1 :                         poslen--;
    2476           1 :                     pose = i;
    2477           1 :                     if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
    2478           1 :                         continue;
    2479           0 :                     break;
    2480             :                 }
    2481             :             }
    2482             : 
    2483          10 :             if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
    2484           2 :                 (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
    2485           0 :                  (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
    2486             :             {
    2487           8 :                 bestb = posb;
    2488           8 :                 beste = pose;
    2489           8 :                 bestlen = poslen;
    2490             :             }
    2491             : 
    2492          10 :             p++;
    2493             :         }
    2494             : 
    2495          46 :         if (bestlen < 0)
    2496             :         {
    2497          38 :             curlen = 0;
    2498         168 :             for (i = 0; i < prs->curwords && curlen < min_words; i++)
    2499             :             {
    2500         130 :                 if (!NONWORDTOKEN(prs->words[i].type))
    2501          84 :                     curlen++;
    2502         130 :                 pose = i;
    2503             :             }
    2504          38 :             bestb = 0;
    2505          38 :             beste = pose;
    2506             :         }
    2507             :     }
    2508             :     else
    2509             :     {
    2510           1 :         bestb = 0;
    2511           1 :         beste = prs->curwords - 1;
    2512             :     }
    2513             : 
    2514         380 :     for (i = bestb; i <= beste; i++)
    2515             :     {
    2516         333 :         if (prs->words[i].item)
    2517          49 :             prs->words[i].selected = 1;
    2518         333 :         if (highlight == 0)
    2519             :         {
    2520         292 :             if (HLIDREPLACE(prs->words[i].type))
    2521           0 :                 prs->words[i].replace = 1;
    2522         292 :             else if (HLIDSKIP(prs->words[i].type))
    2523           0 :                 prs->words[i].skip = 1;
    2524             :         }
    2525             :         else
    2526             :         {
    2527          41 :             if (XMLHLIDSKIP(prs->words[i].type))
    2528           1 :                 prs->words[i].skip = 1;
    2529             :         }
    2530             : 
    2531         333 :         prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
    2532             :     }
    2533             : 
    2534          47 : }
    2535             : 
    2536             : Datum
    2537          51 : prsd_headline(PG_FUNCTION_ARGS)
    2538             : {
    2539          51 :     HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
    2540          51 :     List       *prsoptions = (List *) PG_GETARG_POINTER(1);
    2541          51 :     TSQuery     query = PG_GETARG_TSQUERY(2);
    2542             : 
    2543             :     /* from opt + start and end tag */
    2544          51 :     int         min_words = 15;
    2545          51 :     int         max_words = 35;
    2546          51 :     int         shortword = 3;
    2547          51 :     int         max_fragments = 0;
    2548          51 :     int         highlight = 0;
    2549             :     ListCell   *l;
    2550             : 
    2551             :     /* config */
    2552          51 :     prs->startsel = NULL;
    2553          51 :     prs->stopsel = NULL;
    2554         103 :     foreach(l, prsoptions)
    2555             :     {
    2556          52 :         DefElem    *defel = (DefElem *) lfirst(l);
    2557          52 :         char       *val = defGetString(defel);
    2558             : 
    2559          52 :         if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
    2560           3 :             max_words = pg_atoi(val, sizeof(int32), 0);
    2561          49 :         else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
    2562           3 :             min_words = pg_atoi(val, sizeof(int32), 0);
    2563          46 :         else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
    2564           0 :             shortword = pg_atoi(val, sizeof(int32), 0);
    2565          46 :         else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
    2566           4 :             max_fragments = pg_atoi(val, sizeof(int32), 0);
    2567          42 :         else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
    2568          20 :             prs->startsel = pstrdup(val);
    2569          22 :         else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
    2570          20 :             prs->stopsel = pstrdup(val);
    2571           2 :         else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
    2572           1 :             prs->fragdelim = pstrdup(val);
    2573           1 :         else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
    2574           3 :             highlight = (pg_strcasecmp(val, "1") == 0 ||
    2575           2 :                          pg_strcasecmp(val, "on") == 0 ||
    2576           1 :                          pg_strcasecmp(val, "true") == 0 ||
    2577           0 :                          pg_strcasecmp(val, "t") == 0 ||
    2578           1 :                          pg_strcasecmp(val, "y") == 0 ||
    2579           0 :                          pg_strcasecmp(val, "yes") == 0);
    2580             :         else
    2581           0 :             ereport(ERROR,
    2582             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2583             :                      errmsg("unrecognized headline parameter: \"%s\"",
    2584             :                             defel->defname)));
    2585             :     }
    2586             : 
    2587          51 :     if (highlight == 0)
    2588             :     {
    2589          50 :         if (min_words >= max_words)
    2590           0 :             ereport(ERROR,
    2591             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2592             :                      errmsg("MinWords should be less than MaxWords")));
    2593          50 :         if (min_words <= 0)
    2594           0 :             ereport(ERROR,
    2595             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2596             :                      errmsg("MinWords should be positive")));
    2597          50 :         if (shortword < 0)
    2598           0 :             ereport(ERROR,
    2599             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2600             :                      errmsg("ShortWord should be >= 0")));
    2601          50 :         if (max_fragments < 0)
    2602           0 :             ereport(ERROR,
    2603             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2604             :                      errmsg("MaxFragments should be >= 0")));
    2605             :     }
    2606             : 
    2607          51 :     if (max_fragments == 0)
    2608             :         /* call the default headline generator */
    2609          47 :         mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
    2610             :     else
    2611           4 :         mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
    2612             : 
    2613          51 :     if (!prs->startsel)
    2614          31 :         prs->startsel = pstrdup("<b>");
    2615          51 :     if (!prs->stopsel)
    2616          31 :         prs->stopsel = pstrdup("</b>");
    2617          51 :     if (!prs->fragdelim)
    2618          20 :         prs->fragdelim = pstrdup(" ... ");
    2619          51 :     prs->startsellen = strlen(prs->startsel);
    2620          51 :     prs->stopsellen = strlen(prs->stopsel);
    2621          51 :     prs->fragdelimlen = strlen(prs->fragdelim);
    2622             : 
    2623          51 :     PG_RETURN_POINTER(prs);
    2624             : }

Generated by: LCOV version 1.11