Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commita13fdc4

Browse files
committed
Mop-up for commit 85feb77aa09cda9ff3e12cf95c757c499dc25343.
Adjust commentary in regc_pg_locale.c to remove mention of the possibilityof not having <wctype.h> functions, since we no longer consider that.Eliminate duplicate code in wparser_def.c by generalizing the p_iswhatmacro to take a parameter saying what to return for non-ASCII charsin C locale. (That's not really a consequence of theUSE_WIDE_UPPER_LOWER-ectomy, but I noticed it while doing that.)
1 parentd7c4fa3 commita13fdc4

File tree

3 files changed

+59
-125
lines changed

3 files changed

+59
-125
lines changed

‎expected/pg_tsparser.out

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,3 +236,31 @@ SELECT to_tsvector('english_ts', 'test2.com');
236236
'com':3 'test2':2 'test2.com':1
237237
(1 row)
238238

239+
-- Test non-ASCII symbols
240+
SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф');
241+
tokid | token
242+
-------+--------
243+
17 | аб_вгд
244+
10 | аб
245+
12 | _
246+
10 | вгд
247+
12 |
248+
15 | 12_абв
249+
9 | 12
250+
12 | _
251+
10 | абв
252+
12 |
253+
15 | 12-абв
254+
9 | 12
255+
12 | -
256+
10 | абв
257+
12 |
258+
2 | абв
259+
12 | .
260+
2 | рф
261+
12 |
262+
3 | абв2
263+
12 | .
264+
2 | рф
265+
(22 rows)
266+

‎sql/pg_tsparser.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,6 @@ SELECT to_tsvector('english_ts', '12_abc');
2626
SELECT to_tsvector('english_ts','12-abc');
2727
SELECT to_tsvector('english_ts','test.com');
2828
SELECT to_tsvector('english_ts','test2.com');
29+
30+
-- Test non-ASCII symbols
31+
SELECT*from ts_parse('tsparser','аб_вгд 12_абв 12-абв абв.рф абв2.рф');

‎tsparser.c

Lines changed: 28 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -249,11 +249,9 @@ typedef struct TParser
249249
/* string and position information */
250250
char*str;/* multibyte string */
251251
intlenstr;/* length of mbstring */
252-
#ifdefUSE_WIDE_UPPER_LOWER
253252
wchar_t*wstr;/* wide character string */
254253
pg_wchar*pgwstr;/* wide character string for C-locale */
255254
boolusewide;
256-
#endif
257255

258256
/* State of parse */
259257
intcharmaxlen;
@@ -302,8 +300,6 @@ TParserInit(char *str, int len)
302300
prs->str=str;
303301
prs->lenstr=len;
304302

305-
#ifdefUSE_WIDE_UPPER_LOWER
306-
307303
/*
308304
* Use wide char code only when max encoding length > 1.
309305
*/
@@ -331,7 +327,6 @@ TParserInit(char *str, int len)
331327
}
332328
else
333329
prs->usewide= false;
334-
#endif
335330

336331
prs->state=newTParserPosition(NULL);
337332
prs->state->state=TPS_Base;
@@ -368,15 +363,12 @@ TParserCopyInit(const TParser *orig)
368363
prs->charmaxlen=orig->charmaxlen;
369364
prs->str=orig->str+orig->state->posbyte;
370365
prs->lenstr=orig->lenstr-orig->state->posbyte;
371-
372-
#ifdefUSE_WIDE_UPPER_LOWER
373366
prs->usewide=orig->usewide;
374367

375368
if (orig->pgwstr)
376369
prs->pgwstr=orig->pgwstr+orig->state->poschar;
377370
if (orig->wstr)
378371
prs->wstr=orig->wstr+orig->state->poschar;
379-
#endif
380372

381373
prs->state=newTParserPosition(NULL);
382374
prs->state->state=TPS_Base;
@@ -401,12 +393,10 @@ TParserClose(TParser *prs)
401393
prs->state=ptr;
402394
}
403395

404-
#ifdefUSE_WIDE_UPPER_LOWER
405396
if (prs->wstr)
406397
pfree(prs->wstr);
407398
if (prs->pgwstr)
408399
pfree(prs->pgwstr);
409-
#endif
410400

411401
#ifdefWPARSER_TRACE
412402
fprintf(stderr,"closing parser\n");
@@ -445,96 +435,45 @@ TParserCopyClose(TParser *prs)
445435
*- if locale is C then we use pgwstr instead of wstr.
446436
*/
447437

448-
#ifdefUSE_WIDE_UPPER_LOWER
449-
450-
#definep_iswhat(type)\
438+
#definep_iswhat(type,nonascii)\
439+
\
451440
static int\
452-
p_is##type(TParser *prs) {\
453-
Assert( prs->state );\
454-
if ( prs->usewide )\
441+
p_is##type(TParser *prs)\
442+
{\
443+
Assert(prs->state);\
444+
if (prs->usewide)\
455445
{\
456-
if (prs->pgwstr)\
446+
if (prs->pgwstr)\
457447
{\
458448
unsigned int c = *(prs->pgwstr + prs->state->poschar);\
459-
if (c > 0x7f)\
460-
return0;\
461-
return is##type( c );\
449+
if (c > 0x7f)\
450+
returnnonascii;\
451+
return is##type(c);\
462452
}\
463-
return isw##type( *(prs->wstr + prs->state->poschar ) );\
453+
return isw##type(*(prs->wstr + prs->state->poschar));\
464454
}\
465-
\
466-
return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
467-
}\
455+
return is##type(*(unsigned char *) (prs->str + prs->state->posbyte));\
456+
}\
468457
\
469458
static int\
470-
p_isnot##type(TParser *prs) {\
459+
p_isnot##type(TParser *prs)\
460+
{\
471461
return !p_is##type(prs);\
472462
}
473463

474-
staticint
475-
p_isalnum(TParser*prs)
476-
{
477-
Assert(prs->state);
478-
479-
if (prs->usewide)
480-
{
481-
if (prs->pgwstr)
482-
{
483-
unsignedintc=*(prs->pgwstr+prs->state->poschar);
484-
485-
/*
486-
* any non-ascii symbol with multibyte encoding with C-locale is
487-
* an alpha character
488-
*/
489-
if (c>0x7f)
490-
return1;
491-
492-
returnisalnum(c);
493-
}
494-
495-
returniswalnum(*(prs->wstr+prs->state->poschar));
496-
}
497-
498-
returnisalnum(*(unsignedchar*) (prs->str+prs->state->posbyte));
499-
}
500-
staticint
501-
p_isnotalnum(TParser*prs)
502-
{
503-
return !p_isalnum(prs);
504-
}
505-
506-
staticint
507-
p_isalpha(TParser*prs)
508-
{
509-
Assert(prs->state);
510-
511-
if (prs->usewide)
512-
{
513-
if (prs->pgwstr)
514-
{
515-
unsignedintc=*(prs->pgwstr+prs->state->poschar);
516-
517-
/*
518-
* any non-ascii symbol with multibyte encoding with C-locale is
519-
* an alpha character
520-
*/
521-
if (c>0x7f)
522-
return1;
523-
524-
returnisalpha(c);
525-
}
526-
527-
returniswalpha(*(prs->wstr+prs->state->poschar));
528-
}
529-
530-
returnisalpha(*(unsignedchar*) (prs->str+prs->state->posbyte));
531-
}
532-
533-
staticint
534-
p_isnotalpha(TParser*prs)
535-
{
536-
return !p_isalpha(prs);
537-
}
464+
/*
465+
* In C locale with a multibyte encoding, any non-ASCII symbol is considered
466+
* an alpha character, but not a member of other char classes.
467+
*/
468+
p_iswhat(alnum,1)
469+
p_iswhat(alpha,1)
470+
p_iswhat(digit,0)
471+
p_iswhat(lower,0)
472+
p_iswhat(print,0)
473+
p_iswhat(punct,0)
474+
p_iswhat(space,0)
475+
p_iswhat(upper,0)
476+
p_iswhat(xdigit,0)
538477

539478
/* p_iseq should be used only for ascii symbols */
540479

@@ -544,39 +483,6 @@ p_iseq(TParser *prs, char c)
544483
Assert(prs->state);
545484
return ((prs->state->charlen==1&&*(prs->str+prs->state->posbyte)==c)) ?1 :0;
546485
}
547-
#else/* USE_WIDE_UPPER_LOWER */
548-
549-
#definep_iswhat(type)\
550-
static int\
551-
p_is##type(TParser *prs) {\
552-
Assert( prs->state );\
553-
return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );\
554-
}\
555-
\
556-
static int\
557-
p_isnot##type(TParser *prs) {\
558-
return !p_is##type(prs);\
559-
}
560-
561-
562-
staticint
563-
p_iseq(TParser*prs,charc)
564-
{
565-
Assert(prs->state);
566-
return (*(prs->str+prs->state->posbyte)==c) ?1 :0;
567-
}
568-
569-
p_iswhat(alnum)
570-
p_iswhat(alpha)
571-
#endif/* USE_WIDE_UPPER_LOWER */
572-
573-
p_iswhat(digit)
574-
p_iswhat(lower)
575-
p_iswhat(print)
576-
p_iswhat(punct)
577-
p_iswhat(space)
578-
p_iswhat(upper)
579-
p_iswhat(xdigit)
580486

581487
staticint
582488
p_isEOF(TParser*prs)
@@ -793,8 +699,6 @@ p_isspecial(TParser *prs)
793699
if (pg_dsplen(prs->str+prs->state->posbyte)==0)
794700
return1;
795701

796-
#ifdefUSE_WIDE_UPPER_LOWER
797-
798702
/*
799703
* Unicode Characters in the 'Mark, Spacing Combining' Category That
800704
* characters are not alpha although they are not breakers of word too.
@@ -1058,7 +962,6 @@ p_isspecial(TParser *prs)
1058962
StopHigh=StopMiddle;
1059963
}
1060964
}
1061-
#endif
1062965

1063966
return0;
1064967
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp