Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb4c6d31

Browse files
committed
Fix serious performance problems in json(b) to_tsvector().
In an off-list followup to bug #14745, Bob Jones complained thatto_tsvector() on a 2MB jsonb value took an unreasonable amount oftime and space --- enough to draw the wrath of the OOM killer onhis machine. On my machine, his example proved to require upwardsof 18 seconds and 4GB, which seemed pretty bogus considering thatto_tsvector() on the same data treated as text took just a couplehundred msec and 10 or so MB.On investigation, the problem is that the implementation scans eachstring element of the json(b) and converts it to tsvector separately,then applies tsvector_concat() to join those separate tsvectors.The unreasonable memory usage came from leaking every single one ofthe transient tsvectors --- but even without that mistake, this is anO(N^2) or worse algorithm, because tsvector_concat() has to repeatedlyprocess the words coming from earlier elements.We can fix it by accumulating all the lexeme data and applyingmake_tsvector() just once. As a side benefit, that also makes thedesired adjustment of lexeme positions far cheaper, because we canjust tweak the running "pos" counter between JSON elements.In passing, try to make the explanation of that tweak more intelligible.(I didn't think that a barely-readable comment far removed from theactual code was helpful.) And do some minor other code beautification.
1 parentfb9bd4b commitb4c6d31

File tree

2 files changed

+58
-71
lines changed

2 files changed

+58
-71
lines changed

‎src/backend/tsearch/to_tsany.c

Lines changed: 58 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@ typedef struct MorphOpaque
2828
typedefstructTSVectorBuildState
2929
{
3030
ParsedText*prs;
31-
TSVectorresult;
3231
OidcfgId;
3332
}TSVectorBuildState;
3433

35-
staticvoidadd_to_tsvector(void*state,char*elem_value,intelem_len);
34+
staticvoidadd_to_tsvector(void*_state,char*elem_value,intelem_len);
35+
3636

3737
Datum
3838
get_current_ts_config(PG_FUNCTION_ARGS)
@@ -270,34 +270,33 @@ jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
270270
{
271271
OidcfgId=PG_GETARG_OID(0);
272272
Jsonb*jb=PG_GETARG_JSONB(1);
273+
TSVectorresult;
273274
TSVectorBuildStatestate;
274-
ParsedText*prs= (ParsedText*)palloc(sizeof(ParsedText));
275+
ParsedTextprs;
275276

276-
prs->words=NULL;
277-
state.result=NULL;
277+
prs.words=NULL;
278+
prs.curwords=0;
279+
state.prs=&prs;
278280
state.cfgId=cfgId;
279-
state.prs=prs;
280281

281-
iterate_jsonb_string_values(jb,&state,(JsonIterateStringValuesAction)add_to_tsvector);
282+
iterate_jsonb_string_values(jb,&state,add_to_tsvector);
282283

283-
PG_FREE_IF_COPY(jb,1);
284-
285-
if (state.result==NULL)
284+
if (prs.curwords>0)
285+
result=make_tsvector(&prs);
286+
else
286287
{
287288
/*
288-
* There weren't any string elements in jsonb, sowee need to return
289-
*anempty vector
289+
* There weren't any string elements in jsonb, sowe need to return an
290+
* empty vector
290291
*/
291-
292-
if (prs->words!=NULL)
293-
pfree(prs->words);
294-
295-
state.result=palloc(CALCDATASIZE(0,0));
296-
SET_VARSIZE(state.result,CALCDATASIZE(0,0));
297-
state.result->size=0;
292+
result=palloc(CALCDATASIZE(0,0));
293+
SET_VARSIZE(result,CALCDATASIZE(0,0));
294+
result->size=0;
298295
}
299296

300-
PG_RETURN_TSVECTOR(state.result);
297+
PG_FREE_IF_COPY(jb,1);
298+
299+
PG_RETURN_TSVECTOR(result);
301300
}
302301

303302
Datum
@@ -317,33 +316,33 @@ json_to_tsvector_byid(PG_FUNCTION_ARGS)
317316
{
318317
OidcfgId=PG_GETARG_OID(0);
319318
text*json=PG_GETARG_TEXT_P(1);
319+
TSVectorresult;
320320
TSVectorBuildStatestate;
321-
ParsedText*prs= (ParsedText*)palloc(sizeof(ParsedText));
321+
ParsedTextprs;
322322

323-
prs->words=NULL;
324-
state.result=NULL;
323+
prs.words=NULL;
324+
prs.curwords=0;
325+
state.prs=&prs;
325326
state.cfgId=cfgId;
326-
state.prs=prs;
327327

328-
iterate_json_string_values(json,&state,(JsonIterateStringValuesAction)add_to_tsvector);
328+
iterate_json_string_values(json,&state,add_to_tsvector);
329329

330-
PG_FREE_IF_COPY(json,1);
331-
if (state.result==NULL)
330+
if (prs.curwords>0)
331+
result=make_tsvector(&prs);
332+
else
332333
{
333334
/*
334-
* There weren't any string elements in json, sowee need to return an
335+
* There weren't any string elements in json, sowe need to return an
335336
* empty vector
336337
*/
337-
338-
if (prs->words!=NULL)
339-
pfree(prs->words);
340-
341-
state.result=palloc(CALCDATASIZE(0,0));
342-
SET_VARSIZE(state.result,CALCDATASIZE(0,0));
343-
state.result->size=0;
338+
result=palloc(CALCDATASIZE(0,0));
339+
SET_VARSIZE(result,CALCDATASIZE(0,0));
340+
result->size=0;
344341
}
345342

346-
PG_RETURN_TSVECTOR(state.result);
343+
PG_FREE_IF_COPY(json,1);
344+
345+
PG_RETURN_TSVECTOR(result);
347346
}
348347

349348
Datum
@@ -359,45 +358,42 @@ json_to_tsvector(PG_FUNCTION_ARGS)
359358
}
360359

361360
/*
362-
* Extend current TSVector from _state with a new one,
363-
* build over a json(b) element.
361+
* Parse lexemes in an element of a json(b) value, add to TSVectorBuildState.
364362
*/
365363
staticvoid
366364
add_to_tsvector(void*_state,char*elem_value,intelem_len)
367365
{
368366
TSVectorBuildState*state= (TSVectorBuildState*)_state;
369367
ParsedText*prs=state->prs;
370-
TSVectoritem_vector;
371-
inti;
368+
int32prevwords;
372369

373-
prs->lenwords=elem_len /6;
374-
if (prs->lenwords==0)
375-
prs->lenwords=2;
370+
if (prs->words==NULL)
371+
{
372+
/*
373+
* First time through: initialize words array to a reasonable size.
374+
* (parsetext() will realloc it bigger as needed.)
375+
*/
376+
prs->lenwords=Max(elem_len /6,64);
377+
prs->words= (ParsedWord*)palloc(sizeof(ParsedWord)*prs->lenwords);
378+
prs->curwords=0;
379+
prs->pos=0;
380+
}
376381

377-
prs->words= (ParsedWord*)palloc(sizeof(ParsedWord)*prs->lenwords);
378-
prs->curwords=0;
379-
prs->pos=0;
382+
prevwords=prs->curwords;
380383

381384
parsetext(state->cfgId,prs,elem_value,elem_len);
382385

383-
if (prs->curwords)
384-
{
385-
if (state->result!=NULL)
386-
{
387-
for (i=0;i<prs->curwords;i++)
388-
prs->words[i].pos.pos=prs->words[i].pos.pos+TS_JUMP;
389-
390-
item_vector=make_tsvector(prs);
391-
392-
state->result= (TSVector)DirectFunctionCall2(tsvector_concat,
393-
TSVectorGetDatum(state->result),
394-
PointerGetDatum(item_vector));
395-
}
396-
else
397-
state->result=make_tsvector(prs);
398-
}
386+
/*
387+
* If we extracted any words from this JSON element, advance pos to create
388+
* an artificial break between elements. This is because we don't want
389+
* phrase searches to think that the last word in this element is adjacent
390+
* to the first word in the next one.
391+
*/
392+
if (prs->curwords>prevwords)
393+
prs->pos+=1;
399394
}
400395

396+
401397
/*
402398
* to_tsquery
403399
*/

‎src/include/tsearch/ts_type.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -86,15 +86,6 @@ typedef struct
8686
#defineMAXNUMPOS(256)
8787
#defineLIMITPOS(x) ( ( (x) >= MAXENTRYPOS ) ? (MAXENTRYPOS-1) : (x) )
8888

89-
/*
90-
* In case if a TSVector contains several parts and we want to treat them as
91-
* separate, it's necessary to add an artificial increment to position of each
92-
* lexeme from every next part. It's required to avoid the situation when
93-
* tsquery can find a phrase consisting of lexemes from two of such parts.
94-
* TS_JUMP defined a value of this increment.
95-
*/
96-
#defineTS_JUMP 1
97-
9889
/* This struct represents a complete tsvector datum */
9990
typedefstruct
10091
{

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp