NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commitb4c6d31

committed

Fix serious performance problems in json(b) to_tsvector().

In an off-list followup to bug #14745, Bob Jones complained thatto_tsvector() on a 2MB jsonb value took an unreasonable amount oftime and space --- enough to draw the wrath of the OOM killer onhis machine. On my machine, his example proved to require upwardsof 18 seconds and 4GB, which seemed pretty bogus considering thatto_tsvector() on the same data treated as text took just a couplehundred msec and 10 or so MB.On investigation, the problem is that the implementation scans eachstring element of the json(b) and converts it to tsvector separately,then applies tsvector_concat() to join those separate tsvectors.The unreasonable memory usage came from leaking every single one ofthe transient tsvectors --- but even without that mistake, this is anO(N^2) or worse algorithm, because tsvector_concat() has to repeatedlyprocess the words coming from earlier elements.We can fix it by accumulating all the lexeme data and applyingmake_tsvector() just once. As a side benefit, that also makes thedesired adjustment of lexeme positions far cheaper, because we canjust tweak the running "pos" counter between JSON elements.In passing, try to make the explanation of that tweak more intelligible.(I didn't think that a barely-readable comment far removed from theactual code was helpful.) And do some minor other code beautification.

1 parentfb9bd4b commitb4c6d31Copy full SHA for b4c6d31

File tree

2 files changed

+58

-71

lines changed

src
- backend/tsearch
  - to_tsany.c
- include/tsearch
  - ts_type.h

2 files changed

+58

-71

lines changed

`‎src/backend/tsearch/to_tsany.c`

Lines changed: 58 additions & 62 deletions

Original file line number	Diff line number	Diff line change
`@@ -28,11 +28,11 @@ typedef struct MorphOpaque`
`28`	`28`	`typedefstructTSVectorBuildState`
`29`	`29`	`{`
`30`	`30`	`ParsedText*prs;`
`31`		`-TSVectorresult;`
`32`	`31`	`OidcfgId;`
`33`	`32`	`}TSVectorBuildState;`
`34`	`33`
`35`		`-staticvoidadd_to_tsvector(voidstate,charelem_value,intelem_len);`
	`34`	`+staticvoidadd_to_tsvector(void_state,charelem_value,intelem_len);`
	`35`	`+`
`36`	`36`
`37`	`37`	`Datum`
`38`	`38`	`get_current_ts_config(PG_FUNCTION_ARGS)`
`@@ -270,34 +270,33 @@ jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)`
`270`	`270`	`{`
`271`	`271`	`OidcfgId=PG_GETARG_OID(0);`
`272`	`272`	`Jsonb*jb=PG_GETARG_JSONB(1);`
	`273`	`+TSVectorresult;`
`273`	`274`	`TSVectorBuildStatestate;`
`274`		`-ParsedTextprs= (ParsedText)palloc(sizeof(ParsedText));`
	`275`	`+ParsedTextprs;`
`275`	`276`
`276`		`-prs->words=NULL;`
`277`		`-state.result=NULL;`
	`277`	`+prs.words=NULL;`
	`278`	`+prs.curwords=0;`
	`279`	`+state.prs=&prs;`
`278`	`280`	`state.cfgId=cfgId;`
`279`		`-state.prs=prs;`
`280`	`281`
`281`		`-iterate_jsonb_string_values(jb,&state,(JsonIterateStringValuesAction)add_to_tsvector);`
	`282`	`+iterate_jsonb_string_values(jb,&state,add_to_tsvector);`
`282`	`283`
`283`		`-PG_FREE_IF_COPY(jb,1);`
`284`		`-`
`285`		`-if (state.result==NULL)`
	`284`	`+if (prs.curwords>0)`
	`285`	`+result=make_tsvector(&prs);`
	`286`	`+else`
`286`	`287`	`{`
`287`	`288`	`/*`
`288`		`- * There weren't any string elements in jsonb, sowee need to return`
`289`		`- *anempty vector`
	`289`	`+ * There weren't any string elements in jsonb, sowe need to return an`
	`290`	`+ * empty vector`
`290`	`291`	`*/`
`291`		`-`
`292`		`-if (prs->words!=NULL)`
`293`		`-pfree(prs->words);`
`294`		`-`
`295`		`-state.result=palloc(CALCDATASIZE(0,0));`
`296`		`-SET_VARSIZE(state.result,CALCDATASIZE(0,0));`
`297`		`-state.result->size=0;`
	`292`	`+result=palloc(CALCDATASIZE(0,0));`
	`293`	`+SET_VARSIZE(result,CALCDATASIZE(0,0));`
	`294`	`+result->size=0;`
`298`	`295`	`}`
`299`	`296`
`300`		`-PG_RETURN_TSVECTOR(state.result);`
	`297`	`+PG_FREE_IF_COPY(jb,1);`
	`298`	`+`
	`299`	`+PG_RETURN_TSVECTOR(result);`
`301`	`300`	`}`
`302`	`301`
`303`	`302`	`Datum`
`@@ -317,33 +316,33 @@ json_to_tsvector_byid(PG_FUNCTION_ARGS)`
`317`	`316`	`{`
`318`	`317`	`OidcfgId=PG_GETARG_OID(0);`
`319`	`318`	`text*json=PG_GETARG_TEXT_P(1);`
	`319`	`+TSVectorresult;`
`320`	`320`	`TSVectorBuildStatestate;`
`321`		`-ParsedTextprs= (ParsedText)palloc(sizeof(ParsedText));`
	`321`	`+ParsedTextprs;`
`322`	`322`
`323`		`-prs->words=NULL;`
`324`		`-state.result=NULL;`
	`323`	`+prs.words=NULL;`
	`324`	`+prs.curwords=0;`
	`325`	`+state.prs=&prs;`
`325`	`326`	`state.cfgId=cfgId;`
`326`		`-state.prs=prs;`
`327`	`327`
`328`		`-iterate_json_string_values(json,&state,(JsonIterateStringValuesAction)add_to_tsvector);`
	`328`	`+iterate_json_string_values(json,&state,add_to_tsvector);`
`329`	`329`
`330`		`-PG_FREE_IF_COPY(json,1);`
`331`		`-if (state.result==NULL)`
	`330`	`+if (prs.curwords>0)`
	`331`	`+result=make_tsvector(&prs);`
	`332`	`+else`
`332`	`333`	`{`
`333`	`334`	`/*`
`334`		`- * There weren't any string elements in json, sowee need to return an`
	`335`	`+ * There weren't any string elements in json, sowe need to return an`
`335`	`336`	`* empty vector`
`336`	`337`	`*/`
`337`		`-`
`338`		`-if (prs->words!=NULL)`
`339`		`-pfree(prs->words);`
`340`		`-`
`341`		`-state.result=palloc(CALCDATASIZE(0,0));`
`342`		`-SET_VARSIZE(state.result,CALCDATASIZE(0,0));`
`343`		`-state.result->size=0;`
	`338`	`+result=palloc(CALCDATASIZE(0,0));`
	`339`	`+SET_VARSIZE(result,CALCDATASIZE(0,0));`
	`340`	`+result->size=0;`
`344`	`341`	`}`
`345`	`342`
`346`		`-PG_RETURN_TSVECTOR(state.result);`
	`343`	`+PG_FREE_IF_COPY(json,1);`
	`344`	`+`
	`345`	`+PG_RETURN_TSVECTOR(result);`
`347`	`346`	`}`
`348`	`347`
`349`	`348`	`Datum`
`@@ -359,45 +358,42 @@ json_to_tsvector(PG_FUNCTION_ARGS)`
`359`	`358`	`}`
`360`	`359`
`361`	`360`	`/*`
`362`		`- * Extend current TSVector from _state with a new one,`
`363`		`- * build over a json(b) element.`
	`361`	`+ * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState.`
`364`	`362`	`*/`
`365`	`363`	`staticvoid`
`366`	`364`	`add_to_tsvector(void_state,charelem_value,intelem_len)`
`367`	`365`	`{`
`368`	`366`	`TSVectorBuildStatestate= (TSVectorBuildState)_state;`
`369`	`367`	`ParsedText*prs=state->prs;`
`370`		`-TSVectoritem_vector;`
`371`		`-inti;`
	`368`	`+int32prevwords;`
`372`	`369`
`373`		`-prs->lenwords=elem_len /6;`
`374`		`-if (prs->lenwords==0)`
`375`		`-prs->lenwords=2;`
	`370`	`+if (prs->words==NULL)`
	`371`	`+{`
	`372`	`+/*`
	`373`	`+ * First time through: initialize words array to a reasonable size.`
	`374`	`+ * (parsetext() will realloc it bigger as needed.)`
	`375`	`+ */`
	`376`	`+prs->lenwords=Max(elem_len /6,64);`
	`377`	`+prs->words= (ParsedWord)palloc(sizeof(ParsedWord)prs->lenwords);`
	`378`	`+prs->curwords=0;`
	`379`	`+prs->pos=0;`
	`380`	`+}`
`376`	`381`
`377`		`-prs->words= (ParsedWord)palloc(sizeof(ParsedWord)prs->lenwords);`
`378`		`-prs->curwords=0;`
`379`		`-prs->pos=0;`
	`382`	`+prevwords=prs->curwords;`
`380`	`383`
`381`	`384`	`parsetext(state->cfgId,prs,elem_value,elem_len);`
`382`	`385`
`383`		`-if (prs->curwords)`
`384`		`-{`
`385`		`-if (state->result!=NULL)`
`386`		`-{`
`387`		`-for (i=0;i<prs->curwords;i++)`
`388`		`-prs->words[i].pos.pos=prs->words[i].pos.pos+TS_JUMP;`
`389`		`-`
`390`		`-item_vector=make_tsvector(prs);`
`391`		`-`
`392`		`-state->result= (TSVector)DirectFunctionCall2(tsvector_concat,`
`393`		`-TSVectorGetDatum(state->result),`
`394`		`-PointerGetDatum(item_vector));`
`395`		`-}`
`396`		`-else`
`397`		`-state->result=make_tsvector(prs);`
`398`		`-}`
	`386`	`+/*`
	`387`	`+ * If we extracted any words from this JSON element, advance pos to create`
	`388`	`+ * an artificial break between elements. This is because we don't want`
	`389`	`+ * phrase searches to think that the last word in this element is adjacent`
	`390`	`+ * to the first word in the next one.`
	`391`	`+ */`
	`392`	`+if (prs->curwords>prevwords)`
	`393`	`+prs->pos+=1;`
`399`	`394`	`}`
`400`	`395`
	`396`	`+`
`401`	`397`	`/*`
`402`	`398`	`* to_tsquery`
`403`	`399`	`*/`

`‎src/include/tsearch/ts_type.h`

Lines changed: 0 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -86,15 +86,6 @@ typedef struct`
`86`	`86`	`#defineMAXNUMPOS(256)`
`87`	`87`	`#defineLIMITPOS(x) ( ( (x) >= MAXENTRYPOS ) ? (MAXENTRYPOS-1) : (x) )`
`88`	`88`
`89`		`-/*`
`90`		`- * In case if a TSVector contains several parts and we want to treat them as`
`91`		`- * separate, it's necessary to add an artificial increment to position of each`
`92`		`- * lexeme from every next part. It's required to avoid the situation when`
`93`		`- * tsquery can find a phrase consisting of lexemes from two of such parts.`
`94`		`- * TS_JUMP defined a value of this increment.`
`95`		`- */`
`96`		`-#defineTS_JUMP 1`
`97`		`-`
`98`	`89`	`/* This struct represents a complete tsvector datum */`
`99`	`90`	`typedefstruct`
`100`	`91`	`{`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitb4c6d31

File tree

2 files changed

2 files changed

`‎src/backend/tsearch/to_tsany.c`

`‎src/include/tsearch/ts_type.h`

0 commit comments