NotificationsYou must be signed in to change notification settings
Fork4.9k
Star17.7k

Commit51e78ab

committed

Avoid use of sscanf() to parse ispell dictionary files.

It turns out that on FreeBSD-derived platforms (including OS X), the*scanf() family of functions is pretty much brain-dead about multibytecharacters. In particular it will apply isspace() to individual bytesof input even when those bytes are part of a multibyte character, thusallowing false recognition of a field-terminating space.We appear to have little alternative other than instituting a codingrule that *scanf() is not to be used if the input string might containmultibyte characters. (There was some discussion of relying on "%ls",but that probably just moves the portability problem somewhere else,and besides it doesn't fully prevent BSD *scanf() from using isspace().)This patch is a down payment on that: it gets rid of use of sscanf()to parse ispell dictionary files, which are certainly at great riskof having a problem. The code is cleaner this way anyway, thougha bit longer.In passing, improve a few comments.Report and patch by Artur Zakirov, reviewed and somewhat tweaked by me.Back-patch to all supported branches.

1 parentc5e9b77 commit51e78abCopy full SHA for 51e78ab

File tree

1 file changed

+153

-13

lines changed

src/backend/tsearch
- spell.c

1 file changed

+153

-13

lines changed

`‎src/backend/tsearch/spell.c`

Lines changed: 153 additions & 13 deletions

Original file line number	Diff line number	Diff line change
`@@ -457,13 +457,149 @@ NIAddAffix(IspellDict Conf, int flag, char flagflags, const char mask, const c`
`457`	`457`	`Conf->naffixes++;`
`458`	`458`	`}`
`459`	`459`
	`460`	`+`
	`461`	`+/* Parsing states for parse_affentry() and friends */`
`460`	`462`	`#definePAE_WAIT_MASK0`
`461`		`-#definePAE_INMASK1`
	`463`	`+#definePAE_INMASK1`
`462`	`464`	`#definePAE_WAIT_FIND2`
`463`		`-#definePAE_INFIND3`
	`465`	`+#definePAE_INFIND3`
`464`	`466`	`#definePAE_WAIT_REPL4`
`465`		`-#definePAE_INREPL5`
	`467`	`+#definePAE_INREPL5`
	`468`	`+#definePAE_WAIT_TYPE6`
	`469`	`+#definePAE_WAIT_FLAG7`
`466`	`470`
	`471`	`+/*`
	`472`	`+ * Parse next space-separated field of an .affix file line.`
	`473`	`+ *`
	`474`	`+ * *str is the input pointer (will be advanced past field)`
	`475`	`+ * next is where to copy the field value to, with null termination`
	`476`	`+ *`
	`477`	`+ * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.`
	`478`	`+ *`
	`479`	`+ * Returns TRUE if we found a field, FALSE if not.`
	`480`	`+ */`
	`481`	`+staticbool`
	`482`	`+get_nextfield(char*str,charnext)`
	`483`	`+{`
	`484`	`+intstate=PAE_WAIT_MASK;`
	`485`	`+intavail=BUFSIZ;`
	`486`	`+`
	`487`	`+while (**str)`
	`488`	`+{`
	`489`	`+if (state==PAE_WAIT_MASK)`
	`490`	`+{`
	`491`	`+if (t_iseq(*str,'#'))`
	`492`	`+return false;`
	`493`	`+elseif (!t_isspace(*str))`
	`494`	`+{`
	`495`	`+intclen=pg_mblen(*str);`
	`496`	`+`
	`497`	`+if (clen<avail)`
	`498`	`+{`
	`499`	`+COPYCHAR(next,*str);`
	`500`	`+next+=clen;`
	`501`	`+avail-=clen;`
	`502`	`+}`
	`503`	`+state=PAE_INMASK;`
	`504`	`+}`
	`505`	`+}`
	`506`	`+else/* state == PAE_INMASK */`
	`507`	`+{`
	`508`	`+if (t_isspace(*str))`
	`509`	`+{`
	`510`	`+*next='\0';`
	`511`	`+return true;`
	`512`	`+}`
	`513`	`+else`
	`514`	`+{`
	`515`	`+intclen=pg_mblen(*str);`
	`516`	`+`
	`517`	`+if (clen<avail)`
	`518`	`+{`
	`519`	`+COPYCHAR(next,*str);`
	`520`	`+next+=clen;`
	`521`	`+avail-=clen;`
	`522`	`+}`
	`523`	`+}`
	`524`	`+}`
	`525`	`+str+=pg_mblen(str);`
	`526`	`+}`
	`527`	`+`
	`528`	`+*next='\0';`
	`529`	`+`
	`530`	`+return (state==PAE_INMASK);/* OK if we got a nonempty field */`
	`531`	`+}`
	`532`	`+`
	`533`	`+/*`
	`534`	`+ * Parses entry of an .affix file of MySpell or Hunspell format.`
	`535`	`+ *`
	`536`	`+ * An .affix file entry has the following format:`
	`537`	`+ * - header`
	`538`	`+ * <type> <flag> <cross_flag> <flag_count>`
	`539`	`+ * - fields after header:`
	`540`	`+ * <type> <flag> <find> <replace><mask>`
	`541`	`+ *`
	`542`	`+ * str is the input line`
	`543`	`+ * field values are returned to type etc, which must be buffers of size BUFSIZ.`
	`544`	`+ *`
	`545`	`+ * Returns number of fields found; any omitted fields are set to empty strings.`
	`546`	`+ */`
	`547`	`+staticint`
	`548`	`+parse_ooaffentry(charstr,chartype,charflag,charfind,`
	`549`	`+charrepl,charmask)`
	`550`	`+{`
	`551`	`+intstate=PAE_WAIT_TYPE;`
	`552`	`+intfields_read=0;`
	`553`	`+boolvalid= false;`
	`554`	`+`
	`555`	`+type=flag=find=repl=*mask='\0';`
	`556`	`+`
	`557`	`+while (*str)`
	`558`	`+{`
	`559`	`+switch (state)`
	`560`	`+{`
	`561`	`+casePAE_WAIT_TYPE:`
	`562`	`+valid=get_nextfield(&str,type);`
	`563`	`+state=PAE_WAIT_FLAG;`
	`564`	`+break;`
	`565`	`+casePAE_WAIT_FLAG:`
	`566`	`+valid=get_nextfield(&str,flag);`
	`567`	`+state=PAE_WAIT_FIND;`
	`568`	`+break;`
	`569`	`+casePAE_WAIT_FIND:`
	`570`	`+valid=get_nextfield(&str,find);`
	`571`	`+state=PAE_WAIT_REPL;`
	`572`	`+break;`
	`573`	`+casePAE_WAIT_REPL:`
	`574`	`+valid=get_nextfield(&str,repl);`
	`575`	`+state=PAE_WAIT_MASK;`
	`576`	`+break;`
	`577`	`+casePAE_WAIT_MASK:`
	`578`	`+valid=get_nextfield(&str,mask);`
	`579`	`+state=-1;/* force loop exit */`
	`580`	`+break;`
	`581`	`+default:`
	`582`	`+elog(ERROR,"unrecognized state in parse_ooaffentry: %d",`
	`583`	`+state);`
	`584`	`+break;`
	`585`	`+}`
	`586`	`+if (valid)`
	`587`	`+fields_read++;`
	`588`	`+else`
	`589`	`+break;/* early EOL */`
	`590`	`+if (state<0)`
	`591`	`+break;/* got all fields */`
	`592`	`+}`
	`593`	`+`
	`594`	`+returnfields_read;`
	`595`	`+}`
	`596`	`+`
	`597`	`+/*`
	`598`	`+ * Parses entry of an .affix file of Ispell format`
	`599`	`+ *`
	`600`	`+ * An .affix file entry has the following format:`
	`601`	`+ * <mask> > [-<find>,]<replace>`
	`602`	`+ */`
`467`	`603`	`staticbool`
`468`	`604`	`parse_affentry(charstr,charmask,charfind,charrepl)`
`469`	`605`	`{`
`@@ -618,8 +754,6 @@ NIImportOOAffixes(IspellDict Conf, const char filename)`
`618`	`754`	`intflag=0;`
`619`	`755`	`charflagflags=0;`
`620`	`756`	`tsearch_readline_statetrst;`
`621`		`-intscanread=0;`
`622`		`-charscanbuf[BUFSIZ];`
`623`	`757`	`char*recoded;`
`624`	`758`
`625`	`759`	`/* read file to find any flag */`
`@@ -682,8 +816,6 @@ NIImportOOAffixes(IspellDict Conf, const char filename)`
`682`	`816`	`}`
`683`	`817`	`tsearch_readline_end(&trst);`
`684`	`818`
`685`		`-sprintf(scanbuf,"%%6s %%%ds %%%ds %%%ds %%%ds",BUFSIZ /5,BUFSIZ /5,BUFSIZ /5,BUFSIZ /5);`
`686`		`-`
`687`	`819`	`if (!tsearch_readline_begin(&trst,filename))`
`688`	`820`	`ereport(ERROR,`
`689`	`821`	`(errcode(ERRCODE_CONFIG_FILE_ERROR),`
`@@ -692,18 +824,21 @@ NIImportOOAffixes(IspellDict Conf, const char filename)`
`692`	`824`
`693`	`825`	`while ((recoded=tsearch_readline(&trst))!=NULL)`
`694`	`826`	`{`
	`827`	`+intfields_read;`
	`828`	`+`
`695`	`829`	`if (*recoded=='\0'\|\|t_isspace(recoded)\|\|t_iseq(recoded,'#'))`
`696`	`830`	`gotonextline;`
`697`	`831`
`698`		`-scanread=sscanf(recoded,scanbuf,type,sflag,find,repl,mask);`
	`832`	`+fields_read=parse_ooaffentry(recoded,type,sflag,find,repl,mask);`
`699`	`833`
`700`	`834`	`if (ptype)`
`701`	`835`	`pfree(ptype);`
`702`	`836`	`ptype=lowerstr_ctx(Conf,type);`
`703`		`-if (scanread<4\|\| (STRNCMP(ptype,"sfx")&&STRNCMP(ptype,"pfx")))`
	`837`	`+if (fields_read<4\|\|`
	`838`	`+(STRNCMP(ptype,"sfx")!=0&&STRNCMP(ptype,"pfx")!=0))`
`704`	`839`	`gotonextline;`
`705`	`840`
`706`		`-if (scanread==4)`
	`841`	`+if (fields_read==4)`
`707`	`842`	`{`
`708`	`843`	`if (strlen(sflag)!=1)`
`709`	`844`	`gotonextline;`
`@@ -722,9 +857,13 @@ NIImportOOAffixes(IspellDict Conf, const char filename)`
`722`	`857`	`if (strlen(sflag)!=1\|\|flag!=*sflag\|\|flag==0)`
`723`	`858`	`gotonextline;`
`724`	`859`	`prepl=lowerstr_ctx(Conf,repl);`
`725`		`-/affix flag /`
	`860`	`+/Find position of '/' in lowercased string "prepl" /`
`726`	`861`	`if ((ptr=strchr(prepl,'/'))!=NULL)`
`727`	`862`	`{`
	`863`	`+/*`
	`864`	`+ * Here we use non-lowercased string "repl". We need position`
	`865`	`+ * of '/' in "repl".`
	`866`	`+ */`
`728`	`867`	`*ptr='\0';`
`729`	`868`	`ptr=repl+ (ptr-prepl)+1;`
`730`	`869`	`while (*ptr)`
`@@ -800,11 +939,12 @@ NIImportAffixes(IspellDict Conf, const char filename)`
`800`	`939`
`801`	`940`	`if (STRNCMP(pstr,"compoundwords")==0)`
`802`	`941`	`{`
	`942`	`+/* Find position in lowercased string "pstr" */`
`803`	`943`	`s=findchar(pstr,'l');`
`804`	`944`	`if (s)`
`805`	`945`	`{`
`806`		`-s=recoded+ (s-pstr);/* weneed non-lowercased`
`807`		`- * string */`
	`946`	`+/Hereweuse non-lowercased string "recoded" /`
	`947`	`+s=recoded+ (s-pstr);`
`808`	`948`	`while (*s&& !t_isspace(s))`
`809`	`949`	`s+=pg_mblen(s);`
`810`	`950`	`while (*s&&t_isspace(s))`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit51e78ab

File tree

1 file changed

1 file changed

`‎src/backend/tsearch/spell.c`

0 commit comments