|
22 | 22 | #include"catalog/pg_type.h"
|
23 | 23 | #include"common/hashfn.h"
|
24 | 24 | #include"common/int.h"
|
| 25 | +#include"common/unicode_norm.h" |
25 | 26 | #include"lib/hyperloglog.h"
|
26 | 27 | #include"libpq/pqformat.h"
|
27 | 28 | #include"miscadmin.h"
|
@@ -5976,3 +5977,152 @@ rest_of_char_same(const char *s1, const char *s2, int len)
|
5976 | 5977 | #include"levenshtein.c"
|
5977 | 5978 | #defineLEVENSHTEIN_LESS_EQUAL
|
5978 | 5979 | #include"levenshtein.c"
|
| 5980 | + |
| 5981 | + |
| 5982 | +/* |
| 5983 | + * Unicode support |
| 5984 | + */ |
| 5985 | + |
| 5986 | +staticUnicodeNormalizationForm |
| 5987 | +unicode_norm_form_from_string(constchar*formstr) |
| 5988 | +{ |
| 5989 | +UnicodeNormalizationFormform=-1; |
| 5990 | + |
| 5991 | +/* |
| 5992 | + * Might as well check this while we're here. |
| 5993 | + */ |
| 5994 | +if (GetDatabaseEncoding()!=PG_UTF8) |
| 5995 | +ereport(ERROR, |
| 5996 | +(errcode(ERRCODE_SYNTAX_ERROR), |
| 5997 | +errmsg("Unicode normalization can only be performed if server encoding is UTF8"))); |
| 5998 | + |
| 5999 | +if (pg_strcasecmp(formstr,"NFC")==0) |
| 6000 | +form=UNICODE_NFC; |
| 6001 | +elseif (pg_strcasecmp(formstr,"NFD")==0) |
| 6002 | +form=UNICODE_NFD; |
| 6003 | +elseif (pg_strcasecmp(formstr,"NFKC")==0) |
| 6004 | +form=UNICODE_NFKC; |
| 6005 | +elseif (pg_strcasecmp(formstr,"NFKD")==0) |
| 6006 | +form=UNICODE_NFKD; |
| 6007 | +else |
| 6008 | +ereport(ERROR, |
| 6009 | +(errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| 6010 | +errmsg("invalid normalization form: %s",formstr))); |
| 6011 | + |
| 6012 | +returnform; |
| 6013 | +} |
| 6014 | + |
| 6015 | +Datum |
| 6016 | +unicode_normalize_func(PG_FUNCTION_ARGS) |
| 6017 | +{ |
| 6018 | +text*input=PG_GETARG_TEXT_PP(0); |
| 6019 | +char*formstr=text_to_cstring(PG_GETARG_TEXT_PP(1)); |
| 6020 | +UnicodeNormalizationFormform; |
| 6021 | +intsize; |
| 6022 | +pg_wchar*input_chars; |
| 6023 | +pg_wchar*output_chars; |
| 6024 | +unsignedchar*p; |
| 6025 | +text*result; |
| 6026 | +inti; |
| 6027 | + |
| 6028 | +form=unicode_norm_form_from_string(formstr); |
| 6029 | + |
| 6030 | +/* convert to pg_wchar */ |
| 6031 | +size=pg_mbstrlen_with_len(VARDATA_ANY(input),VARSIZE_ANY_EXHDR(input)); |
| 6032 | +input_chars=palloc((size+1)*sizeof(pg_wchar)); |
| 6033 | +p= (unsignedchar*)VARDATA_ANY(input); |
| 6034 | +for (i=0;i<size;i++) |
| 6035 | +{ |
| 6036 | +input_chars[i]=utf8_to_unicode(p); |
| 6037 | +p+=pg_utf_mblen(p); |
| 6038 | +} |
| 6039 | +input_chars[i]= (pg_wchar)'\0'; |
| 6040 | +Assert((char*)p==VARDATA_ANY(input)+VARSIZE_ANY_EXHDR(input)); |
| 6041 | + |
| 6042 | +/* action */ |
| 6043 | +output_chars=unicode_normalize(form,input_chars); |
| 6044 | + |
| 6045 | +/* convert back to UTF-8 string */ |
| 6046 | +size=0; |
| 6047 | +for (pg_wchar*wp=output_chars;*wp;wp++) |
| 6048 | +{ |
| 6049 | +unsignedcharbuf[4]; |
| 6050 | + |
| 6051 | +unicode_to_utf8(*wp,buf); |
| 6052 | +size+=pg_utf_mblen(buf); |
| 6053 | +} |
| 6054 | + |
| 6055 | +result=palloc(size+VARHDRSZ); |
| 6056 | +SET_VARSIZE(result,size+VARHDRSZ); |
| 6057 | + |
| 6058 | +p= (unsignedchar*)VARDATA_ANY(result); |
| 6059 | +for (pg_wchar*wp=output_chars;*wp;wp++) |
| 6060 | +{ |
| 6061 | +unicode_to_utf8(*wp,p); |
| 6062 | +p+=pg_utf_mblen(p); |
| 6063 | +} |
| 6064 | +Assert((char*)p== (char*)result+size+VARHDRSZ); |
| 6065 | + |
| 6066 | +PG_RETURN_TEXT_P(result); |
| 6067 | +} |
| 6068 | + |
| 6069 | +/* |
| 6070 | + * Check whether the string is in the specified Unicode normalization form. |
| 6071 | + * |
| 6072 | + * This is done by convering the string to the specified normal form and then |
| 6073 | + * comparing that to the original string. To speed that up, we also apply the |
| 6074 | + * "quick check" algorithm specified in UAX #15, which can give a yes or no |
| 6075 | + * answer for many strings by just scanning the string once. |
| 6076 | + * |
| 6077 | + * This function should generally be optimized for the case where the string |
| 6078 | + * is in fact normalized. In that case, we'll end up looking at the entire |
| 6079 | + * string, so it's probably not worth doing any incremental conversion etc. |
| 6080 | + */ |
| 6081 | +Datum |
| 6082 | +unicode_is_normalized(PG_FUNCTION_ARGS) |
| 6083 | +{ |
| 6084 | +text*input=PG_GETARG_TEXT_PP(0); |
| 6085 | +char*formstr=text_to_cstring(PG_GETARG_TEXT_PP(1)); |
| 6086 | +UnicodeNormalizationFormform; |
| 6087 | +intsize; |
| 6088 | +pg_wchar*input_chars; |
| 6089 | +pg_wchar*output_chars; |
| 6090 | +unsignedchar*p; |
| 6091 | +inti; |
| 6092 | +UnicodeNormalizationQCquickcheck; |
| 6093 | +intoutput_size; |
| 6094 | +boolresult; |
| 6095 | + |
| 6096 | +form=unicode_norm_form_from_string(formstr); |
| 6097 | + |
| 6098 | +/* convert to pg_wchar */ |
| 6099 | +size=pg_mbstrlen_with_len(VARDATA_ANY(input),VARSIZE_ANY_EXHDR(input)); |
| 6100 | +input_chars=palloc((size+1)*sizeof(pg_wchar)); |
| 6101 | +p= (unsignedchar*)VARDATA_ANY(input); |
| 6102 | +for (i=0;i<size;i++) |
| 6103 | +{ |
| 6104 | +input_chars[i]=utf8_to_unicode(p); |
| 6105 | +p+=pg_utf_mblen(p); |
| 6106 | +} |
| 6107 | +input_chars[i]= (pg_wchar)'\0'; |
| 6108 | +Assert((char*)p==VARDATA_ANY(input)+VARSIZE_ANY_EXHDR(input)); |
| 6109 | + |
| 6110 | +/* quick check (see UAX #15) */ |
| 6111 | +quickcheck=unicode_is_normalized_quickcheck(form,input_chars); |
| 6112 | +if (quickcheck==UNICODE_NORM_QC_YES) |
| 6113 | +PG_RETURN_BOOL(true); |
| 6114 | +elseif (quickcheck==UNICODE_NORM_QC_NO) |
| 6115 | +PG_RETURN_BOOL(false); |
| 6116 | + |
| 6117 | +/* normalize and compare with original */ |
| 6118 | +output_chars=unicode_normalize(form,input_chars); |
| 6119 | + |
| 6120 | +output_size=0; |
| 6121 | +for (pg_wchar*wp=output_chars;*wp;wp++) |
| 6122 | +output_size++; |
| 6123 | + |
| 6124 | +result= (size==output_size)&& |
| 6125 | +(memcmp(input_chars,output_chars,size*sizeof(pg_wchar))==0); |
| 6126 | + |
| 6127 | +PG_RETURN_BOOL(result); |
| 6128 | +} |