88 *
99 *
1010 * IDENTIFICATION
11- * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.236 2004/12/31 21:59:41 pgsql Exp $
11+ * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.237 2005/03/12 05:41:34 momjian Exp $
1212 *
1313 *-------------------------------------------------------------------------
1414 */
@@ -98,7 +98,6 @@ static bool fe_eof;/* true if detected end of copy data */
9898static EolType eol_type ;/* EOL type of input */
9999static int client_encoding ;/* remote side's character encoding */
100100static int server_encoding ;/* local encoding */
101- static bool embedded_line_warning ;
102101
103102/* these are just for error messages, see copy_in_error_callback */
104103static bool copy_binary ;/* is it a binary copy? */
@@ -139,7 +138,7 @@ static void CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
139138static void CopyFrom (Relation rel ,List * attnumlist ,bool binary ,bool oids ,
140139char * delim ,char * null_print ,bool csv_mode ,char * quote ,char * escape ,
141140List * force_notnull_atts );
142- static bool CopyReadLine (void );
141+ static bool CopyReadLine (char * quote , char * escape );
143142static char * CopyReadAttribute (const char * delim ,const char * null_print ,
144143CopyReadResult * result ,bool * isnull );
145144static char * CopyReadAttributeCSV (const char * delim ,const char * null_print ,
@@ -1191,7 +1190,6 @@ CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
11911190attr = tupDesc -> attrs ;
11921191num_phys_attrs = tupDesc -> natts ;
11931192attr_count = list_length (attnumlist );
1194- embedded_line_warning = false;
11951193
11961194/*
11971195 * Get info about the columns we need to process.
@@ -1718,7 +1716,8 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
17181716ListCell * cur ;
17191717
17201718/* Actually read the line into memory here */
1721- done = CopyReadLine ();
1719+ done = csv_mode ?
1720+ CopyReadLine (quote ,escape ) :CopyReadLine (NULL ,NULL );
17221721
17231722/*
17241723 * EOF at start of line means we're done. If we see EOF after
@@ -2006,7 +2005,7 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
20062005 * by newline.
20072006 */
20082007static bool
2009- CopyReadLine (void )
2008+ CopyReadLine (char * quote , char * escape )
20102009{
20112010bool result ;
20122011bool change_encoding = (client_encoding != server_encoding );
@@ -2015,6 +2014,19 @@ CopyReadLine(void)
20152014int j ;
20162015unsignedchar s [2 ];
20172016char * cvt ;
2017+ bool in_quote = false,last_was_esc = false,csv_mode = false;
2018+ char quotec = '\0' ,escapec = '\0' ;
2019+
2020+ if (quote )
2021+ {
2022+ csv_mode = true;
2023+ quotec = quote [0 ];
2024+ escapec = escape [0 ];
2025+ /* ignore special escape processing if it's the same as quotec */
2026+ if (quotec == escapec )
2027+ escapec = '\0' ;
2028+ }
2029+
20182030
20192031s [1 ]= 0 ;
20202032
@@ -2031,11 +2043,20 @@ CopyReadLine(void)
20312043
20322044/*
20332045 * In this loop we only care for detecting newlines (\r and/or \n) and
2034- * the end-of-copy marker (\.). For backwards compatibility we allow
2046+ * the end-of-copy marker (\.).
2047+ *
2048+ * In Text mode, for backwards compatibility we allow
20352049 * backslashes to escape newline characters. Backslashes other than
20362050 * the end marker get put into the line_buf, since CopyReadAttribute
2037- * does its own escape processing.These four characters, and only
2038- * these four, are assumed the same in frontend and backend encodings.
2051+ * does its own escape processing.
2052+ *
2053+ * In CSV mode, CR and NL inside q quoted field are just part of the
2054+ * data value and are put in line_buf. We keep just enough state
2055+ * to know if we are currently in a quoted field or not.
2056+ *
2057+ * These four characters, and only these four, are assumed the same in
2058+ * frontend and backend encodings.
2059+ *
20392060 * We do not assume that second and later bytes of a frontend
20402061 * multibyte character couldn't look like ASCII characters.
20412062 */
@@ -2047,13 +2068,49 @@ CopyReadLine(void)
20472068result = true;
20482069break ;
20492070}
2050- if (c == '\r' )
2071+
2072+ if (csv_mode )
2073+ {
2074+ /*
2075+ * Dealing with quotes and escapes here is mildly tricky. If the
2076+ * quote char is also the escape char, there's no problem - we
2077+ * just use the char as a toggle. If they are different, we need
2078+ * to ensure that we only take account of an escape inside a quoted
2079+ * field and immediately preceding a quote char, and not the
2080+ * second in a escape-escape sequence.
2081+ */
2082+
2083+ if (in_quote && c == escapec )
2084+ last_was_esc = !last_was_esc ;
2085+ if (c == quotec && !last_was_esc )
2086+ in_quote = !in_quote ;
2087+ if (c != escapec )
2088+ last_was_esc = false;
2089+
2090+ /*
2091+ * updating the line count for embedded CR and/or LF chars is
2092+ * necessarily a little fragile - this test is probably about
2093+ * the best we can do.
2094+ */
2095+ if (in_quote && c == (eol_type == EOL_CR ?'\r' :'\n' ))
2096+ copy_lineno ++ ;
2097+ }
2098+
2099+ if (!in_quote && c == '\r' )
20512100{
20522101if (eol_type == EOL_NL )
2053- ereport (ERROR ,
2054- (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2055- errmsg ("literal carriage return found in data" ),
2056- errhint ("Use \"\\r\" to represent carriage return." )));
2102+ {
2103+ if (!csv_mode )
2104+ ereport (ERROR ,
2105+ (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2106+ errmsg ("literal carriage return found in data" ),
2107+ errhint ("Use \"\\r\" to represent carriage return." )));
2108+ else
2109+ ereport (ERROR ,
2110+ (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2111+ errmsg ("unquoted carriage return found in CSV data" ),
2112+ errhint ("Use quoted CSV field to represent carriage return." )));
2113+ }
20572114/* Check for \r\n on first line, _and_ handle \r\n. */
20582115if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL )
20592116{
@@ -2068,10 +2125,19 @@ CopyReadLine(void)
20682125{
20692126/* found \r, but no \n */
20702127if (eol_type == EOL_CRNL )
2071- ereport (ERROR ,
2072- (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2073- errmsg ("literal carriage return found in data" ),
2074- errhint ("Use \"\\r\" to represent carriage return." )));
2128+ {
2129+ if (!csv_mode )
2130+ ereport (ERROR ,
2131+ (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2132+ errmsg ("literal carriage return found in data" ),
2133+ errhint ("Use \"\\r\" to represent carriage return." )));
2134+ else
2135+ ereport (ERROR ,
2136+ (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2137+ errmsg ("unquoted carriage return found in data" ),
2138+ errhint ("Use quoted CSV field to represent carriage return." )));
2139+
2140+ }
20752141
20762142/*
20772143 * if we got here, it is the first line and we didn't
@@ -2083,26 +2149,47 @@ CopyReadLine(void)
20832149}
20842150break ;
20852151}
2086- if (c == '\n' )
2152+ if (! in_quote && c == '\n' )
20872153{
20882154if (eol_type == EOL_CR || eol_type == EOL_CRNL )
2089- ereport (ERROR ,
2090- (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2091- errmsg ("literal newline found in data" ),
2092- errhint ("Use \"\\n\" to represent newline." )));
2155+ {
2156+ if (!csv_mode )
2157+ ereport (ERROR ,
2158+ (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2159+ errmsg ("literal newline found in data" ),
2160+ errhint ("Use \"\\n\" to represent newline." )));
2161+ else
2162+ ereport (ERROR ,
2163+ (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2164+ errmsg ("unquoted newline found in data" ),
2165+ errhint ("Use quoted CSV field to represent newline." )));
2166+
2167+ }
20932168eol_type = EOL_NL ;
20942169break ;
20952170}
2096- if (c == '\\' )
2171+
2172+ if ((line_buf .len == 0 || !csv_mode )&& c == '\\' )
20972173{
2098- c = CopyGetChar ();
2099- if (c == EOF )
2174+ int c2 ;
2175+
2176+ if (csv_mode )
2177+ c2 = CopyPeekChar ();
2178+ else
2179+ c2 = c = CopyGetChar ();
2180+
2181+ if (c2 == EOF )
21002182{
21012183result = true;
2184+ if (csv_mode )
2185+ CopyDonePeek (c2 , true);
21022186break ;
21032187}
2104- if (c == '.' )
2188+ if (c2 == '.' )
21052189{
2190+ if (csv_mode )
2191+ CopyDonePeek (c2 , true);/* allow keep calling GetChar() */
2192+
21062193if (eol_type == EOL_CRNL )
21072194{
21082195c = CopyGetChar ();
@@ -2140,8 +2227,12 @@ CopyReadLine(void)
21402227result = true;/* report EOF */
21412228break ;
21422229}
2143- /* not EOF mark, so emit \ and following char literally */
2144- appendStringInfoCharMacro (& line_buf ,'\\' );
2230+
2231+ if (csv_mode )
2232+ CopyDonePeek (c2 , false);/* not a dot, so put it back */
2233+ else
2234+ /* not EOF mark, so emit \ and following char literally */
2235+ appendStringInfoCharMacro (& line_buf ,'\\' );
21452236}
21462237
21472238appendStringInfoCharMacro (& line_buf ,c );
@@ -2369,34 +2460,6 @@ CopyReadAttributeCSV(const char *delim, const char *null_print, char *quote,
23692460
23702461for (;;)
23712462{
2372- /* handle multiline quoted fields */
2373- if (in_quote && line_buf .cursor >=line_buf .len )
2374- {
2375- bool done ;
2376-
2377- switch (eol_type )
2378- {
2379- case EOL_NL :
2380- appendStringInfoString (& attribute_buf ,"\n" );
2381- break ;
2382- case EOL_CR :
2383- appendStringInfoString (& attribute_buf ,"\r" );
2384- break ;
2385- case EOL_CRNL :
2386- appendStringInfoString (& attribute_buf ,"\r\n" );
2387- break ;
2388- case EOL_UNKNOWN :
2389- /* shouldn't happen - just keep going */
2390- break ;
2391- }
2392-
2393- copy_lineno ++ ;
2394- done = CopyReadLine ();
2395- if (done && line_buf .len == 0 )
2396- break ;
2397- start_cursor = line_buf .cursor ;
2398- }
2399-
24002463end_cursor = line_buf .cursor ;
24012464if (line_buf .cursor >=line_buf .len )
24022465break ;
@@ -2629,25 +2692,6 @@ CopyAttributeOutCSV(char *server_string, char *delim, char *quote,
26292692 !use_quote && (c = * test_string )!= '\0' ;
26302693test_string += mblen )
26312694{
2632- /*
2633- * We don't know here what the surrounding line end characters
2634- * might be. It might not even be under postgres' control. So
2635- * we simple warn on ANY embedded line ending character.
2636- *
2637- * This warning will disappear when we make line parsing field-aware,
2638- * so that we can reliably read in embedded line ending characters
2639- * regardless of the file's line-end context.
2640- *
2641- */
2642-
2643- if (!embedded_line_warning && (c == '\n' || c == '\r' ) )
2644- {
2645- embedded_line_warning = true;
2646- elog (WARNING ,
2647- "CSV fields with embedded linefeed or carriage return "
2648- "characters might not be able to be reimported" );
2649- }
2650-
26512695if (c == delimc || c == quotec || c == '\n' || c == '\r' )
26522696use_quote = true;
26532697if (!same_encoding )