@@ -2042,7 +2042,6 @@ def tobytes_first(x, conv):
20422042strcolidx = [i for (i ,v )in enumerate (column_types )
20432043if v == np .unicode_ ]
20442044
2045- type_str = np .unicode_
20462045if byte_converters and strcolidx :
20472046# convert strings back to bytes for backward compatibility
20482047warnings .warn (
@@ -2058,33 +2057,37 @@ def encode_unicode_cols(row_tup):
20582057
20592058try :
20602059data = [encode_unicode_cols (r )for r in data ]
2061- type_str = np .bytes_
20622060except UnicodeEncodeError :
20632061pass
2062+ else :
2063+ for i in strcolidx :
2064+ column_types [i ]= np .bytes_
20642065
2066+ # Update string types to be the right length
2067+ sized_column_types = column_types [:]
2068+ for i ,col_type in enumerate (column_types ):
2069+ if np .issubdtype (col_type ,np .character ):
2070+ n_chars = max (len (row [i ])for row in data )
2071+ sized_column_types [i ]= (col_type ,n_chars )
20652072
2066- # ... and take the largest number of chars.
2067- for i in strcolidx :
2068- max_line_length = max (len (row [i ])for row in data )
2069- column_types [i ]= np .dtype ((type_str ,max_line_length ))
2070- #
20712073if names is None :
2072- # If the dtype is uniform, don't define names, else use ''
2073- base = set ([c .type for c in converters if c ._checked ])
2074+ # If the dtype is uniform (before sizing strings)
2075+ base = set ([
2076+ c_type
2077+ for c ,c_type in zip (converters ,column_types )
2078+ if c ._checked ])
20742079if len (base )== 1 :
2075- if strcolidx :
2076- (ddtype ,mdtype )= (type_str ,bool )
2077- else :
2078- (ddtype ,mdtype )= (list (base )[0 ],bool )
2080+ uniform_type ,= base
2081+ (ddtype ,mdtype )= (uniform_type ,bool )
20792082else :
20802083ddtype = [(defaultfmt % i ,dt )
2081- for (i ,dt )in enumerate (column_types )]
2084+ for (i ,dt )in enumerate (sized_column_types )]
20822085if usemask :
20832086mdtype = [(defaultfmt % i ,bool )
2084- for (i ,dt )in enumerate (column_types )]
2087+ for (i ,dt )in enumerate (sized_column_types )]
20852088else :
2086- ddtype = list (zip (names ,column_types ))
2087- mdtype = list (zip (names , [bool ]* len (column_types )))
2089+ ddtype = list (zip (names ,sized_column_types ))
2090+ mdtype = list (zip (names , [bool ]* len (sized_column_types )))
20882091output = np .array (data ,dtype = ddtype )
20892092if usemask :
20902093outputmask = np .array (masks ,dtype = mdtype )