Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitf9f2f45

Browse files
authored
fix: omitNaN values when uploading frominsert_rows_from_dataframe (#170)
* fix: omit `NaN` values when uploading from `insert_rows_from_dataframe`NaN values are most often used to indicate a NULL value in pandas. Also,even when a column is a floating point column, the BigQuery streamingAPI JSON parser doesn't seem to be able to handle NaN literals.* doc: update docstring to indicate missing NaNs
1 parent08c12c5 commitf9f2f45

File tree

4 files changed

+110
-9
lines changed

4 files changed

+110
-9
lines changed

‎google/cloud/bigquery/_pandas_helpers.py‎

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
importlogging
2020
importwarnings
2121

22+
importsix
2223
fromsix.movesimportqueue
2324

2425
try:
@@ -780,3 +781,14 @@ def download_dataframe_bqstorage(
780781
selected_fields=selected_fields,
781782
page_to_item=page_to_item,
782783
)
784+
785+
786+
defdataframe_to_json_generator(dataframe):
787+
forrowindataframe.itertuples(index=False,name=None):
788+
output= {}
789+
forcolumn,valueinsix.moves.zip(dataframe.columns,row):
790+
# Omit NaN values.
791+
ifvalue!=value:
792+
continue
793+
output[column]=value
794+
yieldoutput

‎google/cloud/bigquery/client.py‎

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2535,7 +2535,9 @@ def insert_rows_from_dataframe(
25352535
]):
25362536
The destination table for the row data, or a reference to it.
25372537
dataframe (pandas.DataFrame):
2538-
A :class:`~pandas.DataFrame` containing the data to load.
2538+
A :class:`~pandas.DataFrame` containing the data to load. Any
2539+
``NaN`` values present in the dataframe are omitted from the
2540+
streaming API request(s).
25392541
selected_fields (Sequence[google.cloud.bigquery.schema.SchemaField]):
25402542
The fields to return. Required if ``table`` is a
25412543
:class:`~google.cloud.bigquery.table.TableReference`.
@@ -2559,10 +2561,7 @@ def insert_rows_from_dataframe(
25592561
insert_results= []
25602562

25612563
chunk_count=int(math.ceil(len(dataframe)/chunk_size))
2562-
rows_iter= (
2563-
dict(six.moves.zip(dataframe.columns,row))
2564-
forrowindataframe.itertuples(index=False,name=None)
2565-
)
2564+
rows_iter=_pandas_helpers.dataframe_to_json_generator(dataframe)
25662565

25672566
for_inrange(chunk_count):
25682567
rows_chunk=itertools.islice(rows_iter,chunk_size)

‎tests/system.py‎

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2335,6 +2335,14 @@ def test_insert_rows_from_dataframe(self):
23352335
"string_col":"another string",
23362336
"int_col":50,
23372337
},
2338+
{
2339+
"float_col":6.66,
2340+
"bool_col":True,
2341+
# Include a NaN value, because pandas often uses NaN as a
2342+
# NULL value indicator.
2343+
"string_col":float("NaN"),
2344+
"int_col":60,
2345+
},
23382346
]
23392347
)
23402348

@@ -2344,14 +2352,28 @@ def test_insert_rows_from_dataframe(self):
23442352
table=retry_403(Config.CLIENT.create_table)(table_arg)
23452353
self.to_delete.insert(0,table)
23462354

2347-
Config.CLIENT.insert_rows_from_dataframe(table,dataframe,chunk_size=3)
2355+
chunk_errors=Config.CLIENT.insert_rows_from_dataframe(
2356+
table,dataframe,chunk_size=3
2357+
)
2358+
forerrorsinchunk_errors:
2359+
assertnoterrors
23482360

2349-
retry=RetryResult(_has_rows,max_tries=8)
2350-
rows=retry(self._fetch_single_page)(table)
2361+
# Use query to fetch rows instead of listing directly from the table so
2362+
# that we get values from the streaming buffer.
2363+
rows=list(
2364+
Config.CLIENT.query(
2365+
"SELECT * FROM `{}.{}.{}`".format(
2366+
table.project,table.dataset_id,table.table_id
2367+
)
2368+
)
2369+
)
23512370

23522371
sorted_rows=sorted(rows,key=operator.attrgetter("int_col"))
23532372
row_tuples= [r.values()forrinsorted_rows]
2354-
expected= [tuple(data_row)fordata_rowindataframe.itertuples(index=False)]
2373+
expected= [
2374+
tuple(Noneifcol!=colelsecolforcolindata_row)
2375+
fordata_rowindataframe.itertuples(index=False)
2376+
]
23552377

23562378
assertlen(row_tuples)==len(expected)
23572379

‎tests/unit/test_client.py‎

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5582,6 +5582,74 @@ def test_insert_rows_from_dataframe(self):
55825582
)
55835583
assertcall==expected_call
55845584

5585+
@unittest.skipIf(pandasisNone,"Requires `pandas`")
5586+
deftest_insert_rows_from_dataframe_nan(self):
5587+
fromgoogle.cloud.bigquery.schemaimportSchemaField
5588+
fromgoogle.cloud.bigquery.tableimportTable
5589+
5590+
API_PATH="/projects/{}/datasets/{}/tables/{}/insertAll".format(
5591+
self.PROJECT,self.DS_ID,self.TABLE_REF.table_id
5592+
)
5593+
5594+
dataframe=pandas.DataFrame(
5595+
{
5596+
"str_col": ["abc","def",float("NaN"),"jkl"],
5597+
"int_col": [1,float("NaN"),3,4],
5598+
"float_col": [float("NaN"),0.25,0.5,0.125],
5599+
}
5600+
)
5601+
5602+
# create client
5603+
creds=_make_credentials()
5604+
http=object()
5605+
client=self._make_one(project=self.PROJECT,credentials=creds,_http=http)
5606+
conn=client._connection=make_connection({}, {})
5607+
5608+
# create table
5609+
schema= [
5610+
SchemaField("str_col","STRING"),
5611+
SchemaField("int_col","INTEGER"),
5612+
SchemaField("float_col","FLOAT"),
5613+
]
5614+
table=Table(self.TABLE_REF,schema=schema)
5615+
5616+
withmock.patch("uuid.uuid4",side_effect=map(str,range(len(dataframe)))):
5617+
error_info=client.insert_rows_from_dataframe(
5618+
table,dataframe,chunk_size=3,timeout=7.5
5619+
)
5620+
5621+
self.assertEqual(len(error_info),2)
5622+
forchunk_errorsinerror_info:
5623+
assertchunk_errors== []
5624+
5625+
EXPECTED_SENT_DATA= [
5626+
{
5627+
"rows": [
5628+
{"insertId":"0","json": {"str_col":"abc","int_col":1}},
5629+
{"insertId":"1","json": {"str_col":"def","float_col":0.25}},
5630+
{"insertId":"2","json": {"int_col":3,"float_col":0.5}},
5631+
]
5632+
},
5633+
{
5634+
"rows": [
5635+
{
5636+
"insertId":"3",
5637+
"json": {"str_col":"jkl","int_col":4,"float_col":0.125},
5638+
}
5639+
]
5640+
},
5641+
]
5642+
5643+
actual_calls=conn.api_request.call_args_list
5644+
5645+
forcall,expected_datainsix.moves.zip_longest(
5646+
actual_calls,EXPECTED_SENT_DATA
5647+
):
5648+
expected_call=mock.call(
5649+
method="POST",path=API_PATH,data=expected_data,timeout=7.5
5650+
)
5651+
assertcall==expected_call
5652+
55855653
@unittest.skipIf(pandasisNone,"Requires `pandas`")
55865654
deftest_insert_rows_from_dataframe_many_columns(self):
55875655
fromgoogle.cloud.bigquery.schemaimportSchemaField

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp