Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit3d8b17f

Browse files
authored
fix: support results with STRUCT and ARRAY columns containing JSON subfields into_pandas_batches() (#2216)
* Correctly display DataFrames with JSON columns in anywidget* Improve JSON type handling for to_gbq and to_pandas_batches* Revert "Correctly display DataFrames with JSON columns in anywidget"This reverts commit8c34512.* Remove unnecessary comment* code refactor* testcase update* Fix testcase* function call updated in bigframes/core/blocks.py, unused function removed from bigframes/dtypes.py* revert the code refactor in loader.py, I will use a seperate pr for this refactor* replace the manual construction of the empty DataFrame with the more robust try...except block that leverages to_pyarrow and empty_table* fix testcase* existing arrow_to_pandas() helper that properly handles dtype conversion* testcase update* refactor testcase* Add pyarrow id to comments
1 parent94c8b3c commit3d8b17f

File tree

2 files changed

+96
-6
lines changed

2 files changed

+96
-6
lines changed

‎bigframes/core/blocks.py‎

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
importbigframes.operations.aggregationsasagg_ops
6969
frombigframes.sessionimportdry_runs,execution_spec
7070
frombigframes.sessionimportexecutorasexecutors
71+
frombigframes.session._ioimportpandasasio_pandas
7172

7273
# Type constraint for wherever column labels are used
7374
Label=typing.Hashable
@@ -711,12 +712,15 @@ def to_pandas_batches(
711712
# To reduce the number of edge cases to consider when working with the
712713
# results of this, always return at least one DataFrame. See:
713714
# b/428918844.
714-
empty_val=pd.DataFrame(
715-
{
716-
col:pd.Series([],dtype=self.expr.get_column_type(col))
717-
forcolinitertools.chain(self.value_columns,self.index_columns)
718-
}
719-
)
715+
try:
716+
empty_arrow_table=self.expr.schema.to_pyarrow().empty_table()
717+
exceptpa.ArrowNotImplementedError:
718+
# Bug with some pyarrow versions(https://github.com/apache/arrow/issues/45262),
719+
# empty_table only supports base storage types, not extension types.
720+
empty_arrow_table=self.expr.schema.to_pyarrow(
721+
use_storage_types=True
722+
).empty_table()
723+
empty_val=io_pandas.arrow_to_pandas(empty_arrow_table,self.expr.schema)
720724
dfs=map(
721725
lambdaa:a[0],
722726
itertools.zip_longest(

‎tests/system/small/test_dataframe_io.py‎

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,92 @@ def test_to_pandas_batches_w_empty_dataframe(session):
376376
pandas.testing.assert_series_equal(results[0].dtypes,empty.dtypes)
377377

378378

379+
@pytest.mark.skipif(
380+
bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
381+
reason="Test for pandas 1.x behavior only",
382+
)
383+
deftest_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas1(session):
384+
"""Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 1.x."""
385+
sql="""
386+
SELECT
387+
0 AS id,
388+
[JSON '{"a":1}', JSON '{"b":2}'] AS json_array,
389+
STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct
390+
"""
391+
df=session.read_gbq(sql,index_col="id")
392+
batches=list(df.to_pandas_batches())
393+
394+
assertbatches[0].dtypes["json_array"]=="object"
395+
assertisinstance(batches[0].dtypes["json_struct"],pd.ArrowDtype)
396+
397+
398+
@pytest.mark.skipif(
399+
notbigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
400+
reason="Test for pandas 2.x behavior only",
401+
)
402+
deftest_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas2(session):
403+
"""Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 2.x."""
404+
sql="""
405+
SELECT
406+
0 AS id,
407+
[JSON '{"a":1}', JSON '{"b":2}'] AS json_array,
408+
STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct
409+
"""
410+
df=session.read_gbq(sql,index_col="id")
411+
batches=list(df.to_pandas_batches())
412+
413+
assertisinstance(batches[0].dtypes["json_array"],pd.ArrowDtype)
414+
assertisinstance(batches[0].dtypes["json_array"].pyarrow_dtype,pa.ListType)
415+
assertisinstance(batches[0].dtypes["json_struct"],pd.ArrowDtype)
416+
417+
418+
@pytest.mark.skipif(
419+
bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
420+
reason="Test for pandas 1.x behavior only",
421+
)
422+
deftest_to_pandas_batches_should_not_error_on_empty_nested_json_pandas1(session):
423+
"""Verify to_pandas_batches() works with empty nested JSON types in pandas 1.x."""
424+
425+
sql="""
426+
SELECT
427+
1 AS id,
428+
[] AS json_array,
429+
STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct
430+
"""
431+
df=session.read_gbq(sql,index_col="id")
432+
433+
# The main point: this should not raise an error
434+
batches=list(df.to_pandas_batches())
435+
assertsum(len(b)forbinbatches)==1
436+
437+
assertbatches[0].dtypes["json_array"]=="object"
438+
assertisinstance(batches[0].dtypes["json_struct"],pd.ArrowDtype)
439+
440+
441+
@pytest.mark.skipif(
442+
notbigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
443+
reason="Test for pandas 2.x behavior only",
444+
)
445+
deftest_to_pandas_batches_should_not_error_on_empty_nested_json_pandas2(session):
446+
"""Verify to_pandas_batches() works with empty nested JSON types in pandas 2.x."""
447+
448+
sql="""
449+
SELECT
450+
1 AS id,
451+
[] AS json_array,
452+
STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct
453+
"""
454+
df=session.read_gbq(sql,index_col="id")
455+
456+
# The main point: this should not raise an error
457+
batches=list(df.to_pandas_batches())
458+
assertsum(len(b)forbinbatches)==1
459+
460+
assertisinstance(batches[0].dtypes["json_array"],pd.ArrowDtype)
461+
assertisinstance(batches[0].dtypes["json_struct"],pd.ArrowDtype)
462+
assertisinstance(batches[0].dtypes["json_struct"].pyarrow_dtype,pa.StructType)
463+
464+
379465
@pytest.mark.parametrize("allow_large_results", (True,False))
380466
deftest_to_pandas_batches_w_page_size_and_max_results(session,allow_large_results):
381467
"""Verify to_pandas_batches() APIs returns the expected page size.

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp