Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit5e4465d

Browse files
authored
feat: add bool, int, float, string dtype to to_dataframe (#1529)
1 parenta2520ca commit5e4465d

File tree

5 files changed

+294
-12
lines changed

5 files changed

+294
-12
lines changed

‎google/cloud/bigquery/_pandas_helpers.py‎

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
importlogging
2222
importqueue
2323
importwarnings
24+
fromtypingimportAny,Union
2425

2526
frompackagingimportversion
2627

@@ -283,7 +284,13 @@ def bq_to_arrow_schema(bq_schema):
283284
returnpyarrow.schema(arrow_fields)
284285

285286

286-
defdefault_types_mapper(date_as_object:bool=False):
287+
defdefault_types_mapper(
288+
date_as_object:bool=False,
289+
bool_dtype:Union[Any,None]=None,
290+
int_dtype:Union[Any,None]=None,
291+
float_dtype:Union[Any,None]=None,
292+
string_dtype:Union[Any,None]=None,
293+
):
287294
"""Create a mapping from pyarrow types to pandas types.
288295
289296
This overrides the pandas defaults to use null-safe extension types where
@@ -299,8 +306,17 @@ def default_types_mapper(date_as_object: bool = False):
299306
"""
300307

301308
deftypes_mapper(arrow_data_type):
302-
ifpyarrow.types.is_boolean(arrow_data_type):
303-
returnpandas.BooleanDtype()
309+
ifbool_dtypeisnotNoneandpyarrow.types.is_boolean(arrow_data_type):
310+
returnbool_dtype
311+
312+
elifint_dtypeisnotNoneandpyarrow.types.is_integer(arrow_data_type):
313+
returnint_dtype
314+
315+
eliffloat_dtypeisnotNoneandpyarrow.types.is_floating(arrow_data_type):
316+
returnfloat_dtype
317+
318+
elifstring_dtypeisnotNoneandpyarrow.types.is_string(arrow_data_type):
319+
returnstring_dtype
304320

305321
elif (
306322
# If date_as_object is True, we know some DATE columns are
@@ -310,9 +326,6 @@ def types_mapper(arrow_data_type):
310326
):
311327
returndb_dtypes.DateDtype()
312328

313-
elifpyarrow.types.is_integer(arrow_data_type):
314-
returnpandas.Int64Dtype()
315-
316329
elifpyarrow.types.is_time(arrow_data_type):
317330
returndb_dtypes.TimeDtype()
318331

‎google/cloud/bigquery/enums.py‎

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,20 @@ class CreateDisposition(object):
7777
returned in the job result."""
7878

7979

80+
classDefaultPandasDTypes(enum.Enum):
81+
"""Default Pandas DataFrem DTypes to convert BigQuery data. These
82+
Sentinel values are used instead of None to maintain backward compatibility,
83+
and allow Pandas package is not available. For more information:
84+
https://stackoverflow.com/a/60605919/101923
85+
"""
86+
87+
BOOL_DTYPE=object()
88+
"""Specifies default bool dtype"""
89+
90+
INT_DTYPE=object()
91+
"""Specifies default integer dtype"""
92+
93+
8094
classDestinationFormat(object):
8195
"""The exported file format. The default value is :attr:`CSV`.
8296

‎google/cloud/bigquery/job/query.py‎

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
fromgoogle.cloud.bigquery.datasetimportDatasetListItem
2929
fromgoogle.cloud.bigquery.datasetimportDatasetReference
3030
fromgoogle.cloud.bigquery.encryption_configurationimportEncryptionConfiguration
31-
fromgoogle.cloud.bigquery.enumsimportKeyResultStatementKind
31+
fromgoogle.cloud.bigquery.enumsimportKeyResultStatementKind,DefaultPandasDTypes
3232
fromgoogle.cloud.bigquery.external_configimportExternalConfig
3333
fromgoogle.cloud.bigqueryimport_helpers
3434
fromgoogle.cloud.bigquery.queryimport (
@@ -53,6 +53,11 @@
5353
fromgoogle.cloud.bigquery.job.baseimport_JobConfig
5454
fromgoogle.cloud.bigquery.job.baseimport_JobReference
5555

56+
try:
57+
importpandas# type: ignore
58+
exceptImportError:# pragma: NO COVER
59+
pandas=None
60+
5661
iftyping.TYPE_CHECKING:# pragma: NO COVER
5762
# Assumption: type checks are only used by library developers and CI environments
5863
# that have all optional dependencies installed, thus no conditional imports.
@@ -1620,6 +1625,10 @@ def to_dataframe(
16201625
create_bqstorage_client:bool=True,
16211626
max_results:Optional[int]=None,
16221627
geography_as_object:bool=False,
1628+
bool_dtype:Union[Any,None]=DefaultPandasDTypes.BOOL_DTYPE,
1629+
int_dtype:Union[Any,None]=DefaultPandasDTypes.INT_DTYPE,
1630+
float_dtype:Union[Any,None]=None,
1631+
string_dtype:Union[Any,None]=None,
16231632
)->"pandas.DataFrame":
16241633
"""Return a pandas DataFrame from a QueryJob
16251634
@@ -1672,6 +1681,46 @@ def to_dataframe(
16721681
16731682
.. versionadded:: 2.24.0
16741683
1684+
bool_dtype (Optional[pandas.Series.dtype, None]):
1685+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
1686+
to convert BigQuery Boolean type, instead of relying on the default
1687+
``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
1688+
then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
1689+
type can be found at:
1690+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
1691+
1692+
.. versionadded:: 3.7.1
1693+
1694+
int_dtype (Optional[pandas.Series.dtype, None]):
1695+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
1696+
to convert BigQuery Integer types, instead of relying on the default
1697+
``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
1698+
then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
1699+
Integer types can be found at:
1700+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
1701+
1702+
.. versionadded:: 3.7.1
1703+
1704+
float_dtype (Optional[pandas.Series.dtype, None]):
1705+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
1706+
to convert BigQuery Float type, instead of relying on the default
1707+
``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
1708+
then the data type will be ``numpy.dtype("float64")``. BigQuery Float
1709+
type can be found at:
1710+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
1711+
1712+
.. versionadded:: 3.7.1
1713+
1714+
string_dtype (Optional[pandas.Series.dtype, None]):
1715+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
1716+
convert BigQuery String type, instead of relying on the default
1717+
``numpy.dtype("object")``. If you explicitly set the value to ``None``,
1718+
then the data type will be ``numpy.dtype("object")``. BigQuery String
1719+
type can be found at:
1720+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
1721+
1722+
.. versionadded:: 3.7.1
1723+
16751724
Returns:
16761725
pandas.DataFrame:
16771726
A :class:`~pandas.DataFrame` populated with row data
@@ -1694,6 +1743,10 @@ def to_dataframe(
16941743
progress_bar_type=progress_bar_type,
16951744
create_bqstorage_client=create_bqstorage_client,
16961745
geography_as_object=geography_as_object,
1746+
bool_dtype=bool_dtype,
1747+
int_dtype=int_dtype,
1748+
float_dtype=float_dtype,
1749+
string_dtype=string_dtype,
16971750
)
16981751

16991752
# If changing the signature of this method, make sure to apply the same

‎google/cloud/bigquery/table.py‎

Lines changed: 94 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@
3434
exceptImportError:# pragma: NO COVER
3535
pyarrow=None
3636

37+
try:
38+
importdb_dtypes# type: ignore
39+
exceptImportError:# pragma: NO COVER
40+
db_dtypes=None
41+
3742
try:
3843
importgeopandas# type: ignore
3944
exceptImportError:
@@ -55,6 +60,7 @@
5560
importgoogle.cloud._helpers# type: ignore
5661
fromgoogle.cloud.bigqueryimport_helpers
5762
fromgoogle.cloud.bigqueryimport_pandas_helpers
63+
fromgoogle.cloud.bigquery.enumsimportDefaultPandasDTypes
5864
fromgoogle.cloud.bigquery.exceptionsimportLegacyBigQueryStorageError
5965
fromgoogle.cloud.bigquery.schemaimport_build_schema_resource
6066
fromgoogle.cloud.bigquery.schemaimport_parse_schema_resource
@@ -88,6 +94,11 @@
8894

8995
_TABLE_HAS_NO_SCHEMA='Table has no schema: call "client.get_table()"'
9096

97+
_NO_SUPPORTED_DTYPE= (
98+
"The dtype cannot to be converted to a pandas ExtensionArray "
99+
"because the necessary `__from_arrow__` attribute is missing."
100+
)
101+
91102

92103
def_reference_getter(table):
93104
"""A :class:`~google.cloud.bigquery.table.TableReference` pointing to
@@ -1920,6 +1931,10 @@ def to_dataframe(
19201931
progress_bar_type:str=None,
19211932
create_bqstorage_client:bool=True,
19221933
geography_as_object:bool=False,
1934+
bool_dtype:Union[Any,None]=DefaultPandasDTypes.BOOL_DTYPE,
1935+
int_dtype:Union[Any,None]=DefaultPandasDTypes.INT_DTYPE,
1936+
float_dtype:Union[Any,None]=None,
1937+
string_dtype:Union[Any,None]=None,
19231938
)->"pandas.DataFrame":
19241939
"""Create a pandas DataFrame by loading all pages of a query.
19251940
@@ -1958,6 +1973,7 @@ def to_dataframe(
19581973
progress bar as a graphical dialog box.
19591974
19601975
.. versionadded:: 1.11.0
1976+
19611977
create_bqstorage_client (Optional[bool]):
19621978
If ``True`` (default), create a BigQuery Storage API client
19631979
using the default API settings. The BigQuery Storage API
@@ -1975,6 +1991,46 @@ def to_dataframe(
19751991
19761992
.. versionadded:: 2.24.0
19771993
1994+
bool_dtype (Optional[pandas.Series.dtype, None]):
1995+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
1996+
to convert BigQuery Boolean type, instead of relying on the default
1997+
``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
1998+
then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
1999+
type can be found at:
2000+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
2001+
2002+
.. versionadded:: 3.7.1
2003+
2004+
int_dtype (Optional[pandas.Series.dtype, None]):
2005+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
2006+
to convert BigQuery Integer types, instead of relying on the default
2007+
``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
2008+
then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
2009+
Integer types can be found at:
2010+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
2011+
2012+
.. versionadded:: 3.7.1
2013+
2014+
float_dtype (Optional[pandas.Series.dtype, None]):
2015+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
2016+
to convert BigQuery Float type, instead of relying on the default
2017+
``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
2018+
then the data type will be ``numpy.dtype("float64")``. BigQuery Float
2019+
type can be found at:
2020+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
2021+
2022+
.. versionadded:: 3.7.1
2023+
2024+
string_dtype (Optional[pandas.Series.dtype, None]):
2025+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
2026+
convert BigQuery String type, instead of relying on the default
2027+
``numpy.dtype("object")``. If you explicitly set the value to ``None``,
2028+
then the data type will be ``numpy.dtype("object")``. BigQuery String
2029+
type can be found at:
2030+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
2031+
2032+
.. versionadded:: 3.7.1
2033+
19782034
Returns:
19792035
pandas.DataFrame:
19802036
A :class:`~pandas.DataFrame` populated with row data and column
@@ -1987,14 +2043,34 @@ def to_dataframe(
19872043
the :mod:`google.cloud.bigquery_storage_v1` module is
19882044
required but cannot be imported. Also if
19892045
`geography_as_object` is `True`, but the
1990-
:mod:`shapely` library cannot be imported.
2046+
:mod:`shapely` library cannot be imported. Also if
2047+
`bool_dtype`, `int_dtype` or other dtype parameters
2048+
is not supported dtype.
19912049
19922050
"""
19932051
_pandas_helpers.verify_pandas_imports()
19942052

19952053
ifgeography_as_objectandshapelyisNone:
19962054
raiseValueError(_NO_SHAPELY_ERROR)
19972055

2056+
ifbool_dtypeisDefaultPandasDTypes.BOOL_DTYPE:
2057+
bool_dtype=pandas.BooleanDtype()
2058+
2059+
ifint_dtypeisDefaultPandasDTypes.INT_DTYPE:
2060+
int_dtype=pandas.Int64Dtype()
2061+
2062+
ifbool_dtypeisnotNoneandnothasattr(bool_dtype,"__from_arrow__"):
2063+
raiseValueError("bool_dtype",_NO_SUPPORTED_DTYPE)
2064+
2065+
ifint_dtypeisnotNoneandnothasattr(int_dtype,"__from_arrow__"):
2066+
raiseValueError("int_dtype",_NO_SUPPORTED_DTYPE)
2067+
2068+
iffloat_dtypeisnotNoneandnothasattr(float_dtype,"__from_arrow__"):
2069+
raiseValueError("float_dtype",_NO_SUPPORTED_DTYPE)
2070+
2071+
ifstring_dtypeisnotNoneandnothasattr(string_dtype,"__from_arrow__"):
2072+
raiseValueError("string_dtype",_NO_SUPPORTED_DTYPE)
2073+
19982074
ifdtypesisNone:
19992075
dtypes= {}
20002076

@@ -2019,15 +2095,15 @@ def to_dataframe(
20192095
forcolinrecord_batch
20202096
# Type can be date32 or date64 (plus units).
20212097
# See: https://arrow.apache.org/docs/python/api/datatypes.html
2022-
ifstr(col.type).startswith("date")
2098+
ifpyarrow.types.is_date(col.type)
20232099
)
20242100

20252101
timestamp_as_object=notall(
20262102
self.__can_cast_timestamp_ns(col)
20272103
forcolinrecord_batch
2028-
# Type can be timestamp (plus units and time zone).
2104+
# Type can bedatetime andtimestamp (plus units and time zone).
20292105
# See: https://arrow.apache.org/docs/python/api/datatypes.html
2030-
ifstr(col.type).startswith("timestamp")
2106+
ifpyarrow.types.is_timestamp(col.type)
20312107
)
20322108

20332109
iflen(record_batch)>0:
@@ -2036,7 +2112,11 @@ def to_dataframe(
20362112
timestamp_as_object=timestamp_as_object,
20372113
integer_object_nulls=True,
20382114
types_mapper=_pandas_helpers.default_types_mapper(
2039-
date_as_object=date_as_object
2115+
date_as_object=date_as_object,
2116+
bool_dtype=bool_dtype,
2117+
int_dtype=int_dtype,
2118+
float_dtype=float_dtype,
2119+
string_dtype=string_dtype,
20402120
),
20412121
)
20422122
else:
@@ -2233,6 +2313,10 @@ def to_dataframe(
22332313
progress_bar_type=None,
22342314
create_bqstorage_client=True,
22352315
geography_as_object=False,
2316+
bool_dtype=None,
2317+
int_dtype=None,
2318+
float_dtype=None,
2319+
string_dtype=None,
22362320
)->"pandas.DataFrame":
22372321
"""Create an empty dataframe.
22382322
@@ -2241,6 +2325,11 @@ def to_dataframe(
22412325
dtypes (Any): Ignored. Added for compatibility with RowIterator.
22422326
progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
22432327
create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
2328+
geography_as_object (bool): Ignored. Added for compatibility with RowIterator.
2329+
bool_dtype (Any): Ignored. Added for compatibility with RowIterator.
2330+
int_dtype (Any): Ignored. Added for compatibility with RowIterator.
2331+
float_dtype (Any): Ignored. Added for compatibility with RowIterator.
2332+
string_dtype (Any): Ignored. Added for compatibility with RowIterator.
22442333
22452334
Returns:
22462335
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp