Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit16f65e6

Browse files
author
Jim Fulton
authored
feat: Support using GeoPandas for GEOGRAPHY columns (#848)
1 parent5c5b4b8 commit16f65e6

File tree

16 files changed

+1102
-29
lines changed

16 files changed

+1102
-29
lines changed

‎docs/conf.py‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,8 @@
366366
"grpc": ("https://grpc.github.io/grpc/python/",None),
367367
"proto-plus": ("https://proto-plus-python.readthedocs.io/en/latest/",None),
368368
"protobuf": ("https://googleapis.dev/python/protobuf/latest/",None),
369+
"pandas": ("http://pandas.pydata.org/pandas-docs/dev",None),
370+
"geopandas": ("https://geopandas.org/",None),
369371
}
370372

371373

‎docs/usage/pandas.rst‎

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,21 @@ To retrieve table rows as a :class:`pandas.DataFrame`:
3737
:start-after: [START bigquery_list_rows_dataframe]
3838
:end-before: [END bigquery_list_rows_dataframe]
3939

40+
41+
Retrieve BigQuery GEOGRAPHY data as a GeoPandas GeoDataFrame
42+
------------------------------------------------------------
43+
44+
`GeoPandas<https://geopandas.org/>`_ adds geospatial analytics
45+
capabilities to Pandas. To retrieve query results containing
46+
GEOGRAPHY data as a:class:`geopandas.GeoDataFrame`:
47+
48+
..literalinclude::../samples/geography/to_geodataframe.py
49+
:language: python
50+
:dedent: 4
51+
:start-after: [START bigquery_query_results_geodataframe]
52+
:end-before: [END bigquery_query_results_geodataframe]
53+
54+
4055
Load a Pandas DataFrame to a BigQuery Table
4156
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4257

‎google/cloud/bigquery/_pandas_helpers.py‎

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,36 @@
2424
importpandas
2525
exceptImportError:# pragma: NO COVER
2626
pandas=None
27+
else:
28+
importnumpy
29+
30+
try:
31+
# _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array`
32+
fromshapely.geometry.baseimportBaseGeometryas_BaseGeometry
33+
exceptImportError:# pragma: NO COVER
34+
# No shapely, use NoneType for _BaseGeometry as a placeholder.
35+
_BaseGeometry=type(None)
36+
else:
37+
ifpandasisnotNone:# pragma: NO COVER
38+
39+
def_to_wkb():
40+
# Create a closure that:
41+
# - Adds a not-null check. This allows the returned function to
42+
# be used directly with apply, unlike `shapely.wkb.dumps`.
43+
# - Avoid extra work done by `shapely.wkb.dumps` that we don't need.
44+
# - Caches the WKBWriter (and write method lookup :) )
45+
# - Avoids adding WKBWriter, lgeos, and notnull to the module namespace.
46+
fromshapely.geosimportWKBWriter,lgeos
47+
48+
write=WKBWriter(lgeos).write
49+
notnull=pandas.notnull
50+
51+
def_to_wkb(v):
52+
returnwrite(v)ifnotnull(v)elsev
53+
54+
return_to_wkb
55+
56+
_to_wkb=_to_wkb()
2757

2858
try:
2959
importpyarrow
@@ -69,6 +99,7 @@
6999
"uint8":"INTEGER",
70100
"uint16":"INTEGER",
71101
"uint32":"INTEGER",
102+
"geometry":"GEOGRAPHY",
72103
}
73104

74105

@@ -193,14 +224,16 @@ def bq_to_arrow_data_type(field):
193224
returndata_type_constructor()
194225

195226

196-
defbq_to_arrow_field(bq_field):
227+
defbq_to_arrow_field(bq_field,array_type=None):
197228
"""Return the Arrow field, corresponding to a given BigQuery column.
198229
199230
Returns:
200231
None: if the Arrow type cannot be determined.
201232
"""
202233
arrow_type=bq_to_arrow_data_type(bq_field)
203-
ifarrow_type:
234+
ifarrow_typeisnotNone:
235+
ifarray_typeisnotNone:
236+
arrow_type=array_type# For GEOGRAPHY, at least initially
204237
is_nullable=bq_field.mode.upper()=="NULLABLE"
205238
returnpyarrow.field(bq_field.name,arrow_type,nullable=is_nullable)
206239

@@ -225,7 +258,24 @@ def bq_to_arrow_schema(bq_schema):
225258

226259

227260
defbq_to_arrow_array(series,bq_field):
228-
arrow_type=bq_to_arrow_data_type(bq_field)
261+
ifbq_field.field_type.upper()=="GEOGRAPHY":
262+
arrow_type=None
263+
first=_first_valid(series)
264+
iffirstisnotNone:
265+
ifseries.dtype.name=="geometry"orisinstance(first,_BaseGeometry):
266+
arrow_type=pyarrow.binary()
267+
# Convert shapey geometry to WKB binary format:
268+
series=series.apply(_to_wkb)
269+
elifisinstance(first,bytes):
270+
arrow_type=pyarrow.binary()
271+
elifseries.dtype.name=="geometry":
272+
# We have a GeoSeries containing all nulls, convert it to a pandas series
273+
series=pandas.Series(numpy.array(series))
274+
275+
ifarrow_typeisNone:
276+
arrow_type=bq_to_arrow_data_type(bq_field)
277+
else:
278+
arrow_type=bq_to_arrow_data_type(bq_field)
229279

230280
field_type_upper=bq_field.field_type.upper()ifbq_field.field_typeelse""
231281

@@ -279,6 +329,12 @@ def list_columns_and_indexes(dataframe):
279329
returncolumns_and_indexes
280330

281331

332+
def_first_valid(series):
333+
first_valid_index=series.first_valid_index()
334+
iffirst_valid_indexisnotNone:
335+
returnseries.at[first_valid_index]
336+
337+
282338
defdataframe_to_bq_schema(dataframe,bq_schema):
283339
"""Convert a pandas DataFrame schema to a BigQuery schema.
284340
@@ -319,6 +375,13 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
319375
# Otherwise, try to automatically determine the type based on the
320376
# pandas dtype.
321377
bq_type=_PANDAS_DTYPE_TO_BQ.get(dtype.name)
378+
ifbq_typeisNone:
379+
sample_data=_first_valid(dataframe[column])
380+
if (
381+
isinstance(sample_data,_BaseGeometry)
382+
andsample_dataisnotNone# Paranoia
383+
):
384+
bq_type="GEOGRAPHY"
322385
bq_field=schema.SchemaField(column,bq_type)
323386
bq_schema_out.append(bq_field)
324387

@@ -450,11 +513,11 @@ def dataframe_to_arrow(dataframe, bq_schema):
450513
arrow_names= []
451514
arrow_fields= []
452515
forbq_fieldinbq_schema:
453-
arrow_fields.append(bq_to_arrow_field(bq_field))
454516
arrow_names.append(bq_field.name)
455517
arrow_arrays.append(
456518
bq_to_arrow_array(get_column_or_index(dataframe,bq_field.name),bq_field)
457519
)
520+
arrow_fields.append(bq_to_arrow_field(bq_field,arrow_arrays[-1].type))
458521

459522
ifall((fieldisnotNoneforfieldinarrow_fields)):
460523
returnpyarrow.Table.from_arrays(

‎google/cloud/bigquery/job/query.py‎

Lines changed: 115 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
# Assumption: type checks are only used by library developers and CI environments
5454
# that have all optional dependencies installed, thus no conditional imports.
5555
importpandas
56+
importgeopandas
5657
importpyarrow
5758
fromgoogle.api_coreimportretryasretries
5859
fromgoogle.cloudimportbigquery_storage
@@ -1487,6 +1488,7 @@ def to_dataframe(
14871488
create_bqstorage_client:bool=True,
14881489
date_as_object:bool=True,
14891490
max_results:Optional[int]=None,
1491+
geography_as_object:bool=False,
14901492
)->"pandas.DataFrame":
14911493
"""Return a pandas DataFrame from a QueryJob
14921494
@@ -1538,13 +1540,27 @@ def to_dataframe(
15381540
15391541
.. versionadded:: 2.21.0
15401542
1543+
geography_as_object (Optional[bool]):
1544+
If ``True``, convert GEOGRAPHY data to :mod:`shapely`
1545+
geometry objects. If ``False`` (default), don't cast
1546+
geography data to :mod:`shapely` geometry objects.
1547+
1548+
.. versionadded:: 2.24.0
1549+
15411550
Returns:
1542-
A :class:`~pandas.DataFrame` populated with row data and column
1543-
headers from the query results. The column headers are derived
1544-
from the destination table's schema.
1551+
pandas.DataFrame:
1552+
A :class:`~pandas.DataFrame` populated with row data
1553+
and column headers from the query results. The column
1554+
headers are derived from the destination table's
1555+
schema.
15451556
15461557
Raises:
1547-
ValueError: If the `pandas` library cannot be imported.
1558+
ValueError:
1559+
If the :mod:`pandas` library cannot be imported, or
1560+
the :mod:`google.cloud.bigquery_storage_v1` module is
1561+
required but cannot be imported. Also if
1562+
`geography_as_object` is `True`, but the
1563+
:mod:`shapely` library cannot be imported.
15481564
"""
15491565
query_result=wait_for_query(self,progress_bar_type,max_results=max_results)
15501566
returnquery_result.to_dataframe(
@@ -1553,6 +1569,101 @@ def to_dataframe(
15531569
progress_bar_type=progress_bar_type,
15541570
create_bqstorage_client=create_bqstorage_client,
15551571
date_as_object=date_as_object,
1572+
geography_as_object=geography_as_object,
1573+
)
1574+
1575+
# If changing the signature of this method, make sure to apply the same
1576+
# changes to table.RowIterator.to_dataframe(), except for the max_results parameter
1577+
# that should only exist here in the QueryJob method.
1578+
defto_geodataframe(
1579+
self,
1580+
bqstorage_client:"bigquery_storage.BigQueryReadClient"=None,
1581+
dtypes:Dict[str,Any]=None,
1582+
progress_bar_type:str=None,
1583+
create_bqstorage_client:bool=True,
1584+
date_as_object:bool=True,
1585+
max_results:Optional[int]=None,
1586+
geography_column:Optional[str]=None,
1587+
)->"geopandas.GeoDataFrame":
1588+
"""Return a GeoPandas GeoDataFrame from a QueryJob
1589+
1590+
Args:
1591+
bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]):
1592+
A BigQuery Storage API client. If supplied, use the faster
1593+
BigQuery Storage API to fetch rows from BigQuery. This
1594+
API is a billable API.
1595+
1596+
This method requires the ``fastavro`` and
1597+
``google-cloud-bigquery-storage`` libraries.
1598+
1599+
Reading from a specific partition or snapshot is not
1600+
currently supported by this method.
1601+
1602+
dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]):
1603+
A dictionary of column names pandas ``dtype``s. The provided
1604+
``dtype`` is used when constructing the series for the column
1605+
specified. Otherwise, the default pandas behavior is used.
1606+
1607+
progress_bar_type (Optional[str]):
1608+
If set, use the `tqdm <https://tqdm.github.io/>`_ library to
1609+
display a progress bar while the data downloads. Install the
1610+
``tqdm`` package to use this feature.
1611+
1612+
See
1613+
:func:`~google.cloud.bigquery.table.RowIterator.to_dataframe`
1614+
for details.
1615+
1616+
.. versionadded:: 1.11.0
1617+
create_bqstorage_client (Optional[bool]):
1618+
If ``True`` (default), create a BigQuery Storage API client
1619+
using the default API settings. The BigQuery Storage API
1620+
is a faster way to fetch rows from BigQuery. See the
1621+
``bqstorage_client`` parameter for more information.
1622+
1623+
This argument does nothing if ``bqstorage_client`` is supplied.
1624+
1625+
.. versionadded:: 1.24.0
1626+
1627+
date_as_object (Optional[bool]):
1628+
If ``True`` (default), cast dates to objects. If ``False``, convert
1629+
to datetime64[ns] dtype.
1630+
1631+
.. versionadded:: 1.26.0
1632+
1633+
max_results (Optional[int]):
1634+
Maximum number of rows to include in the result. No limit by default.
1635+
1636+
.. versionadded:: 2.21.0
1637+
1638+
geography_column (Optional[str]):
1639+
If there are more than one GEOGRAPHY column,
1640+
identifies which one to use to construct a GeoPandas
1641+
GeoDataFrame. This option can be ommitted if there's
1642+
only one GEOGRAPHY column.
1643+
1644+
Returns:
1645+
geopandas.GeoDataFrame:
1646+
A :class:`geopandas.GeoDataFrame` populated with row
1647+
data and column headers from the query results. The
1648+
column headers are derived from the destination
1649+
table's schema.
1650+
1651+
Raises:
1652+
ValueError:
1653+
If the :mod:`geopandas` library cannot be imported, or the
1654+
:mod:`google.cloud.bigquery_storage_v1` module is
1655+
required but cannot be imported.
1656+
1657+
.. versionadded:: 2.24.0
1658+
"""
1659+
query_result=wait_for_query(self,progress_bar_type,max_results=max_results)
1660+
returnquery_result.to_geodataframe(
1661+
bqstorage_client=bqstorage_client,
1662+
dtypes=dtypes,
1663+
progress_bar_type=progress_bar_type,
1664+
create_bqstorage_client=create_bqstorage_client,
1665+
date_as_object=date_as_object,
1666+
geography_column=geography_column,
15561667
)
15571668

15581669
def__iter__(self):

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp