NotificationsYou must be signed in to change notification settings
Fork321
Star786

Commit1e59083

authored

fix: support ARRAY data type when loading from DataFrame with Parquet (#980)

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:- [x] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea- [x] Ensure the tests and linter pass- [x] Code coverage does not decrease (if any source code was changed)- [x] Appropriate docs were updated (if necessary)Fixes#19 🦕

1 parentaacc521 commit1e59083Copy full SHA for 1e59083

File tree

5 files changed

+483

-45

lines changed

google/cloud/bigquery
tests
- system
  - test_pandas.py
- unit
  - test_client.py

5 files changed

+483

-45

lines changed

`‎google/cloud/bigquery/_helpers.py‎`

Lines changed: 11 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -107,6 +107,9 @@ def verify_version(self):`
`107`	`107`	`classPyarrowVersions:`
`108`	`108`	`"""Version comparisons for pyarrow package."""`
`109`	`109`
	`110`	`+# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414`
	`111`	`+_PYARROW_BAD_VERSIONS=frozenset([packaging.version.Version("2.0.0")])`
	`112`	`+`
`110`	`113`	`def__init__(self):`
`111`	`114`	`self._installed_version=None`
`112`	`115`
`@@ -126,6 +129,14 @@ def installed_version(self) -> packaging.version.Version:`
`126`	`129`
`127`	`130`	`returnself._installed_version`
`128`	`131`
	`132`	`+@property`
	`133`	`+defis_bad_version(self)->bool:`
	`134`	`+returnself.installed_versioninself._PYARROW_BAD_VERSIONS`
	`135`	`+`
	`136`	`+@property`
	`137`	`+defuse_compliant_nested_type(self)->bool:`
	`138`	`+returnself.installed_version.major>=4`
	`139`	`+`
`129`	`140`	`deftry_import(self,raise_if_error:bool=False)->Any:`
`130`	`141`	`"""Verify that a recent enough version of pyarrow extra is`
`131`	`142`	`installed.`

`‎google/cloud/bigquery/_pandas_helpers.py‎`

Lines changed: 39 additions & 11 deletions

Original file line number	Diff line number	Diff line change
`@@ -79,8 +79,8 @@ def _to_wkb(v):`
`79`	`79`	`_PANDAS_DTYPE_TO_BQ= {`
`80`	`80`	`"bool":"BOOLEAN",`
`81`	`81`	`"datetime64[ns, UTC]":"TIMESTAMP",`
`82`		`-#BigQuery does not support uploadingDATETIMEvalues from Parquet files.`
`83`		`-#See:https://github.com/googleapis/google-cloud-python/issues/9996`
	`82`	`+#TODO: Update toDATETIMEin V3`
	`83`	`+# https://github.com/googleapis/python-bigquery/issues/985`
`84`	`84`	`"datetime64[ns]":"TIMESTAMP",`
`85`	`85`	`"float32":"FLOAT",`
`86`	`86`	`"float64":"FLOAT",`
`@@ -396,7 +396,7 @@ def dataframe_to_bq_schema(dataframe, bq_schema):`
`396`	`396`	`# column, but it was not found.`
`397`	`397`	`ifbq_schema_unused:`
`398`	`398`	`raiseValueError(`
`399`		`-u"bq_schema contains fields not present in dataframe: {}".format(`
	`399`	`+"bq_schema contains fields not present in dataframe: {}".format(`
`400`	`400`	`bq_schema_unused`
`401`	`401`	`)`
`402`	`402`	`)`
`@@ -405,7 +405,7 @@ def dataframe_to_bq_schema(dataframe, bq_schema):`
`405`	`405`	`# pyarrow, if available.`
`406`	`406`	`ifunknown_type_fields:`
`407`	`407`	`ifnotpyarrow:`
`408`		`-msg=u"Could not determine the type of columns: {}".format(`
	`408`	`+msg="Could not determine the type of columns: {}".format(`
`409`	`409`	`", ".join(field.nameforfieldinunknown_type_fields)`
`410`	`410`	`)`
`411`	`411`	`warnings.warn(msg)`
`@@ -444,7 +444,14 @@ def augment_schema(dataframe, current_bq_schema):`
`444`	`444`	`continue`
`445`	`445`
`446`	`446`	`arrow_table=pyarrow.array(dataframe[field.name])`
`447`		`-detected_type=ARROW_SCALAR_IDS_TO_BQ.get(arrow_table.type.id)`
	`447`	`+`
	`448`	`+ifpyarrow.types.is_list(arrow_table.type):`
	`449`	+# `pyarrow.ListType`
	`450`	`+detected_mode="REPEATED"`
	`451`	`+detected_type=ARROW_SCALAR_IDS_TO_BQ.get(arrow_table.values.type.id)`
	`452`	`+else:`
	`453`	`+detected_mode=field.mode`
	`454`	`+detected_type=ARROW_SCALAR_IDS_TO_BQ.get(arrow_table.type.id)`
`448`	`455`
`449`	`456`	`ifdetected_typeisNone:`
`450`	`457`	`unknown_type_fields.append(field)`
`@@ -453,15 +460,15 @@ def augment_schema(dataframe, current_bq_schema):`
`453`	`460`	`new_field=schema.SchemaField(`
`454`	`461`	`name=field.name,`
`455`	`462`	`field_type=detected_type,`
`456`		`-mode=field.mode,`
	`463`	`+mode=detected_mode,`
`457`	`464`	`description=field.description,`
`458`	`465`	`fields=field.fields,`
`459`	`466`	`)`
`460`	`467`	`augmented_schema.append(new_field)`
`461`	`468`
`462`	`469`	`ifunknown_type_fields:`
`463`	`470`	`warnings.warn(`
`464`		`-u"Pyarrow could not determine the type of columns: {}.".format(`
	`471`	`+"Pyarrow could not determine the type of columns: {}.".format(`
`465`	`472`	`", ".join(field.nameforfieldinunknown_type_fields)`
`466`	`473`	`)`
`467`	`474`	`)`
`@@ -500,7 +507,7 @@ def dataframe_to_arrow(dataframe, bq_schema):`
`500`	`507`	`extra_fields=bq_field_names-column_and_index_names`
`501`	`508`	`ifextra_fields:`
`502`	`509`	`raiseValueError(`
`503`		`-u"bq_schema contains fields not present in dataframe: {}".format(`
	`510`	`+"bq_schema contains fields not present in dataframe: {}".format(`
`504`	`511`	`extra_fields`
`505`	`512`	`)`
`506`	`513`	`)`
`@@ -510,7 +517,7 @@ def dataframe_to_arrow(dataframe, bq_schema):`
`510`	`517`	`missing_fields=column_names-bq_field_names`
`511`	`518`	`ifmissing_fields:`
`512`	`519`	`raiseValueError(`
`513`		`-u"bq_schema is missing fields from dataframe: {}".format(missing_fields)`
	`520`	`+"bq_schema is missing fields from dataframe: {}".format(missing_fields)`
`514`	`521`	`)`
`515`	`522`
`516`	`523`	`arrow_arrays= []`
`@@ -530,7 +537,13 @@ def dataframe_to_arrow(dataframe, bq_schema):`
`530`	`537`	`returnpyarrow.Table.from_arrays(arrow_arrays,names=arrow_names)`
`531`	`538`
`532`	`539`
`533`		`-defdataframe_to_parquet(dataframe,bq_schema,filepath,parquet_compression="SNAPPY"):`
	`540`	`+defdataframe_to_parquet(`
	`541`	`+dataframe,`
	`542`	`+bq_schema,`
	`543`	`+filepath,`
	`544`	`+parquet_compression="SNAPPY",`
	`545`	`+parquet_use_compliant_nested_type=True,`
	`546`	`+):`
`534`	`547`	`"""Write dataframe as a Parquet file, according to the desired BQ schema.`
`535`	`548`
`536`	`549`	This function requires the :mod:`pyarrow` package. Arrow is used as an
`@@ -551,14 +564,29 @@ def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SN`
`551`	`564`	The compression codec to use by the the ``pyarrow.parquet.write_table``
`552`	`565`	`serializing method. Defaults to "SNAPPY".`
`553`	`566`	`https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table`
	`567`	`+ parquet_use_compliant_nested_type (bool):`
	`568`	+ Whether the ``pyarrow.parquet.write_table`` serializing method should write
	`569`	+ compliant Parquet nested type (lists). Defaults to ``True``.
	`570`	`+ https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types`
	`571`	`+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table`
	`572`	`+`
	`573`	+ This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``.
`554`	`574`	`"""`
`555`	`575`	`pyarrow=_helpers.PYARROW_VERSIONS.try_import(raise_if_error=True)`
`556`	`576`
`557`	`577`	`importpyarrow.parquet`
`558`	`578`
	`579`	`+kwargs= (`
	`580`	`+ {"use_compliant_nested_type":parquet_use_compliant_nested_type}`
	`581`	`+if_helpers.PYARROW_VERSIONS.use_compliant_nested_type`
	`582`	`+else {}`
	`583`	`+ )`
	`584`	`+`
`559`	`585`	`bq_schema=schema._to_schema_fields(bq_schema)`
`560`	`586`	`arrow_table=dataframe_to_arrow(dataframe,bq_schema)`
`561`		`-pyarrow.parquet.write_table(arrow_table,filepath,compression=parquet_compression)`
	`587`	`+pyarrow.parquet.write_table(`
	`588`	`+arrow_table,filepath,compression=parquet_compression,**kwargs,`
	`589`	`+ )`
`562`	`590`
`563`	`591`
`564`	`592`	`def_row_iterator_page_to_arrow(page,column_names,arrow_types):`

`‎google/cloud/bigquery/client.py‎`

Lines changed: 43 additions & 29 deletions

Original file line number	Diff line number	Diff line change
`@@ -27,19 +27,11 @@`
`27`	`27`	`importjson`
`28`	`28`	`importmath`
`29`	`29`	`importos`
`30`		`-importpackaging.version`
`31`	`30`	`importtempfile`
`32`	`31`	`fromtypingimportAny,BinaryIO,Dict,Iterable,Optional,Sequence,Tuple,Union`
`33`	`32`	`importuuid`
`34`	`33`	`importwarnings`
`35`	`34`
`36`		`-try:`
`37`		`-importpyarrow`
`38`		`-`
`39`		`-_PYARROW_VERSION=packaging.version.parse(pyarrow.__version__)`
`40`		`-exceptImportError:# pragma: NO COVER`
`41`		`-pyarrow=None`
`42`		`-`
`43`	`35`	`fromgoogleimportresumable_media# type: ignore`
`44`	`36`	`fromgoogle.resumable_media.requestsimportMultipartUpload`
`45`	`37`	`fromgoogle.resumable_media.requestsimportResumableUpload`
`@@ -103,6 +95,10 @@`
`103`	`95`	`fromgoogle.cloud.bigquery.tableimportTableListItem`
`104`	`96`	`fromgoogle.cloud.bigquery.tableimportTableReference`
`105`	`97`	`fromgoogle.cloud.bigquery.tableimportRowIterator`
	`98`	`+fromgoogle.cloud.bigquery.format_optionsimportParquetOptions`
	`99`	`+fromgoogle.cloud.bigqueryimport_helpers`
	`100`	`+`
	`101`	`+pyarrow=_helpers.PYARROW_VERSIONS.try_import()`
`106`	`102`
`107`	`103`
`108`	`104`	`_DEFAULT_CHUNKSIZE=10010241024# 100 MB`
`@@ -128,8 +124,6 @@`
`128`	`124`	`# https://github.com/googleapis/python-bigquery/issues/438`
`129`	`125`	`_MIN_GET_QUERY_RESULTS_TIMEOUT=120`
`130`	`126`
`131`		`-# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414`
`132`		`-_PYARROW_BAD_VERSIONS=frozenset([packaging.version.Version("2.0.0")])`
`133`	`127`
`134`	`128`	`TIMEOUT_HEADER="X-Server-Timeout"`
`135`	`129`
`@@ -2469,10 +2463,10 @@ def load_table_from_dataframe(`
`2469`	`2463`	`They are supported when using the PARQUET source format, but`
`2470`	`2464`	due to the way they are encoded in the ``parquet`` file,
`2471`	`2465`	`a mismatch with the existing table schema can occur, so`
`2472`		`-100% compatibility cannot be guaranteed for REPEATED fields when`
	`2466`	+REPEATED fields are not properly supported when using ``pyarrow<4.0.0``
`2473`	`2467`	`using the parquet format.`
`2474`	`2468`
`2475`		`- https://github.com/googleapis/python-bigquery/issues/17`
	`2469`	`+ https://github.com/googleapis/python-bigquery/issues/19`
`2476`	`2470`
`2477`	`2471`	`Args:`
`2478`	`2472`	`dataframe (pandas.DataFrame):`
`@@ -2519,18 +2513,18 @@ def load_table_from_dataframe(`
`2519`	`2513`	:attr:`~google.cloud.bigquery.job.SourceFormat.PARQUET` are
`2520`	`2514`	`supported.`
`2521`	`2515`	`parquet_compression (Optional[str]):`
`2522`		`-[Beta] The compression method to use if intermittently`
`2523`		-serializing ``dataframe`` to a parquet file.
`2524`		`-`
`2525`		-The argument is directly passed as the ``compression``
`2526`		-argument to the underlying ``pyarrow.parquet.write_table()``
`2527`		`-method (the default value "snappy" gets converted to uppercase).`
`2528`		`-https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table`
`2529`		`-`
`2530`		`-If the job config schema is missing, the argument is directly`
`2531`		-passed as the ``compression`` argument to the underlying
`2532`		-``DataFrame.to_parquet()`` method.
`2533`		`-https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet`
	`2516`	`+ [Beta] The compression method to use if intermittently`
	`2517`	+ serializing ``dataframe`` to a parquet file.
	`2518`	`+`
	`2519`	+ The argument is directly passed as the ``compression``
	`2520`	+ argument to the underlying ``pyarrow.parquet.write_table()``
	`2521`	`+ method (the default value "snappy" gets converted to uppercase).`
	`2522`	`+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table`
	`2523`	`+`
	`2524`	`+ If the job config schema is missing, the argument is directly`
	`2525`	+ passed as the ``compression`` argument to the underlying
	`2526`	+ ``DataFrame.to_parquet()`` method.
	`2527`	`+ https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet`
`2534`	`2528`	`timeout (Optional[float]):`
`2535`	`2529`	`The number of seconds to wait for the underlying HTTP transport`
`2536`	`2530`	before using ``retry``.
`@@ -2562,6 +2556,16 @@ def load_table_from_dataframe(`
`2562`	`2556`	`ifjob_config.source_formatisNone:`
`2563`	`2557`	`# default value`
`2564`	`2558`	`job_config.source_format=job.SourceFormat.PARQUET`
	`2559`	`+`
	`2560`	`+if (`
	`2561`	`+job_config.source_format==job.SourceFormat.PARQUET`
	`2562`	`+andjob_config.parquet_optionsisNone`
	`2563`	`+ ):`
	`2564`	`+parquet_options=ParquetOptions()`
	`2565`	`+# default value`
	`2566`	`+parquet_options.enable_list_inference=True`
	`2567`	`+job_config.parquet_options=parquet_options`
	`2568`	`+`
`2565`	`2569`	`ifjob_config.source_formatnotinsupported_formats:`
`2566`	`2570`	`raiseValueError(`
`2567`	`2571`	`"Got unexpected source_format: '{}'. Currently, only PARQUET and CSV are supported".format(`
`@@ -2628,12 +2632,12 @@ def load_table_from_dataframe(`
`2628`	`2632`	`try:`
`2629`	`2633`
`2630`	`2634`	`ifjob_config.source_format==job.SourceFormat.PARQUET:`
`2631`		`-if_PYARROW_VERSIONin_PYARROW_BAD_VERSIONS:`
	`2635`	`+if_helpers.PYARROW_VERSIONS.is_bad_version:`
`2632`	`2636`	`msg= (`
`2633`	`2637`	`"Loading dataframe data in PARQUET format with pyarrow "`
`2634`		`-f"{_PYARROW_VERSION} can result in data corruption. It is "`
`2635`		`-"therefore strongly advised to use a different pyarrow "`
`2636`		`-"version or a different source format. "`
	`2638`	`+f"{_helpers.PYARROW_VERSIONS.installed_version} can result in data "`
	`2639`	`+"corruption. It istherefore strongly advised to use a "`
	`2640`	`+"different pyarrowversion or a different source format. "`
`2637`	`2641`	`"See: https://github.com/googleapis/python-bigquery/issues/781"`
`2638`	`2642`	`)`
`2639`	`2643`	`warnings.warn(msg,category=RuntimeWarning)`
`@@ -2647,9 +2651,19 @@ def load_table_from_dataframe(`
`2647`	`2651`	`job_config.schema,`
`2648`	`2652`	`tmppath,`
`2649`	`2653`	`parquet_compression=parquet_compression,`
	`2654`	`+parquet_use_compliant_nested_type=True,`
`2650`	`2655`	`)`
`2651`	`2656`	`else:`
`2652`		`-dataframe.to_parquet(tmppath,compression=parquet_compression)`
	`2657`	`+dataframe.to_parquet(`
	`2658`	`+tmppath,`
	`2659`	`+engine="pyarrow",`
	`2660`	`+compression=parquet_compression,`
	`2661`	`+**(`
	`2662`	`+ {"use_compliant_nested_type":True}`
	`2663`	`+if_helpers.PYARROW_VERSIONS.use_compliant_nested_type`
	`2664`	`+else {}`
	`2665`	`+ ),`
	`2666`	`+ )`
`2653`	`2667`
`2654`	`2668`	`else:`
`2655`	`2669`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit1e59083

File tree

5 files changed

5 files changed

`‎google/cloud/bigquery/_helpers.py‎`

`‎google/cloud/bigquery/_pandas_helpers.py‎`

`‎google/cloud/bigquery/client.py‎`

0 commit comments