Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd792ce0

Browse files
authored
feat: add support for Parquet options (#679)
* feat: add support for Parquet optionsFor load jobs and external tables config.* Simplify ParquetOptions.to_api_repr()Co-authored by Tres Seaver.* Expose ParquetOptions in top level namespace* Parquet options should be reflected in options
1 parenta0a9fa2 commitd792ce0

File tree

7 files changed

+306
-1
lines changed

7 files changed

+306
-1
lines changed

‎google/cloud/bigquery/__init__.py‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
fromgoogle.cloud.bigquery.external_configimportCSVOptions
4848
fromgoogle.cloud.bigquery.external_configimportGoogleSheetsOptions
4949
fromgoogle.cloud.bigquery.external_configimportExternalSourceFormat
50+
fromgoogle.cloud.bigquery.format_optionsimportParquetOptions
5051
fromgoogle.cloud.bigquery.jobimportCompression
5152
fromgoogle.cloud.bigquery.jobimportCopyJob
5253
fromgoogle.cloud.bigquery.jobimportCopyJobConfig
@@ -136,6 +137,7 @@
136137
"BigtableColumn",
137138
"CSVOptions",
138139
"GoogleSheetsOptions",
140+
"ParquetOptions",
139141
"DEFAULT_RETRY",
140142
# Enum Constants
141143
"enums",

‎google/cloud/bigquery/external_config.py‎

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
fromgoogle.cloud.bigquery._helpersimport_bytes_to_json
2828
fromgoogle.cloud.bigquery._helpersimport_int_or_none
2929
fromgoogle.cloud.bigquery._helpersimport_str_or_none
30+
fromgoogle.cloud.bigquery.format_optionsimportParquetOptions
3031
fromgoogle.cloud.bigquery.schemaimportSchemaField
3132

3233

@@ -53,6 +54,12 @@ class ExternalSourceFormat(object):
5354
DATASTORE_BACKUP="DATASTORE_BACKUP"
5455
"""Specifies datastore backup format"""
5556

57+
ORC="ORC"
58+
"""Specifies ORC format."""
59+
60+
PARQUET="PARQUET"
61+
"""Specifies Parquet format."""
62+
5663
BIGTABLE="BIGTABLE"
5764
"""Specifies Bigtable format."""
5865

@@ -540,7 +547,7 @@ def from_api_repr(cls, resource: dict) -> "GoogleSheetsOptions":
540547
returnconfig
541548

542549

543-
_OPTION_CLASSES= (BigtableOptions,CSVOptions,GoogleSheetsOptions)
550+
_OPTION_CLASSES= (BigtableOptions,CSVOptions,GoogleSheetsOptions,ParquetOptions)
544551

545552

546553
classHivePartitioningOptions(object):
@@ -784,6 +791,25 @@ def schema(self, value):
784791
prop= {"fields": [field.to_api_repr()forfieldinvalue]}
785792
self._properties["schema"]=prop
786793

794+
@property
795+
defparquet_options(self):
796+
"""Optional[google.cloud.bigquery.format_options.ParquetOptions]: Additional
797+
properties to set if ``sourceFormat`` is set to PARQUET.
798+
799+
See:
800+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.parquet_options
801+
"""
802+
ifself.source_format!=ExternalSourceFormat.PARQUET:
803+
returnNone
804+
returnself._options
805+
806+
@parquet_options.setter
807+
defparquet_options(self,value):
808+
ifself.source_format!=ExternalSourceFormat.PARQUET:
809+
msg=f"Cannot set Parquet options, source format is{self.source_format}"
810+
raiseTypeError(msg)
811+
self._options=value
812+
787813
defto_api_repr(self)->dict:
788814
"""Build an API representation of this object.
789815
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Copyright 2021 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
importcopy
16+
fromtypingimportDict
17+
18+
19+
classParquetOptions:
20+
"""Additional options if the PARQUET source format is used."""
21+
22+
_SOURCE_FORMAT="PARQUET"
23+
_RESOURCE_NAME="parquetOptions"
24+
25+
def__init__(self):
26+
self._properties= {}
27+
28+
@property
29+
defenum_as_string(self)->bool:
30+
"""Indicates whether to infer Parquet ENUM logical type as STRING instead of
31+
BYTES by default.
32+
33+
See
34+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ParquetOptions.FIELDS.enum_as_string
35+
"""
36+
returnself._properties.get("enumAsString")
37+
38+
@enum_as_string.setter
39+
defenum_as_string(self,value:bool)->None:
40+
self._properties["enumAsString"]=value
41+
42+
@property
43+
defenable_list_inference(self)->bool:
44+
"""Indicates whether to use schema inference specifically for Parquet LIST
45+
logical type.
46+
47+
See
48+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ParquetOptions.FIELDS.enable_list_inference
49+
"""
50+
returnself._properties.get("enableListInference")
51+
52+
@enable_list_inference.setter
53+
defenable_list_inference(self,value:bool)->None:
54+
self._properties["enableListInference"]=value
55+
56+
@classmethod
57+
deffrom_api_repr(cls,resource:Dict[str,bool])->"ParquetOptions":
58+
"""Factory: construct an instance from a resource dict.
59+
60+
Args:
61+
resource (Dict[str, bool]):
62+
Definition of a :class:`~.format_options.ParquetOptions` instance in
63+
the same representation as is returned from the API.
64+
65+
Returns:
66+
:class:`~.format_options.ParquetOptions`:
67+
Configuration parsed from ``resource``.
68+
"""
69+
config=cls()
70+
config._properties=copy.deepcopy(resource)
71+
returnconfig
72+
73+
defto_api_repr(self)->dict:
74+
"""Build an API representation of this object.
75+
76+
Returns:
77+
Dict[str, bool]:
78+
A dictionary in the format used by the BigQuery API.
79+
"""
80+
returncopy.deepcopy(self._properties)

‎google/cloud/bigquery/job/load.py‎

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
fromgoogle.cloud.bigquery.encryption_configurationimportEncryptionConfiguration
1818
fromgoogle.cloud.bigquery.external_configimportHivePartitioningOptions
19+
fromgoogle.cloud.bigquery.format_optionsimportParquetOptions
1920
fromgoogle.cloud.bigqueryimport_helpers
2021
fromgoogle.cloud.bigquery.schemaimportSchemaField
2122
fromgoogle.cloud.bigquery.schemaimport_to_schema_fields
@@ -439,6 +440,26 @@ def write_disposition(self):
439440
defwrite_disposition(self,value):
440441
self._set_sub_prop("writeDisposition",value)
441442

443+
@property
444+
defparquet_options(self):
445+
"""Optional[google.cloud.bigquery.format_options.ParquetOptions]: Additional
446+
properties to set if ``sourceFormat`` is set to PARQUET.
447+
448+
See:
449+
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.parquet_options
450+
"""
451+
prop=self._get_sub_prop("parquetOptions")
452+
ifpropisnotNone:
453+
prop=ParquetOptions.from_api_repr(prop)
454+
returnprop
455+
456+
@parquet_options.setter
457+
defparquet_options(self,value):
458+
ifvalueisnotNone:
459+
self._set_sub_prop("parquetOptions",value.to_api_repr())
460+
else:
461+
self._del_sub_prop("parquetOptions")
462+
442463

443464
classLoadJob(_AsyncJob):
444465
"""Asynchronous job for loading data into a table.

‎tests/unit/job/test_load_config.py‎

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -700,3 +700,38 @@ def test_write_disposition_setter(self):
700700
self.assertEqual(
701701
config._properties["load"]["writeDisposition"],write_disposition
702702
)
703+
704+
deftest_parquet_options_missing(self):
705+
config=self._get_target_class()()
706+
self.assertIsNone(config.parquet_options)
707+
708+
deftest_parquet_options_hit(self):
709+
config=self._get_target_class()()
710+
config._properties["load"]["parquetOptions"]=dict(
711+
enumAsString=True,enableListInference=False
712+
)
713+
self.assertTrue(config.parquet_options.enum_as_string)
714+
self.assertFalse(config.parquet_options.enable_list_inference)
715+
716+
deftest_parquet_options_setter(self):
717+
fromgoogle.cloud.bigquery.format_optionsimportParquetOptions
718+
719+
parquet_options=ParquetOptions.from_api_repr(
720+
dict(enumAsString=False,enableListInference=True)
721+
)
722+
config=self._get_target_class()()
723+
724+
config.parquet_options=parquet_options
725+
self.assertEqual(
726+
config._properties["load"]["parquetOptions"],
727+
{"enumAsString":False,"enableListInference":True},
728+
)
729+
730+
deftest_parquet_options_setter_clearing(self):
731+
config=self._get_target_class()()
732+
config._properties["load"]["parquetOptions"]=dict(
733+
enumAsString=False,enableListInference=True
734+
)
735+
736+
config.parquet_options=None
737+
self.assertNotIn("parquetOptions",config._properties["load"])

‎tests/unit/test_external_config.py‎

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,106 @@ def test_to_api_repr_bigtable(self):
425425

426426
self.assertEqual(got_resource,exp_resource)
427427

428+
deftest_parquet_options_getter(self):
429+
fromgoogle.cloud.bigquery.format_optionsimportParquetOptions
430+
431+
parquet_options=ParquetOptions.from_api_repr(
432+
{"enumAsString":True,"enableListInference":False}
433+
)
434+
ec=external_config.ExternalConfig(
435+
external_config.ExternalSourceFormat.PARQUET
436+
)
437+
438+
self.assertIsNone(ec.parquet_options.enum_as_string)
439+
self.assertIsNone(ec.parquet_options.enable_list_inference)
440+
441+
ec._options=parquet_options
442+
443+
self.assertTrue(ec.parquet_options.enum_as_string)
444+
self.assertFalse(ec.parquet_options.enable_list_inference)
445+
446+
self.assertIs(ec.parquet_options,ec.options)
447+
448+
deftest_parquet_options_getter_non_parquet_format(self):
449+
ec=external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV)
450+
self.assertIsNone(ec.parquet_options)
451+
452+
deftest_parquet_options_setter(self):
453+
fromgoogle.cloud.bigquery.format_optionsimportParquetOptions
454+
455+
parquet_options=ParquetOptions.from_api_repr(
456+
{"enumAsString":False,"enableListInference":True}
457+
)
458+
ec=external_config.ExternalConfig(
459+
external_config.ExternalSourceFormat.PARQUET
460+
)
461+
462+
ec.parquet_options=parquet_options
463+
464+
# Setting Parquet options should be reflected in the generic options attribute.
465+
self.assertFalse(ec.options.enum_as_string)
466+
self.assertTrue(ec.options.enable_list_inference)
467+
468+
deftest_parquet_options_setter_non_parquet_format(self):
469+
fromgoogle.cloud.bigquery.format_optionsimportParquetOptions
470+
471+
parquet_options=ParquetOptions.from_api_repr(
472+
{"enumAsString":False,"enableListInference":True}
473+
)
474+
ec=external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV)
475+
476+
withself.assertRaisesRegex(TypeError,"Cannot set.*source format is CSV"):
477+
ec.parquet_options=parquet_options
478+
479+
deftest_from_api_repr_parquet(self):
480+
fromgoogle.cloud.bigquery.format_optionsimportParquetOptions
481+
482+
resource=_copy_and_update(
483+
self.BASE_RESOURCE,
484+
{
485+
"sourceFormat":"PARQUET",
486+
"parquetOptions": {"enumAsString":True,"enableListInference":False},
487+
},
488+
)
489+
490+
ec=external_config.ExternalConfig.from_api_repr(resource)
491+
492+
self._verify_base(ec)
493+
self.assertEqual(ec.source_format,external_config.ExternalSourceFormat.PARQUET)
494+
self.assertIsInstance(ec.options,ParquetOptions)
495+
self.assertTrue(ec.parquet_options.enum_as_string)
496+
self.assertFalse(ec.parquet_options.enable_list_inference)
497+
498+
got_resource=ec.to_api_repr()
499+
500+
self.assertEqual(got_resource,resource)
501+
502+
delresource["parquetOptions"]["enableListInference"]
503+
ec=external_config.ExternalConfig.from_api_repr(resource)
504+
self.assertIsNone(ec.options.enable_list_inference)
505+
got_resource=ec.to_api_repr()
506+
self.assertEqual(got_resource,resource)
507+
508+
deftest_to_api_repr_parquet(self):
509+
fromgoogle.cloud.bigquery.format_optionsimportParquetOptions
510+
511+
ec=external_config.ExternalConfig(
512+
external_config.ExternalSourceFormat.PARQUET
513+
)
514+
options=ParquetOptions.from_api_repr(
515+
dict(enumAsString=False,enableListInference=True)
516+
)
517+
ec._options=options
518+
519+
exp_resource= {
520+
"sourceFormat":external_config.ExternalSourceFormat.PARQUET,
521+
"parquetOptions": {"enumAsString":False,"enableListInference":True},
522+
}
523+
524+
got_resource=ec.to_api_repr()
525+
526+
self.assertEqual(got_resource,exp_resource)
527+
428528

429529
def_copy_and_update(d,u):
430530
d=copy.deepcopy(d)

‎tests/unit/test_format_options.py‎

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Copyright 2021 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
classTestParquetOptions:
17+
@staticmethod
18+
def_get_target_class():
19+
fromgoogle.cloud.bigquery.format_optionsimportParquetOptions
20+
21+
returnParquetOptions
22+
23+
deftest_ctor(self):
24+
config=self._get_target_class()()
25+
assertconfig.enum_as_stringisNone
26+
assertconfig.enable_list_inferenceisNone
27+
28+
deftest_from_api_repr(self):
29+
config=self._get_target_class().from_api_repr(
30+
{"enumAsString":False,"enableListInference":True}
31+
)
32+
assertnotconfig.enum_as_string
33+
assertconfig.enable_list_inference
34+
35+
deftest_to_api_repr(self):
36+
config=self._get_target_class()()
37+
config.enum_as_string=True
38+
config.enable_list_inference=False
39+
40+
result=config.to_api_repr()
41+
assertresult== {"enumAsString":True,"enableListInference":False}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp