Commite8fc63b

authored

Cloud fetch queue and integration (#151)

* Cloud fetch queue and integrationSigned-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com>* Enable cloudfetch with direct resultsSigned-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com>* Typing and style changesSigned-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com>* Client-settable max_download_threadsSigned-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com>* Docstrings and commentsSigned-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com>* Increase default buffer size bytes to 104857600Signed-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com>* Move max_download_threads to kwargs of ThriftBackend, fix unit testsSigned-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com>* Fix tests: staticmethod make_arrow_table mock not callableSigned-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com>* cancel_futures in shutdown() only available in python >=3.9.0Signed-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com>* Black lintingSigned-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com>* Fix typing errorsSigned-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com>---------Signed-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com>

1 parent061c763 commite8fc63bCopy full SHA for e8fc63b

File tree

6 files changed

+596

-136

lines changed

src/databricks/sql
- client.py
- cloudfetch
  - download_manager.py
- thrift_backend.py
- utils.py
tests/unit
- test_cloud_fetch_queue.py
- test_thrift_backend.py

6 files changed

+596

-136

lines changed

`‎src/databricks/sql/client.py‎`

Lines changed: 6 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`
`18`	`18`	`logger=logging.getLogger(__name__)`
`19`	`19`
`20`		`-DEFAULT_RESULT_BUFFER_SIZE_BYTES=10485760`
	`20`	`+DEFAULT_RESULT_BUFFER_SIZE_BYTES=104857600`
`21`	`21`	`DEFAULT_ARRAY_SIZE=100000`
`22`	`22`
`23`	`23`
`@@ -153,6 +153,8 @@ def read(self) -> Optional[OAuthToken]:`
`153`	`153`	`# _use_arrow_native_timestamps`
`154`	`154`	`# Databricks runtime will return native Arrow types for timestamps instead of Arrow strings`
`155`	`155`	`# (True by default)`
	`156`	`+# use_cloud_fetch`
	`157`	`+# Enable use of cloud fetch to extract large query results in parallel via cloud storage`
`156`	`158`
`157`	`159`	`ifaccess_token:`
`158`	`160`	`access_token_kv= {"access_token":access_token}`
`@@ -189,6 +191,7 @@ def read(self) -> Optional[OAuthToken]:`
`189`	`191`	`self._session_handle=self.thrift_backend.open_session(`
`190`	`192`	`session_configuration,catalog,schema`
`191`	`193`	`)`
	`194`	`+self.use_cloud_fetch=kwargs.get("use_cloud_fetch",False)`
`192`	`195`	`self.open=True`
`193`	`196`	`logger.info("Successfully opened session "+str(self.get_session_id_hex()))`
`194`	`197`	`self._cursors= []# type: List[Cursor]`
`@@ -497,6 +500,7 @@ def execute(`
`497`	`500`	`max_bytes=self.buffer_size_bytes,`
`498`	`501`	`lz4_compression=self.connection.lz4_compression,`
`499`	`502`	`cursor=self,`
	`503`	`+use_cloud_fetch=self.connection.use_cloud_fetch,`
`500`	`504`	`)`
`501`	`505`	`self.active_result_set=ResultSet(`
`502`	`506`	`self.connection,`
`@@ -822,6 +826,7 @@ def __iter__(self):`
`822`	`826`	`break`
`823`	`827`
`824`	`828`	`def_fill_results_buffer(self):`
	`829`	`+# At initialization or if the server does not have cloud fetch result links available`
`825`	`830`	`results,has_more_rows=self.thrift_backend.fetch_results(`
`826`	`831`	`op_handle=self.command_id,`
`827`	`832`	`max_rows=self.arraysize,`

`‎src/databricks/sql/cloudfetch/download_manager.py‎`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -161,6 +161,6 @@ def _check_if_download_successful(self, handler: ResultSetDownloadHandler):`
`161`	`161`	`returnTrue`
`162`	`162`
`163`	`163`	`def_shutdown_manager(self):`
`164`		`-# Clear download handlers and shutdown the thread pool to cancel pending futures`
	`164`	`+# Clear download handlers and shutdown the thread pool`
`165`	`165`	`self.download_handlers= []`
`166`		`-self.thread_pool.shutdown(wait=False,cancel_futures=True)`
	`166`	`+self.thread_pool.shutdown(wait=False)`

`‎src/databricks/sql/thrift_backend.py‎`

Lines changed: 39 additions & 112 deletions

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@`
`5`	`5`	`importtime`
`6`	`6`	`importuuid`
`7`	`7`	`importthreading`
`8`		`-importlz4.frame`
`9`	`8`	`fromsslimportCERT_NONE,CERT_REQUIRED,create_default_context`
`10`	`9`	`fromtypingimportList,Union`
`11`	`10`
`@@ -26,11 +25,14 @@`
`26`	`25`	`)`
`27`	`26`
`28`	`27`	`fromdatabricks.sql.utilsimport (`
`29`		`-ArrowQueue,`
`30`	`28`	`ExecuteResponse,`
`31`	`29`	`_bound,`
`32`	`30`	`RequestErrorInfo,`
`33`	`31`	`NoRetryReason,`
	`32`	`+ResultSetQueueFactory,`
	`33`	`+convert_arrow_based_set_to_arrow_table,`
	`34`	`+convert_decimals_in_arrow_table,`
	`35`	`+convert_column_based_set_to_arrow_table,`
`34`	`36`	`)`
`35`	`37`
`36`	`38`	`logger=logging.getLogger(__name__)`
`@@ -67,7 +69,6 @@`
`67`	`69`	`classThriftBackend:`
`68`	`70`	`CLOSED_OP_STATE=ttypes.TOperationState.CLOSED_STATE`
`69`	`71`	`ERROR_OP_STATE=ttypes.TOperationState.ERROR_STATE`
`70`		`-BIT_MASKS= [1,2,4,8,16,32,64,128]`
`71`	`72`
`72`	`73`	`def__init__(`
`73`	`74`	`self,`
`@@ -115,6 +116,8 @@ def __init__(`
`115`	`116`	`# _socket_timeout`
`116`	`117`	`# The timeout in seconds for socket send, recv and connect operations. Should be a positive float or integer.`
`117`	`118`	`# (defaults to 900)`
	`119`	`+# max_download_threads`
	`120`	`+# Number of threads for handling cloud fetch downloads. Defaults to 10`
`118`	`121`
`119`	`122`	`port=portor443`
`120`	`123`	`ifkwargs.get("_connection_uri"):`
`@@ -136,6 +139,9 @@ def __init__(`
`136`	`139`	`"_use_arrow_native_timestamps",True`
`137`	`140`	`)`
`138`	`141`
	`142`	`+# Cloud fetch`
	`143`	`+self.max_download_threads=kwargs.get("max_download_threads",10)`
	`144`	`+`
`139`	`145`	`# Configure tls context`
`140`	`146`	`ssl_context=create_default_context(cafile=kwargs.get("_tls_trusted_ca_file"))`
`141`	`147`	`ifkwargs.get("_tls_no_verify")isTrue:`
`@@ -558,108 +564,14 @@ def _create_arrow_table(self, t_row_set, lz4_compressed, schema_bytes, descripti`
`558`	`564`	`(`
`559`	`565`	`arrow_table,`
`560`	`566`	`num_rows,`
`561`		`- )=ThriftBackend._convert_column_based_set_to_arrow_table(`
`562`		`-t_row_set.columns,description`
`563`		`- )`
	`567`	`+ )=convert_column_based_set_to_arrow_table(t_row_set.columns,description)`
`564`	`568`	`elift_row_set.arrowBatchesisnotNone:`
`565`		`- (`
`566`		`-arrow_table,`
`567`		`-num_rows,`
`568`		`- )=ThriftBackend._convert_arrow_based_set_to_arrow_table(`
	`569`	`+ (arrow_table,num_rows,)=convert_arrow_based_set_to_arrow_table(`
`569`	`570`	`t_row_set.arrowBatches,lz4_compressed,schema_bytes`
`570`	`571`	`)`
`571`	`572`	`else:`
`572`	`573`	`raiseOperationalError("Unsupported TRowSet instance {}".format(t_row_set))`
`573`		`-returnself._convert_decimals_in_arrow_table(arrow_table,description),num_rows`
`574`		`-`
`575`		`-@staticmethod`
`576`		`-def_convert_decimals_in_arrow_table(table,description):`
`577`		`-for (i,col)inenumerate(table.itercolumns()):`
`578`		`-ifdescription[i][1]=="decimal":`
`579`		`-decimal_col=col.to_pandas().apply(`
`580`		`-lambdav:vifvisNoneelseDecimal(v)`
`581`		`- )`
`582`		`-precision,scale=description[i][4],description[i][5]`
`583`		`-assertscaleisnotNone`
`584`		`-assertprecisionisnotNone`
`585`		`-# Spark limits decimal to a maximum scale of 38,`
`586`		`-# so 128 is guaranteed to be big enough`
`587`		`-dtype=pyarrow.decimal128(precision,scale)`
`588`		`-col_data=pyarrow.array(decimal_col,type=dtype)`
`589`		`-field=table.field(i).with_type(dtype)`
`590`		`-table=table.set_column(i,field,col_data)`
`591`		`-returntable`
`592`		`-`
`593`		`-@staticmethod`
`594`		`-def_convert_arrow_based_set_to_arrow_table(`
`595`		`-arrow_batches,lz4_compressed,schema_bytes`
`596`		`- ):`
`597`		`-ba=bytearray()`
`598`		`-ba+=schema_bytes`
`599`		`-n_rows=0`
`600`		`-iflz4_compressed:`
`601`		`-forarrow_batchinarrow_batches:`
`602`		`-n_rows+=arrow_batch.rowCount`
`603`		`-ba+=lz4.frame.decompress(arrow_batch.batch)`
`604`		`-else:`
`605`		`-forarrow_batchinarrow_batches:`
`606`		`-n_rows+=arrow_batch.rowCount`
`607`		`-ba+=arrow_batch.batch`
`608`		`-arrow_table=pyarrow.ipc.open_stream(ba).read_all()`
`609`		`-returnarrow_table,n_rows`
`610`		`-`
`611`		`-@staticmethod`
`612`		`-def_convert_column_based_set_to_arrow_table(columns,description):`
`613`		`-arrow_table=pyarrow.Table.from_arrays(`
`614`		`- [ThriftBackend._convert_column_to_arrow_array(c)forcincolumns],`
`615`		`-# Only use the column names from the schema, the types are determined by the`
`616`		`-# physical types used in column based set, as they can differ from the`
`617`		`-# mapping used in _hive_schema_to_arrow_schema.`
`618`		`-names=[c[0]forcindescription],`
`619`		`- )`
`620`		`-returnarrow_table,arrow_table.num_rows`
`621`		`-`
`622`		`-@staticmethod`
`623`		`-def_convert_column_to_arrow_array(t_col):`
`624`		`-"""`
`625`		`- Return a pyarrow array from the values in a TColumn instance.`
`626`		`- Note that ColumnBasedSet has no native support for complex types, so they will be converted`
`627`		`- to strings server-side.`
`628`		`- """`
`629`		`-field_name_to_arrow_type= {`
`630`		`-"boolVal":pyarrow.bool_(),`
`631`		`-"byteVal":pyarrow.int8(),`
`632`		`-"i16Val":pyarrow.int16(),`
`633`		`-"i32Val":pyarrow.int32(),`
`634`		`-"i64Val":pyarrow.int64(),`
`635`		`-"doubleVal":pyarrow.float64(),`
`636`		`-"stringVal":pyarrow.string(),`
`637`		`-"binaryVal":pyarrow.binary(),`
`638`		`- }`
`639`		`-forfieldinfield_name_to_arrow_type.keys():`
`640`		`-wrapper=getattr(t_col,field)`
`641`		`-ifwrapper:`
`642`		`-returnThriftBackend._create_arrow_array(`
`643`		`-wrapper,field_name_to_arrow_type[field]`
`644`		`- )`
`645`		`-`
`646`		`-raiseOperationalError("Empty TColumn instance {}".format(t_col))`
`647`		`-`
`648`		`-@staticmethod`
`649`		`-def_create_arrow_array(t_col_value_wrapper,arrow_type):`
`650`		`-result=t_col_value_wrapper.values`
`651`		`-nulls=t_col_value_wrapper.nulls# bitfield describing which values are null`
`652`		`-assertisinstance(nulls,bytes)`
`653`		`-`
`654`		`-# The number of bits in nulls can be both larger or smaller than the number of`
`655`		`-# elements in result, so take the minimum of both to iterate over.`
`656`		`-length=min(len(result),len(nulls)*8)`
`657`		`-`
`658`		`-foriinrange(length):`
`659`		`-ifnulls[i>>3]&ThriftBackend.BIT_MASKS[i&0x7]:`
`660`		`-result[i]=None`
`661`		`-`
`662`		`-returnpyarrow.array(result,type=arrow_type)`
	`574`	`+returnconvert_decimals_in_arrow_table(arrow_table,description),num_rows`
`663`	`575`
`664`	`576`	`def_get_metadata_resp(self,op_handle):`
`665`	`577`	`req=ttypes.TGetResultSetMetadataReq(operationHandle=op_handle)`
`@@ -752,6 +664,7 @@ def _results_message_to_execute_response(self, resp, operation_state):`
`752`	`664`	`ift_result_set_metadata_resp.resultFormatnotin [`
`753`	`665`	`ttypes.TSparkRowSetType.ARROW_BASED_SET,`
`754`	`666`	`ttypes.TSparkRowSetType.COLUMN_BASED_SET,`
	`667`	`+ttypes.TSparkRowSetType.URL_BASED_SET,`
`755`	`668`	`]:`
`756`	`669`	`raiseOperationalError(`
`757`	`670`	`"Expected results to be in Arrow or column based format, "`
`@@ -783,13 +696,14 @@ def _results_message_to_execute_response(self, resp, operation_state):`
`783`	`696`	`assertdirect_results.resultSet.results.startRowOffset==0`
`784`	`697`	`assertdirect_results.resultSetMetadata`
`785`	`698`
`786`		`-arrow_results,n_rows=self._create_arrow_table(`
`787`		`-direct_results.resultSet.results,`
`788`		`-lz4_compressed,`
`789`		`-schema_bytes,`
`790`		`-description,`
	`699`	`+arrow_queue_opt=ResultSetQueueFactory.build_queue(`
	`700`	`+row_set_type=t_result_set_metadata_resp.resultFormat,`
	`701`	`+t_row_set=direct_results.resultSet.results,`
	`702`	`+arrow_schema_bytes=schema_bytes,`
	`703`	`+max_download_threads=self.max_download_threads,`
	`704`	`+lz4_compressed=lz4_compressed,`
	`705`	`+description=description,`
`791`	`706`	`)`
`792`		`-arrow_queue_opt=ArrowQueue(arrow_results,n_rows,0)`
`793`	`707`	`else:`
`794`	`708`	`arrow_queue_opt=None`
`795`	`709`	`returnExecuteResponse(`
`@@ -843,7 +757,14 @@ def _check_direct_results_for_error(t_spark_direct_results):`
`843`	`757`	`)`
`844`	`758`
`845`	`759`	`defexecute_command(`
`846`		`-self,operation,session_handle,max_rows,max_bytes,lz4_compression,cursor`
	`760`	`+self,`
	`761`	`+operation,`
	`762`	`+session_handle,`
	`763`	`+max_rows,`
	`764`	`+max_bytes,`
	`765`	`+lz4_compression,`
	`766`	`+cursor,`
	`767`	`+use_cloud_fetch=False,`
`847`	`768`	`):`
`848`	`769`	`assertsession_handleisnotNone`
`849`	`770`
`@@ -864,7 +785,7 @@ def execute_command(`
`864`	`785`	`),`
`865`	`786`	`canReadArrowResult=True,`
`866`	`787`	`canDecompressLZ4Result=lz4_compression,`
`867`		`-canDownloadResult=False,`
	`788`	`+canDownloadResult=use_cloud_fetch,`
`868`	`789`	`confOverlay={`
`869`	`790`	`# We want to receive proper Timestamp arrow types.`
`870`	`791`	`"spark.thriftserver.arrowBasedRowSet.timestampAsString":"false"`
`@@ -993,6 +914,7 @@ def fetch_results(`
`993`	`914`	`maxRows=max_rows,`
`994`	`915`	`maxBytes=max_bytes,`
`995`	`916`	`orientation=ttypes.TFetchOrientation.FETCH_NEXT,`
	`917`	`+includeResultSetMetadata=True,`
`996`	`918`	`)`
`997`	`919`
`998`	`920`	`resp=self.make_request(self._client.FetchResults,req)`
`@@ -1002,12 +924,17 @@ def fetch_results(`
`1002`	`924`	`expected_row_start_offset,resp.results.startRowOffset`
`1003`	`925`	`)`
`1004`	`926`	`)`
`1005`		`-arrow_results,n_rows=self._create_arrow_table(`
`1006`		`-resp.results,lz4_compressed,arrow_schema_bytes,description`
	`927`	`+`
	`928`	`+queue=ResultSetQueueFactory.build_queue(`
	`929`	`+row_set_type=resp.resultSetMetadata.resultFormat,`
	`930`	`+t_row_set=resp.results,`
	`931`	`+arrow_schema_bytes=arrow_schema_bytes,`
	`932`	`+max_download_threads=self.max_download_threads,`
	`933`	`+lz4_compressed=lz4_compressed,`
	`934`	`+description=description,`
`1007`	`935`	`)`
`1008`		`-arrow_queue=ArrowQueue(arrow_results,n_rows)`
`1009`	`936`
`1010`		`-returnarrow_queue,resp.hasMoreRows`
	`937`	`+returnqueue,resp.hasMoreRows`
`1011`	`938`
`1012`	`939`	`defclose_command(self,op_handle):`
`1013`	`940`	`req=ttypes.TCloseOperationReq(operationHandle=op_handle)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commite8fc63b

File tree

6 files changed

6 files changed

`‎src/databricks/sql/client.py‎`

`‎src/databricks/sql/cloudfetch/download_manager.py‎`

`‎src/databricks/sql/thrift_backend.py‎`

0 commit comments