- Notifications
You must be signed in to change notification settings - Fork126
[ PECO-2065 ] Create the async execution flow for the PySQL Connector#463
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes fromall commits
e637408a174370925b2a3756ac17beffa2f8bf4442b44b29869b32e90511690File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
| import time | ||
| from typing import Dict, Tuple, List, Optional, Any, Union, Sequence | ||
| import pandas | ||
| @@ -47,6 +48,7 @@ | ||
| from databricks.sql.thrift_api.TCLIService.ttypes import ( | ||
| TSparkParameter, | ||
| TOperationState, | ||
| ) | ||
| @@ -430,6 +432,8 @@ def __init__( | ||
| self.escaper = ParamEscaper() | ||
| self.lastrowid = None | ||
| self.ASYNC_DEFAULT_POLLING_INTERVAL = 2 | ||
| # The ideal return type for this method is perhaps Self, but that was not added until 3.11, and we support pre-3.11 pythons, currently. | ||
| def __enter__(self) -> "Cursor": | ||
| return self | ||
| @@ -796,6 +800,7 @@ def execute( | ||
| cursor=self, | ||
| use_cloud_fetch=self.connection.use_cloud_fetch, | ||
| parameters=prepared_params, | ||
| async_op=False, | ||
| ) | ||
| self.active_result_set = ResultSet( | ||
Collaborator There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. result set is not ready yet when async_op is ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. The result set that is returned over here is empty and does not have any data. Collaborator There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I know, but this will make the code confusing and I do not think it is is necessary. ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I did this to keep the same logical flow for both execute_async and execute. Like in execute the active_result_set has data and in execute_async since there is no data so it is none. Once data is available the active_result_set will again have data, so logically I felt it made sense Collaborator There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. the return result comes from the returned value from execute_command in the sync flow, which means it is not ready until the sync completes, this is why I said it is confusing as it should be set in the completion of the async operation, this is standard practice/ways for most of the async code ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I got it what Jacky meant here. This is about confusing the users regarding the interface. We can keep the logic internally same, but don't need to keep the interface same for async and sync. For async what matters is the operationHandle. We can have different interface for both, but internally can reuse the code if possible. ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. @gopalldb@jackyhu-db I have changed the code, based on these suggestions. | ||
| self.connection, | ||
| @@ -812,6 +817,106 @@ def execute( | ||
| return self | ||
| def execute_async( | ||
jprakash-db marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
| self, | ||
| operation: str, | ||
| parameters: Optional[TParameterCollection] = None, | ||
| ) -> "Cursor": | ||
| """ | ||
| Execute a query and do not wait for it to complete and just move ahead | ||
| :param operation: | ||
| :param parameters: | ||
| :return: | ||
| """ | ||
| param_approach = self._determine_parameter_approach(parameters) | ||
| if param_approach == ParameterApproach.NONE: | ||
| prepared_params = NO_NATIVE_PARAMS | ||
| prepared_operation = operation | ||
| elif param_approach == ParameterApproach.INLINE: | ||
| prepared_operation, prepared_params = self._prepare_inline_parameters( | ||
| operation, parameters | ||
| ) | ||
| elif param_approach == ParameterApproach.NATIVE: | ||
| normalized_parameters = self._normalize_tparametercollection(parameters) | ||
| param_structure = self._determine_parameter_structure(normalized_parameters) | ||
| transformed_operation = transform_paramstyle( | ||
| operation, normalized_parameters, param_structure | ||
| ) | ||
| prepared_operation, prepared_params = self._prepare_native_parameters( | ||
| transformed_operation, normalized_parameters, param_structure | ||
| ) | ||
| self._check_not_closed() | ||
| self._close_and_clear_active_result_set() | ||
| self.thrift_backend.execute_command( | ||
| operation=prepared_operation, | ||
| session_handle=self.connection._session_handle, | ||
| max_rows=self.arraysize, | ||
| max_bytes=self.buffer_size_bytes, | ||
| lz4_compression=self.connection.lz4_compression, | ||
| cursor=self, | ||
| use_cloud_fetch=self.connection.use_cloud_fetch, | ||
| parameters=prepared_params, | ||
| async_op=True, | ||
| ) | ||
| return self | ||
| def get_query_state(self) -> "TOperationState": | ||
| """ | ||
| Get the state of the async executing query or basically poll the status of the query | ||
| :return: | ||
| """ | ||
| self._check_not_closed() | ||
| return self.thrift_backend.get_query_state(self.active_op_handle) | ||
| def get_async_execution_result(self): | ||
| """ | ||
| Checks for the status of the async executing query and fetches the result if the query is finished | ||
| Otherwise it will keep polling the status of the query till there is a Not pending state | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. How is this method async if it is polling for result? Shouldn't be contract like this?
| ||
| :return: | ||
| """ | ||
| self._check_not_closed() | ||
| def is_executing(operation_state) -> "bool": | ||
| return not operation_state or operation_state in [ | ||
| ttypes.TOperationState.RUNNING_STATE, | ||
| ttypes.TOperationState.PENDING_STATE, | ||
| ] | ||
| while is_executing(self.get_query_state()): | ||
| # Poll after some default time | ||
| time.sleep(self.ASYNC_DEFAULT_POLLING_INTERVAL) | ||
| operation_state = self.get_query_state() | ||
| if operation_state == ttypes.TOperationState.FINISHED_STATE: | ||
| execute_response = self.thrift_backend.get_execution_result( | ||
| self.active_op_handle, self | ||
| ) | ||
| self.active_result_set = ResultSet( | ||
| self.connection, | ||
| execute_response, | ||
| self.thrift_backend, | ||
| self.buffer_size_bytes, | ||
| self.arraysize, | ||
| ) | ||
| if execute_response.is_staging_operation: | ||
| self._handle_staging_operation( | ||
| staging_allowed_local_path=self.thrift_backend.staging_allowed_local_path | ||
| ) | ||
| return self | ||
| else: | ||
| raise Error( | ||
| f"get_execution_result failed with Operation status {operation_state}" | ||
| ) | ||
| def executemany(self, operation, seq_of_parameters): | ||
| """ | ||
| Execute the operation once for every set of passed in parameters. | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -7,6 +7,8 @@ | ||
| import threading | ||
| from typing import List, Union | ||
| from databricks.sql.thrift_api.TCLIService.ttypes import TOperationState | ||
| try: | ||
| import pyarrow | ||
| except ImportError: | ||
| @@ -769,6 +771,63 @@ def _results_message_to_execute_response(self, resp, operation_state): | ||
| arrow_schema_bytes=schema_bytes, | ||
| ) | ||
| def get_execution_result(self, op_handle, cursor): | ||
| assert op_handle is not None | ||
| req = ttypes.TFetchResultsReq( | ||
| operationHandle=ttypes.TOperationHandle( | ||
| op_handle.operationId, | ||
| op_handle.operationType, | ||
| False, | ||
| op_handle.modifiedRowCount, | ||
| ), | ||
| maxRows=cursor.arraysize, | ||
| maxBytes=cursor.buffer_size_bytes, | ||
| orientation=ttypes.TFetchOrientation.FETCH_NEXT, | ||
| includeResultSetMetadata=True, | ||
| ) | ||
| resp = self.make_request(self._client.FetchResults, req) | ||
| t_result_set_metadata_resp = resp.resultSetMetadata | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. we don't need to check the state of response? ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. In the client.py we check the result status and then go ahead with fetching | ||
| lz4_compressed = t_result_set_metadata_resp.lz4Compressed | ||
| is_staging_operation = t_result_set_metadata_resp.isStagingOperation | ||
| has_more_rows = resp.hasMoreRows | ||
| description = self._hive_schema_to_description( | ||
| t_result_set_metadata_resp.schema | ||
| ) | ||
| schema_bytes = ( | ||
| t_result_set_metadata_resp.arrowSchema | ||
| or self._hive_schema_to_arrow_schema(t_result_set_metadata_resp.schema) | ||
| .serialize() | ||
| .to_pybytes() | ||
| ) | ||
| queue = ResultSetQueueFactory.build_queue( | ||
| row_set_type=resp.resultSetMetadata.resultFormat, | ||
| t_row_set=resp.results, | ||
| arrow_schema_bytes=schema_bytes, | ||
| max_download_threads=self.max_download_threads, | ||
| lz4_compressed=lz4_compressed, | ||
| description=description, | ||
| ssl_options=self._ssl_options, | ||
| ) | ||
| return ExecuteResponse( | ||
| arrow_queue=queue, | ||
| status=resp.status, | ||
| has_been_closed_server_side=False, | ||
| has_more_rows=has_more_rows, | ||
| lz4_compressed=lz4_compressed, | ||
| is_staging_operation=is_staging_operation, | ||
| command_handle=op_handle, | ||
| description=description, | ||
| arrow_schema_bytes=schema_bytes, | ||
| ) | ||
| def _wait_until_command_done(self, op_handle, initial_operation_status_resp): | ||
| if initial_operation_status_resp: | ||
| self._check_command_not_in_error_or_closed_state( | ||
| @@ -787,6 +846,12 @@ def _wait_until_command_done(self, op_handle, initial_operation_status_resp): | ||
| self._check_command_not_in_error_or_closed_state(op_handle, poll_resp) | ||
| return operation_state | ||
| def get_query_state(self, op_handle) -> "TOperationState": | ||
| poll_resp = self._poll_for_status(op_handle) | ||
| operation_state = poll_resp.operationState | ||
| self._check_command_not_in_error_or_closed_state(op_handle, poll_resp) | ||
| return operation_state | ||
| @staticmethod | ||
| def _check_direct_results_for_error(t_spark_direct_results): | ||
| if t_spark_direct_results: | ||
| @@ -817,6 +882,7 @@ def execute_command( | ||
| cursor, | ||
| use_cloud_fetch=True, | ||
| parameters=[], | ||
| async_op=False, | ||
| ): | ||
| assert session_handle is not None | ||
| @@ -846,7 +912,11 @@ def execute_command( | ||
| parameters=parameters, | ||
| ) | ||
| resp = self.make_request(self._client.ExecuteStatement, req) | ||
| if async_op: | ||
| self._handle_execute_response_async(resp, cursor) | ||
| else: | ||
| return self._handle_execute_response(resp, cursor) | ||
| def get_catalogs(self, session_handle, max_rows, max_bytes, cursor): | ||
| assert session_handle is not None | ||
| @@ -945,6 +1015,10 @@ def _handle_execute_response(self, resp, cursor): | ||
| return self._results_message_to_execute_response(resp, final_operation_state) | ||
| def _handle_execute_response_async(self, resp, cursor): | ||
| cursor.active_op_handle = resp.operationHandle | ||
| self._check_direct_results_for_error(resp.directResults) | ||
| def fetch_results( | ||
| self, | ||
| op_handle, | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -36,6 +36,7 @@ | ||
| compare_dbr_versions, | ||
| is_thrift_v5_plus, | ||
| ) | ||
| from databricks.sql.thrift_api.TCLIService import ttypes | ||
| from tests.e2e.common.core_tests import CoreTestMixin, SmokeTestMixin | ||
| from tests.e2e.common.large_queries_mixin import LargeQueriesMixin | ||
| from tests.e2e.common.timestamp_tests import TimestampTestsMixin | ||
| @@ -78,6 +79,7 @@ class PySQLPytestTestCase: | ||
| } | ||
| arraysize = 1000 | ||
| buffer_size_bytes = 104857600 | ||
| POLLING_INTERVAL = 2 | ||
| @pytest.fixture(autouse=True) | ||
| def get_details(self, connection_details): | ||
| @@ -175,6 +177,27 @@ def test_cloud_fetch(self): | ||
| for i in range(len(cf_result)): | ||
| assert cf_result[i] == noop_result[i] | ||
| def test_execute_async(self): | ||
| def isExecuting(operation_state): | ||
| return not operation_state or operation_state in [ | ||
| ttypes.TOperationState.RUNNING_STATE, | ||
| ttypes.TOperationState.PENDING_STATE, | ||
| ] | ||
| long_running_query = "SELECT COUNT(*) FROM RANGE(10000 * 16) x JOIN RANGE(10000) y ON FROM_UNIXTIME(x.id * y.id, 'yyyy-MM-dd') LIKE '%not%a%date%'" | ||
| with self.cursor() as cursor: | ||
| cursor.execute_async(long_running_query) | ||
| ## Polling after every POLLING_INTERVAL seconds | ||
| while isExecuting(cursor.get_query_state()): | ||
jprakash-db marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
| time.sleep(self.POLLING_INTERVAL) | ||
| log.info("Polling the status in test_execute_async") | ||
| cursor.get_async_execution_result() | ||
| result = cursor.fetchall() | ||
| assert result[0].asDict() == {"count(1)": 0} | ||
| # Exclude Retry tests because they require specific setups, and LargeQueries too slow for core | ||
| # tests | ||
Uh oh!
There was an error while loading.Please reload this page.