Commit1e6c2e9

authored

Add more debug logging for CloudFetch (#395)

Signed-off-by: Levko Kravets <levko.ne@gmail.com>

1 parent6c16b70 commit1e6c2e9Copy full SHA for 1e6c2e9

File tree

2 files changed

+84

-2

lines changed

src/databricks/sql
- cloudfetch
  - download_manager.py
- utils.py

2 files changed

+84

-2

lines changed

`‎src/databricks/sql/cloudfetch/download_manager.py‎`

Lines changed: 49 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,11 @@ def add_file_links(`
`49`	`49`	`forlinkint_spark_arrow_result_links:`
`50`	`50`	`iflink.rowCount<=0:`
`51`	`51`	`continue`
	`52`	`+logger.debug(`
	`53`	`+"ResultFileDownloadManager.add_file_links: start offset {}, row count: {}".format(`
	`54`	`+link.startRowOffset,link.rowCount`
	`55`	`+ )`
	`56`	`+ )`
`52`	`57`	`self.download_handlers.append(`
`53`	`58`	`ResultSetDownloadHandler(self.downloadable_result_settings,link)`
`54`	`59`	`)`
`@@ -88,6 +93,12 @@ def get_next_downloaded_file(`
`88`	`93`
`89`	`94`	`# Check (and wait) for download status`
`90`	`95`	`ifself._check_if_download_successful(handler):`
	`96`	`+link=handler.result_link`
	`97`	`+logger.debug(`
	`98`	`+"ResultFileDownloadManager: file found for row index {}: start {}, row count: {}".format(`
	`99`	`+next_row_offset,link.startRowOffset,link.rowCount`
	`100`	`+ )`
	`101`	`+ )`
`91`	`102`	`# Buffer should be empty so set buffer to new ArrowQueue with result_file`
`92`	`103`	`result=DownloadedFile(`
`93`	`104`	`handler.result_file,`
`@@ -97,40 +108,78 @@ def get_next_downloaded_file(`
`97`	`108`	`self.download_handlers.pop(idx)`
`98`	`109`	`# Return True upon successful download to continue loop and not force a retry`
`99`	`110`	`returnresult`
	`111`	`+else:`
	`112`	`+logger.debug(`
	`113`	`+"ResultFileDownloadManager: cannot find file for row index {}".format(`
	`114`	`+next_row_offset`
	`115`	`+ )`
	`116`	`+ )`
	`117`	`+`
`100`	`118`	`# Download was not successful for next download item, force a retry`
`101`	`119`	`self._shutdown_manager()`
`102`	`120`	`returnNone`
`103`	`121`
`104`	`122`	`def_remove_past_handlers(self,next_row_offset:int):`
	`123`	`+logger.debug(`
	`124`	`+"ResultFileDownloadManager: removing past handlers, current offset: {}".format(`
	`125`	`+next_row_offset`
	`126`	`+ )`
	`127`	`+ )`
`105`	`128`	`# Any link in which its start to end range doesn't include the next row to be fetched does not need downloading`
`106`	`129`	`i=0`
`107`	`130`	`whilei<len(self.download_handlers):`
`108`	`131`	`result_link=self.download_handlers[i].result_link`
	`132`	`+logger.debug(`
	`133`	`+"- checking result link: start {}, row count: {}, current offset: {}".format(`
	`134`	`+result_link.startRowOffset,result_link.rowCount,next_row_offset`
	`135`	`+ )`
	`136`	`+ )`
`109`	`137`	`ifresult_link.startRowOffset+result_link.rowCount>next_row_offset:`
`110`	`138`	`i+=1`
`111`	`139`	`continue`
`112`	`140`	`self.download_handlers.pop(i)`
`113`	`141`
`114`	`142`	`def_schedule_downloads(self):`
`115`	`143`	`# Schedule downloads for all download handlers if not already scheduled.`
	`144`	`+logger.debug("ResultFileDownloadManager: schedule downloads")`
`116`	`145`	`forhandlerinself.download_handlers:`
`117`	`146`	`ifhandler.is_download_scheduled:`
`118`	`147`	`continue`
`119`	`148`	`try:`
	`149`	`+logger.debug(`
	`150`	`+"- start: {}, row count: {}".format(`
	`151`	`+handler.result_link.startRowOffset,handler.result_link.rowCount`
	`152`	`+ )`
	`153`	`+ )`
`120`	`154`	`self.thread_pool.submit(handler.run)`
`121`	`155`	`exceptExceptionase:`
`122`	`156`	`logger.error(e)`
`123`	`157`	`break`
`124`	`158`	`handler.is_download_scheduled=True`
`125`	`159`
`126`	`160`	`def_find_next_file_index(self,next_row_offset:int):`
	`161`	`+logger.debug(`
	`162`	`+"ResultFileDownloadManager: trying to find file for row {}".format(`
	`163`	`+next_row_offset`
	`164`	`+ )`
	`165`	`+ )`
`127`	`166`	`# Get the handler index of the next file in order`
`128`	`167`	`next_indices= [`
`129`	`168`	`i`
`130`	`169`	`fori,handlerinenumerate(self.download_handlers)`
`131`	`170`	`ifhandler.is_download_scheduled`
	`171`	+# TODO: shouldn't `next_row_offset` be tested against the range, not just start row offset?
`132`	`172`	`andhandler.result_link.startRowOffset==next_row_offset`
`133`	`173`	`]`
	`174`	`+`
	`175`	`+foriinnext_indices:`
	`176`	`+link=self.download_handlers[i].result_link`
	`177`	`+logger.debug(`
	`178`	`+"- found file: start {}, row count {}".format(`
	`179`	`+link.startRowOffset,link.rowCount`
	`180`	`+ )`
	`181`	`+ )`
	`182`	`+`
`134`	`183`	`returnnext_indices[0]iflen(next_indices)>0elseNone`
`135`	`184`
`136`	`185`	`def_check_if_download_successful(self,handler:ResultSetDownloadHandler):`

`‎src/databricks/sql/utils.py‎`

Lines changed: 35 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -156,6 +156,19 @@ def __init__(`
`156`	`156`	`self.lz4_compressed=lz4_compressed`
`157`	`157`	`self.description=description`
`158`	`158`
	`159`	`+logger.debug(`
	`160`	`+"Initialize CloudFetch loader, row set start offset: {}, file list:".format(`
	`161`	`+start_row_offset`
	`162`	`+ )`
	`163`	`+ )`
	`164`	`+ifresult_linksisnotNone:`
	`165`	`+forresult_linkinresult_links:`
	`166`	`+logger.debug(`
	`167`	`+"- start row offset: {}, row count: {}".format(`
	`168`	`+result_link.startRowOffset,result_link.rowCount`
	`169`	`+ )`
	`170`	`+ )`
	`171`	`+`
`159`	`172`	`self.download_manager=ResultFileDownloadManager(`
`160`	`173`	`self.max_download_threads,self.lz4_compressed`
`161`	`174`	`)`
`@@ -175,8 +188,10 @@ def next_n_rows(self, num_rows: int) -> pyarrow.Table:`
`175`	`188`	`pyarrow.Table`
`176`	`189`	`"""`
`177`	`190`	`ifnotself.table:`
	`191`	`+logger.debug("CloudFetchQueue: no more rows available")`
`178`	`192`	`# Return empty pyarrow table to cause retry of fetch`
`179`	`193`	`returnself._create_empty_table()`
	`194`	`+logger.debug("CloudFetchQueue: trying to get {} next rows".format(num_rows))`
`180`	`195`	`results=self.table.slice(0,0)`
`181`	`196`	`whilenum_rows>0andself.table:`
`182`	`197`	`# Get remaining of num_rows or the rest of the current table, whichever is smaller`
`@@ -190,6 +205,8 @@ def next_n_rows(self, num_rows: int) -> pyarrow.Table:`
`190`	`205`	`self.table=self._create_next_table()`
`191`	`206`	`self.table_row_index=0`
`192`	`207`	`num_rows-=table_slice.num_rows`
	`208`	`+`
	`209`	`+logger.debug("CloudFetchQueue: collected {} next rows".format(results.num_rows))`
`193`	`210`	`returnresults`
`194`	`211`
`195`	`212`	`defremaining_rows(self)->pyarrow.Table:`
`@@ -214,11 +231,21 @@ def remaining_rows(self) -> pyarrow.Table:`
`214`	`231`	`returnresults`
`215`	`232`
`216`	`233`	`def_create_next_table(self)->Union[pyarrow.Table,None]:`
	`234`	`+logger.debug(`
	`235`	`+"CloudFetchQueue: Trying to get downloaded file for row {}".format(`
	`236`	`+self.start_row_index`
	`237`	`+ )`
	`238`	`+ )`
`217`	`239`	`# Create next table by retrieving the logical next downloaded file, or return None to signal end of queue`
`218`	`240`	`downloaded_file=self.download_manager.get_next_downloaded_file(`
`219`	`241`	`self.start_row_index`
`220`	`242`	`)`
`221`	`243`	`ifnotdownloaded_file:`
	`244`	`+logger.debug(`
	`245`	`+"CloudFetchQueue: Cannot find downloaded file for row {}".format(`
	`246`	`+self.start_row_index`
	`247`	`+ )`
	`248`	`+ )`
`222`	`249`	`# None signals no more Arrow tables can be built from the remaining handlers if any remain`
`223`	`250`	`returnNone`
`224`	`251`	`arrow_table=create_arrow_table_from_arrow_file(`
`@@ -228,12 +255,18 @@ def _create_next_table(self) -> Union[pyarrow.Table, None]:`
`228`	`255`	`# The server rarely prepares the exact number of rows requested by the client in cloud fetch.`
`229`	`256`	`# Subsequently, we drop the extraneous rows in the last file if more rows are retrieved than requested`
`230`	`257`	`ifarrow_table.num_rows>downloaded_file.row_count:`
`231`		`-self.start_row_index+=downloaded_file.row_count`
`232`		`-returnarrow_table.slice(0,downloaded_file.row_count)`
	`258`	`+arrow_table=arrow_table.slice(0,downloaded_file.row_count)`
`233`	`259`
`234`	`260`	`# At this point, whether the file has extraneous rows or not, the arrow table should have the correct num rows`
`235`	`261`	`assertdownloaded_file.row_count==arrow_table.num_rows`
`236`	`262`	`self.start_row_index+=arrow_table.num_rows`
	`263`	`+`
	`264`	`+logger.debug(`
	`265`	`+"CloudFetchQueue: Found downloaded file, row count: {}, new start offset: {}".format(`
	`266`	`+arrow_table.num_rows,self.start_row_index`
	`267`	`+ )`
	`268`	`+ )`
	`269`	`+`
`237`	`270`	`returnarrow_table`
`238`	`271`
`239`	`272`	`def_create_empty_table(self)->pyarrow.Table:`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit1e6c2e9

File tree

2 files changed

2 files changed

`‎src/databricks/sql/cloudfetch/download_manager.py‎`

`‎src/databricks/sql/utils.py‎`

0 commit comments