Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit12fa9fb

Browse files
committed
fix: retry query job after ambiguous failures
1 parent6a43cfd commit12fa9fb

File tree

2 files changed

+189
-5
lines changed

2 files changed

+189
-5
lines changed

‎google/cloud/bigquery/retry.py‎

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
# Allow for a few retries after the API request times out. This relevant for
4040
# rateLimitExceeded errors, which can be raised either by the Google load
4141
# balancer or the BigQuery job server.
42-
_DEFAULT_JOB_DEADLINE=3.0*_DEFAULT_RETRY_DEADLINE
42+
_DEFAULT_JOB_DEADLINE=4.0*_DEFAULT_RETRY_DEADLINE
4343

4444

4545
def_should_retry(exc):
@@ -73,10 +73,32 @@ def _should_retry(exc):
7373
deadline on the retry object.
7474
"""
7575

76-
job_retry_reasons="rateLimitExceeded","backendError","jobRateLimitExceeded"
76+
job_retry_reasons= (
77+
"rateLimitExceeded",
78+
"backendError",
79+
"internalError",
80+
"jobRateLimitExceeded",
81+
)
7782

7883

7984
def_job_should_retry(exc):
85+
# Sometimes we have ambiguous errors, such as 'backendError' which could
86+
# be due to an API problem or a job problem. For these, make sure we retry
87+
# our is_job_done function.
88+
#
89+
# Note: This won't restart the job unless we know for sure it's because of
90+
# the job status and set restart_query_job = True in that loop. This means
91+
# that we might end up calling this predicate twice for the same job
92+
# but from different paths: (1) from jobs.getQueryResults RetryError and
93+
# (2) from translating the job error from the body of a jobs.get response.
94+
#
95+
# Note: If we start retrying job types other than queries where we don't
96+
# call the problematic getQueryResults API to check the status, we need
97+
# to provide a different predicate, as there shouldn't be ambiguous
98+
# errors in those cases.
99+
ifisinstance(exc,exceptions.RetryError):
100+
exc=exc.cause
101+
80102
ifnothasattr(exc,"errors")orlen(exc.errors)==0:
81103
returnFalse
82104

‎tests/unit/test_job_retry.py‎

Lines changed: 165 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
fromgoogle.cloud.bigquery.clientimportClient
2626
fromgoogle.cloud.bigqueryimport_job_helpers
27-
fromgoogle.cloud.bigquery.retryimportDEFAULT_JOB_RETRY
27+
importgoogle.cloud.bigquery.retry
2828

2929
from .helpersimportmake_connection
3030

@@ -126,6 +126,168 @@ def api_request(method, path, query_params=None, data=None, **kw):
126126
assertjob.job_id==orig_job_id
127127

128128

129+
deftest_query_retry_with_default_retry_and_ambiguous_errors_only_retries_with_failed_job(
130+
client,monkeypatch
131+
):
132+
"""
133+
Some errors like 'rateLimitExceeded' can be ambiguous. Make sure we only
134+
retry the job when we know for sure that the job has failed for a retriable
135+
reason. We can only be sure after a "successful" call to jobs.get to fetch
136+
the failed job status.
137+
"""
138+
job_counter=0
139+
140+
defmake_job_id(*args,**kwargs):
141+
nonlocaljob_counter
142+
job_counter+=1
143+
returnf"{job_counter}"
144+
145+
monkeypatch.setattr(_job_helpers,"make_job_id",make_job_id)
146+
147+
project=client.project
148+
job_reference_1= {"projectId":project,"jobId":"1","location":"test-loc"}
149+
job_reference_2= {"projectId":project,"jobId":"2","location":"test-loc"}
150+
NUM_API_RETRIES=2
151+
152+
# This error is modeled after a real customer exception in
153+
# https://github.com/googleapis/python-bigquery/issues/707.
154+
internal_error=google.api_core.exceptions.InternalServerError(
155+
"Job failed just because...",
156+
errors=[
157+
{"reason":"internalError"},
158+
],
159+
)
160+
responses= [
161+
# jobs.insert
162+
{"jobReference":job_reference_1,"status": {"state":"PENDING"}},
163+
# jobs.get
164+
{"jobReference":job_reference_1,"status": {"state":"RUNNING"}},
165+
# jobs.getQueryResults x2
166+
#
167+
# Note: internalError is ambiguous in jobs.getQueryResults. The
168+
# problem could be at the Google Frontend leve or it could be because
169+
# the job has failed due to some transient issues and the BigQuery
170+
# REST API is translating the job failed status into failure HTTP
171+
# codes.
172+
#
173+
# TODO(GH#1903): We shouldn't retry nearly this many times when we get
174+
# ambiguous errors from jobs.getQueryResults.
175+
# See: https://github.com/googleapis/python-bigquery/issues/1903
176+
internal_error,
177+
internal_error,
178+
# jobs.get -- the job has failed
179+
{
180+
"jobReference":job_reference_1,
181+
"status": {"state":"DONE","errorResult": {"reason":"internalError"}},
182+
},
183+
# jobs.insert
184+
{"jobReference":job_reference_2,"status": {"state":"PENDING"}},
185+
# jobs.get
186+
{"jobReference":job_reference_2,"status": {"state":"RUNNING"}},
187+
# jobs.getQueryResults
188+
{"jobReference":job_reference_2,"jobComplete":True},
189+
# jobs.get
190+
{"jobReference":job_reference_2,"status": {"state":"DONE"}},
191+
]
192+
193+
conn=client._connection=make_connection()
194+
conn.api_request.side_effect=responses
195+
196+
withfreezegun.freeze_time(
197+
# Note: because of exponential backoff and a bit of jitter,
198+
# NUM_API_RETRIES will get less accurate the greater the value.
199+
# We add 1 because we know there will be at least some additional
200+
# calls to fetch the time / sleep before the retry deadline is hit.
201+
auto_tick_seconds=(
202+
google.cloud.bigquery.retry._DEFAULT_RETRY_DEADLINE/NUM_API_RETRIES
203+
)
204+
+1,
205+
):
206+
job=client.query("select 1")
207+
job.result()
208+
209+
conn.api_request.assert_has_calls(
210+
[
211+
# jobs.insert
212+
mock.call(
213+
method="POST",
214+
path="/projects/PROJECT/jobs",
215+
data={
216+
"jobReference": {"jobId":"1","projectId":"PROJECT"},
217+
"configuration": {
218+
"query": {"useLegacySql":False,"query":"select 1"}
219+
},
220+
},
221+
timeout=None,
222+
),
223+
# jobs.get
224+
mock.call(
225+
method="GET",
226+
path="/projects/PROJECT/jobs/1",
227+
query_params={"location":"test-loc"},
228+
timeout=None,
229+
),
230+
# jobs.getQueryResults x2
231+
mock.call(
232+
method="GET",
233+
path="/projects/PROJECT/queries/1",
234+
query_params={"maxResults":0,"location":"test-loc"},
235+
timeout=None,
236+
),
237+
mock.call(
238+
method="GET",
239+
path="/projects/PROJECT/queries/1",
240+
query_params={"maxResults":0,"location":"test-loc"},
241+
timeout=None,
242+
),
243+
# jobs.get -- verify that the job has failed
244+
mock.call(
245+
method="GET",
246+
path="/projects/PROJECT/jobs/1",
247+
query_params={"location":"test-loc"},
248+
timeout=None,
249+
),
250+
# jobs.insert
251+
mock.call(
252+
method="POST",
253+
path="/projects/PROJECT/jobs",
254+
data={
255+
"jobReference": {
256+
# Make sure that we generated a new job ID.
257+
"jobId":"2",
258+
"projectId":"PROJECT",
259+
},
260+
"configuration": {
261+
"query": {"useLegacySql":False,"query":"select 1"}
262+
},
263+
},
264+
timeout=None,
265+
),
266+
# jobs.get
267+
mock.call(
268+
method="GET",
269+
path="/projects/PROJECT/jobs/2",
270+
query_params={"location":"test-loc"},
271+
timeout=None,
272+
),
273+
# jobs.getQueryResults
274+
mock.call(
275+
method="GET",
276+
path="/projects/PROJECT/queries/2",
277+
query_params={"maxResults":0,"location":"test-loc"},
278+
timeout=None,
279+
),
280+
# jobs.get
281+
mock.call(
282+
method="GET",
283+
path="/projects/PROJECT/jobs/2",
284+
query_params={"location":"test-loc"},
285+
timeout=None,
286+
),
287+
]
288+
)
289+
290+
129291
# With job_retry_on_query, we're testing 4 scenarios:
130292
# - Pass None retry to `query`.
131293
# - Pass None retry to `result`.
@@ -301,8 +463,8 @@ def test_query_and_wait_retries_job_for_DDL_queries():
301463
job_config=None,
302464
page_size=None,
303465
max_results=None,
304-
retry=DEFAULT_JOB_RETRY,
305-
job_retry=DEFAULT_JOB_RETRY,
466+
retry=google.cloud.bigquery.retry.DEFAULT_RETRY,
467+
job_retry=google.cloud.bigquery.retry.DEFAULT_JOB_RETRY,
306468
)
307469
assertlen(list(rows))==4
308470

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp