Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd676ad5

Browse files
authored
chore: log provider stack traces on text file busy (#15078)
re:#14726If we see "text file busy" in the errors while initializing terraform,attempt to query the pprof endpoint set up bycoder/terraform-provider-coder#295 and log atCRITICAL.---------Signed-off-by: Spike Curtis <spike@coder.com>
1 parenta77c907 commitd676ad5

File tree

4 files changed

+235
-5
lines changed

4 files changed

+235
-5
lines changed

‎provisioner/terraform/executor.go

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,15 @@ func versionFromBinaryPath(ctx context.Context, binaryPath string) (*version.Ver
200200
returnversion.NewVersion(vj.Version)
201201
}
202202

203+
typetextFileBusyErrorstruct {
204+
exitErr*exec.ExitError
205+
stderrstring
206+
}
207+
208+
func (e*textFileBusyError)Error()string {
209+
return"text file busy: "+e.exitErr.String()
210+
}
211+
203212
func (e*executor)init(ctx,killCtx context.Context,logrlogSink)error {
204213
ctx,span:=e.server.startTrace(ctx,tracing.FuncName())
205214
deferspan.End()
@@ -216,13 +225,24 @@ func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
216225
<-doneErr
217226
}()
218227

228+
// As a special case, we want to look for the error "text file busy" in the stderr output of
229+
// the init command, so we also take a copy of the stderr into an in memory buffer.
230+
errBuf:=newBufferedWriteCloser(errWriter)
231+
219232
args:= []string{
220233
"init",
221234
"-no-color",
222235
"-input=false",
223236
}
224237

225-
returne.execWriteOutput(ctx,killCtx,args,e.basicEnv(),outWriter,errWriter)
238+
err:=e.execWriteOutput(ctx,killCtx,args,e.basicEnv(),outWriter,errBuf)
239+
varexitErr*exec.ExitError
240+
ifxerrors.As(err,&exitErr) {
241+
ifbytes.Contains(errBuf.b.Bytes(), []byte("text file busy")) {
242+
return&textFileBusyError{exitErr:exitErr,stderr:errBuf.b.String()}
243+
}
244+
}
245+
returnerr
226246
}
227247

228248
funcgetPlanFilePath(workdirstring)string {
@@ -707,3 +727,26 @@ func (sw syncWriter) Write(p []byte) (n int, err error) {
707727
defersw.mut.Unlock()
708728
returnsw.w.Write(p)
709729
}
730+
731+
typebufferedWriteCloserstruct {
732+
wc io.WriteCloser
733+
b bytes.Buffer
734+
}
735+
736+
funcnewBufferedWriteCloser(wc io.WriteCloser)*bufferedWriteCloser {
737+
return&bufferedWriteCloser{
738+
wc:wc,
739+
}
740+
}
741+
742+
func (b*bufferedWriteCloser)Write(p []byte) (int,error) {
743+
n,err:=b.b.Write(p)
744+
iferr!=nil {
745+
returnn,err
746+
}
747+
returnb.wc.Write(p)
748+
}
749+
750+
func (b*bufferedWriteCloser)Close()error {
751+
returnb.wc.Close()
752+
}

‎provisioner/terraform/provision.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7+
"io"
8+
"net"
9+
"net/http"
710
"os"
11+
"path/filepath"
812
"strings"
913
"time"
1014

@@ -109,10 +113,32 @@ func (s *server) Plan(
109113
initTimings.ingest(createInitTimingsEvent(timingInitStart))
110114

111115
err=e.init(ctx,killCtx,sess)
116+
112117
iferr!=nil {
113118
initTimings.ingest(createInitTimingsEvent(timingInitErrored))
114119

115120
s.logger.Debug(ctx,"init failed",slog.Error(err))
121+
122+
// Special handling for "text file busy" c.f. https://github.com/coder/coder/issues/14726
123+
// We believe this might be due to some race condition that prevents the
124+
// terraform-provider-coder process from exiting. When terraform tries to install the
125+
// provider during this init, it copies over the local cache. Normally this isn't an issue,
126+
// but if the terraform-provider-coder process is still running from a previous build, Linux
127+
// returns "text file busy" error when attempting to open the file.
128+
//
129+
// Capturing the stack trace from the process should help us figure out why it has not
130+
// exited. We'll drop these diagnostics in a CRITICAL log so that operators are likely to
131+
// notice, and also because it indicates this provisioner could be permanently broken and
132+
// require a restart.
133+
varerrTFB*textFileBusyError
134+
ifxerrors.As(err,&errTFB) {
135+
stacktrace:=tryGettingCoderProviderStacktrace(sess)
136+
s.logger.Critical(ctx,"init: text file busy",
137+
slog.Error(errTFB),
138+
slog.F("stderr",errTFB.stderr),
139+
slog.F("provider_coder_stacktrace",stacktrace),
140+
)
141+
}
116142
returnprovisionersdk.PlanErrorf("initialize terraform: %s",err)
117143
}
118144

@@ -280,3 +306,39 @@ func logTerraformEnvVars(sink logSink) {
280306
}
281307
}
282308
}
309+
310+
// tryGettingCoderProviderStacktrace attempts to dial a special pprof endpoint we added to
311+
// terraform-provider-coder in https://github.com/coder/terraform-provider-coder/pull/295 which
312+
// shipped in v1.0.4. It will return the stacktraces of the provider, which will hopefully allow us
313+
// to figure out why it hasn't exited.
314+
functryGettingCoderProviderStacktrace(sess*provisionersdk.Session)string {
315+
path:=filepath.Clean(filepath.Join(sess.WorkDirectory,"../.coder/pprof"))
316+
sess.Logger.Info(sess.Context(),"attempting to get stack traces",slog.F("path",path))
317+
c:= http.Client{
318+
Transport:&http.Transport{
319+
DialContext:func(ctx context.Context,_,_string) (net.Conn,error) {
320+
d:= net.Dialer{}
321+
returnd.DialContext(ctx,"unix",path)
322+
},
323+
},
324+
}
325+
req,err:=http.NewRequestWithContext(sess.Context(),http.MethodGet,
326+
"http://localhost/debug/pprof/goroutine?debug=2",nil)
327+
iferr!=nil {
328+
sess.Logger.Error(sess.Context(),"error creating GET request",slog.Error(err))
329+
return""
330+
}
331+
resp,err:=c.Do(req)
332+
iferr!=nil {
333+
// Only log at Info here, since we only added the pprof endpoint to terraform-provider-coder
334+
// in v1.0.4
335+
sess.Logger.Info(sess.Context(),"could not GET stack traces",slog.Error(err))
336+
return""
337+
}
338+
deferresp.Body.Close()
339+
stacktraces,err:=io.ReadAll(resp.Body)
340+
iferr!=nil {
341+
sess.Logger.Error(sess.Context(),"could not read stack traces",slog.Error(err))
342+
}
343+
returnstring(stacktraces)
344+
}

‎provisioner/terraform/provision_test.go

Lines changed: 89 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import (
99
"encoding/json"
1010
"errors"
1111
"fmt"
12+
"net"
13+
"net/http"
1214
"os"
1315
"path/filepath"
1416
"runtime"
@@ -31,14 +33,22 @@ import (
3133
typeprovisionerServeOptionsstruct {
3234
binaryPathstring
3335
exitTimeout time.Duration
36+
workDirstring
37+
logger*slog.Logger
3438
}
3539

3640
funcsetupProvisioner(t*testing.T,opts*provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) {
3741
ifopts==nil {
3842
opts=&provisionerServeOptions{}
3943
}
4044
cachePath:=t.TempDir()
41-
workDir:=t.TempDir()
45+
ifopts.workDir=="" {
46+
opts.workDir=t.TempDir()
47+
}
48+
ifopts.logger==nil {
49+
logger:=slogtest.Make(t,nil).Leveled(slog.LevelDebug)
50+
opts.logger=&logger
51+
}
4252
client,server:=drpc.MemTransportPipe()
4353
ctx,cancelFunc:=context.WithCancel(context.Background())
4454
serverErr:=make(chanerror,1)
@@ -55,8 +65,8 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont
5565
serverErr<-terraform.Serve(ctx,&terraform.ServeOptions{
5666
ServeOptions:&provisionersdk.ServeOptions{
5767
Listener:server,
58-
Logger:slogtest.Make(t,nil).Leveled(slog.LevelDebug),
59-
WorkDirectory:workDir,
68+
Logger:*opts.logger,
69+
WorkDirectory:opts.workDir,
6070
},
6171
BinaryPath:opts.binaryPath,
6272
CachePath:cachePath,
@@ -236,7 +246,7 @@ func TestProvision_CancelTimeout(t *testing.T) {
236246
dir:=t.TempDir()
237247
binPath:=filepath.Join(dir,"terraform")
238248

239-
// Example: exec /path/to/terrafork_fake_cancel.sh 1.2.1 apply "$@"
249+
// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
240250
content:=fmt.Sprintf("#!/bin/sh\nexec %q %s\"$@\"\n",fakeBin,terraform.TerraformVersion.String())
241251
err=os.WriteFile(binPath, []byte(content),0o755)//#nosec
242252
require.NoError(t,err)
@@ -282,6 +292,81 @@ func TestProvision_CancelTimeout(t *testing.T) {
282292
}
283293
}
284294

295+
// below we exec fake_text_file_busy.sh, which causes the kernel to execute it, and if more than
296+
// one process tries to do this, it can cause "text file busy" to be returned to us. In this test
297+
// we want to simulate "text file busy" getting logged by terraform, due to an issue with the
298+
// terraform-provider-coder
299+
// nolint: paralleltest
300+
funcTestProvision_TextFileBusy(t*testing.T) {
301+
ifruntime.GOOS=="windows" {
302+
t.Skip("This test uses unix sockets and is not supported on Windows")
303+
}
304+
305+
cwd,err:=os.Getwd()
306+
require.NoError(t,err)
307+
fakeBin:=filepath.Join(cwd,"testdata","fake_text_file_busy.sh")
308+
309+
dir:=t.TempDir()
310+
binPath:=filepath.Join(dir,"terraform")
311+
312+
// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
313+
content:=fmt.Sprintf("#!/bin/sh\nexec %q %s\"$@\"\n",fakeBin,terraform.TerraformVersion.String())
314+
err=os.WriteFile(binPath, []byte(content),0o755)//#nosec
315+
require.NoError(t,err)
316+
317+
workDir:=t.TempDir()
318+
319+
err=os.Mkdir(filepath.Join(workDir,".coder"),0o700)
320+
require.NoError(t,err)
321+
l,err:=net.Listen("unix",filepath.Join(workDir,".coder","pprof"))
322+
require.NoError(t,err)
323+
deferl.Close()
324+
handlerCalled:=0
325+
// nolint: gosec
326+
srv:=&http.Server{
327+
Handler:http.HandlerFunc(func(w http.ResponseWriter,r*http.Request) {
328+
assert.Equal(t,"/debug/pprof/goroutine",r.URL.Path)
329+
w.WriteHeader(http.StatusOK)
330+
_,err:=w.Write([]byte("thestacks\n"))
331+
assert.NoError(t,err)
332+
handlerCalled++
333+
}),
334+
}
335+
srvErr:=make(chanerror,1)
336+
gofunc() {
337+
srvErr<-srv.Serve(l)
338+
}()
339+
340+
logger:=slogtest.Make(t,&slogtest.Options{IgnoreErrors:true})
341+
ctx,api:=setupProvisioner(t,&provisionerServeOptions{
342+
binaryPath:binPath,
343+
exitTimeout:time.Second,
344+
workDir:workDir,
345+
logger:&logger,
346+
})
347+
348+
sess:=configure(ctx,t,api,&proto.Config{
349+
TemplateSourceArchive:makeTar(t,nil),
350+
})
351+
352+
err=sendPlan(sess,proto.WorkspaceTransition_START)
353+
require.NoError(t,err)
354+
355+
found:=false
356+
for {
357+
msg,err:=sess.Recv()
358+
require.NoError(t,err)
359+
360+
ifc:=msg.GetPlan();c!=nil {
361+
require.Contains(t,c.Error,"exit status 1")
362+
found=true
363+
break
364+
}
365+
}
366+
require.True(t,found)
367+
require.EqualValues(t,1,handlerCalled)
368+
}
369+
285370
funcTestProvision(t*testing.T) {
286371
t.Parallel()
287372

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/sh
2+
3+
VERSION=$1
4+
shift 1
5+
6+
json_print() {
7+
echo"{\"@level\":\"error\",\"@message\":\"$*\"}"
8+
}
9+
10+
case"$1"in
11+
version)
12+
cat<<-EOF
13+
{
14+
"terraform_version": "${VERSION}",
15+
"platform": "linux_amd64",
16+
"provider_selections": {},
17+
"terraform_outdated": false
18+
}
19+
EOF
20+
exit 0
21+
;;
22+
init)
23+
echo"init"
24+
echo>&2"Error: Failed to install provider"
25+
echo>&2" Error while installing coder/coder v1.0.4: open"
26+
echo>&2" /home/coder/.cache/coder/provisioner-0/tf/registry.terraform.io/coder/coder/1.0.3/linux_amd64/terraform-provider-coder_v1.0.4:"
27+
echo>&2" text file busy"
28+
exit 1
29+
;;
30+
plan)
31+
echo"plan not supported"
32+
exit 1
33+
;;
34+
apply)
35+
echo"apply not supported"
36+
exit 1
37+
;;
38+
esac
39+
40+
exit 10

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp