NotificationsYou must be signed in to change notification settings
Fork745
Star10.7k

Merge Queue Checks for refs/heads/sl/migrate-count-rows-for-dataset#5900

Workflow file for this run

.github/workflows/merge-queue.yml at1b78ca7

	name:Merge Queue Checks
	run-name:Merge Queue Checks for ${{ github.ref }}

	# This workflow is called from 'general.yml' (so that it can depend on artifacts from other jobs)
	# It is not invoked directly via a 'merge_group' event, so checking for 'github.event_name == 'merge_group' will not work
	on:
	workflow_dispatch:
	workflow_call:
	schedule:
	-cron:"0 0 * * *"# Runs at 00:00 UTC every day

	# When triggered from the merge queue, cancel any existing workflow runs for the same PR branch
	# Otherwise, use the unique run id for the concurrency group, to prevent anything from getting cancelled
	# Note that the event will be 'merge_group' when general.yml calls this workflow via a 'workflow_call' event
	concurrency:
	group:${{ github.event_name == 'merge_group' && format('merge-queue-{0}-{1}', github.workflow, github.ref) \|\| github.run_id }}
	cancel-in-progress:true

	env:
	ANTHROPIC_API_KEY:${{ secrets.ANTHROPIC_API_KEY }}
	AWS_ACCESS_KEY_ID:${{ secrets.AWS_ACCESS_KEY_ID }}
	AWS_REGION:"us-east-1"
	AWS_SECRET_ACCESS_KEY:${{ secrets.AWS_SECRET_ACCESS_KEY }}
	AZURE_AI_FOUNDRY_API_KEY:${{ secrets.AZURE_AI_FOUNDRY_API_KEY }}
	AZURE_OPENAI_API_BASE:${{secrets.AZURE_OPENAI_API_BASE }}
	AZURE_OPENAI_API_KEY:${{ secrets.AZURE_OPENAI_API_KEY }}
	AZURE_OPENAI_EASTUS2_API_KEY:${{ secrets.AZURE_OPENAI_EASTUS2_API_KEY }}
	AZURE_OPENAI_DEPLOYMENT_ID:${{secrets.AZURE_OPENAI_DEPLOYMENT_ID }}
	DEEPSEEK_API_KEY:${{ secrets.DEEPSEEK_API_KEY }}
	FIREWORKS_ACCOUNT_ID:${{ secrets.FIREWORKS_ACCOUNT_ID }}
	FIREWORKS_API_KEY:${{ secrets.FIREWORKS_API_KEY }}
	FORCE_COLOR:1
	GCP_STORAGE_ACCESS_KEY_ID:${{ secrets.GCP_STORAGE_ACCESS_KEY_ID }}
	GCP_STORAGE_SECRET_ACCESS_KEY:${{ secrets.GCP_STORAGE_SECRET_ACCESS_KEY }}
	GCP_VERTEX_CREDENTIALS_PATH:${{ github.workspace }}/gcp_jwt_key.json
	GOOGLE_AI_STUDIO_API_KEY:${{ secrets.GOOGLE_AI_STUDIO_API_KEY }}
	GOOGLE_APPLICATION_CREDENTIALS:${{ github.workspace }}/gcp_jwt_key.json
	GROQ_API_KEY:${{ secrets.GROQ_API_KEY }}
	HYPERBOLIC_API_KEY:${{secrets.HYPERBOLIC_API_KEY}}
	MISTRAL_API_KEY:${{ secrets.MISTRAL_API_KEY }}
	MODAL_KEY:${{ secrets.MODAL_KEY }}
	MODAL_SECRET:${{ secrets.MODAL_SECRET }}
	OPENAI_API_KEY:${{ secrets.OPENAI_API_KEY }}
	OPENROUTER_API_KEY:${{ secrets.OPENROUTER_API_KEY }}
	R2_ACCESS_KEY_ID:${{ secrets.R2_ACCESS_KEY_ID }}
	R2_SECRET_ACCESS_KEY:${{ secrets.R2_SECRET_ACCESS_KEY }}
	SGLANG_API_KEY:${{ secrets.SGLANG_API_KEY }}
	TGI_API_KEY:${{ secrets.TGI_API_KEY }}
	TOGETHER_API_KEY:${{ secrets.TOGETHER_API_KEY }}
	VLLM_API_BASE:${{ secrets.VLLM_API_BASE }}
	VLLM_API_KEY:${{ secrets.VLLM_API_KEY }}
	VLLM_MODEL_NAME:"microsoft/Phi-3.5-mini-instruct"
	VOYAGE_API_KEY:${{ secrets.VOYAGE_API_KEY }}
	XAI_API_KEY:${{ secrets.XAI_API_KEY }}
	OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:https://localhost:4316
	SQLX_OFFLINE:1
	TENSORZERO_E2E_PROXY:http://localhost:3003
	TENSORZERO_COMMIT_TAG:sha-${{ github.sha }}
	TENSORZERO_CI:1

	jobs:
	live-tests:
	name:"live-tests (batch_writes: ${{ matrix.batch_writes }})"
	runs-on:ubuntu-latest
	if:github.repository == 'tensorzero/tensorzero'
	permissions:
	# Permission to checkout the repository
	contents:read
	# Permission to fetch GitHub OIDC token authentication
	id-token:write
	timeout-minutes:45
	strategy:
	matrix:
	batch_writes:[true, false]

	steps:
	-uses:actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

	-name:Install Namespace CLI
	uses:namespacelabs/nscloud-setup@d1c625762f7c926a54bd39252efff0705fd11c64

	-name:Configure Namespace-powered Buildx
	uses:namespacelabs/nscloud-setup-buildx-action@91c2e6537780e3b092cb8476406be99a8f91bd5e
	with:
	wait-for-builder:true

	-name:Cleanup disk space
	run:./ci/free-disk-space.sh

	-name:Download provider-proxy cache
	# When running as a cron job, don't use the provider-proxy cache.
	# The cron job is used to gather information about provider flakiness.
	if:github.event_name != 'schedule'
	run:\|
	AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY PROVIDER_PROXY_CACHE_BUCKET=provider-proxy-cache ./ci/download-provider-proxy-cache.sh

	-name:Login to DockerHub
	uses:docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
	with:
	username:${{ secrets.DOCKERHUB_USERNAME }}
	password:${{ secrets.DOCKERHUB_TOKEN }}

	-name:Write GCP JWT key to file
	env:
	GCP_JWT_KEY:${{ secrets.GCP_JWT_KEY }}
	run:echo "$GCP_JWT_KEY" > $GITHUB_WORKSPACE/gcp_jwt_key.json

	-name:Pull images referenced by the compose file
	run:docker compose -f tensorzero-core/tests/e2e/docker-compose.live.yml pull --ignore-pull-failures

	-name:Run live tests container via Docker Compose
	run:\|
	DOCKER_UID=$(id -u) DOCKER_GID=$(id -g) docker compose -f tensorzero-core/tests/e2e/docker-compose.live.yml run --rm -e TENSORZERO_CI=1 -e TENSORZERO_FF_WRITE_CONFIG_SNAPSHOT=1 live-tests

	-name:Print live tests logs
	if:always()
	run:docker compose -f tensorzero-core/tests/e2e/docker-compose.live.yml logs -t

	# # TODO(https://github.com/tensorzero/tensorzero/issues/3989) - move this back to the end of the job
	# # For now, we only check for deprecation warnings after running the Rust e2e tests
	# TODO - re-enable this: https://github.com/tensorzero/tensorzero/issues/3989
	# - name: Check e2e logs for deprecation warnings (gateway e2e tests only)
	# run: \|
	# LOGS=$(docker compose -f tensorzero-core/tests/e2e/docker-compose.live.yml logs gateway)
	# if [ -z "$LOGS" ]; then
	# echo "ERROR: Gateway logs are empty"
	# exit 1
	# fi
	# ! grep -i "Deprecation Warning" "$LOGS"

	-name:Upload provider-proxy cache
	# Only upload the cache when we're running from a 'good' run
	# (running from the merge queue via `general.yml` or a cron job)
	# This prevents manual workflow runs from modifying the cache
	if:github.event_name == 'merge_group' \|\| github.event_name == 'schedule'
	run:\|
	AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY PROVIDER_PROXY_CACHE_BUCKET=provider-proxy-cache ./ci/upload-provider-proxy-cache.sh

	client-tests:
	name:"client-tests (batch_writes: ${{ matrix.batch_writes }})"
	runs-on:ubuntu-latest
	if:github.repository == 'tensorzero/tensorzero'
	permissions:
	# Permission to checkout the repository
	contents:read
	# Permission to fetch GitHub OIDC token authentication
	id-token:write
	timeout-minutes:45
	strategy:
	matrix:
	batch_writes:[true, false]
	# Don't fail-fast for manual/cron runs, so that we get the full picture of what broke
	fail-fast:${{ github.event_name == 'merge_group' }}

	steps:
	-uses:actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

	-name:Install gdb
	run:sudo apt-get update && sudo apt-get install -y gdb

	-name:Warm up Modal instances
	run:\|
	curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--vllm-inference-vllm-inference.modal.run/docs > vllm_modal_logs.txt &
	curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--sglang-0-4-10-inference-sglang-inference.modal.run/ > sglang_modal_logs.txt &
	# TODO: Re-enable once we can switch to a T4 GPU
	# curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--vllm-gpt-oss-20b-serve.modal.run/ > vllm_gpt_oss_modal_logs.txt &

	-name:Cleanup disk space
	run:./ci/free-disk-space.sh

	-name:Download client-tests provider-proxy cache
	# When running as a cron job, don't use the provider-proxy cache.
	# The cron job is used to gather information about provider flakiness.
	if:github.event_name != 'schedule'
	run:\|
	AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY PROVIDER_PROXY_CACHE_BUCKET=provider-proxy-cache-client-tests ./ci/download-provider-proxy-cache.sh

	-name:Update Rust
	run:\|
	for attempt in 1 2 3; do
	if rustup update stable && rustup default stable; then
	break
	fi
	if [ $attempt -eq 3 ]; then
	echo "Failed to update Rust after 3 attempts"
	exit 1
	fi
	sleep $((10 * attempt))
	done
	shell:bash

	-name:Install Namespace CLI
	uses:namespacelabs/nscloud-setup@d1c625762f7c926a54bd39252efff0705fd11c64

	-name:Configure Namespace-powered Buildx
	uses:namespacelabs/nscloud-setup-buildx-action@91c2e6537780e3b092cb8476406be99a8f91bd5e
	with:
	wait-for-builder:true

	-name:Install Rust toolchain
	run:\|
	for attempt in 1 2 3; do
	if rustup toolchain install stable && rustup default stable; then
	break
	fi
	if [ $attempt -eq 3 ]; then
	echo "Failed to install Rust toolchain after 3 attempts"
	exit 1
	fi
	sleep $((10 * attempt))
	done
	shell:bash
	# Start testing workload identity federation credentials once the SDK adds support: https://github.com/googleapis/google-cloud-rust/issues/1342

	# - uses: 'google-github-actions/auth@v2'
	# with:
	# project_id: 'tensozero-public'
	# workload_identity_provider: 'projects/454541351720/locations/global/workloadIdentityPools/github/providers/tensorzero'

	-name:Print Rust version
	run:rustc --version

	-name:Install uv
	uses:astral-sh/setup-uv@ed21f2f24f8dd64503750218de024bcf64c7250a
	with:
	version:"0.6.17"

	-name:Install pnpm
	run:\|
	for attempt in 1 2 3; do
	if npm install -g pnpm@latest; then
	break
	fi
	if [ $attempt -eq 3 ]; then
	echo "Failed to install pnpm after 3 attempts"
	exit 1
	fi
	sleep $((10 * attempt))
	done
	shell:bash

	-name:Install JS dependencies
	run:pnpm install --frozen-lockfile

	-name:Login to DockerHub
	uses:docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
	with:
	username:${{ secrets.DOCKERHUB_USERNAME }}
	password:${{ secrets.DOCKERHUB_TOKEN }}

	-name:Install cargo-nextest
	uses:taiki-e/install-action@d12e869b89167df346dd0ff65da342d1fb1202fb
	with:
	tool:cargo-nextest

	-name:Write GCP JWT key to file
	env:
	GCP_JWT_KEY:${{ secrets.GCP_JWT_KEY }}
	run:echo "$GCP_JWT_KEY" > $GITHUB_WORKSPACE/gcp_jwt_key.json

	-name:Set up database URLs for E2E tests
	run:\|
	echo "TENSORZERO_CLICKHOUSE_URL=http://chuser:chpassword@localhost:8123/tensorzero_e2e_tests" >> $GITHUB_ENV
	echo "TENSORZERO_CLICKHOUSE_BATCH_WRITES=${{ matrix.batch_writes }}" >> $GITHUB_ENV
	echo "DATABASE_URL=postgresql://postgres:postgres@localhost:5432/tensorzero-e2e-tests" >> $GITHUB_ENV
	echo "TENSORZERO_POSTGRES_URL=postgresql://postgres:postgres@localhost:5432/tensorzero-e2e-tests" >> $GITHUB_ENV
	echo "TENSORZERO_SKIP_LARGE_FIXTURES=1" >> $GITHUB_ENV

	-name:Configure batch writes in tensorzero.toml
	if:matrix.batch_writes == true
	run:\|
	echo "[gateway.observability.batch_writes]" >> tensorzero-core/tests/e2e/config/tensorzero.misc.toml
	echo "enabled = true" >> tensorzero-core/tests/e2e/config/tensorzero.misc.toml
	echo "flush_interval_ms = 80" >> tensorzero-core/tests/e2e/config/tensorzero.misc.toml
	echo "__force_allow_embedded_batch_writes = true" >> tensorzero-core/tests/e2e/config/tensorzero.misc.toml

	-name:Launch dependency services for E2E tests
	run:\|
	docker compose -f tensorzero-core/tests/e2e/docker-compose.yml up --build -d --wait

	-name:Print ClickHouse container logs
	if:always()
	run:\|
	docker compose -f tensorzero-core/tests/e2e/docker-compose.yml logs -t

	-name:Launch the provider-proxy cache for E2E tests
	run:\|
	./ci/run-provider-proxy.sh ci

	# TODO - get rid of this when the merge queue has a freshly-build gateway image available
	-name:Manually run the latest postgres migrations
	run:cargo run-e2e --run-postgres-migrations

	-name:Launch the gateway for E2E tests
	timeout-minutes:2
	run:\|
	cargo run-e2e > e2e_logs.txt 2>&1 &
	while ! curl -s -f http://localhost:3000/health >/dev/null 2>&1; do
	echo "Waiting for gateway to be healthy..."
	sleep 1
	done
	echo "GATEWAY_PID=$!" >> $GITHUB_ENV

	-name:Install Python for python async client tests
	run:uv python install 3.9

	-name:"Python: PyO3 Client: pytest"
	working-directory:clients/python
	run:\|
	# Start the test in background and capture its PID
	bash ./test.sh --verbose -n 8 &
	TEST_PID=$!
	echo "Started test.sh with PID: $TEST_PID"

	# Wait for 10 minutes (600 seconds)
	for i in {1..600}; do
	if ! kill -0 $TEST_PID 2>/dev/null; then
	echo "Test completed normally"
	wait $TEST_PID
	exit $?
	fi
	sleep 1
	done

	echo "Test has been running for 10 minutes, capturing backtraces..."

	# Get all processes related to our test
	echo "=== Process tree ==="
	ps -ef \| grep -E "(test\.sh\|pytest\|python)" \| grep -v grep \|\|true

	echo "=== Capturing backtraces with gdb ==="
	# Find all python processes that might be related to our test
	PYTHON_PIDS=$(pgrep -f "tensorzero.*python" \|\| true)
	if [ -n "$PYTHON_PIDS" ]; then
	for pid in $PYTHON_PIDS; do
	echo "--- Backtrace for Python process $pid ---"
	gdb -p $pid --batch \
	-ex "set pagination off" \
	-ex "thread apply all bt" \
	-ex "info threads" \
	-ex "detach" \
	-ex "quit" 2>&1 \|\|true
	echo ""
	done
	else
	echo "No Python processes found"
	fi
	exit 1

	-name:"Node.js: OpenAI Client: test"
	working-directory:clients/openai-node
	run:\|
	pnpm run test

	-name:Install Go
	uses:actions/setup-go@29694d72cd5e7ef3b09496b39f28a942af47737e
	with:
	go-version:"1.24"

	-name:"Go: OpenAI Client: test"
	working-directory:clients/openai-go/tests
	run:go test -v

	-name:"Python: Recipes: pytest"
	working-directory:recipes
	run:\|
	uv run pytest

	-name:Terminate the gateway and wait for it to exit
	if:always()
	run:\|
	echo "Killing gateway with pid $GATEWAY_PID"
	kill $GATEWAY_PID
	# Wait for at most 30 seconds for the gateway to exit
	for i in {1..30}; do
	if ! kill -0 $GATEWAY_PID 2>/dev/null; then
	echo "Gateway exited"
	break
	fi
	sleep 1
	done
	if kill -0 $GATEWAY_PID 2>/dev/null; then
	echo "Gateway did not exit after 30 seconds!"
	exit 1
	fi

	-name:Print e2e logs
	if:always()
	run:cat e2e_logs.txt

	-name:Print provider-proxy logs
	if:always()
	run:cat provider_proxy_logs.txt

	-name:Print vLLM modal logs
	if:always()
	run:cat vllm_modal_logs.txt

	-name:Print SGLang modal logs
	if:always()
	run:cat sglang_modal_logs.txt

	-name:Print vLLM GPT-OSS modal logs
	if:always()
	continue-on-error:true
	run:cat vllm_gpt_oss_modal_logs.txt

	-name:Upload client-tests provider-proxy cache
	# Only upload the cache when we're running from a 'good' run
	# (running from the merge queue via 'workflow_call' from general.yml, or a cron job)
	# This prevents manual workflow runs from modifying the cache
	if:github.event_name == 'merge_group' \|\| github.event_name == 'schedule'
	run:\|
	AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY PROVIDER_PROXY_CACHE_BUCKET=provider-proxy-cache-client-tests ./ci/upload-provider-proxy-cache.sh

	# Test that the ui e2e tests still pass after we regenerate the model inference cache
	ui-tests-e2e-regen-model-inference-cache:
	permissions:
	contents:read
	actions:write
	if:github.repository == 'tensorzero/tensorzero'
	uses:./.github/workflows/ui-tests-e2e-model-inference-cache.yml
	with:
	regen_cache:true
	is_merge_group:true
	force_no_auth:true
	secrets:
	S3_ACCESS_KEY_ID:${{ secrets.AWS_ACCESS_KEY_ID }}
	S3_SECRET_ACCESS_KEY:${{ secrets.AWS_SECRET_ACCESS_KEY }}
	OPENAI_API_KEY:${{ secrets.OPENAI_API_KEY }}
	FIREWORKS_ACCOUNT_ID:${{ secrets.FIREWORKS_ACCOUNT_ID }}
	FIREWORKS_API_KEY:${{ secrets.FIREWORKS_API_KEY }}
	ANTHROPIC_API_KEY:${{ secrets.ANTHROPIC_API_KEY }}

	# See 'ci/README.md' at the repository root for more details.
	check-all-tests-passed:
	permissions:{}
	if:always() && github.repository == 'tensorzero/tensorzero'
	needs:[ui-tests-e2e-regen-model-inference-cache, client-tests, live-tests]
	runs-on:ubuntu-latest
	steps:
	# When running in the merge queue, jobs should never be skipped.
	# In a scheduled run, some jobs may be intentionally skipped, as we only care about regenerating the model inference cache.
	-if:${{ contains(needs..result, 'failure') \|\| contains(needs..result, 'cancelled') \|\| (github.event_name != 'pull_request' && contains(needs.*.result, 'skipped')) }}
	run:exit 1

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Merge Queue Checks for refs/heads/sl/migrate-count-rows-for-dataset#5900

Workflow file

Merge Queue Checks for refs/heads/sl/migrate-count-rows-for-dataset

Merge Queue Checks for refs/heads/sl/migrate-count-rows-for-dataset #5900

Uh oh!

Jobs

Run details

Workflow file for this run