- Notifications
You must be signed in to change notification settings - Fork745
Merge Queue Checks for refs/heads/sl/migrate-count-rows-for-dataset#5900
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.Learn more about bidirectional Unicode characters
| name:Merge Queue Checks | |
| run-name:Merge Queue Checks for ${{ github.ref }} | |
| # This workflow is called from 'general.yml' (so that it can depend on artifacts from other jobs) | |
| # It is *not* invoked directly via a 'merge_group' event, so checking for 'github.event_name == 'merge_group' will not work | |
| on: | |
| workflow_dispatch: | |
| workflow_call: | |
| schedule: | |
| -cron:"0 0 * * *"# Runs at 00:00 UTC every day | |
| # When triggered from the merge queue, cancel any existing workflow runs for the same PR branch | |
| # Otherwise, use the unique run id for the concurrency group, to prevent anything from getting cancelled | |
| # Note that the event will be 'merge_group' when general.yml calls this workflow via a 'workflow_call' event | |
| concurrency: | |
| group:${{ github.event_name == 'merge_group' && format('merge-queue-{0}-{1}', github.workflow, github.ref) || github.run_id }} | |
| cancel-in-progress:true | |
| env: | |
| ANTHROPIC_API_KEY:${{ secrets.ANTHROPIC_API_KEY }} | |
| AWS_ACCESS_KEY_ID:${{ secrets.AWS_ACCESS_KEY_ID }} | |
| AWS_REGION:"us-east-1" | |
| AWS_SECRET_ACCESS_KEY:${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| AZURE_AI_FOUNDRY_API_KEY:${{ secrets.AZURE_AI_FOUNDRY_API_KEY }} | |
| AZURE_OPENAI_API_BASE:${{secrets.AZURE_OPENAI_API_BASE }} | |
| AZURE_OPENAI_API_KEY:${{ secrets.AZURE_OPENAI_API_KEY }} | |
| AZURE_OPENAI_EASTUS2_API_KEY:${{ secrets.AZURE_OPENAI_EASTUS2_API_KEY }} | |
| AZURE_OPENAI_DEPLOYMENT_ID:${{secrets.AZURE_OPENAI_DEPLOYMENT_ID }} | |
| DEEPSEEK_API_KEY:${{ secrets.DEEPSEEK_API_KEY }} | |
| FIREWORKS_ACCOUNT_ID:${{ secrets.FIREWORKS_ACCOUNT_ID }} | |
| FIREWORKS_API_KEY:${{ secrets.FIREWORKS_API_KEY }} | |
| FORCE_COLOR:1 | |
| GCP_STORAGE_ACCESS_KEY_ID:${{ secrets.GCP_STORAGE_ACCESS_KEY_ID }} | |
| GCP_STORAGE_SECRET_ACCESS_KEY:${{ secrets.GCP_STORAGE_SECRET_ACCESS_KEY }} | |
| GCP_VERTEX_CREDENTIALS_PATH:${{ github.workspace }}/gcp_jwt_key.json | |
| GOOGLE_AI_STUDIO_API_KEY:${{ secrets.GOOGLE_AI_STUDIO_API_KEY }} | |
| GOOGLE_APPLICATION_CREDENTIALS:${{ github.workspace }}/gcp_jwt_key.json | |
| GROQ_API_KEY:${{ secrets.GROQ_API_KEY }} | |
| HYPERBOLIC_API_KEY:${{secrets.HYPERBOLIC_API_KEY}} | |
| MISTRAL_API_KEY:${{ secrets.MISTRAL_API_KEY }} | |
| MODAL_KEY:${{ secrets.MODAL_KEY }} | |
| MODAL_SECRET:${{ secrets.MODAL_SECRET }} | |
| OPENAI_API_KEY:${{ secrets.OPENAI_API_KEY }} | |
| OPENROUTER_API_KEY:${{ secrets.OPENROUTER_API_KEY }} | |
| R2_ACCESS_KEY_ID:${{ secrets.R2_ACCESS_KEY_ID }} | |
| R2_SECRET_ACCESS_KEY:${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| SGLANG_API_KEY:${{ secrets.SGLANG_API_KEY }} | |
| TGI_API_KEY:${{ secrets.TGI_API_KEY }} | |
| TOGETHER_API_KEY:${{ secrets.TOGETHER_API_KEY }} | |
| VLLM_API_BASE:${{ secrets.VLLM_API_BASE }} | |
| VLLM_API_KEY:${{ secrets.VLLM_API_KEY }} | |
| VLLM_MODEL_NAME:"microsoft/Phi-3.5-mini-instruct" | |
| VOYAGE_API_KEY:${{ secrets.VOYAGE_API_KEY }} | |
| XAI_API_KEY:${{ secrets.XAI_API_KEY }} | |
| OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:https://localhost:4316 | |
| SQLX_OFFLINE:1 | |
| TENSORZERO_E2E_PROXY:http://localhost:3003 | |
| TENSORZERO_COMMIT_TAG:sha-${{ github.sha }} | |
| TENSORZERO_CI:1 | |
| jobs: | |
| live-tests: | |
| name:"live-tests (batch_writes: ${{ matrix.batch_writes }})" | |
| runs-on:ubuntu-latest | |
| if:github.repository == 'tensorzero/tensorzero' | |
| permissions: | |
| # Permission to checkout the repository | |
| contents:read | |
| # Permission to fetch GitHub OIDC token authentication | |
| id-token:write | |
| timeout-minutes:45 | |
| strategy: | |
| matrix: | |
| batch_writes:[true, false] | |
| steps: | |
| -uses:actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 | |
| -name:Install Namespace CLI | |
| uses:namespacelabs/nscloud-setup@d1c625762f7c926a54bd39252efff0705fd11c64 | |
| -name:Configure Namespace-powered Buildx | |
| uses:namespacelabs/nscloud-setup-buildx-action@91c2e6537780e3b092cb8476406be99a8f91bd5e | |
| with: | |
| wait-for-builder:true | |
| -name:Cleanup disk space | |
| run:./ci/free-disk-space.sh | |
| -name:Download provider-proxy cache | |
| # When running as a cron job, don't use the provider-proxy cache. | |
| # The cron job is used to gather information about provider flakiness. | |
| if:github.event_name != 'schedule' | |
| run:| | |
| AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY PROVIDER_PROXY_CACHE_BUCKET=provider-proxy-cache ./ci/download-provider-proxy-cache.sh | |
| -name:Login to DockerHub | |
| uses:docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 | |
| with: | |
| username:${{ secrets.DOCKERHUB_USERNAME }} | |
| password:${{ secrets.DOCKERHUB_TOKEN }} | |
| -name:Write GCP JWT key to file | |
| env: | |
| GCP_JWT_KEY:${{ secrets.GCP_JWT_KEY }} | |
| run:echo "$GCP_JWT_KEY" > $GITHUB_WORKSPACE/gcp_jwt_key.json | |
| -name:Pull images referenced by the compose file | |
| run:docker compose -f tensorzero-core/tests/e2e/docker-compose.live.yml pull --ignore-pull-failures | |
| -name:Run live tests container via Docker Compose | |
| run:| | |
| DOCKER_UID=$(id -u) DOCKER_GID=$(id -g) docker compose -f tensorzero-core/tests/e2e/docker-compose.live.yml run --rm -e TENSORZERO_CI=1 -e TENSORZERO_FF_WRITE_CONFIG_SNAPSHOT=1 live-tests | |
| -name:Print live tests logs | |
| if:always() | |
| run:docker compose -f tensorzero-core/tests/e2e/docker-compose.live.yml logs -t | |
| # # TODO(https://github.com/tensorzero/tensorzero/issues/3989) - move this back to the end of the job | |
| # # For now, we only check for deprecation warnings after running the Rust e2e tests | |
| # TODO - re-enable this: https://github.com/tensorzero/tensorzero/issues/3989 | |
| # - name: Check e2e logs for deprecation warnings (gateway e2e tests only) | |
| # run: | | |
| # LOGS=$(docker compose -f tensorzero-core/tests/e2e/docker-compose.live.yml logs gateway) | |
| # if [ -z "$LOGS" ]; then | |
| # echo "ERROR: Gateway logs are empty" | |
| # exit 1 | |
| # fi | |
| # ! grep -i "Deprecation Warning" "$LOGS" | |
| -name:Upload provider-proxy cache | |
| # Only upload the cache when we're running from a 'good' run | |
| # (running from the merge queue via `general.yml` or a cron job) | |
| # This prevents manual workflow runs from modifying the cache | |
| if:github.event_name == 'merge_group' || github.event_name == 'schedule' | |
| run:| | |
| AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY PROVIDER_PROXY_CACHE_BUCKET=provider-proxy-cache ./ci/upload-provider-proxy-cache.sh | |
| client-tests: | |
| name:"client-tests (batch_writes: ${{ matrix.batch_writes }})" | |
| runs-on:ubuntu-latest | |
| if:github.repository == 'tensorzero/tensorzero' | |
| permissions: | |
| # Permission to checkout the repository | |
| contents:read | |
| # Permission to fetch GitHub OIDC token authentication | |
| id-token:write | |
| timeout-minutes:45 | |
| strategy: | |
| matrix: | |
| batch_writes:[true, false] | |
| # Don't fail-fast for manual/cron runs, so that we get the full picture of what broke | |
| fail-fast:${{ github.event_name == 'merge_group' }} | |
| steps: | |
| -uses:actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 | |
| -name:Install gdb | |
| run:sudo apt-get update && sudo apt-get install -y gdb | |
| -name:Warm up Modal instances | |
| run:| | |
| curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--vllm-inference-vllm-inference.modal.run/docs > vllm_modal_logs.txt & | |
| curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--sglang-0-4-10-inference-sglang-inference.modal.run/ > sglang_modal_logs.txt & | |
| # TODO: Re-enable once we can switch to a T4 GPU | |
| # curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--vllm-gpt-oss-20b-serve.modal.run/ > vllm_gpt_oss_modal_logs.txt & | |
| -name:Cleanup disk space | |
| run:./ci/free-disk-space.sh | |
| -name:Download client-tests provider-proxy cache | |
| # When running as a cron job, don't use the provider-proxy cache. | |
| # The cron job is used to gather information about provider flakiness. | |
| if:github.event_name != 'schedule' | |
| run:| | |
| AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY PROVIDER_PROXY_CACHE_BUCKET=provider-proxy-cache-client-tests ./ci/download-provider-proxy-cache.sh | |
| -name:Update Rust | |
| run:| | |
| for attempt in 1 2 3; do | |
| if rustup update stable && rustup default stable; then | |
| break | |
| fi | |
| if [ $attempt -eq 3 ]; then | |
| echo "Failed to update Rust after 3 attempts" | |
| exit 1 | |
| fi | |
| sleep $((10 * attempt)) | |
| done | |
| shell:bash | |
| -name:Install Namespace CLI | |
| uses:namespacelabs/nscloud-setup@d1c625762f7c926a54bd39252efff0705fd11c64 | |
| -name:Configure Namespace-powered Buildx | |
| uses:namespacelabs/nscloud-setup-buildx-action@91c2e6537780e3b092cb8476406be99a8f91bd5e | |
| with: | |
| wait-for-builder:true | |
| -name:Install Rust toolchain | |
| run:| | |
| for attempt in 1 2 3; do | |
| if rustup toolchain install stable && rustup default stable; then | |
| break | |
| fi | |
| if [ $attempt -eq 3 ]; then | |
| echo "Failed to install Rust toolchain after 3 attempts" | |
| exit 1 | |
| fi | |
| sleep $((10 * attempt)) | |
| done | |
| shell:bash | |
| # Start testing workload identity federation credentials once the SDK adds support: https://github.com/googleapis/google-cloud-rust/issues/1342 | |
| # - uses: 'google-github-actions/auth@v2' | |
| # with: | |
| # project_id: 'tensozero-public' | |
| # workload_identity_provider: 'projects/454541351720/locations/global/workloadIdentityPools/github/providers/tensorzero' | |
| -name:Print Rust version | |
| run:rustc --version | |
| -name:Install uv | |
| uses:astral-sh/setup-uv@ed21f2f24f8dd64503750218de024bcf64c7250a | |
| with: | |
| version:"0.6.17" | |
| -name:Install pnpm | |
| run:| | |
| for attempt in 1 2 3; do | |
| if npm install -g pnpm@latest; then | |
| break | |
| fi | |
| if [ $attempt -eq 3 ]; then | |
| echo "Failed to install pnpm after 3 attempts" | |
| exit 1 | |
| fi | |
| sleep $((10 * attempt)) | |
| done | |
| shell:bash | |
| -name:Install JS dependencies | |
| run:pnpm install --frozen-lockfile | |
| -name:Login to DockerHub | |
| uses:docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 | |
| with: | |
| username:${{ secrets.DOCKERHUB_USERNAME }} | |
| password:${{ secrets.DOCKERHUB_TOKEN }} | |
| -name:Install cargo-nextest | |
| uses:taiki-e/install-action@d12e869b89167df346dd0ff65da342d1fb1202fb | |
| with: | |
| tool:cargo-nextest | |
| -name:Write GCP JWT key to file | |
| env: | |
| GCP_JWT_KEY:${{ secrets.GCP_JWT_KEY }} | |
| run:echo "$GCP_JWT_KEY" > $GITHUB_WORKSPACE/gcp_jwt_key.json | |
| -name:Set up database URLs for E2E tests | |
| run:| | |
| echo "TENSORZERO_CLICKHOUSE_URL=http://chuser:chpassword@localhost:8123/tensorzero_e2e_tests" >> $GITHUB_ENV | |
| echo "TENSORZERO_CLICKHOUSE_BATCH_WRITES=${{ matrix.batch_writes }}" >> $GITHUB_ENV | |
| echo "DATABASE_URL=postgresql://postgres:postgres@localhost:5432/tensorzero-e2e-tests" >> $GITHUB_ENV | |
| echo "TENSORZERO_POSTGRES_URL=postgresql://postgres:postgres@localhost:5432/tensorzero-e2e-tests" >> $GITHUB_ENV | |
| echo "TENSORZERO_SKIP_LARGE_FIXTURES=1" >> $GITHUB_ENV | |
| -name:Configure batch writes in tensorzero.toml | |
| if:matrix.batch_writes == true | |
| run:| | |
| echo "[gateway.observability.batch_writes]" >> tensorzero-core/tests/e2e/config/tensorzero.misc.toml | |
| echo "enabled = true" >> tensorzero-core/tests/e2e/config/tensorzero.misc.toml | |
| echo "flush_interval_ms = 80" >> tensorzero-core/tests/e2e/config/tensorzero.misc.toml | |
| echo "__force_allow_embedded_batch_writes = true" >> tensorzero-core/tests/e2e/config/tensorzero.misc.toml | |
| -name:Launch dependency services for E2E tests | |
| run:| | |
| docker compose -f tensorzero-core/tests/e2e/docker-compose.yml up --build -d --wait | |
| -name:Print ClickHouse container logs | |
| if:always() | |
| run:| | |
| docker compose -f tensorzero-core/tests/e2e/docker-compose.yml logs -t | |
| -name:Launch the provider-proxy cache for E2E tests | |
| run:| | |
| ./ci/run-provider-proxy.sh ci | |
| # TODO - get rid of this when the merge queue has a freshly-build gateway image available | |
| -name:Manually run the latest postgres migrations | |
| run:cargo run-e2e --run-postgres-migrations | |
| -name:Launch the gateway for E2E tests | |
| timeout-minutes:2 | |
| run:| | |
| cargo run-e2e > e2e_logs.txt 2>&1 & | |
| while ! curl -s -f http://localhost:3000/health >/dev/null 2>&1; do | |
| echo "Waiting for gateway to be healthy..." | |
| sleep 1 | |
| done | |
| echo "GATEWAY_PID=$!" >> $GITHUB_ENV | |
| -name:Install Python for python async client tests | |
| run:uv python install 3.9 | |
| -name:"Python: PyO3 Client: pytest" | |
| working-directory:clients/python | |
| run:| | |
| # Start the test in background and capture its PID | |
| bash ./test.sh --verbose -n 8 & | |
| TEST_PID=$! | |
| echo "Started test.sh with PID: $TEST_PID" | |
| # Wait for 10 minutes (600 seconds) | |
| for i in {1..600}; do | |
| if ! kill -0 $TEST_PID 2>/dev/null; then | |
| echo "Test completed normally" | |
| wait $TEST_PID | |
| exit $? | |
| fi | |
| sleep 1 | |
| done | |
| echo "Test has been running for 10 minutes, capturing backtraces..." | |
| # Get all processes related to our test | |
| echo "=== Process tree ===" | |
| ps -ef | grep -E "(test\.sh|pytest|python)" | grep -v grep ||true | |
| echo "=== Capturing backtraces with gdb ===" | |
| # Find all python processes that might be related to our test | |
| PYTHON_PIDS=$(pgrep -f "tensorzero.*python" || true) | |
| if [ -n "$PYTHON_PIDS" ]; then | |
| for pid in $PYTHON_PIDS; do | |
| echo "--- Backtrace for Python process $pid ---" | |
| gdb -p $pid --batch \ | |
| -ex "set pagination off" \ | |
| -ex "thread apply all bt" \ | |
| -ex "info threads" \ | |
| -ex "detach" \ | |
| -ex "quit" 2>&1 ||true | |
| echo "" | |
| done | |
| else | |
| echo "No Python processes found" | |
| fi | |
| exit 1 | |
| -name:"Node.js: OpenAI Client: test" | |
| working-directory:clients/openai-node | |
| run:| | |
| pnpm run test | |
| -name:Install Go | |
| uses:actions/setup-go@29694d72cd5e7ef3b09496b39f28a942af47737e | |
| with: | |
| go-version:"1.24" | |
| -name:"Go: OpenAI Client: test" | |
| working-directory:clients/openai-go/tests | |
| run:go test -v | |
| -name:"Python: Recipes: pytest" | |
| working-directory:recipes | |
| run:| | |
| uv run pytest | |
| -name:Terminate the gateway and wait for it to exit | |
| if:always() | |
| run:| | |
| echo "Killing gateway with pid $GATEWAY_PID" | |
| kill $GATEWAY_PID | |
| # Wait for at most 30 seconds for the gateway to exit | |
| for i in {1..30}; do | |
| if ! kill -0 $GATEWAY_PID 2>/dev/null; then | |
| echo "Gateway exited" | |
| break | |
| fi | |
| sleep 1 | |
| done | |
| if kill -0 $GATEWAY_PID 2>/dev/null; then | |
| echo "Gateway did not exit after 30 seconds!" | |
| exit 1 | |
| fi | |
| -name:Print e2e logs | |
| if:always() | |
| run:cat e2e_logs.txt | |
| -name:Print provider-proxy logs | |
| if:always() | |
| run:cat provider_proxy_logs.txt | |
| -name:Print vLLM modal logs | |
| if:always() | |
| run:cat vllm_modal_logs.txt | |
| -name:Print SGLang modal logs | |
| if:always() | |
| run:cat sglang_modal_logs.txt | |
| -name:Print vLLM GPT-OSS modal logs | |
| if:always() | |
| continue-on-error:true | |
| run:cat vllm_gpt_oss_modal_logs.txt | |
| -name:Upload client-tests provider-proxy cache | |
| # Only upload the cache when we're running from a 'good' run | |
| # (running from the merge queue via 'workflow_call' from general.yml, or a cron job) | |
| # This prevents manual workflow runs from modifying the cache | |
| if:github.event_name == 'merge_group' || github.event_name == 'schedule' | |
| run:| | |
| AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY PROVIDER_PROXY_CACHE_BUCKET=provider-proxy-cache-client-tests ./ci/upload-provider-proxy-cache.sh | |
| # Test that the ui e2e tests still pass after we regenerate the model inference cache | |
| ui-tests-e2e-regen-model-inference-cache: | |
| permissions: | |
| contents:read | |
| actions:write | |
| if:github.repository == 'tensorzero/tensorzero' | |
| uses:./.github/workflows/ui-tests-e2e-model-inference-cache.yml | |
| with: | |
| regen_cache:true | |
| is_merge_group:true | |
| force_no_auth:true | |
| secrets: | |
| S3_ACCESS_KEY_ID:${{ secrets.AWS_ACCESS_KEY_ID }} | |
| S3_SECRET_ACCESS_KEY:${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| OPENAI_API_KEY:${{ secrets.OPENAI_API_KEY }} | |
| FIREWORKS_ACCOUNT_ID:${{ secrets.FIREWORKS_ACCOUNT_ID }} | |
| FIREWORKS_API_KEY:${{ secrets.FIREWORKS_API_KEY }} | |
| ANTHROPIC_API_KEY:${{ secrets.ANTHROPIC_API_KEY }} | |
| # See 'ci/README.md' at the repository root for more details. | |
| check-all-tests-passed: | |
| permissions:{} | |
| if:always() && github.repository == 'tensorzero/tensorzero' | |
| needs:[ui-tests-e2e-regen-model-inference-cache, client-tests, live-tests] | |
| runs-on:ubuntu-latest | |
| steps: | |
| # When running in the merge queue, jobs should never be skipped. | |
| # In a scheduled run, some jobs may be intentionally skipped, as we only care about regenerating the model inference cache. | |
| -if:${{ contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') || (github.event_name != 'pull_request' && contains(needs.*.result, 'skipped')) }} | |
| run:exit 1 |