opencode-databricks/test-databricks-models.sh at dev · dgokeeffe/opencode-databricks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/bin/bash
# Test script for Databricks models via opencode API
# Tests both basic response and tool call capabilities
# Auto-starts/restarts the dev server as needed

BASE_URL="http://localhost:4096"
PROVIDER="databricks"
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"

# Representative models from each family
MODELS=(
  "databricks-claude-sonnet-4-6"
  "databricks-gpt-5-3-codex"
  "databricks-gpt-5-2-codex"
  "databricks-gpt-5"
  "databricks-gpt-5-mini"
  "databricks-gemini-2-5-flash"
  "databricks-gemini-2-5-pro"
  "databricks-llama-4-maverick"
  "databricks-meta-llama-3-3-70b-instruct"
  "databricks-qwen3-next-80b-a3b-instruct"
)

PASSED=0
FAILED=0
WARNINGS=0
ERRORS=()
WARN_LIST=()

# Auto-manage the dev server
start_server() {
  echo -n "  Starting dev server... "
  pkill -f "bun.*serve" 2>/dev/null
  sleep 1

  cd "$SCRIPT_DIR" && bun dev serve > /tmp/opencode-test-server.log 2>&1 &
  SERVER_PID=$!

  # Wait for server to be ready (up to 30s)
  for i in $(seq 1 30); do
    if curl -s "$BASE_URL/session" | jq -e . > /dev/null 2>&1; then
      echo "OK (pid: $SERVER_PID)"
      return 0
    fi
    sleep 1
  done
  echo "FAIL (timeout)"
  return 1
}

stop_server() {
  pkill -f "bun.*serve" 2>/dev/null
}

test_model() {
  local model="$1"
  echo ""
  echo "============================================"
  echo "Testing: $model"
  echo "============================================"

  # 1. Create session
  echo -n "  Creating session... "
  SESSION_RESP=$(curl -s -X POST "$BASE_URL/session" \
    -H "Content-Type: application/json" \
    -d '{"title":"test-'"$model"'"}')

  SESSION_ID=$(echo "$SESSION_RESP" | jq -r '.id // empty')
  if [ -z "$SESSION_ID" ]; then
    echo "FAIL"
    FAILED=$((FAILED + 1))
    ERRORS+=("$model: session creation failed")
    return 1
  fi
  echo "$SESSION_ID"

  # 2. Test basic response
  echo -n "  Basic response... "
  MSG_RESP=$(curl -s --max-time 120 -X POST "$BASE_URL/session/$SESSION_ID/message" \
    -H "Content-Type: application/json" \
    -d '{
      "model": {"providerID": "'"$PROVIDER"'", "modelID": "'"$model"'"},
      "parts": [{"type": "text", "text": "What is 2+2? Answer with just the number."}]
    }')

  # Check for API-level errors
  API_ERROR=$(echo "$MSG_RESP" | jq -r '.info.error.name // empty' 2>/dev/null)
  API_ERROR_MSG=$(echo "$MSG_RESP" | jq -r '.info.error.data.message // empty' 2>/dev/null | head -c 200)

  if [ -n "$API_ERROR" ]; then
    echo "FAIL ($API_ERROR: ${API_ERROR_MSG:0:100})"
    FAILED=$((FAILED + 1))
    ERRORS+=("$model: $API_ERROR - ${API_ERROR_MSG:0:150}")
    return 1
  fi

  # Check for text parts
  TEXT_CONTENT=$(echo "$MSG_RESP" | jq -r '[.parts[] | select(.type == "text") | .text] | join(" ")' 2>/dev/null)
  INPUT_TOKENS=$(echo "$MSG_RESP" | jq '.info.tokens.input // 0' 2>/dev/null)
  OUTPUT_TOKENS=$(echo "$MSG_RESP" | jq '.info.tokens.output // 0' 2>/dev/null)

  if [ -z "$TEXT_CONTENT" ] || [ "$TEXT_CONTENT" = "null" ]; then
    echo "FAIL (no text in response)"
    FAILED=$((FAILED + 1))
    ERRORS+=("$model: no text in basic response")
    return 1
  fi
  echo "OK (\"${TEXT_CONTENT:0:50}\") [${INPUT_TOKENS}/${OUTPUT_TOKENS} tokens]"

  # 3. Test tool call - ask it to use the read tool
  echo -n "  Tool call... "
  TOOL_RESP=$(curl -s --max-time 300 -X POST "$BASE_URL/session/$SESSION_ID/message" \
    -H "Content-Type: application/json" \
    -d '{
      "model": {"providerID": "'"$PROVIDER"'", "modelID": "'"$model"'"},
      "parts": [{"type": "text", "text": "Use the read tool to read the file at /Users/david.okeeffe/Repos/opencode/opencode.json and tell me the provider name configured in it."}]
    }')

  # Check for API-level errors
  TOOL_API_ERROR=$(echo "$TOOL_RESP" | jq -r '.info.error.name // empty' 2>/dev/null)
  TOOL_API_ERROR_MSG=$(echo "$TOOL_RESP" | jq -r '.info.error.data.message // empty' 2>/dev/null | head -c 200)

  if [ -n "$TOOL_API_ERROR" ]; then
    echo "FAIL ($TOOL_API_ERROR: ${TOOL_API_ERROR_MSG:0:100})"
    FAILED=$((FAILED + 1))
    ERRORS+=("$model: tool test - $TOOL_API_ERROR")
    return 1
  fi

  TOOL_TEXT=$(echo "$TOOL_RESP" | jq -r '[.parts[] | select(.type == "text") | .text] | join(" ")' 2>/dev/null | head -c 150)

  # The prompt endpoint returns only the final assistant message.
  # Check full message history for tool calls in intermediate steps.
  HISTORY=$(curl -s "$BASE_URL/session/$SESSION_ID/message" 2>/dev/null)
  HAS_TOOL_IN_HISTORY=$(echo "$HISTORY" | jq '[.[] | .parts[] | select(.type == "tool")] | length' 2>/dev/null)

  if [ "$HAS_TOOL_IN_HISTORY" -gt 0 ]; then
    TOOL_NAMES=$(echo "$HISTORY" | jq -r '[.[] | .parts[] | select(.type == "tool") | .tool] | unique | join(", ")' 2>/dev/null)
    echo "OK (tools: $TOOL_NAMES) text: \"${TOOL_TEXT:0:80}\""
    PASSED=$((PASSED + 1))
  else
    echo "WARN (no tool call, but text: \"${TOOL_TEXT:0:80}\")"
    WARNINGS=$((WARNINGS + 1))
    WARN_LIST+=("$model: responded without using tools")
    if [ -n "$TOOL_TEXT" ]; then
      PASSED=$((PASSED + 1))
    else
      FAILED=$((FAILED + 1))
      ERRORS+=("$model: no tool call and no text in tool test")
    fi
  fi
}

echo "====================================================="
echo "  Databricks Model Integration Test Suite"
echo "====================================================="
echo "  Server: $BASE_URL"
echo "  Models: ${#MODELS[@]}"
echo "====================================================="

# Auto-start server
if ! curl -s "$BASE_URL/session" | jq -e . > /dev/null 2>&1; then
  start_server || exit 1
else
  echo "  Server already running"
fi

for model in "${MODELS[@]}"; do
  test_model "$model"
done

echo ""
echo "====================================================="
echo "  RESULTS"
echo "====================================================="
echo "  Passed:   $PASSED / ${#MODELS[@]}"
echo "  Failed:   $FAILED / ${#MODELS[@]}"
echo "  Warnings: $WARNINGS"
if [ ${#ERRORS[@]} -gt 0 ]; then
  echo ""
  echo "  Failures:"
  for err in "${ERRORS[@]}"; do
    echo "    - $err"
  done
fi
if [ ${#WARN_LIST[@]} -gt 0 ]; then
  echo ""
  echo "  Warnings:"
  for w in "${WARN_LIST[@]}"; do
    echo "    - $w"
  done
fi
echo "====================================================="

exit $FAILED