Add model comparison test framework and analysis report

- scripts/check_results.py: Python script to check task results
- scripts/run_security_test.sh: Interactive security test runner
- test_results/MODEL_ANALYSIS_REPORT.md: Comprehensive analysis of model selection

Key findings:
- 8/10 requested models work with the agent
- Gemini 3 thinking models require special reasoning token handling
- Price-based capability estimation underestimates cheap models
- Benchmark data integration not working properly
This commit is contained in:
Thomas Marchand
2025-12-19 07:54:40 +00:00
parent 14241f92c1
commit 6177ac3a5f
3 changed files with 555 additions and 0 deletions

135
scripts/check_results.py Normal file
View File

@@ -0,0 +1,135 @@
#!/usr/bin/env python3
"""Check task results from the model comparison test."""
import json
import requests
import sys
import os
API_URL = "https://agent-backend.thomas.md"
# Task IDs from the test (round 2 - with fixed default model)
TASKS = {
"moonshotai/kimi-k2-thinking": "108bfe55-e937-4ff4-b71e-5865370c8191",
"x-ai/grok-4.1-fast": "856703ff-f5d1-401d-9f3b-e7f965e4524d",
"deepseek/deepseek-v3.2-speciale": "a404d71d-f22c-4c38-ac18-7332e39c8b6b",
"mistralai/mistral-large-2512": "87972676-e4cf-4b23-8f8e-1043169bc12d",
"anthropic/claude-sonnet-4.5": "e2e1bb84-aaab-410a-b133-68a182901576",
}
def get_token():
"""Get auth token."""
# Try to get password from secrets.json
secrets_path = os.path.join(os.path.dirname(__file__), "..", "secrets.json")
password = ""
if os.path.exists(secrets_path):
with open(secrets_path) as f:
secrets = json.load(f)
# Try different possible keys
password = (
secrets.get("dashboard_password") or
secrets.get("dashboard", {}).get("password") or
secrets.get("auth", {}).get("dashboard_password") or
""
)
if not password:
password = os.environ.get("DASHBOARD_PASSWORD", "")
if not password:
print("Error: No dashboard password found")
sys.exit(1)
resp = requests.post(f"{API_URL}/api/auth/login", json={"password": password})
data = resp.json()
return data.get("token")
def check_task(token, model, task_id):
"""Check a task's status."""
headers = {"Authorization": f"Bearer {token}"}
try:
resp = requests.get(f"{API_URL}/api/task/{task_id}", headers=headers)
data = resp.json()
return {
"model": model,
"task_id": task_id,
"status": data.get("status", "unknown"),
"iterations": data.get("iterations", 0),
"result_length": len(data.get("result", "")),
"result_preview": data.get("result", "")[:200],
"error": "Error:" in data.get("result", ""),
}
except Exception as e:
return {
"model": model,
"task_id": task_id,
"status": "error",
"iterations": 0,
"result_length": 0,
"result_preview": str(e),
"error": True,
}
def main():
token = get_token()
if not token:
print("Failed to get auth token")
sys.exit(1)
print("=" * 80)
print("Quick Model Test Results")
print("=" * 80)
print()
results = []
for model, task_id in TASKS.items():
result = check_task(token, model, task_id)
results.append(result)
# Print summary table
print(f"{'Model':<45} | {'Status':<10} | {'Iters':<5} | {'Chars':<8} | {'Error'}")
print("-" * 45 + "-+-" + "-" * 10 + "-+-" + "-" * 5 + "-+-" + "-" * 8 + "-+-------")
for r in results:
error_mark = "" if r["error"] else ""
print(f"{r['model']:<45} | {r['status']:<10} | {r['iterations']:<5} | {r['result_length']:<8} | {error_mark}")
print()
print("=" * 80)
print("Detailed Results")
print("=" * 80)
# Categorize results
working = [r for r in results if r["status"] == "completed" and not r["error"]]
failed = [r for r in results if r["status"] == "failed" or r["error"]]
running = [r for r in results if r["status"] in ("pending", "running")]
print(f"\n✓ Working models ({len(working)}):")
for r in working:
print(f" - {r['model']}: {r['result_preview'][:100]}...")
print(f"\n❌ Failed models ({len(failed)}):")
for r in failed:
print(f" - {r['model']}: {r['result_preview'][:150]}...")
if running:
print(f"\n⏳ Still running ({len(running)}):")
for r in running:
print(f" - {r['model']}")
# Summary
print()
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Working: {len(working)}/{len(results)}")
print(f"Failed: {len(failed)}/{len(results)}")
print(f"Running: {len(running)}/{len(results)}")
return results
if __name__ == "__main__":
main()

223
scripts/run_security_test.sh Executable file
View File

@@ -0,0 +1,223 @@
#!/bin/bash
# Run the Rabby Wallet security analysis with multiple models
# This script submits the task to each model and monitors progress
set -e
API_URL="https://agent-backend.thomas.md"
RESULTS_DIR="$(dirname "$0")/../test_results/security_$(date +%Y%m%d_%H%M%S)"
mkdir -p "$RESULTS_DIR"
echo "==========================================="
echo "Rabby Wallet Security Analysis - Model Comparison"
echo "Results: $RESULTS_DIR"
echo "==========================================="
# Models to test (prioritized)
MODELS=(
"moonshotai/kimi-k2-thinking"
"x-ai/grok-4.1-fast"
"google/gemini-3-flash-preview"
"deepseek/deepseek-v3.2-speciale"
"anthropic/claude-sonnet-4.5" # baseline
)
# The security analysis task
TASK='Download Rabby Wallet extension for Chrome, decompile it, and look for security vulnerabilities similar to the Permit2 transaction simulation bypass bug.
Context on the vulnerability pattern to look for:
- Rabby simulation fails to detect malicious Permit2 approval patterns
- The simulation shows a harmless transaction (e.g., spending 1 USDC) while the actual tx enables draining the user full balance
- The key issue is that the simulation engine does not correctly model Permit2 delegation or spending flows
- The "spender" field from a permit2 should be validated against known safe contract addresses
Focus areas:
1. How Rabby parses and validates Permit2 signatures
2. Whether the spender field is properly validated against known contract addresses
3. If the witness data can be manipulated to display incorrect transaction details
4. Any other transaction simulation bypass vectors
Steps:
1. Download the Rabby extension (https://rabby.io or Chrome Web Store)
2. Extract and decompile the JavaScript code
3. Search for Permit2-related code paths
4. Analyze the simulation/preview logic
5. Identify potential bypass vectors
Provide findings in a structured markdown report with:
- Vulnerability title
- Severity (Critical/High/Medium/Low)
- Description
- Affected code snippets
- Proof of concept outline
- Recommended fix'
# Get auth token
DASHBOARD_PASSWORD="${DASHBOARD_PASSWORD:-}"
if [ -z "$DASHBOARD_PASSWORD" ]; then
# Try to get from secrets.json
if [ -f "$(dirname "$0")/../secrets.json" ]; then
DASHBOARD_PASSWORD=$(jq -r '.dashboard_password // empty' "$(dirname "$0")/../secrets.json")
fi
fi
if [ -z "$DASHBOARD_PASSWORD" ]; then
echo "Error: DASHBOARD_PASSWORD not set"
exit 1
fi
TOKEN=$(curl -s -X POST "$API_URL/api/auth/login" \
-H "Content-Type: application/json" \
-d "{\"password\": \"$DASHBOARD_PASSWORD\"}" | jq -r '.token')
if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
echo "Failed to get auth token"
exit 1
fi
echo "Authenticated successfully"
# Function to submit a task
submit_task() {
local model="$1"
local safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
echo ""
echo "Submitting task for: $model"
local payload=$(jq -n \
--arg task "$TASK" \
--arg model "$model" \
'{task: $task, model: $model}')
local response=$(curl -s -X POST "$API_URL/api/task" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $TOKEN" \
-d "$payload")
local task_id=$(echo "$response" | jq -r '.id // empty')
if [ -z "$task_id" ]; then
echo " Failed: $response"
return 1
fi
echo " Task ID: $task_id"
echo "$task_id" > "$RESULTS_DIR/${safe_name}_task_id.txt"
# Save initial state
echo "{\"model\": \"$model\", \"task_id\": \"$task_id\", \"submitted_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" > "$RESULTS_DIR/${safe_name}_meta.json"
}
# Submit all tasks
echo ""
echo "Submitting tasks..."
for model in "${MODELS[@]}"; do
submit_task "$model"
sleep 1
done
echo ""
echo "All tasks submitted. Monitoring progress..."
echo "(Press Ctrl+C to stop monitoring)"
echo ""
# Monitor loop
while true; do
all_done=true
clear
echo "==========================================="
echo "Task Status ($(date))"
echo "==========================================="
printf "%-45s | %-10s | %8s | %s\n" "Model" "Status" "Iters" "Result"
echo "---------------------------------------------+------------+----------+---------"
for model in "${MODELS[@]}"; do
safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
task_id_file="$RESULTS_DIR/${safe_name}_task_id.txt"
if [ ! -f "$task_id_file" ]; then
printf "%-45s | %-10s | %8s | %s\n" "$model" "no_task" "-" "-"
continue
fi
task_id=$(cat "$task_id_file")
status_response=$(curl -s "$API_URL/api/task/$task_id" -H "Authorization: Bearer $TOKEN")
status=$(echo "$status_response" | jq -r '.status // "unknown"')
iterations=$(echo "$status_response" | jq -r '.iterations // 0')
result_preview=$(echo "$status_response" | jq -r '.result // ""' | head -c 50)
if [ "$status" != "completed" ] && [ "$status" != "failed" ]; then
all_done=false
fi
printf "%-45s | %-10s | %8s | %s\n" "$model" "$status" "$iterations" "${result_preview:0:50}"
# Save full result if done
if [ "$status" = "completed" ] || [ "$status" = "failed" ]; then
echo "$status_response" | jq . > "$RESULTS_DIR/${safe_name}_result.json"
fi
done
if $all_done; then
echo ""
echo "All tasks completed!"
break
fi
sleep 10
done
# Generate summary
echo ""
echo "==========================================="
echo "Final Summary"
echo "==========================================="
{
echo "# Model Comparison Results"
echo ""
echo "Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
echo ""
echo "| Model | Status | Iterations | Result Length | Cost (cents) |"
echo "|-------|--------|------------|---------------|--------------|"
for model in "${MODELS[@]}"; do
safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
result_file="$RESULTS_DIR/${safe_name}_result.json"
if [ -f "$result_file" ]; then
status=$(jq -r '.status' "$result_file")
iterations=$(jq -r '.iterations' "$result_file")
result=$(jq -r '.result // ""' "$result_file")
result_len=${#result}
# Note: cost would need to be tracked by the agent
echo "| $model | $status | $iterations | $result_len | - |"
else
echo "| $model | no_result | - | - | - |"
fi
done
echo ""
echo "## Detailed Results"
echo ""
for model in "${MODELS[@]}"; do
safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
result_file="$RESULTS_DIR/${safe_name}_result.json"
if [ -f "$result_file" ]; then
echo "### $model"
echo ""
jq -r '.result // "No result"' "$result_file"
echo ""
echo "---"
echo ""
fi
done
} > "$RESULTS_DIR/REPORT.md"
echo "Report saved to: $RESULTS_DIR/REPORT.md"
echo ""
cat "$RESULTS_DIR/REPORT.md" | head -30