Add model comparison test framework and analysis report
- scripts/check_results.py: Python script to check task results - scripts/run_security_test.sh: Interactive security test runner - test_results/MODEL_ANALYSIS_REPORT.md: Comprehensive analysis of model selection Key findings: - 8/10 requested models work with the agent - Gemini 3 thinking models require special reasoning token handling - Price-based capability estimation underestimates cheap models - Benchmark data integration not working properly
This commit is contained in:
135
scripts/check_results.py
Normal file
135
scripts/check_results.py
Normal file
@@ -0,0 +1,135 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Check task results from the model comparison test."""
|
||||
|
||||
import json
|
||||
import requests
|
||||
import sys
|
||||
import os
|
||||
|
||||
API_URL = "https://agent-backend.thomas.md"
|
||||
|
||||
# Task IDs from the test (round 2 - with fixed default model)
|
||||
TASKS = {
|
||||
"moonshotai/kimi-k2-thinking": "108bfe55-e937-4ff4-b71e-5865370c8191",
|
||||
"x-ai/grok-4.1-fast": "856703ff-f5d1-401d-9f3b-e7f965e4524d",
|
||||
"deepseek/deepseek-v3.2-speciale": "a404d71d-f22c-4c38-ac18-7332e39c8b6b",
|
||||
"mistralai/mistral-large-2512": "87972676-e4cf-4b23-8f8e-1043169bc12d",
|
||||
"anthropic/claude-sonnet-4.5": "e2e1bb84-aaab-410a-b133-68a182901576",
|
||||
}
|
||||
|
||||
|
||||
def get_token():
|
||||
"""Get auth token."""
|
||||
# Try to get password from secrets.json
|
||||
secrets_path = os.path.join(os.path.dirname(__file__), "..", "secrets.json")
|
||||
password = ""
|
||||
if os.path.exists(secrets_path):
|
||||
with open(secrets_path) as f:
|
||||
secrets = json.load(f)
|
||||
# Try different possible keys
|
||||
password = (
|
||||
secrets.get("dashboard_password") or
|
||||
secrets.get("dashboard", {}).get("password") or
|
||||
secrets.get("auth", {}).get("dashboard_password") or
|
||||
""
|
||||
)
|
||||
if not password:
|
||||
password = os.environ.get("DASHBOARD_PASSWORD", "")
|
||||
|
||||
if not password:
|
||||
print("Error: No dashboard password found")
|
||||
sys.exit(1)
|
||||
|
||||
resp = requests.post(f"{API_URL}/api/auth/login", json={"password": password})
|
||||
data = resp.json()
|
||||
return data.get("token")
|
||||
|
||||
|
||||
def check_task(token, model, task_id):
|
||||
"""Check a task's status."""
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
try:
|
||||
resp = requests.get(f"{API_URL}/api/task/{task_id}", headers=headers)
|
||||
data = resp.json()
|
||||
return {
|
||||
"model": model,
|
||||
"task_id": task_id,
|
||||
"status": data.get("status", "unknown"),
|
||||
"iterations": data.get("iterations", 0),
|
||||
"result_length": len(data.get("result", "")),
|
||||
"result_preview": data.get("result", "")[:200],
|
||||
"error": "Error:" in data.get("result", ""),
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"model": model,
|
||||
"task_id": task_id,
|
||||
"status": "error",
|
||||
"iterations": 0,
|
||||
"result_length": 0,
|
||||
"result_preview": str(e),
|
||||
"error": True,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
token = get_token()
|
||||
if not token:
|
||||
print("Failed to get auth token")
|
||||
sys.exit(1)
|
||||
|
||||
print("=" * 80)
|
||||
print("Quick Model Test Results")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
results = []
|
||||
for model, task_id in TASKS.items():
|
||||
result = check_task(token, model, task_id)
|
||||
results.append(result)
|
||||
|
||||
# Print summary table
|
||||
print(f"{'Model':<45} | {'Status':<10} | {'Iters':<5} | {'Chars':<8} | {'Error'}")
|
||||
print("-" * 45 + "-+-" + "-" * 10 + "-+-" + "-" * 5 + "-+-" + "-" * 8 + "-+-------")
|
||||
|
||||
for r in results:
|
||||
error_mark = "❌" if r["error"] else "✓"
|
||||
print(f"{r['model']:<45} | {r['status']:<10} | {r['iterations']:<5} | {r['result_length']:<8} | {error_mark}")
|
||||
|
||||
print()
|
||||
print("=" * 80)
|
||||
print("Detailed Results")
|
||||
print("=" * 80)
|
||||
|
||||
# Categorize results
|
||||
working = [r for r in results if r["status"] == "completed" and not r["error"]]
|
||||
failed = [r for r in results if r["status"] == "failed" or r["error"]]
|
||||
running = [r for r in results if r["status"] in ("pending", "running")]
|
||||
|
||||
print(f"\n✓ Working models ({len(working)}):")
|
||||
for r in working:
|
||||
print(f" - {r['model']}: {r['result_preview'][:100]}...")
|
||||
|
||||
print(f"\n❌ Failed models ({len(failed)}):")
|
||||
for r in failed:
|
||||
print(f" - {r['model']}: {r['result_preview'][:150]}...")
|
||||
|
||||
if running:
|
||||
print(f"\n⏳ Still running ({len(running)}):")
|
||||
for r in running:
|
||||
print(f" - {r['model']}")
|
||||
|
||||
# Summary
|
||||
print()
|
||||
print("=" * 80)
|
||||
print("SUMMARY")
|
||||
print("=" * 80)
|
||||
print(f"Working: {len(working)}/{len(results)}")
|
||||
print(f"Failed: {len(failed)}/{len(results)}")
|
||||
print(f"Running: {len(running)}/{len(results)}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
223
scripts/run_security_test.sh
Executable file
223
scripts/run_security_test.sh
Executable file
@@ -0,0 +1,223 @@
|
||||
#!/bin/bash
|
||||
# Run the Rabby Wallet security analysis with multiple models
|
||||
# This script submits the task to each model and monitors progress
|
||||
|
||||
set -e
|
||||
|
||||
API_URL="https://agent-backend.thomas.md"
|
||||
RESULTS_DIR="$(dirname "$0")/../test_results/security_$(date +%Y%m%d_%H%M%S)"
|
||||
mkdir -p "$RESULTS_DIR"
|
||||
|
||||
echo "==========================================="
|
||||
echo "Rabby Wallet Security Analysis - Model Comparison"
|
||||
echo "Results: $RESULTS_DIR"
|
||||
echo "==========================================="
|
||||
|
||||
# Models to test (prioritized)
|
||||
MODELS=(
|
||||
"moonshotai/kimi-k2-thinking"
|
||||
"x-ai/grok-4.1-fast"
|
||||
"google/gemini-3-flash-preview"
|
||||
"deepseek/deepseek-v3.2-speciale"
|
||||
"anthropic/claude-sonnet-4.5" # baseline
|
||||
)
|
||||
|
||||
# The security analysis task
|
||||
TASK='Download Rabby Wallet extension for Chrome, decompile it, and look for security vulnerabilities similar to the Permit2 transaction simulation bypass bug.
|
||||
|
||||
Context on the vulnerability pattern to look for:
|
||||
- Rabby simulation fails to detect malicious Permit2 approval patterns
|
||||
- The simulation shows a harmless transaction (e.g., spending 1 USDC) while the actual tx enables draining the user full balance
|
||||
- The key issue is that the simulation engine does not correctly model Permit2 delegation or spending flows
|
||||
- The "spender" field from a permit2 should be validated against known safe contract addresses
|
||||
|
||||
Focus areas:
|
||||
1. How Rabby parses and validates Permit2 signatures
|
||||
2. Whether the spender field is properly validated against known contract addresses
|
||||
3. If the witness data can be manipulated to display incorrect transaction details
|
||||
4. Any other transaction simulation bypass vectors
|
||||
|
||||
Steps:
|
||||
1. Download the Rabby extension (https://rabby.io or Chrome Web Store)
|
||||
2. Extract and decompile the JavaScript code
|
||||
3. Search for Permit2-related code paths
|
||||
4. Analyze the simulation/preview logic
|
||||
5. Identify potential bypass vectors
|
||||
|
||||
Provide findings in a structured markdown report with:
|
||||
- Vulnerability title
|
||||
- Severity (Critical/High/Medium/Low)
|
||||
- Description
|
||||
- Affected code snippets
|
||||
- Proof of concept outline
|
||||
- Recommended fix'
|
||||
|
||||
# Get auth token
|
||||
DASHBOARD_PASSWORD="${DASHBOARD_PASSWORD:-}"
|
||||
if [ -z "$DASHBOARD_PASSWORD" ]; then
|
||||
# Try to get from secrets.json
|
||||
if [ -f "$(dirname "$0")/../secrets.json" ]; then
|
||||
DASHBOARD_PASSWORD=$(jq -r '.dashboard_password // empty' "$(dirname "$0")/../secrets.json")
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -z "$DASHBOARD_PASSWORD" ]; then
|
||||
echo "Error: DASHBOARD_PASSWORD not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TOKEN=$(curl -s -X POST "$API_URL/api/auth/login" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"password\": \"$DASHBOARD_PASSWORD\"}" | jq -r '.token')
|
||||
|
||||
if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
|
||||
echo "Failed to get auth token"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Authenticated successfully"
|
||||
|
||||
# Function to submit a task
|
||||
submit_task() {
|
||||
local model="$1"
|
||||
local safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
|
||||
|
||||
echo ""
|
||||
echo "Submitting task for: $model"
|
||||
|
||||
local payload=$(jq -n \
|
||||
--arg task "$TASK" \
|
||||
--arg model "$model" \
|
||||
'{task: $task, model: $model}')
|
||||
|
||||
local response=$(curl -s -X POST "$API_URL/api/task" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer $TOKEN" \
|
||||
-d "$payload")
|
||||
|
||||
local task_id=$(echo "$response" | jq -r '.id // empty')
|
||||
|
||||
if [ -z "$task_id" ]; then
|
||||
echo " Failed: $response"
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo " Task ID: $task_id"
|
||||
echo "$task_id" > "$RESULTS_DIR/${safe_name}_task_id.txt"
|
||||
|
||||
# Save initial state
|
||||
echo "{\"model\": \"$model\", \"task_id\": \"$task_id\", \"submitted_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" > "$RESULTS_DIR/${safe_name}_meta.json"
|
||||
}
|
||||
|
||||
# Submit all tasks
|
||||
echo ""
|
||||
echo "Submitting tasks..."
|
||||
for model in "${MODELS[@]}"; do
|
||||
submit_task "$model"
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "All tasks submitted. Monitoring progress..."
|
||||
echo "(Press Ctrl+C to stop monitoring)"
|
||||
echo ""
|
||||
|
||||
# Monitor loop
|
||||
while true; do
|
||||
all_done=true
|
||||
clear
|
||||
echo "==========================================="
|
||||
echo "Task Status ($(date))"
|
||||
echo "==========================================="
|
||||
printf "%-45s | %-10s | %8s | %s\n" "Model" "Status" "Iters" "Result"
|
||||
echo "---------------------------------------------+------------+----------+---------"
|
||||
|
||||
for model in "${MODELS[@]}"; do
|
||||
safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
|
||||
task_id_file="$RESULTS_DIR/${safe_name}_task_id.txt"
|
||||
|
||||
if [ ! -f "$task_id_file" ]; then
|
||||
printf "%-45s | %-10s | %8s | %s\n" "$model" "no_task" "-" "-"
|
||||
continue
|
||||
fi
|
||||
|
||||
task_id=$(cat "$task_id_file")
|
||||
status_response=$(curl -s "$API_URL/api/task/$task_id" -H "Authorization: Bearer $TOKEN")
|
||||
|
||||
status=$(echo "$status_response" | jq -r '.status // "unknown"')
|
||||
iterations=$(echo "$status_response" | jq -r '.iterations // 0')
|
||||
result_preview=$(echo "$status_response" | jq -r '.result // ""' | head -c 50)
|
||||
|
||||
if [ "$status" != "completed" ] && [ "$status" != "failed" ]; then
|
||||
all_done=false
|
||||
fi
|
||||
|
||||
printf "%-45s | %-10s | %8s | %s\n" "$model" "$status" "$iterations" "${result_preview:0:50}"
|
||||
|
||||
# Save full result if done
|
||||
if [ "$status" = "completed" ] || [ "$status" = "failed" ]; then
|
||||
echo "$status_response" | jq . > "$RESULTS_DIR/${safe_name}_result.json"
|
||||
fi
|
||||
done
|
||||
|
||||
if $all_done; then
|
||||
echo ""
|
||||
echo "All tasks completed!"
|
||||
break
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
|
||||
# Generate summary
|
||||
echo ""
|
||||
echo "==========================================="
|
||||
echo "Final Summary"
|
||||
echo "==========================================="
|
||||
|
||||
{
|
||||
echo "# Model Comparison Results"
|
||||
echo ""
|
||||
echo "Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
echo ""
|
||||
echo "| Model | Status | Iterations | Result Length | Cost (cents) |"
|
||||
echo "|-------|--------|------------|---------------|--------------|"
|
||||
|
||||
for model in "${MODELS[@]}"; do
|
||||
safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
|
||||
result_file="$RESULTS_DIR/${safe_name}_result.json"
|
||||
|
||||
if [ -f "$result_file" ]; then
|
||||
status=$(jq -r '.status' "$result_file")
|
||||
iterations=$(jq -r '.iterations' "$result_file")
|
||||
result=$(jq -r '.result // ""' "$result_file")
|
||||
result_len=${#result}
|
||||
# Note: cost would need to be tracked by the agent
|
||||
echo "| $model | $status | $iterations | $result_len | - |"
|
||||
else
|
||||
echo "| $model | no_result | - | - | - |"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "## Detailed Results"
|
||||
echo ""
|
||||
|
||||
for model in "${MODELS[@]}"; do
|
||||
safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
|
||||
result_file="$RESULTS_DIR/${safe_name}_result.json"
|
||||
|
||||
if [ -f "$result_file" ]; then
|
||||
echo "### $model"
|
||||
echo ""
|
||||
jq -r '.result // "No result"' "$result_file"
|
||||
echo ""
|
||||
echo "---"
|
||||
echo ""
|
||||
fi
|
||||
done
|
||||
} > "$RESULTS_DIR/REPORT.md"
|
||||
|
||||
echo "Report saved to: $RESULTS_DIR/REPORT.md"
|
||||
echo ""
|
||||
cat "$RESULTS_DIR/REPORT.md" | head -30
|
||||
Reference in New Issue
Block a user