Add model comparison test framework and analysis report

- scripts/check_results.py: Python script to check task results - scripts/run_security_test.sh: Interactive security test runner - test_results/MODEL_ANALYSIS_REPORT.md: Comprehensive analysis of model selection Key findings: - 8/10 requested models work with the agent - Gemini 3 thinking models require special reasoning token handling - Price-based capability estimation underestimates cheap models - Benchmark data integration not working properly
2025-12-19 07:54:40 +00:00
parent 14241f92c1
commit 6177ac3a5f
3 changed files with 555 additions and 0 deletions
--- a/scripts/check_results.py
+++ b/scripts/check_results.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""Check task results from the model comparison test."""
+
+import json
+import requests
+import sys
+import os
+
+API_URL = "https://agent-backend.thomas.md"
+
+# Task IDs from the test (round 2 - with fixed default model)
+TASKS = {
+    "moonshotai/kimi-k2-thinking": "108bfe55-e937-4ff4-b71e-5865370c8191",
+    "x-ai/grok-4.1-fast": "856703ff-f5d1-401d-9f3b-e7f965e4524d",
+    "deepseek/deepseek-v3.2-speciale": "a404d71d-f22c-4c38-ac18-7332e39c8b6b",
+    "mistralai/mistral-large-2512": "87972676-e4cf-4b23-8f8e-1043169bc12d",
+    "anthropic/claude-sonnet-4.5": "e2e1bb84-aaab-410a-b133-68a182901576",
+}
+
+
+def get_token():
+    """Get auth token."""
+    # Try to get password from secrets.json
+    secrets_path = os.path.join(os.path.dirname(__file__), "..", "secrets.json")
+    password = ""
+    if os.path.exists(secrets_path):
+        with open(secrets_path) as f:
+            secrets = json.load(f)
+            # Try different possible keys
+            password = (
+                secrets.get("dashboard_password") or 
+                secrets.get("dashboard", {}).get("password") or
+                secrets.get("auth", {}).get("dashboard_password") or
+                ""
+            )
+    if not password:
+        password = os.environ.get("DASHBOARD_PASSWORD", "")
+    
+    if not password:
+        print("Error: No dashboard password found")
+        sys.exit(1)
+    
+    resp = requests.post(f"{API_URL}/api/auth/login", json={"password": password})
+    data = resp.json()
+    return data.get("token")
+
+
+def check_task(token, model, task_id):
+    """Check a task's status."""
+    headers = {"Authorization": f"Bearer {token}"}
+    try:
+        resp = requests.get(f"{API_URL}/api/task/{task_id}", headers=headers)
+        data = resp.json()
+        return {
+            "model": model,
+            "task_id": task_id,
+            "status": data.get("status", "unknown"),
+            "iterations": data.get("iterations", 0),
+            "result_length": len(data.get("result", "")),
+            "result_preview": data.get("result", "")[:200],
+            "error": "Error:" in data.get("result", ""),
+        }
+    except Exception as e:
+        return {
+            "model": model,
+            "task_id": task_id,
+            "status": "error",
+            "iterations": 0,
+            "result_length": 0,
+            "result_preview": str(e),
+            "error": True,
+        }
+
+
+def main():
+    token = get_token()
+    if not token:
+        print("Failed to get auth token")
+        sys.exit(1)
+    
+    print("=" * 80)
+    print("Quick Model Test Results")
+    print("=" * 80)
+    print()
+    
+    results = []
+    for model, task_id in TASKS.items():
+        result = check_task(token, model, task_id)
+        results.append(result)
+    
+    # Print summary table
+    print(f"{'Model':<45} | {'Status':<10} | {'Iters':<5} | {'Chars':<8} | {'Error'}")
+    print("-" * 45 + "-+-" + "-" * 10 + "-+-" + "-" * 5 + "-+-" + "-" * 8 + "-+-------")
+    
+    for r in results:
+        error_mark = "❌" if r["error"] else "✓"
+        print(f"{r['model']:<45} | {r['status']:<10} | {r['iterations']:<5} | {r['result_length']:<8} | {error_mark}")
+    
+    print()
+    print("=" * 80)
+    print("Detailed Results")
+    print("=" * 80)
+    
+    # Categorize results
+    working = [r for r in results if r["status"] == "completed" and not r["error"]]
+    failed = [r for r in results if r["status"] == "failed" or r["error"]]
+    running = [r for r in results if r["status"] in ("pending", "running")]
+    
+    print(f"\n✓ Working models ({len(working)}):")
+    for r in working:
+        print(f"  - {r['model']}: {r['result_preview'][:100]}...")
+    
+    print(f"\n❌ Failed models ({len(failed)}):")
+    for r in failed:
+        print(f"  - {r['model']}: {r['result_preview'][:150]}...")
+    
+    if running:
+        print(f"\n⏳ Still running ({len(running)}):")
+        for r in running:
+            print(f"  - {r['model']}")
+    
+    # Summary
+    print()
+    print("=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    print(f"Working: {len(working)}/{len(results)}")
+    print(f"Failed: {len(failed)}/{len(results)}")
+    print(f"Running: {len(running)}/{len(results)}")
+    
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/run_security_test.sh
+++ b/scripts/run_security_test.sh
@@ -0,0 +1,223 @@
+#!/bin/bash
+# Run the Rabby Wallet security analysis with multiple models
+# This script submits the task to each model and monitors progress
+
+set -e
+
+API_URL="https://agent-backend.thomas.md"
+RESULTS_DIR="$(dirname "$0")/../test_results/security_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$RESULTS_DIR"
+
+echo "==========================================="
+echo "Rabby Wallet Security Analysis - Model Comparison"
+echo "Results: $RESULTS_DIR"
+echo "==========================================="
+
+# Models to test (prioritized)
+MODELS=(
+    "moonshotai/kimi-k2-thinking"
+    "x-ai/grok-4.1-fast"
+    "google/gemini-3-flash-preview"
+    "deepseek/deepseek-v3.2-speciale"
+    "anthropic/claude-sonnet-4.5"  # baseline
+)
+
+# The security analysis task
+TASK='Download Rabby Wallet extension for Chrome, decompile it, and look for security vulnerabilities similar to the Permit2 transaction simulation bypass bug.
+
+Context on the vulnerability pattern to look for:
+- Rabby simulation fails to detect malicious Permit2 approval patterns
+- The simulation shows a harmless transaction (e.g., spending 1 USDC) while the actual tx enables draining the user full balance
+- The key issue is that the simulation engine does not correctly model Permit2 delegation or spending flows
+- The "spender" field from a permit2 should be validated against known safe contract addresses
+
+Focus areas:
+1. How Rabby parses and validates Permit2 signatures
+2. Whether the spender field is properly validated against known contract addresses
+3. If the witness data can be manipulated to display incorrect transaction details
+4. Any other transaction simulation bypass vectors
+
+Steps:
+1. Download the Rabby extension (https://rabby.io or Chrome Web Store)
+2. Extract and decompile the JavaScript code
+3. Search for Permit2-related code paths
+4. Analyze the simulation/preview logic
+5. Identify potential bypass vectors
+
+Provide findings in a structured markdown report with:
+- Vulnerability title
+- Severity (Critical/High/Medium/Low)
+- Description
+- Affected code snippets
+- Proof of concept outline
+- Recommended fix'
+
+# Get auth token
+DASHBOARD_PASSWORD="${DASHBOARD_PASSWORD:-}"
+if [ -z "$DASHBOARD_PASSWORD" ]; then
+    # Try to get from secrets.json
+    if [ -f "$(dirname "$0")/../secrets.json" ]; then
+        DASHBOARD_PASSWORD=$(jq -r '.dashboard_password // empty' "$(dirname "$0")/../secrets.json")
+    fi
+fi
+
+if [ -z "$DASHBOARD_PASSWORD" ]; then
+    echo "Error: DASHBOARD_PASSWORD not set"
+    exit 1
+fi
+
+TOKEN=$(curl -s -X POST "$API_URL/api/auth/login" \
+    -H "Content-Type: application/json" \
+    -d "{\"password\": \"$DASHBOARD_PASSWORD\"}" | jq -r '.token')
+
+if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
+    echo "Failed to get auth token"
+    exit 1
+fi
+
+echo "Authenticated successfully"
+
+# Function to submit a task
+submit_task() {
+    local model="$1"
+    local safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
+    
+    echo ""
+    echo "Submitting task for: $model"
+    
+    local payload=$(jq -n \
+        --arg task "$TASK" \
+        --arg model "$model" \
+        '{task: $task, model: $model}')
+    
+    local response=$(curl -s -X POST "$API_URL/api/task" \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $TOKEN" \
+        -d "$payload")
+    
+    local task_id=$(echo "$response" | jq -r '.id // empty')
+    
+    if [ -z "$task_id" ]; then
+        echo "  Failed: $response"
+        return 1
+    fi
+    
+    echo "  Task ID: $task_id"
+    echo "$task_id" > "$RESULTS_DIR/${safe_name}_task_id.txt"
+    
+    # Save initial state
+    echo "{\"model\": \"$model\", \"task_id\": \"$task_id\", \"submitted_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" > "$RESULTS_DIR/${safe_name}_meta.json"
+}
+
+# Submit all tasks
+echo ""
+echo "Submitting tasks..."
+for model in "${MODELS[@]}"; do
+    submit_task "$model"
+    sleep 1
+done
+
+echo ""
+echo "All tasks submitted. Monitoring progress..."
+echo "(Press Ctrl+C to stop monitoring)"
+echo ""
+
+# Monitor loop
+while true; do
+    all_done=true
+    clear
+    echo "==========================================="
+    echo "Task Status ($(date))"
+    echo "==========================================="
+    printf "%-45s | %-10s | %8s | %s\n" "Model" "Status" "Iters" "Result"
+    echo "---------------------------------------------+------------+----------+---------"
+    
+    for model in "${MODELS[@]}"; do
+        safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
+        task_id_file="$RESULTS_DIR/${safe_name}_task_id.txt"
+        
+        if [ ! -f "$task_id_file" ]; then
+            printf "%-45s | %-10s | %8s | %s\n" "$model" "no_task" "-" "-"
+            continue
+        fi
+        
+        task_id=$(cat "$task_id_file")
+        status_response=$(curl -s "$API_URL/api/task/$task_id" -H "Authorization: Bearer $TOKEN")
+        
+        status=$(echo "$status_response" | jq -r '.status // "unknown"')
+        iterations=$(echo "$status_response" | jq -r '.iterations // 0')
+        result_preview=$(echo "$status_response" | jq -r '.result // ""' | head -c 50)
+        
+        if [ "$status" != "completed" ] && [ "$status" != "failed" ]; then
+            all_done=false
+        fi
+        
+        printf "%-45s | %-10s | %8s | %s\n" "$model" "$status" "$iterations" "${result_preview:0:50}"
+        
+        # Save full result if done
+        if [ "$status" = "completed" ] || [ "$status" = "failed" ]; then
+            echo "$status_response" | jq . > "$RESULTS_DIR/${safe_name}_result.json"
+        fi
+    done
+    
+    if $all_done; then
+        echo ""
+        echo "All tasks completed!"
+        break
+    fi
+    
+    sleep 10
+done
+
+# Generate summary
+echo ""
+echo "==========================================="
+echo "Final Summary"
+echo "==========================================="
+
+{
+    echo "# Model Comparison Results"
+    echo ""
+    echo "Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
+    echo ""
+    echo "| Model | Status | Iterations | Result Length | Cost (cents) |"
+    echo "|-------|--------|------------|---------------|--------------|"
+    
+    for model in "${MODELS[@]}"; do
+        safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
+        result_file="$RESULTS_DIR/${safe_name}_result.json"
+        
+        if [ -f "$result_file" ]; then
+            status=$(jq -r '.status' "$result_file")
+            iterations=$(jq -r '.iterations' "$result_file")
+            result=$(jq -r '.result // ""' "$result_file")
+            result_len=${#result}
+            # Note: cost would need to be tracked by the agent
+            echo "| $model | $status | $iterations | $result_len | - |"
+        else
+            echo "| $model | no_result | - | - | - |"
+        fi
+    done
+    
+    echo ""
+    echo "## Detailed Results"
+    echo ""
+    
+    for model in "${MODELS[@]}"; do
+        safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
+        result_file="$RESULTS_DIR/${safe_name}_result.json"
+        
+        if [ -f "$result_file" ]; then
+            echo "### $model"
+            echo ""
+            jq -r '.result // "No result"' "$result_file"
+            echo ""
+            echo "---"
+            echo ""
+        fi
+    done
+} > "$RESULTS_DIR/REPORT.md"
+
+echo "Report saved to: $RESULTS_DIR/REPORT.md"
+echo ""
+cat "$RESULTS_DIR/REPORT.md" | head -30