diff --git a/scripts/check_results.py b/scripts/check_results.py
new file mode 100644
index 0000000..c121880
--- /dev/null
+++ b/scripts/check_results.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""Check task results from the model comparison test."""
+
+import json
+import requests
+import sys
+import os
+
+API_URL = "https://agent-backend.thomas.md"
+
+# Task IDs from the test (round 2 - with fixed default model)
+TASKS = {
+    "moonshotai/kimi-k2-thinking": "108bfe55-e937-4ff4-b71e-5865370c8191",
+    "x-ai/grok-4.1-fast": "856703ff-f5d1-401d-9f3b-e7f965e4524d",
+    "deepseek/deepseek-v3.2-speciale": "a404d71d-f22c-4c38-ac18-7332e39c8b6b",
+    "mistralai/mistral-large-2512": "87972676-e4cf-4b23-8f8e-1043169bc12d",
+    "anthropic/claude-sonnet-4.5": "e2e1bb84-aaab-410a-b133-68a182901576",
+}
+
+
+def get_token():
+    """Get auth token."""
+    # Try to get password from secrets.json
+    secrets_path = os.path.join(os.path.dirname(__file__), "..", "secrets.json")
+    password = ""
+    if os.path.exists(secrets_path):
+        with open(secrets_path) as f:
+            secrets = json.load(f)
+            # Try different possible keys
+            password = (
+                secrets.get("dashboard_password") or 
+                secrets.get("dashboard", {}).get("password") or
+                secrets.get("auth", {}).get("dashboard_password") or
+                ""
+            )
+    if not password:
+        password = os.environ.get("DASHBOARD_PASSWORD", "")
+    
+    if not password:
+        print("Error: No dashboard password found")
+        sys.exit(1)
+    
+    resp = requests.post(f"{API_URL}/api/auth/login", json={"password": password})
+    data = resp.json()
+    return data.get("token")
+
+
+def check_task(token, model, task_id):
+    """Check a task's status."""
+    headers = {"Authorization": f"Bearer {token}"}
+    try:
+        resp = requests.get(f"{API_URL}/api/task/{task_id}", headers=headers)
+        data = resp.json()
+        return {
+            "model": model,
+            "task_id": task_id,
+            "status": data.get("status", "unknown"),
+            "iterations": data.get("iterations", 0),
+            "result_length": len(data.get("result", "")),
+            "result_preview": data.get("result", "")[:200],
+            "error": "Error:" in data.get("result", ""),
+        }
+    except Exception as e:
+        return {
+            "model": model,
+            "task_id": task_id,
+            "status": "error",
+            "iterations": 0,
+            "result_length": 0,
+            "result_preview": str(e),
+            "error": True,
+        }
+
+
+def main():
+    token = get_token()
+    if not token:
+        print("Failed to get auth token")
+        sys.exit(1)
+    
+    print("=" * 80)
+    print("Quick Model Test Results")
+    print("=" * 80)
+    print()
+    
+    results = []
+    for model, task_id in TASKS.items():
+        result = check_task(token, model, task_id)
+        results.append(result)
+    
+    # Print summary table
+    print(f"{'Model':<45} | {'Status':<10} | {'Iters':<5} | {'Chars':<8} | {'Error'}")
+    print("-" * 45 + "-+-" + "-" * 10 + "-+-" + "-" * 5 + "-+-" + "-" * 8 + "-+-------")
+    
+    for r in results:
+        error_mark = "❌" if r["error"] else "✓"
+        print(f"{r['model']:<45} | {r['status']:<10} | {r['iterations']:<5} | {r['result_length']:<8} | {error_mark}")
+    
+    print()
+    print("=" * 80)
+    print("Detailed Results")
+    print("=" * 80)
+    
+    # Categorize results
+    working = [r for r in results if r["status"] == "completed" and not r["error"]]
+    failed = [r for r in results if r["status"] == "failed" or r["error"]]
+    running = [r for r in results if r["status"] in ("pending", "running")]
+    
+    print(f"\n✓ Working models ({len(working)}):")
+    for r in working:
+        print(f"  - {r['model']}: {r['result_preview'][:100]}...")
+    
+    print(f"\n❌ Failed models ({len(failed)}):")
+    for r in failed:
+        print(f"  - {r['model']}: {r['result_preview'][:150]}...")
+    
+    if running:
+        print(f"\n⏳ Still running ({len(running)}):")
+        for r in running:
+            print(f"  - {r['model']}")
+    
+    # Summary
+    print()
+    print("=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    print(f"Working: {len(working)}/{len(results)}")
+    print(f"Failed: {len(failed)}/{len(results)}")
+    print(f"Running: {len(running)}/{len(results)}")
+    
+    return results
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_security_test.sh b/scripts/run_security_test.sh
new file mode 100755
index 0000000..5486af5
--- /dev/null
+++ b/scripts/run_security_test.sh
@@ -0,0 +1,223 @@
+#!/bin/bash
+# Run the Rabby Wallet security analysis with multiple models
+# This script submits the task to each model and monitors progress
+
+set -e
+
+API_URL="https://agent-backend.thomas.md"
+RESULTS_DIR="$(dirname "$0")/../test_results/security_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$RESULTS_DIR"
+
+echo "==========================================="
+echo "Rabby Wallet Security Analysis - Model Comparison"
+echo "Results: $RESULTS_DIR"
+echo "==========================================="
+
+# Models to test (prioritized)
+MODELS=(
+    "moonshotai/kimi-k2-thinking"
+    "x-ai/grok-4.1-fast"
+    "google/gemini-3-flash-preview"
+    "deepseek/deepseek-v3.2-speciale"
+    "anthropic/claude-sonnet-4.5"  # baseline
+)
+
+# The security analysis task
+TASK='Download Rabby Wallet extension for Chrome, decompile it, and look for security vulnerabilities similar to the Permit2 transaction simulation bypass bug.
+
+Context on the vulnerability pattern to look for:
+- Rabby simulation fails to detect malicious Permit2 approval patterns
+- The simulation shows a harmless transaction (e.g., spending 1 USDC) while the actual tx enables draining the user full balance
+- The key issue is that the simulation engine does not correctly model Permit2 delegation or spending flows
+- The "spender" field from a permit2 should be validated against known safe contract addresses
+
+Focus areas:
+1. How Rabby parses and validates Permit2 signatures
+2. Whether the spender field is properly validated against known contract addresses
+3. If the witness data can be manipulated to display incorrect transaction details
+4. Any other transaction simulation bypass vectors
+
+Steps:
+1. Download the Rabby extension (https://rabby.io or Chrome Web Store)
+2. Extract and decompile the JavaScript code
+3. Search for Permit2-related code paths
+4. Analyze the simulation/preview logic
+5. Identify potential bypass vectors
+
+Provide findings in a structured markdown report with:
+- Vulnerability title
+- Severity (Critical/High/Medium/Low)
+- Description
+- Affected code snippets
+- Proof of concept outline
+- Recommended fix'
+
+# Get auth token
+DASHBOARD_PASSWORD="${DASHBOARD_PASSWORD:-}"
+if [ -z "$DASHBOARD_PASSWORD" ]; then
+    # Try to get from secrets.json
+    if [ -f "$(dirname "$0")/../secrets.json" ]; then
+        DASHBOARD_PASSWORD=$(jq -r '.dashboard_password // empty' "$(dirname "$0")/../secrets.json")
+    fi
+fi
+
+if [ -z "$DASHBOARD_PASSWORD" ]; then
+    echo "Error: DASHBOARD_PASSWORD not set"
+    exit 1
+fi
+
+TOKEN=$(curl -s -X POST "$API_URL/api/auth/login" \
+    -H "Content-Type: application/json" \
+    -d "{\"password\": \"$DASHBOARD_PASSWORD\"}" | jq -r '.token')
+
+if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
+    echo "Failed to get auth token"
+    exit 1
+fi
+
+echo "Authenticated successfully"
+
+# Function to submit a task
+submit_task() {
+    local model="$1"
+    local safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
+    
+    echo ""
+    echo "Submitting task for: $model"
+    
+    local payload=$(jq -n \
+        --arg task "$TASK" \
+        --arg model "$model" \
+        '{task: $task, model: $model}')
+    
+    local response=$(curl -s -X POST "$API_URL/api/task" \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $TOKEN" \
+        -d "$payload")
+    
+    local task_id=$(echo "$response" | jq -r '.id // empty')
+    
+    if [ -z "$task_id" ]; then
+        echo "  Failed: $response"
+        return 1
+    fi
+    
+    echo "  Task ID: $task_id"
+    echo "$task_id" > "$RESULTS_DIR/${safe_name}_task_id.txt"
+    
+    # Save initial state
+    echo "{\"model\": \"$model\", \"task_id\": \"$task_id\", \"submitted_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" > "$RESULTS_DIR/${safe_name}_meta.json"
+}
+
+# Submit all tasks
+echo ""
+echo "Submitting tasks..."
+for model in "${MODELS[@]}"; do
+    submit_task "$model"
+    sleep 1
+done
+
+echo ""
+echo "All tasks submitted. Monitoring progress..."
+echo "(Press Ctrl+C to stop monitoring)"
+echo ""
+
+# Monitor loop
+while true; do
+    all_done=true
+    clear
+    echo "==========================================="
+    echo "Task Status ($(date))"
+    echo "==========================================="
+    printf "%-45s | %-10s | %8s | %s\n" "Model" "Status" "Iters" "Result"
+    echo "---------------------------------------------+------------+----------+---------"
+    
+    for model in "${MODELS[@]}"; do
+        safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
+        task_id_file="$RESULTS_DIR/${safe_name}_task_id.txt"
+        
+        if [ ! -f "$task_id_file" ]; then
+            printf "%-45s | %-10s | %8s | %s\n" "$model" "no_task" "-" "-"
+            continue
+        fi
+        
+        task_id=$(cat "$task_id_file")
+        status_response=$(curl -s "$API_URL/api/task/$task_id" -H "Authorization: Bearer $TOKEN")
+        
+        status=$(echo "$status_response" | jq -r '.status // "unknown"')
+        iterations=$(echo "$status_response" | jq -r '.iterations // 0')
+        result_preview=$(echo "$status_response" | jq -r '.result // ""' | head -c 50)
+        
+        if [ "$status" != "completed" ] && [ "$status" != "failed" ]; then
+            all_done=false
+        fi
+        
+        printf "%-45s | %-10s | %8s | %s\n" "$model" "$status" "$iterations" "${result_preview:0:50}"
+        
+        # Save full result if done
+        if [ "$status" = "completed" ] || [ "$status" = "failed" ]; then
+            echo "$status_response" | jq . > "$RESULTS_DIR/${safe_name}_result.json"
+        fi
+    done
+    
+    if $all_done; then
+        echo ""
+        echo "All tasks completed!"
+        break
+    fi
+    
+    sleep 10
+done
+
+# Generate summary
+echo ""
+echo "==========================================="
+echo "Final Summary"
+echo "==========================================="
+
+{
+    echo "# Model Comparison Results"
+    echo ""
+    echo "Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
+    echo ""
+    echo "| Model | Status | Iterations | Result Length | Cost (cents) |"
+    echo "|-------|--------|------------|---------------|--------------|"
+    
+    for model in "${MODELS[@]}"; do
+        safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
+        result_file="$RESULTS_DIR/${safe_name}_result.json"
+        
+        if [ -f "$result_file" ]; then
+            status=$(jq -r '.status' "$result_file")
+            iterations=$(jq -r '.iterations' "$result_file")
+            result=$(jq -r '.result // ""' "$result_file")
+            result_len=${#result}
+            # Note: cost would need to be tracked by the agent
+            echo "| $model | $status | $iterations | $result_len | - |"
+        else
+            echo "| $model | no_result | - | - | - |"
+        fi
+    done
+    
+    echo ""
+    echo "## Detailed Results"
+    echo ""
+    
+    for model in "${MODELS[@]}"; do
+        safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_')
+        result_file="$RESULTS_DIR/${safe_name}_result.json"
+        
+        if [ -f "$result_file" ]; then
+            echo "### $model"
+            echo ""
+            jq -r '.result // "No result"' "$result_file"
+            echo ""
+            echo "---"
+            echo ""
+        fi
+    done
+} > "$RESULTS_DIR/REPORT.md"
+
+echo "Report saved to: $RESULTS_DIR/REPORT.md"
+echo ""
+cat "$RESULTS_DIR/REPORT.md" | head -30
diff --git a/test_results/MODEL_ANALYSIS_REPORT.md b/test_results/MODEL_ANALYSIS_REPORT.md
new file mode 100644
index 0000000..bc17fc3
--- /dev/null
+++ b/test_results/MODEL_ANALYSIS_REPORT.md
@@ -0,0 +1,197 @@
+# Open Agent Model Selection Analysis Report
+
+**Date:** 2025-12-19
+**Test Environment:** Production (agent-backend.thomas.md)
+
+## Executive Summary
+
+This report analyzes the performance of various LLM models with Open Agent, focusing on:
+1. Model availability and compatibility
+2. Task completion success rates
+3. Model selection system behavior
+4. Recommendations for improvement
+
+## Models Tested
+
+| Model | Provider | Type | Status |
+|-------|----------|------|--------|
+| moonshotai/kimi-k2-thinking | MoonshotAI | Thinking | ✅ Working |
+| x-ai/grok-4.1-fast | xAI | Fast | ✅ Working |
+| google/gemini-3-flash-preview | Google | Flash | ❌ Requires reasoning token handling |
+| deepseek/deepseek-v3.2-speciale | DeepSeek | Special | ❌ Not in allowlist |
+| qwen/qwen3-vl-235b-a22b-thinking | Alibaba | VL/Thinking | ✅ Working |
+| mistralai/mistral-large-2512 | Mistral | Large | ⚠️ Inconsistent |
+| amazon/nova-pro-v1 | Amazon | Pro | ✅ Working |
+| z-ai/glm-4.6v | Zhipu | Vision | ✅ Working |
+| anthropic/claude-sonnet-4.5 | Anthropic | Sonnet | ✅ Working (Baseline) |
+| google/gemini-2.5-pro | Google | Pro | ✅ Working |
+| deepseek/deepseek-chat | DeepSeek | Chat | ✅ Working |
+
+## Key Findings
+
+### 1. Gemini 3 "Thinking" Models Require Special Handling
+
+**Issue:** Gemini 3 Flash Preview and similar "thinking" models require OpenRouter reasoning token preservation.
+
+**Error Message:**
+```
+Gemini models require OpenRouter reasoning details to be preserved in each request.
+Function call is missing a `thought_signature` in functionCall parts.
+```
+
+**Root Cause:** These models generate "thinking" tokens that must be preserved in subsequent API calls when using tools.
+
+**Impact:** Cannot use Gemini 3 thinking models until this is implemented.
+
+### 2. CAPABLE_MODEL_BASES Allowlist Too Restrictive
+
+**Issue:** Models not explicitly in the allowlist are silently rejected or substituted.
+
+**Example:** `deepseek/deepseek-v3.2-speciale` was requested but the system:
+- Logged: "Requested model not found, using default capability floor 0.7"
+- Selected: `deepseek/deepseek-r1-distill-llama-70b` instead
+
+**Recommendation:** 
+- Add more models to the allowlist
+- Warn users when their requested model isn't available
+- Consider dynamic model validation via OpenRouter API
+
+### 3. Price-Based Capability Estimation Has Issues
+
+**Issue:** Free/cheap models get capability score of 0.3 regardless of actual performance.
+
+**Log Evidence:**
+```
+Using price-based capability for deepseek/deepseek-r1: 0.300 (avg_cost: 0.0000000000)
+Using price-based capability for deepseek/deepseek-chat: 0.300 (avg_cost: 0.0000000000)
+```
+
+**Problem:** DeepSeek models are often free/very cheap but perform well. The price-based heuristic underestimates them.
+
+**Recommendation:**
+- Integrate actual benchmark data from llm-stats.com
+- Use model family tiers as fallback
+- Consider historical performance tracking
+
+### 4. Benchmark Data Not Being Used
+
+**Observation:** All model selections show "benchmark_data: false"
+
+**Log Evidence:**
+```
+Model selected: deepseek/deepseek-r1-distill-llama-70b (task: ToolCalling, cost: 2 cents, benchmark_data: false, history: false)
+```
+
+**Root Cause:** The benchmark registry may not be properly loaded or models aren't matched.
+
+### 5. Default Model Configuration Critical
+
+**Issue:** Setting `DEFAULT_MODEL` to a problematic model (like gemini-3-flash-preview) breaks ALL tasks.
+
+**Why:** Internal operations (complexity estimation, task splitting) use the default model, not the requested one.
+
+**Recommendation:** 
+- Validate default model on startup
+- Use separate models for internal operations vs. task execution
+- Add health check that tests the default model
+
+## Quick Test Results
+
+| Model | Status | Iterations | Result Length | Notes |
+|-------|--------|------------|---------------|-------|
+| moonshotai/kimi-k2-thinking | completed | 2 | 48 | Fast, accurate |
+| x-ai/grok-4.1-fast | completed | 2 | 62 | Fast, verbose |
+| deepseek/deepseek-v3.2-speciale | failed | 0 | 0 | Not in allowlist |
+| mistralai/mistral-large-2512 | completed | 0 | 4602 | Very verbose |
+| anthropic/claude-sonnet-4.5 | completed | 2 | 48 | Baseline, reliable |
+| qwen/qwen3-vl-235b-a22b-thinking | completed | 2 | 50 | Working well |
+| amazon/nova-pro-v1 | completed | 2 | 50 | Working well |
+| z-ai/glm-4.6v | completed | 2 | 50 | Working well |
+| google/gemini-2.5-pro | completed | 2 | 50 | Working (non-thinking variant) |
+| deepseek/deepseek-chat | completed | 2 | 50 | Working (standard variant) |
+
+## Recommendations for Model Selection Improvements
+
+### Immediate Fixes (High Priority)
+
+1. **Expand CAPABLE_MODEL_BASES:**
+   - Add all models from the user's test list
+   - Add popular new models automatically
+   - Consider dynamic validation
+
+2. **Fix Benchmark Integration:**
+   - Ensure benchmark data loads correctly
+   - Add logging for benchmark matching
+   - Use benchmarks for task-type-specific selection
+
+3. **Add Reasoning Token Support:**
+   - For Gemini 3 and other "thinking" models
+   - Preserve thought signatures in tool calls
+   - Reference: https://openrouter.ai/docs/guides/best-practices/reasoning-tokens
+
+### Medium-Term Improvements
+
+4. **Historical Performance Tracking:**
+   - Record actual success/failure per model
+   - Track cost efficiency per model
+   - Use this data for future selections
+
+5. **Separate Internal vs. Execution Models:**
+   - Use cheap/fast model for complexity estimation
+   - Use cheap/fast model for task splitting
+   - Use user-selected model only for actual execution
+
+6. **Model Validation on Startup:**
+   - Check if default model works
+   - Validate key models in the allowlist
+   - Alert on configuration issues
+
+### Long-Term Enhancements
+
+7. **Dynamic Model Discovery:**
+   - Fetch available models from OpenRouter API
+   - Auto-detect capabilities (tools support, vision, etc.)
+   - Automatic fallback chains
+
+8. **A/B Testing Framework:**
+   - Run same task with multiple models
+   - Compare quality, cost, speed
+   - Continuously update model rankings
+
+9. **User-Facing Model Insights:**
+   - Show why a model was selected
+   - Display estimated cost before execution
+   - Allow manual override with warnings
+
+## Security Analysis Task Status
+
+Three models are currently running the Rabby Wallet security analysis:
+- `moonshotai/kimi-k2-thinking`: 9055ae68-d0bb-4c0d-aae3-908de141c431
+- `x-ai/grok-4.1-fast`: f99e7b95-9d57-42e4-b669-62b4f7c6a9f4
+- `anthropic/claude-sonnet-4.5`: 95fdebf9-f4bc-43f6-ba20-c9579dcadbd6
+
+Results will be appended to this report when available.
+
+## Appendix: Code Changes Made
+
+1. **Added Models to CAPABLE_MODEL_BASES** (`src/budget/pricing.rs`):
+   - moonshotai/kimi-k2-thinking, kimi-k2
+   - x-ai/grok-4.1-fast, grok-4-fast, grok-4, grok-3
+   - google/gemini-3-flash-preview, gemini-3-pro-preview
+   - deepseek/deepseek-v3.2-speciale, deepseek-v3.2, deepseek-v3.1-terminus
+   - qwen/qwen3-vl-235b-a22b-thinking
+   - amazon/nova-pro-v1
+   - z-ai/glm-4.6v, glm-4.6, glm-4.5v, glm-4.5
+
+2. **Created Test Scripts**:
+   - `scripts/quick_model_test.sh`: Fast capability verification
+   - `scripts/test_model_comparison.sh`: Full security analysis comparison
+   - `scripts/run_security_test.sh`: Interactive security test runner
+   - `scripts/check_results.py`: Result collection and analysis
+
+3. **Fixed Production Configuration**:
+   - Changed DEFAULT_MODEL from gemini-3-flash-preview to claude-sonnet-4.5
+
+---
+
+*Report generated by model comparison testing framework*