diff --git a/scripts/check_results.py b/scripts/check_results.py new file mode 100644 index 0000000..c121880 --- /dev/null +++ b/scripts/check_results.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +"""Check task results from the model comparison test.""" + +import json +import requests +import sys +import os + +API_URL = "https://agent-backend.thomas.md" + +# Task IDs from the test (round 2 - with fixed default model) +TASKS = { + "moonshotai/kimi-k2-thinking": "108bfe55-e937-4ff4-b71e-5865370c8191", + "x-ai/grok-4.1-fast": "856703ff-f5d1-401d-9f3b-e7f965e4524d", + "deepseek/deepseek-v3.2-speciale": "a404d71d-f22c-4c38-ac18-7332e39c8b6b", + "mistralai/mistral-large-2512": "87972676-e4cf-4b23-8f8e-1043169bc12d", + "anthropic/claude-sonnet-4.5": "e2e1bb84-aaab-410a-b133-68a182901576", +} + + +def get_token(): + """Get auth token.""" + # Try to get password from secrets.json + secrets_path = os.path.join(os.path.dirname(__file__), "..", "secrets.json") + password = "" + if os.path.exists(secrets_path): + with open(secrets_path) as f: + secrets = json.load(f) + # Try different possible keys + password = ( + secrets.get("dashboard_password") or + secrets.get("dashboard", {}).get("password") or + secrets.get("auth", {}).get("dashboard_password") or + "" + ) + if not password: + password = os.environ.get("DASHBOARD_PASSWORD", "") + + if not password: + print("Error: No dashboard password found") + sys.exit(1) + + resp = requests.post(f"{API_URL}/api/auth/login", json={"password": password}) + data = resp.json() + return data.get("token") + + +def check_task(token, model, task_id): + """Check a task's status.""" + headers = {"Authorization": f"Bearer {token}"} + try: + resp = requests.get(f"{API_URL}/api/task/{task_id}", headers=headers) + data = resp.json() + return { + "model": model, + "task_id": task_id, + "status": data.get("status", "unknown"), + "iterations": data.get("iterations", 0), + "result_length": len(data.get("result", "")), + "result_preview": data.get("result", "")[:200], + "error": "Error:" in data.get("result", ""), + } + except Exception as e: + return { + "model": model, + "task_id": task_id, + "status": "error", + "iterations": 0, + "result_length": 0, + "result_preview": str(e), + "error": True, + } + + +def main(): + token = get_token() + if not token: + print("Failed to get auth token") + sys.exit(1) + + print("=" * 80) + print("Quick Model Test Results") + print("=" * 80) + print() + + results = [] + for model, task_id in TASKS.items(): + result = check_task(token, model, task_id) + results.append(result) + + # Print summary table + print(f"{'Model':<45} | {'Status':<10} | {'Iters':<5} | {'Chars':<8} | {'Error'}") + print("-" * 45 + "-+-" + "-" * 10 + "-+-" + "-" * 5 + "-+-" + "-" * 8 + "-+-------") + + for r in results: + error_mark = "❌" if r["error"] else "✓" + print(f"{r['model']:<45} | {r['status']:<10} | {r['iterations']:<5} | {r['result_length']:<8} | {error_mark}") + + print() + print("=" * 80) + print("Detailed Results") + print("=" * 80) + + # Categorize results + working = [r for r in results if r["status"] == "completed" and not r["error"]] + failed = [r for r in results if r["status"] == "failed" or r["error"]] + running = [r for r in results if r["status"] in ("pending", "running")] + + print(f"\n✓ Working models ({len(working)}):") + for r in working: + print(f" - {r['model']}: {r['result_preview'][:100]}...") + + print(f"\n❌ Failed models ({len(failed)}):") + for r in failed: + print(f" - {r['model']}: {r['result_preview'][:150]}...") + + if running: + print(f"\n⏳ Still running ({len(running)}):") + for r in running: + print(f" - {r['model']}") + + # Summary + print() + print("=" * 80) + print("SUMMARY") + print("=" * 80) + print(f"Working: {len(working)}/{len(results)}") + print(f"Failed: {len(failed)}/{len(results)}") + print(f"Running: {len(running)}/{len(results)}") + + return results + + +if __name__ == "__main__": + main() diff --git a/scripts/run_security_test.sh b/scripts/run_security_test.sh new file mode 100755 index 0000000..5486af5 --- /dev/null +++ b/scripts/run_security_test.sh @@ -0,0 +1,223 @@ +#!/bin/bash +# Run the Rabby Wallet security analysis with multiple models +# This script submits the task to each model and monitors progress + +set -e + +API_URL="https://agent-backend.thomas.md" +RESULTS_DIR="$(dirname "$0")/../test_results/security_$(date +%Y%m%d_%H%M%S)" +mkdir -p "$RESULTS_DIR" + +echo "===========================================" +echo "Rabby Wallet Security Analysis - Model Comparison" +echo "Results: $RESULTS_DIR" +echo "===========================================" + +# Models to test (prioritized) +MODELS=( + "moonshotai/kimi-k2-thinking" + "x-ai/grok-4.1-fast" + "google/gemini-3-flash-preview" + "deepseek/deepseek-v3.2-speciale" + "anthropic/claude-sonnet-4.5" # baseline +) + +# The security analysis task +TASK='Download Rabby Wallet extension for Chrome, decompile it, and look for security vulnerabilities similar to the Permit2 transaction simulation bypass bug. + +Context on the vulnerability pattern to look for: +- Rabby simulation fails to detect malicious Permit2 approval patterns +- The simulation shows a harmless transaction (e.g., spending 1 USDC) while the actual tx enables draining the user full balance +- The key issue is that the simulation engine does not correctly model Permit2 delegation or spending flows +- The "spender" field from a permit2 should be validated against known safe contract addresses + +Focus areas: +1. How Rabby parses and validates Permit2 signatures +2. Whether the spender field is properly validated against known contract addresses +3. If the witness data can be manipulated to display incorrect transaction details +4. Any other transaction simulation bypass vectors + +Steps: +1. Download the Rabby extension (https://rabby.io or Chrome Web Store) +2. Extract and decompile the JavaScript code +3. Search for Permit2-related code paths +4. Analyze the simulation/preview logic +5. Identify potential bypass vectors + +Provide findings in a structured markdown report with: +- Vulnerability title +- Severity (Critical/High/Medium/Low) +- Description +- Affected code snippets +- Proof of concept outline +- Recommended fix' + +# Get auth token +DASHBOARD_PASSWORD="${DASHBOARD_PASSWORD:-}" +if [ -z "$DASHBOARD_PASSWORD" ]; then + # Try to get from secrets.json + if [ -f "$(dirname "$0")/../secrets.json" ]; then + DASHBOARD_PASSWORD=$(jq -r '.dashboard_password // empty' "$(dirname "$0")/../secrets.json") + fi +fi + +if [ -z "$DASHBOARD_PASSWORD" ]; then + echo "Error: DASHBOARD_PASSWORD not set" + exit 1 +fi + +TOKEN=$(curl -s -X POST "$API_URL/api/auth/login" \ + -H "Content-Type: application/json" \ + -d "{\"password\": \"$DASHBOARD_PASSWORD\"}" | jq -r '.token') + +if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then + echo "Failed to get auth token" + exit 1 +fi + +echo "Authenticated successfully" + +# Function to submit a task +submit_task() { + local model="$1" + local safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_') + + echo "" + echo "Submitting task for: $model" + + local payload=$(jq -n \ + --arg task "$TASK" \ + --arg model "$model" \ + '{task: $task, model: $model}') + + local response=$(curl -s -X POST "$API_URL/api/task" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $TOKEN" \ + -d "$payload") + + local task_id=$(echo "$response" | jq -r '.id // empty') + + if [ -z "$task_id" ]; then + echo " Failed: $response" + return 1 + fi + + echo " Task ID: $task_id" + echo "$task_id" > "$RESULTS_DIR/${safe_name}_task_id.txt" + + # Save initial state + echo "{\"model\": \"$model\", \"task_id\": \"$task_id\", \"submitted_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" > "$RESULTS_DIR/${safe_name}_meta.json" +} + +# Submit all tasks +echo "" +echo "Submitting tasks..." +for model in "${MODELS[@]}"; do + submit_task "$model" + sleep 1 +done + +echo "" +echo "All tasks submitted. Monitoring progress..." +echo "(Press Ctrl+C to stop monitoring)" +echo "" + +# Monitor loop +while true; do + all_done=true + clear + echo "===========================================" + echo "Task Status ($(date))" + echo "===========================================" + printf "%-45s | %-10s | %8s | %s\n" "Model" "Status" "Iters" "Result" + echo "---------------------------------------------+------------+----------+---------" + + for model in "${MODELS[@]}"; do + safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_') + task_id_file="$RESULTS_DIR/${safe_name}_task_id.txt" + + if [ ! -f "$task_id_file" ]; then + printf "%-45s | %-10s | %8s | %s\n" "$model" "no_task" "-" "-" + continue + fi + + task_id=$(cat "$task_id_file") + status_response=$(curl -s "$API_URL/api/task/$task_id" -H "Authorization: Bearer $TOKEN") + + status=$(echo "$status_response" | jq -r '.status // "unknown"') + iterations=$(echo "$status_response" | jq -r '.iterations // 0') + result_preview=$(echo "$status_response" | jq -r '.result // ""' | head -c 50) + + if [ "$status" != "completed" ] && [ "$status" != "failed" ]; then + all_done=false + fi + + printf "%-45s | %-10s | %8s | %s\n" "$model" "$status" "$iterations" "${result_preview:0:50}" + + # Save full result if done + if [ "$status" = "completed" ] || [ "$status" = "failed" ]; then + echo "$status_response" | jq . > "$RESULTS_DIR/${safe_name}_result.json" + fi + done + + if $all_done; then + echo "" + echo "All tasks completed!" + break + fi + + sleep 10 +done + +# Generate summary +echo "" +echo "===========================================" +echo "Final Summary" +echo "===========================================" + +{ + echo "# Model Comparison Results" + echo "" + echo "Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "" + echo "| Model | Status | Iterations | Result Length | Cost (cents) |" + echo "|-------|--------|------------|---------------|--------------|" + + for model in "${MODELS[@]}"; do + safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_') + result_file="$RESULTS_DIR/${safe_name}_result.json" + + if [ -f "$result_file" ]; then + status=$(jq -r '.status' "$result_file") + iterations=$(jq -r '.iterations' "$result_file") + result=$(jq -r '.result // ""' "$result_file") + result_len=${#result} + # Note: cost would need to be tracked by the agent + echo "| $model | $status | $iterations | $result_len | - |" + else + echo "| $model | no_result | - | - | - |" + fi + done + + echo "" + echo "## Detailed Results" + echo "" + + for model in "${MODELS[@]}"; do + safe_name=$(echo "$model" | tr '/' '_' | tr ':' '_') + result_file="$RESULTS_DIR/${safe_name}_result.json" + + if [ -f "$result_file" ]; then + echo "### $model" + echo "" + jq -r '.result // "No result"' "$result_file" + echo "" + echo "---" + echo "" + fi + done +} > "$RESULTS_DIR/REPORT.md" + +echo "Report saved to: $RESULTS_DIR/REPORT.md" +echo "" +cat "$RESULTS_DIR/REPORT.md" | head -30 diff --git a/test_results/MODEL_ANALYSIS_REPORT.md b/test_results/MODEL_ANALYSIS_REPORT.md new file mode 100644 index 0000000..bc17fc3 --- /dev/null +++ b/test_results/MODEL_ANALYSIS_REPORT.md @@ -0,0 +1,197 @@ +# Open Agent Model Selection Analysis Report + +**Date:** 2025-12-19 +**Test Environment:** Production (agent-backend.thomas.md) + +## Executive Summary + +This report analyzes the performance of various LLM models with Open Agent, focusing on: +1. Model availability and compatibility +2. Task completion success rates +3. Model selection system behavior +4. Recommendations for improvement + +## Models Tested + +| Model | Provider | Type | Status | +|-------|----------|------|--------| +| moonshotai/kimi-k2-thinking | MoonshotAI | Thinking | ✅ Working | +| x-ai/grok-4.1-fast | xAI | Fast | ✅ Working | +| google/gemini-3-flash-preview | Google | Flash | ❌ Requires reasoning token handling | +| deepseek/deepseek-v3.2-speciale | DeepSeek | Special | ❌ Not in allowlist | +| qwen/qwen3-vl-235b-a22b-thinking | Alibaba | VL/Thinking | ✅ Working | +| mistralai/mistral-large-2512 | Mistral | Large | ⚠️ Inconsistent | +| amazon/nova-pro-v1 | Amazon | Pro | ✅ Working | +| z-ai/glm-4.6v | Zhipu | Vision | ✅ Working | +| anthropic/claude-sonnet-4.5 | Anthropic | Sonnet | ✅ Working (Baseline) | +| google/gemini-2.5-pro | Google | Pro | ✅ Working | +| deepseek/deepseek-chat | DeepSeek | Chat | ✅ Working | + +## Key Findings + +### 1. Gemini 3 "Thinking" Models Require Special Handling + +**Issue:** Gemini 3 Flash Preview and similar "thinking" models require OpenRouter reasoning token preservation. + +**Error Message:** +``` +Gemini models require OpenRouter reasoning details to be preserved in each request. +Function call is missing a `thought_signature` in functionCall parts. +``` + +**Root Cause:** These models generate "thinking" tokens that must be preserved in subsequent API calls when using tools. + +**Impact:** Cannot use Gemini 3 thinking models until this is implemented. + +### 2. CAPABLE_MODEL_BASES Allowlist Too Restrictive + +**Issue:** Models not explicitly in the allowlist are silently rejected or substituted. + +**Example:** `deepseek/deepseek-v3.2-speciale` was requested but the system: +- Logged: "Requested model not found, using default capability floor 0.7" +- Selected: `deepseek/deepseek-r1-distill-llama-70b` instead + +**Recommendation:** +- Add more models to the allowlist +- Warn users when their requested model isn't available +- Consider dynamic model validation via OpenRouter API + +### 3. Price-Based Capability Estimation Has Issues + +**Issue:** Free/cheap models get capability score of 0.3 regardless of actual performance. + +**Log Evidence:** +``` +Using price-based capability for deepseek/deepseek-r1: 0.300 (avg_cost: 0.0000000000) +Using price-based capability for deepseek/deepseek-chat: 0.300 (avg_cost: 0.0000000000) +``` + +**Problem:** DeepSeek models are often free/very cheap but perform well. The price-based heuristic underestimates them. + +**Recommendation:** +- Integrate actual benchmark data from llm-stats.com +- Use model family tiers as fallback +- Consider historical performance tracking + +### 4. Benchmark Data Not Being Used + +**Observation:** All model selections show "benchmark_data: false" + +**Log Evidence:** +``` +Model selected: deepseek/deepseek-r1-distill-llama-70b (task: ToolCalling, cost: 2 cents, benchmark_data: false, history: false) +``` + +**Root Cause:** The benchmark registry may not be properly loaded or models aren't matched. + +### 5. Default Model Configuration Critical + +**Issue:** Setting `DEFAULT_MODEL` to a problematic model (like gemini-3-flash-preview) breaks ALL tasks. + +**Why:** Internal operations (complexity estimation, task splitting) use the default model, not the requested one. + +**Recommendation:** +- Validate default model on startup +- Use separate models for internal operations vs. task execution +- Add health check that tests the default model + +## Quick Test Results + +| Model | Status | Iterations | Result Length | Notes | +|-------|--------|------------|---------------|-------| +| moonshotai/kimi-k2-thinking | completed | 2 | 48 | Fast, accurate | +| x-ai/grok-4.1-fast | completed | 2 | 62 | Fast, verbose | +| deepseek/deepseek-v3.2-speciale | failed | 0 | 0 | Not in allowlist | +| mistralai/mistral-large-2512 | completed | 0 | 4602 | Very verbose | +| anthropic/claude-sonnet-4.5 | completed | 2 | 48 | Baseline, reliable | +| qwen/qwen3-vl-235b-a22b-thinking | completed | 2 | 50 | Working well | +| amazon/nova-pro-v1 | completed | 2 | 50 | Working well | +| z-ai/glm-4.6v | completed | 2 | 50 | Working well | +| google/gemini-2.5-pro | completed | 2 | 50 | Working (non-thinking variant) | +| deepseek/deepseek-chat | completed | 2 | 50 | Working (standard variant) | + +## Recommendations for Model Selection Improvements + +### Immediate Fixes (High Priority) + +1. **Expand CAPABLE_MODEL_BASES:** + - Add all models from the user's test list + - Add popular new models automatically + - Consider dynamic validation + +2. **Fix Benchmark Integration:** + - Ensure benchmark data loads correctly + - Add logging for benchmark matching + - Use benchmarks for task-type-specific selection + +3. **Add Reasoning Token Support:** + - For Gemini 3 and other "thinking" models + - Preserve thought signatures in tool calls + - Reference: https://openrouter.ai/docs/guides/best-practices/reasoning-tokens + +### Medium-Term Improvements + +4. **Historical Performance Tracking:** + - Record actual success/failure per model + - Track cost efficiency per model + - Use this data for future selections + +5. **Separate Internal vs. Execution Models:** + - Use cheap/fast model for complexity estimation + - Use cheap/fast model for task splitting + - Use user-selected model only for actual execution + +6. **Model Validation on Startup:** + - Check if default model works + - Validate key models in the allowlist + - Alert on configuration issues + +### Long-Term Enhancements + +7. **Dynamic Model Discovery:** + - Fetch available models from OpenRouter API + - Auto-detect capabilities (tools support, vision, etc.) + - Automatic fallback chains + +8. **A/B Testing Framework:** + - Run same task with multiple models + - Compare quality, cost, speed + - Continuously update model rankings + +9. **User-Facing Model Insights:** + - Show why a model was selected + - Display estimated cost before execution + - Allow manual override with warnings + +## Security Analysis Task Status + +Three models are currently running the Rabby Wallet security analysis: +- `moonshotai/kimi-k2-thinking`: 9055ae68-d0bb-4c0d-aae3-908de141c431 +- `x-ai/grok-4.1-fast`: f99e7b95-9d57-42e4-b669-62b4f7c6a9f4 +- `anthropic/claude-sonnet-4.5`: 95fdebf9-f4bc-43f6-ba20-c9579dcadbd6 + +Results will be appended to this report when available. + +## Appendix: Code Changes Made + +1. **Added Models to CAPABLE_MODEL_BASES** (`src/budget/pricing.rs`): + - moonshotai/kimi-k2-thinking, kimi-k2 + - x-ai/grok-4.1-fast, grok-4-fast, grok-4, grok-3 + - google/gemini-3-flash-preview, gemini-3-pro-preview + - deepseek/deepseek-v3.2-speciale, deepseek-v3.2, deepseek-v3.1-terminus + - qwen/qwen3-vl-235b-a22b-thinking + - amazon/nova-pro-v1 + - z-ai/glm-4.6v, glm-4.6, glm-4.5v, glm-4.5 + +2. **Created Test Scripts**: + - `scripts/quick_model_test.sh`: Fast capability verification + - `scripts/test_model_comparison.sh`: Full security analysis comparison + - `scripts/run_security_test.sh`: Interactive security test runner + - `scripts/check_results.py`: Result collection and analysis + +3. **Fixed Production Configuration**: + - Changed DEFAULT_MODEL from gemini-3-flash-preview to claude-sonnet-4.5 + +--- + +*Report generated by model comparison testing framework*