Enable benchmark-based model selection and fix agent execution

Key fixes: - Fix shell path (sh -> /bin/sh) in terminal.rs for macOS compatibility - Fix fetch_url to use /tmp instead of /root/tmp - Add WORKING_DIR to config so benchmarks file is found - Enable ModelSelector when benchmarks are loaded (was bypassed) Benchmark integration: - Add BenchmarkRegistry to load models_with_benchmarks.json - Add TaskType enum with inference from task descriptions - ModelSelector uses benchmark scores for task-specific capability - Add info logging for model selection decisions Agent improvements: - Truncate history and tool results to prevent context overflow - Pass SharedBenchmarkRegistry through AgentContext - Better task type inference (math, code, reasoning, etc.) Testing verified: - Agent completed benchmark data aggregation task autonomously - Agent completed Fibonacci matrix exponentiation task with self-debugging - Model selection logs show benchmark_data: true
2025-12-17 04:26:11 +00:00
parent 26cd541e64
commit 3854290982
14 changed files with 13340 additions and 70 deletions
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -0,0 +1,81 @@
+# Open Agent Scripts
+
+Reusable Python scripts for data processing tasks that are too large for LLM context.
+
+## Available Scripts
+
+### merge_benchmarks.py
+
+Merges OpenRouter models with ZeroEval benchmark scores.
+
+**Usage:**
+```bash
+python3 scripts/merge_benchmarks.py
+```
+
+**What it does:**
+1. Fetches all models from OpenRouter API (~350 models)
+2. Fetches benchmark metadata from ZeroEval API (~383 benchmarks)
+3. Fetches scores for key benchmarks in each category:
+   - **code**: SWE-bench, HumanEval, LiveCodeBench, Aider-Polyglot, etc.
+   - **math**: AIME 2025/2024, MATH-500, GSM8K, etc.
+   - **reasoning**: GPQA, MMLU-Pro, MMLU, ARC, HellaSwag, etc.
+   - **tool_calling**: BFCL, Tau-Bench, ACEBench, etc.
+   - **long_context**: RULER, LongBench, InfiniteBench, etc.
+   - **general**: IFEval, Arena-Hard, MT-Bench, etc.
+4. Merges models with benchmark data
+5. Outputs `models_with_benchmarks.json`
+
+**Output files:**
+- `models_with_benchmarks.json` - Main output with merged data
+- `openrouter_models_raw.json` - Raw OpenRouter API response
+- `llm_stats_benchmarks.json` - Benchmark metadata from ZeroEval
+
+**Output format:**
+```json
+{
+  "generated_at": "2025-12-17T03:37:04Z",
+  "total_models": 349,
+  "models_with_benchmarks": 156,
+  "categories": ["code", "math", "reasoning", "tool_calling", "long_context", "general"],
+  "models": [
+    {
+      "id": "openai/gpt-5.2",
+      "name": "GPT-5.2",
+      "context_length": 400000,
+      "pricing": {...},
+      "benchmarks": {
+        "code": {"swe-bench-verified": 0.731},
+        "math": {"aime-2025": 0.96},
+        "reasoning": {"gpqa": 0.924}
+      },
+      "category_scores": {
+        "code": 0.731,
+        "math": 0.96,
+        "reasoning": 0.924
+      }
+    }
+  ]
+}
+```
+
+## Best Practices for Large Data Tasks
+
+When dealing with data too large for the LLM context (>10KB):
+
+1. **Use scripts**: Run Python/bash scripts with `run_command`
+2. **Write to files**: Save intermediate results to files
+3. **Read summaries**: Read only summaries or specific sections
+4. **Process in chunks**: Break large tasks into smaller pieces
+
+Example:
+```bash
+# Run the merge script
+python3 scripts/merge_benchmarks.py
+
+# Check summary
+python3 -c "import json; d=json.load(open('models_with_benchmarks.json')); print(f'Models: {d[\"total_models\"]}, With benchmarks: {d[\"models_with_benchmarks\"]}')"
+
+# Look up specific model
+python3 -c "import json; d=json.load(open('models_with_benchmarks.json')); m=[x for x in d['models'] if 'gpt-5' in x['id'].lower()]; print(json.dumps(m[:3], indent=2))"
+```
--- a/scripts/merge_benchmarks.py
+++ b/scripts/merge_benchmarks.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+Merge OpenRouter models with ZeroEval benchmark scores.
+
+This script:
+1. Fetches all models from OpenRouter API
+2. Fetches benchmark metadata from ZeroEval API
+3. For key benchmarks in each category, fetches model scores
+4. Creates a merged JSON with benchmark scores per category
+
+Categories tracked:
+- code: Coding benchmarks (SWE-bench, HumanEval, etc.)
+- math: Math benchmarks (AIME, MATH, GSM8K, etc.)
+- reasoning: Reasoning benchmarks (GPQA, MMLU, etc.)
+- tool_calling: Tool/function calling benchmarks
+- long_context: Long context benchmarks
+"""
+
+import json
+import time
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+from urllib.request import Request, urlopen
+from urllib.error import URLError, HTTPError
+from collections import defaultdict
+
+# Configuration
+OPENROUTER_API = "https://openrouter.ai/api/v1/models"
+ZEROEVAL_API = "https://api.zeroeval.com"
+OUTPUT_DIR = Path(__file__).parent.parent  # /Users/thomas/workspace/open_agent
+
+# Key benchmarks per category (prioritized list)
+KEY_BENCHMARKS = {
+    "code": [
+        "swe-bench-verified", "humaneval", "livecodebench", "aider-polyglot",
+        "bigcodebench", "codeforces", "mbpp"
+    ],
+    "math": [
+        "aime-2025", "aime-2024", "math-500", "gsm8k", "minerva-math",
+        "gpqa-diamond", "olympiadbench"
+    ],
+    "reasoning": [
+        "gpqa", "mmlu-pro", "mmlu", "arc-challenge", "hellaswag",
+        "winogrande", "commonsenseqa"
+    ],
+    "tool_calling": [
+        "bfcl", "tau-bench", "acebench", "nexusraven", "gorilla-api-bench"
+    ],
+    "long_context": [
+        "ruler", "longbench", "infinitebench", "scrolls", "loogle"
+    ],
+    "general": [
+        "ifeval", "arena-hard", "alpaca-eval-2", "mt-bench", "chatbot-arena"
+    ]
+}
+
+HEADERS = {
+    "Accept": "application/json",
+    "Origin": "https://llm-stats.com",
+    "Referer": "https://llm-stats.com/",
+    "User-Agent": "OpenAgent-BenchmarkMerger/1.0"
+}
+
+
+def fetch_json(url: str, retries: int = 3) -> Optional[Union[dict, list]]:
+    """Fetch JSON from URL with retries."""
+    for attempt in range(retries):
+        try:
+            req = Request(url, headers=HEADERS)
+            with urlopen(req, timeout=30) as resp:
+                return json.loads(resp.read().decode())
+        except HTTPError as e:
+            if e.code == 404:
+                return None
+            print(f"  HTTP error {e.code} for {url}, attempt {attempt + 1}")
+        except URLError as e:
+            print(f"  URL error for {url}: {e}, attempt {attempt + 1}")
+        except Exception as e:
+            print(f"  Error fetching {url}: {e}, attempt {attempt + 1}")
+        time.sleep(1)
+    return None
+
+
+def fetch_openrouter_models() -> List[dict]:
+    """Fetch all models from OpenRouter."""
+    print("Fetching OpenRouter models...")
+    data = fetch_json(OPENROUTER_API)
+    if data and "data" in data:
+        models = data["data"]
+        print(f"  Found {len(models)} models")
+        return models
+    print("  Failed to fetch models!")
+    return []
+
+
+def fetch_all_benchmarks() -> List[dict]:
+    """Fetch all benchmark metadata from ZeroEval."""
+    print("Fetching ZeroEval benchmarks...")
+    data = fetch_json(f"{ZEROEVAL_API}/leaderboard/benchmarks")
+    if data:
+        print(f"  Found {len(data)} benchmarks")
+        return data
+    print("  Failed to fetch benchmarks!")
+    return []
+
+
+def fetch_benchmark_scores(benchmark_id: str) -> Optional[dict]:
+    """Fetch detailed benchmark scores for a specific benchmark."""
+    data = fetch_json(f"{ZEROEVAL_API}/leaderboard/benchmarks/{benchmark_id}")
+    return data
+
+
+def normalize_model_id(model_id: str) -> str:
+    """Normalize model ID for matching."""
+    # Remove common prefixes/suffixes and normalize
+    normalized = model_id.lower()
+    # Remove date suffixes like -20251101
+    parts = normalized.split("-")
+    filtered = [p for p in parts if not (len(p) == 8 and p.isdigit())]
+    return "-".join(filtered)
+
+
+def build_model_score_map(benchmarks_data: Dict[str, dict]) -> Dict[str, dict]:
+    """
+    Build a map from normalized model names to their benchmark scores.
+    
+    Returns: {normalized_model_id: {category: {benchmark_id: score}}}
+    """
+    model_scores = defaultdict(lambda: defaultdict(dict))
+    
+    for category, benchmarks in benchmarks_data.items():
+        for benchmark_id, benchmark_info in benchmarks.items():
+            if not benchmark_info or "models" not in benchmark_info:
+                continue
+            
+            for model in benchmark_info["models"]:
+                model_id = model.get("model_id", "")
+                score = model.get("score")
+                if model_id and score is not None:
+                    # Store both original and normalized
+                    model_scores[model_id][category][benchmark_id] = score
+                    
+                    # Also store by normalized name for fuzzy matching
+                    normalized = normalize_model_id(model_id)
+                    if normalized != model_id:
+                        model_scores[normalized][category][benchmark_id] = score
+    
+    return dict(model_scores)
+
+
+def match_model(openrouter_id: str, zeroeval_scores: dict) -> Optional[dict]:
+    """Try to match an OpenRouter model ID to ZeroEval scores."""
+    # Try exact match first
+    if openrouter_id in zeroeval_scores:
+        return zeroeval_scores[openrouter_id]
+    
+    # Try normalized match
+    normalized = normalize_model_id(openrouter_id)
+    if normalized in zeroeval_scores:
+        return zeroeval_scores[normalized]
+    
+    # Try partial match (model name without provider)
+    if "/" in openrouter_id:
+        model_name = openrouter_id.split("/")[-1]
+        model_name_normalized = normalize_model_id(model_name)
+        
+        for ze_id, scores in zeroeval_scores.items():
+            if model_name_normalized in ze_id or ze_id in model_name_normalized:
+                return scores
+    
+    return None
+
+
+def calculate_category_averages(scores: dict) -> dict:
+    """Calculate average score per category."""
+    averages = {}
+    for category, benchmarks in scores.items():
+        if benchmarks:
+            avg = sum(benchmarks.values()) / len(benchmarks)
+            averages[category] = round(avg, 4)
+    return averages
+
+
+def main():
+    print("=" * 60)
+    print("OpenRouter + ZeroEval Benchmark Merger")
+    print("=" * 60)
+    
+    # Step 1: Fetch OpenRouter models
+    openrouter_models = fetch_openrouter_models()
+    if not openrouter_models:
+        print("Failed to fetch OpenRouter models, exiting.")
+        sys.exit(1)
+    
+    # Save raw OpenRouter models
+    or_path = OUTPUT_DIR / "openrouter_models_raw.json"
+    with open(or_path, "w") as f:
+        json.dump({"data": openrouter_models}, f)
+    print(f"Saved raw OpenRouter models to {or_path}")
+    
+    # Step 2: Fetch all benchmark metadata
+    all_benchmarks = fetch_all_benchmarks()
+    if not all_benchmarks:
+        print("Failed to fetch benchmarks, exiting.")
+        sys.exit(1)
+    
+    # Save benchmarks metadata
+    bench_path = OUTPUT_DIR / "llm_stats_benchmarks.json"
+    with open(bench_path, "w") as f:
+        json.dump(all_benchmarks, f)
+    print(f"Saved benchmarks metadata to {bench_path}")
+    
+    # Build benchmark ID lookup
+    benchmark_lookup = {b["benchmark_id"]: b for b in all_benchmarks}
+    
+    # Step 3: Fetch scores for key benchmarks in each category
+    print("\nFetching benchmark scores by category...")
+    benchmarks_data = {}
+    
+    for category, benchmark_ids in KEY_BENCHMARKS.items():
+        print(f"\n  Category: {category}")
+        benchmarks_data[category] = {}
+        
+        for bench_id in benchmark_ids:
+            # Try the exact ID first
+            data = fetch_benchmark_scores(bench_id)
+            
+            # If not found, try finding a matching benchmark
+            if data is None:
+                # Search for similar benchmark IDs
+                for full_id in benchmark_lookup.keys():
+                    if bench_id in full_id or full_id in bench_id:
+                        data = fetch_benchmark_scores(full_id)
+                        if data:
+                            bench_id = full_id
+                            break
+            
+            if data:
+                model_count = len(data.get("models", []))
+                print(f"    ✓ {bench_id}: {model_count} models")
+                benchmarks_data[category][bench_id] = data
+            else:
+                print(f"    ✗ {bench_id}: not found")
+            
+            time.sleep(0.2)  # Rate limiting
+    
+    # Step 4: Build model score map
+    print("\nBuilding model score map...")
+    model_scores = build_model_score_map(benchmarks_data)
+    print(f"  Found scores for {len(model_scores)} unique model IDs")
+    
+    # Step 5: Merge with OpenRouter models
+    print("\nMerging with OpenRouter models...")
+    merged_models = []
+    matched_count = 0
+    
+    for model in openrouter_models:
+        model_id = model.get("id", "")
+        
+        # Try to find matching benchmark scores
+        scores = match_model(model_id, model_scores)
+        
+        # Build merged model entry
+        merged = {
+            "id": model_id,
+            "name": model.get("name", ""),
+            "context_length": model.get("context_length"),
+            "architecture": model.get("architecture", {}),
+            "pricing": model.get("pricing", {}),
+            "benchmarks": None,
+            "category_scores": None
+        }
+        
+        if scores:
+            merged["benchmarks"] = scores
+            merged["category_scores"] = calculate_category_averages(scores)
+            matched_count += 1
+        
+        merged_models.append(merged)
+    
+    print(f"  Matched {matched_count}/{len(openrouter_models)} models with benchmarks")
+    
+    # Step 6: Save merged data
+    output = {
+        "generated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "total_models": len(merged_models),
+        "models_with_benchmarks": matched_count,
+        "categories": list(KEY_BENCHMARKS.keys()),
+        "models": merged_models
+    }
+    
+    output_path = OUTPUT_DIR / "models_with_benchmarks.json"
+    with open(output_path, "w") as f:
+        json.dump(output, f, indent=2)
+    print(f"\n✓ Saved merged data to {output_path}")
+    
+    # Step 7: Create summary
+    print("\n" + "=" * 60)
+    print("Summary")
+    print("=" * 60)
+    print(f"Total OpenRouter models: {len(openrouter_models)}")
+    print(f"Models with benchmark data: {matched_count}")
+    print(f"Categories tracked: {', '.join(KEY_BENCHMARKS.keys())}")
+    
+    # Show some example matches
+    print("\nExample matched models:")
+    for m in merged_models[:10]:
+        if m["benchmarks"]:
+            cats = list(m["category_scores"].keys()) if m["category_scores"] else []
+            print(f"  - {m['id']}: {', '.join(cats)}")
+
+
+if __name__ == "__main__":
+    main()