diff --git a/models_with_benchmarks.json b/models_with_benchmarks.json
index 2e3afc6..0b6e6f8 100644
--- a/models_with_benchmarks.json
+++ b/models_with_benchmarks.json
@@ -1,7 +1,7 @@
 {
-  "generated_at": "2025-12-17T03:37:04Z",
-  "total_models": 349,
-  "models_with_benchmarks": 156,
+  "generated_at": "2025-12-19T04:29:38Z",
+  "total_models": 350,
+  "models_with_benchmarks": 158,
   "categories": [
     "code",
     "math",
@@ -10,7 +10,322 @@
     "long_context",
     "general"
   ],
+  "families": {
+    "gemini-flash": {
+      "latest": "google/gemini-3-flash-preview",
+      "members": [
+        "google/gemini-3-flash-preview",
+        "google/gemini-2.5-flash-image",
+        "google/gemini-2.5-flash-preview-09-2025",
+        "google/gemini-2.5-flash-image-preview",
+        "google/gemini-2.5-flash",
+        "google/gemini-2.0-flash-001",
+        "google/gemini-2.0-flash-exp:free"
+      ],
+      "tier": "fast"
+    },
+    "mistral-small": {
+      "latest": "mistralai/mistral-small-24b-instruct-2501",
+      "members": [
+        "mistralai/mistral-small-24b-instruct-2501",
+        "mistralai/mistral-small-3.2-24b-instruct",
+        "mistralai/mistral-small-3.1-24b-instruct:free",
+        "mistralai/mistral-small-3.1-24b-instruct",
+        "mistralai/mistral-small-creative"
+      ],
+      "tier": "fast"
+    },
+    "mistral-large": {
+      "latest": "mistralai/mistral-large-2512",
+      "members": [
+        "mistralai/mistral-large-2512",
+        "mistralai/mistral-large-2411",
+        "mistralai/mistral-large-2407",
+        "mistralai/mistral-large"
+      ],
+      "tier": "mid"
+    },
+    "claude-opus": {
+      "latest": "anthropic/claude-opus-4.5",
+      "members": [
+        "anthropic/claude-opus-4.5",
+        "anthropic/claude-opus-4.1",
+        "anthropic/claude-opus-4",
+        "anthropic/claude-3-opus"
+      ],
+      "tier": "flagship"
+    },
+    "gemini-pro": {
+      "latest": "google/gemini-3-pro-image-preview",
+      "members": [
+        "google/gemini-3-pro-image-preview",
+        "google/gemini-3-pro-preview",
+        "google/gemini-2.5-pro",
+        "google/gemini-2.5-pro-preview",
+        "google/gemini-2.5-pro-preview-05-06"
+      ],
+      "tier": "mid"
+    },
+    "claude-haiku": {
+      "latest": "anthropic/claude-haiku-4.5",
+      "members": [
+        "anthropic/claude-haiku-4.5",
+        "anthropic/claude-3.5-haiku",
+        "anthropic/claude-3-haiku"
+      ],
+      "tier": "fast"
+    },
+    "claude-sonnet": {
+      "latest": "anthropic/claude-sonnet-4.5",
+      "members": [
+        "anthropic/claude-sonnet-4.5",
+        "anthropic/claude-sonnet-4",
+        "anthropic/claude-3.7-sonnet",
+        "anthropic/claude-3.5-sonnet"
+      ],
+      "tier": "mid"
+    },
+    "deepseek-chat": {
+      "latest": "deepseek/deepseek-chat-v3-0324",
+      "members": [
+        "deepseek/deepseek-chat-v3-0324",
+        "deepseek/deepseek-chat-v3.1",
+        "deepseek/deepseek-chat"
+      ],
+      "tier": "mid"
+    },
+    "mistral-medium": {
+      "latest": "mistralai/mistral-medium-3.1",
+      "members": [
+        "mistralai/mistral-medium-3.1",
+        "mistralai/mistral-medium-3"
+      ],
+      "tier": "mid"
+    },
+    "gpt-4": {
+      "latest": "openai/gpt-4.1",
+      "members": [
+        "openai/gpt-4.1",
+        "openai/gpt-4o",
+        "openai/gpt-4-turbo",
+        "openai/gpt-4-turbo-preview"
+      ],
+      "tier": "mid"
+    },
+    "gpt-4-mini": {
+      "latest": "openai/gpt-4.1-mini",
+      "members": [
+        "openai/gpt-4.1-mini",
+        "openai/gpt-4o-mini"
+      ],
+      "tier": "fast"
+    },
+    "qwq": {
+      "latest": "qwen/qwq-32b",
+      "members": [
+        "qwen/qwq-32b"
+      ],
+      "tier": "mid"
+    },
+    "o3-mini": {
+      "latest": "openai/o3-mini-high",
+      "members": [
+        "openai/o3-mini-high",
+        "openai/o3-mini"
+      ],
+      "tier": "mid"
+    },
+    "deepseek-r1": {
+      "latest": "deepseek/deepseek-r1",
+      "members": [
+        "deepseek/deepseek-r1"
+      ],
+      "tier": "flagship"
+    },
+    "o1": {
+      "latest": "openai/o1",
+      "members": [
+        "openai/o1"
+      ],
+      "tier": "flagship"
+    },
+    "llama-3-70b": {
+      "latest": "meta-llama/llama-3.3-70b-instruct:free",
+      "members": [
+        "meta-llama/llama-3.3-70b-instruct:free",
+        "meta-llama/llama-3.3-70b-instruct"
+      ],
+      "tier": "mid"
+    },
+    "llama-3-90b": {
+      "latest": "meta-llama/llama-3.2-90b-vision-instruct",
+      "members": [
+        "meta-llama/llama-3.2-90b-vision-instruct"
+      ],
+      "tier": "mid"
+    },
+    "qwen-72b": {
+      "latest": "qwen/qwen-2.5-72b-instruct",
+      "members": [
+        "qwen/qwen-2.5-72b-instruct"
+      ],
+      "tier": "mid"
+    },
+    "llama-3-405b": {
+      "latest": "meta-llama/llama-3.1-405b",
+      "members": [
+        "meta-llama/llama-3.1-405b",
+        "meta-llama/llama-3.1-405b-instruct:free",
+        "meta-llama/llama-3.1-405b-instruct"
+      ],
+      "tier": "flagship"
+    }
+  },
+  "aliases": {
+    "google/gemini-2.5-flash-image": "google/gemini-3-flash-preview",
+    "gemini-2.5-flash-image": "google/gemini-3-flash-preview",
+    "google/gemini-2.5-flash-preview-09-2025": "google/gemini-3-flash-preview",
+    "gemini-2.5-flash-preview-09-2025": "google/gemini-3-flash-preview",
+    "google/gemini-2.5-flash-image-preview": "google/gemini-3-flash-preview",
+    "gemini-2.5-flash-image-preview": "google/gemini-3-flash-preview",
+    "google/gemini-2.5-flash": "google/gemini-3-flash-preview",
+    "gemini-2.5-flash": "google/gemini-3-flash-preview",
+    "google/gemini-2.0-flash-001": "google/gemini-3-flash-preview",
+    "gemini-2.0-flash-001": "google/gemini-3-flash-preview",
+    "google/gemini-2.0-flash-exp:free": "google/gemini-3-flash-preview",
+    "gemini-2.0-flash-exp:free": "google/gemini-3-flash-preview",
+    "gemini-flash": "google/gemini-3-flash-preview",
+    "mistralai/mistral-small-3.2-24b-instruct": "mistralai/mistral-small-24b-instruct-2501",
+    "mistral-small-3.2-24b-instruct": "mistralai/mistral-small-24b-instruct-2501",
+    "mistralai/mistral-small-3.1-24b-instruct:free": "mistralai/mistral-small-24b-instruct-2501",
+    "mistral-small-3.1-24b-instruct:free": "mistralai/mistral-small-24b-instruct-2501",
+    "mistralai/mistral-small-3.1-24b-instruct": "mistralai/mistral-small-24b-instruct-2501",
+    "mistral-small-3.1-24b-instruct": "mistralai/mistral-small-24b-instruct-2501",
+    "mistralai/mistral-small-creative": "mistralai/mistral-small-24b-instruct-2501",
+    "mistral-small-creative": "mistralai/mistral-small-24b-instruct-2501",
+    "mistral-small": "mistralai/mistral-small-24b-instruct-2501",
+    "mistralai/mistral-large-2411": "mistralai/mistral-large-2512",
+    "mistral-large-2411": "mistralai/mistral-large-2512",
+    "mistralai/mistral-large-2407": "mistralai/mistral-large-2512",
+    "mistral-large-2407": "mistralai/mistral-large-2512",
+    "mistralai/mistral-large": "mistralai/mistral-large-2512",
+    "mistral-large": "mistralai/mistral-large-2512",
+    "anthropic/claude-opus-4.1": "anthropic/claude-opus-4.5",
+    "claude-opus-4.1": "anthropic/claude-opus-4.5",
+    "anthropic/claude-opus-4": "anthropic/claude-opus-4.5",
+    "claude-opus-4": "anthropic/claude-opus-4.5",
+    "anthropic/claude-3-opus": "anthropic/claude-opus-4.5",
+    "claude-3-opus": "anthropic/claude-opus-4.5",
+    "claude-opus": "anthropic/claude-opus-4.5",
+    "opus": "anthropic/claude-opus-4.5",
+    "claude opus": "anthropic/claude-opus-4.5",
+    "google/gemini-3-pro-preview": "google/gemini-3-pro-image-preview",
+    "gemini-3-pro-preview": "google/gemini-3-pro-image-preview",
+    "google/gemini-2.5-pro": "google/gemini-3-pro-image-preview",
+    "gemini-2.5-pro": "google/gemini-3-pro-image-preview",
+    "google/gemini-2.5-pro-preview": "google/gemini-3-pro-image-preview",
+    "gemini-2.5-pro-preview": "google/gemini-3-pro-image-preview",
+    "google/gemini-2.5-pro-preview-05-06": "google/gemini-3-pro-image-preview",
+    "gemini-2.5-pro-preview-05-06": "google/gemini-3-pro-image-preview",
+    "gemini-pro": "google/gemini-3-pro-image-preview",
+    "anthropic/claude-3.5-haiku": "anthropic/claude-haiku-4.5",
+    "claude-3.5-haiku": "anthropic/claude-haiku-4.5",
+    "anthropic/claude-3-haiku": "anthropic/claude-haiku-4.5",
+    "claude-3-haiku": "anthropic/claude-haiku-4.5",
+    "claude-haiku": "anthropic/claude-haiku-4.5",
+    "haiku": "anthropic/claude-haiku-4.5",
+    "claude haiku": "anthropic/claude-haiku-4.5",
+    "anthropic/claude-sonnet-4": "anthropic/claude-sonnet-4.5",
+    "claude-sonnet-4": "anthropic/claude-sonnet-4.5",
+    "anthropic/claude-3.7-sonnet": "anthropic/claude-sonnet-4.5",
+    "claude-3.7-sonnet": "anthropic/claude-sonnet-4.5",
+    "anthropic/claude-3.5-sonnet": "anthropic/claude-sonnet-4.5",
+    "claude-3.5-sonnet": "anthropic/claude-sonnet-4.5",
+    "claude-sonnet": "anthropic/claude-sonnet-4.5",
+    "sonnet": "anthropic/claude-sonnet-4.5",
+    "claude sonnet": "anthropic/claude-sonnet-4.5",
+    "deepseek/deepseek-chat-v3.1": "deepseek/deepseek-chat-v3-0324",
+    "deepseek-chat-v3.1": "deepseek/deepseek-chat-v3-0324",
+    "deepseek/deepseek-chat": "deepseek/deepseek-chat-v3-0324",
+    "deepseek-chat": "deepseek/deepseek-chat-v3-0324",
+    "mistralai/mistral-medium-3": "mistralai/mistral-medium-3.1",
+    "mistral-medium-3": "mistralai/mistral-medium-3.1",
+    "mistral-medium": "mistralai/mistral-medium-3.1",
+    "openai/gpt-4o": "openai/gpt-4.1",
+    "gpt-4o": "openai/gpt-4.1",
+    "openai/gpt-4-turbo": "openai/gpt-4.1",
+    "gpt-4-turbo": "openai/gpt-4.1",
+    "openai/gpt-4-turbo-preview": "openai/gpt-4.1",
+    "gpt-4-turbo-preview": "openai/gpt-4.1",
+    "gpt-4": "openai/gpt-4.1",
+    "gpt4": "openai/gpt-4.1",
+    "openai/gpt-4o-mini": "openai/gpt-4.1-mini",
+    "gpt-4o-mini": "openai/gpt-4.1-mini",
+    "gpt-4-mini": "openai/gpt-4.1-mini",
+    "gpt4-mini": "openai/gpt-4.1-mini",
+    "qwq": "qwen/qwq-32b",
+    "openai/o3-mini": "openai/o3-mini-high",
+    "o3-mini": "openai/o3-mini-high",
+    "deepseek-r1": "deepseek/deepseek-r1",
+    "o1": "openai/o1",
+    "meta-llama/llama-3.3-70b-instruct": "meta-llama/llama-3.3-70b-instruct:free",
+    "llama-3.3-70b-instruct": "meta-llama/llama-3.3-70b-instruct:free",
+    "llama-3-70b": "meta-llama/llama-3.3-70b-instruct:free",
+    "llama-3-90b": "meta-llama/llama-3.2-90b-vision-instruct",
+    "qwen-72b": "qwen/qwen-2.5-72b-instruct",
+    "meta-llama/llama-3.1-405b-instruct:free": "meta-llama/llama-3.1-405b",
+    "llama-3.1-405b-instruct:free": "meta-llama/llama-3.1-405b",
+    "meta-llama/llama-3.1-405b-instruct": "meta-llama/llama-3.1-405b",
+    "llama-3.1-405b-instruct": "meta-llama/llama-3.1-405b",
+    "llama-3-405b": "meta-llama/llama-3.1-405b"
+  },
   "models": [
+    {
+      "id": "google/gemini-3-flash-preview",
+      "name": "Google: Gemini 3 Flash Preview",
+      "context_length": 1048576,
+      "architecture": {
+        "modality": "text+image->text",
+        "input_modalities": [
+          "text",
+          "image",
+          "file",
+          "audio",
+          "video"
+        ],
+        "output_modalities": [
+          "text"
+        ],
+        "tokenizer": "Gemini",
+        "instruct_type": null
+      },
+      "pricing": {
+        "prompt": "0.0000005",
+        "completion": "0.000003",
+        "request": "0",
+        "image": "0",
+        "audio": "0.000001",
+        "web_search": "0",
+        "internal_reasoning": "0",
+        "input_cache_read": "0.00000005"
+      },
+      "benchmarks": {
+        "code": {
+          "swe-bench-verified": 0.78
+        },
+        "math": {
+          "aime-2025": 0.997
+        },
+        "reasoning": {
+          "gpqa": 0.904
+        }
+      },
+      "category_scores": {
+        "code": 0.78,
+        "math": 0.997,
+        "reasoning": 0.904
+      }
+    },
     {
       "id": "mistralai/mistral-small-creative",
       "name": "Mistral: Mistral Small Creative",
@@ -153,6 +468,47 @@
         "reasoning": 0.7665
       }
     },
+    {
+      "id": "nvidia/nemotron-3-nano-30b-a3b",
+      "name": "NVIDIA: Nemotron 3 Nano 30B A3B",
+      "context_length": 262144,
+      "architecture": {
+        "modality": "text->text",
+        "input_modalities": [
+          "text"
+        ],
+        "output_modalities": [
+          "text"
+        ],
+        "tokenizer": "Other",
+        "instruct_type": null
+      },
+      "pricing": {
+        "prompt": "0.00000006",
+        "completion": "0.00000024",
+        "request": "0",
+        "image": "0",
+        "web_search": "0",
+        "internal_reasoning": "0"
+      },
+      "benchmarks": {
+        "code": {
+          "swe-bench-verified": 0.388
+        },
+        "math": {
+          "aime-2025": 0.992
+        },
+        "reasoning": {
+          "gpqa": 0.75,
+          "mmlu-pro": 0.783
+        }
+      },
+      "category_scores": {
+        "code": 0.388,
+        "math": 0.992,
+        "reasoning": 0.7665
+      }
+    },
     {
       "id": "openai/gpt-5.2-chat",
       "name": "OpenAI: GPT-5.2 Chat",
@@ -514,35 +870,6 @@
         "code": 0.737
       }
     },
-    {
-      "id": "amazon/nova-2-lite-v1:free",
-      "name": "Amazon: Nova 2 Lite (free)",
-      "context_length": 1000000,
-      "architecture": {
-        "modality": "text+image->text",
-        "input_modalities": [
-          "text",
-          "image",
-          "video",
-          "file"
-        ],
-        "output_modalities": [
-          "text"
-        ],
-        "tokenizer": "Nova",
-        "instruct_type": null
-      },
-      "pricing": {
-        "prompt": "0",
-        "completion": "0",
-        "request": "0",
-        "image": "0",
-        "web_search": "0",
-        "internal_reasoning": "0"
-      },
-      "benchmarks": null,
-      "category_scores": null
-    },
     {
       "id": "amazon/nova-2-lite-v1",
       "name": "Amazon: Nova 2 Lite",
@@ -792,7 +1119,7 @@
         "image": "0",
         "web_search": "0",
         "internal_reasoning": "0",
-        "input_cache_read": "0.00000019",
+        "input_cache_read": "0.00000011",
         "input_cache_write": "0"
       },
       "benchmarks": {
@@ -5895,8 +6222,8 @@
         "instruct_type": null
       },
       "pricing": {
-        "prompt": "0.00000005",
-        "completion": "0.0000002",
+        "prompt": "0.00000002",
+        "completion": "0.0000001",
         "request": "0",
         "image": "0",
         "web_search": "0",
@@ -9595,47 +9922,6 @@
       "benchmarks": null,
       "category_scores": null
     },
-    {
-      "id": "mistralai/ministral-3b",
-      "name": "Mistral: Ministral 3B",
-      "context_length": 131072,
-      "architecture": {
-        "modality": "text->text",
-        "input_modalities": [
-          "text"
-        ],
-        "output_modalities": [
-          "text"
-        ],
-        "tokenizer": "Mistral",
-        "instruct_type": null
-      },
-      "pricing": {
-        "prompt": "0.00000004",
-        "completion": "0.00000004",
-        "request": "0",
-        "image": "0",
-        "web_search": "0",
-        "internal_reasoning": "0"
-      },
-      "benchmarks": {
-        "code": {
-          "livecodebench": 0.548
-        },
-        "math": {
-          "aime-2025": 0.721,
-          "aime-2024": 0.775
-        },
-        "reasoning": {
-          "gpqa": 0.534
-        }
-      },
-      "category_scores": {
-        "code": 0.548,
-        "math": 0.748,
-        "reasoning": 0.534
-      }
-    },
     {
       "id": "mistralai/ministral-8b",
       "name": "Mistral: Ministral 8B",
@@ -9682,6 +9968,47 @@
         "general": 0.7695
       }
     },
+    {
+      "id": "mistralai/ministral-3b",
+      "name": "Mistral: Ministral 3B",
+      "context_length": 131072,
+      "architecture": {
+        "modality": "text->text",
+        "input_modalities": [
+          "text"
+        ],
+        "output_modalities": [
+          "text"
+        ],
+        "tokenizer": "Mistral",
+        "instruct_type": null
+      },
+      "pricing": {
+        "prompt": "0.00000004",
+        "completion": "0.00000004",
+        "request": "0",
+        "image": "0",
+        "web_search": "0",
+        "internal_reasoning": "0"
+      },
+      "benchmarks": {
+        "code": {
+          "livecodebench": 0.548
+        },
+        "math": {
+          "aime-2025": 0.721,
+          "aime-2024": 0.775
+        },
+        "reasoning": {
+          "gpqa": 0.534
+        }
+      },
+      "category_scores": {
+        "code": 0.548,
+        "math": 0.748,
+        "reasoning": 0.534
+      }
+    },
     {
       "id": "qwen/qwen-2.5-7b-instruct",
       "name": "Qwen: Qwen2.5 7B Instruct",
@@ -9775,8 +10102,8 @@
       }
     },
     {
-      "id": "inflection/inflection-3-productivity",
-      "name": "Inflection: Inflection 3 Productivity",
+      "id": "inflection/inflection-3-pi",
+      "name": "Inflection: Inflection 3 Pi",
       "context_length": 8000,
       "architecture": {
         "modality": "text->text",
@@ -9801,8 +10128,8 @@
       "category_scores": null
     },
     {
-      "id": "inflection/inflection-3-pi",
-      "name": "Inflection: Inflection 3 Pi",
+      "id": "inflection/inflection-3-productivity",
+      "name": "Inflection: Inflection 3 Productivity",
       "context_length": 8000,
       "architecture": {
         "modality": "text->text",
@@ -9879,6 +10206,59 @@
       "benchmarks": null,
       "category_scores": null
     },
+    {
+      "id": "meta-llama/llama-3.2-11b-vision-instruct",
+      "name": "Meta: Llama 3.2 11B Vision Instruct",
+      "context_length": 131072,
+      "architecture": {
+        "modality": "text+image->text",
+        "input_modalities": [
+          "text",
+          "image"
+        ],
+        "output_modalities": [
+          "text"
+        ],
+        "tokenizer": "Llama3",
+        "instruct_type": "llama3"
+      },
+      "pricing": {
+        "prompt": "0.000000049",
+        "completion": "0.000000049",
+        "request": "0",
+        "image": "0.00007948",
+        "web_search": "0",
+        "internal_reasoning": "0"
+      },
+      "benchmarks": null,
+      "category_scores": null
+    },
+    {
+      "id": "meta-llama/llama-3.2-1b-instruct",
+      "name": "Meta: Llama 3.2 1B Instruct",
+      "context_length": 60000,
+      "architecture": {
+        "modality": "text->text",
+        "input_modalities": [
+          "text"
+        ],
+        "output_modalities": [
+          "text"
+        ],
+        "tokenizer": "Llama3",
+        "instruct_type": "llama3"
+      },
+      "pricing": {
+        "prompt": "0.000000027",
+        "completion": "0.0000002",
+        "request": "0",
+        "image": "0",
+        "web_search": "0",
+        "internal_reasoning": "0"
+      },
+      "benchmarks": null,
+      "category_scores": null
+    },
     {
       "id": "meta-llama/llama-3.2-3b-instruct:free",
       "name": "Meta: Llama 3.2 3B Instruct (free)",
@@ -9981,59 +10361,6 @@
         "general": 0.774
       }
     },
-    {
-      "id": "meta-llama/llama-3.2-1b-instruct",
-      "name": "Meta: Llama 3.2 1B Instruct",
-      "context_length": 60000,
-      "architecture": {
-        "modality": "text->text",
-        "input_modalities": [
-          "text"
-        ],
-        "output_modalities": [
-          "text"
-        ],
-        "tokenizer": "Llama3",
-        "instruct_type": "llama3"
-      },
-      "pricing": {
-        "prompt": "0.000000027",
-        "completion": "0.0000002",
-        "request": "0",
-        "image": "0",
-        "web_search": "0",
-        "internal_reasoning": "0"
-      },
-      "benchmarks": null,
-      "category_scores": null
-    },
-    {
-      "id": "meta-llama/llama-3.2-11b-vision-instruct",
-      "name": "Meta: Llama 3.2 11B Vision Instruct",
-      "context_length": 131072,
-      "architecture": {
-        "modality": "text+image->text",
-        "input_modalities": [
-          "text",
-          "image"
-        ],
-        "output_modalities": [
-          "text"
-        ],
-        "tokenizer": "Llama3",
-        "instruct_type": "llama3"
-      },
-      "pricing": {
-        "prompt": "0.000000049",
-        "completion": "0.000000049",
-        "request": "0",
-        "image": "0.00007948",
-        "web_search": "0",
-        "internal_reasoning": "0"
-      },
-      "benchmarks": null,
-      "category_scores": null
-    },
     {
       "id": "qwen/qwen-2.5-72b-instruct",
       "name": "Qwen2.5 72B Instruct",
@@ -10156,32 +10483,6 @@
         "general": 0.6905
       }
     },
-    {
-      "id": "cohere/command-r-plus-08-2024",
-      "name": "Cohere: Command R+ (08-2024)",
-      "context_length": 128000,
-      "architecture": {
-        "modality": "text->text",
-        "input_modalities": [
-          "text"
-        ],
-        "output_modalities": [
-          "text"
-        ],
-        "tokenizer": "Cohere",
-        "instruct_type": null
-      },
-      "pricing": {
-        "prompt": "0.0000025",
-        "completion": "0.00001",
-        "request": "0",
-        "image": "0",
-        "web_search": "0",
-        "internal_reasoning": "0"
-      },
-      "benchmarks": null,
-      "category_scores": null
-    },
     {
       "id": "cohere/command-r-08-2024",
       "name": "Cohere: Command R (08-2024)",
@@ -10209,9 +10510,9 @@
       "category_scores": null
     },
     {
-      "id": "sao10k/l3.1-euryale-70b",
-      "name": "Sao10K: Llama 3.1 Euryale 70B v2.2",
-      "context_length": 32768,
+      "id": "cohere/command-r-plus-08-2024",
+      "name": "Cohere: Command R+ (08-2024)",
+      "context_length": 128000,
       "architecture": {
         "modality": "text->text",
         "input_modalities": [
@@ -10220,12 +10521,12 @@
         "output_modalities": [
           "text"
         ],
-        "tokenizer": "Llama3",
-        "instruct_type": "llama3"
+        "tokenizer": "Cohere",
+        "instruct_type": null
       },
       "pricing": {
-        "prompt": "0.00000065",
-        "completion": "0.00000075",
+        "prompt": "0.0000025",
+        "completion": "0.00001",
         "request": "0",
         "image": "0",
         "web_search": "0",
@@ -10288,6 +10589,32 @@
       "benchmarks": null,
       "category_scores": null
     },
+    {
+      "id": "sao10k/l3.1-euryale-70b",
+      "name": "Sao10K: Llama 3.1 Euryale 70B v2.2",
+      "context_length": 32768,
+      "architecture": {
+        "modality": "text->text",
+        "input_modalities": [
+          "text"
+        ],
+        "output_modalities": [
+          "text"
+        ],
+        "tokenizer": "Llama3",
+        "instruct_type": "llama3"
+      },
+      "pricing": {
+        "prompt": "0.00000065",
+        "completion": "0.00000075",
+        "request": "0",
+        "image": "0",
+        "web_search": "0",
+        "internal_reasoning": "0"
+      },
+      "benchmarks": null,
+      "category_scores": null
+    },
     {
       "id": "microsoft/phi-3.5-mini-128k-instruct",
       "name": "Microsoft: Phi-3.5 Mini 128K Instruct",
@@ -10651,53 +10978,6 @@
         "general": 0.886
       }
     },
-    {
-      "id": "meta-llama/llama-3.1-70b-instruct",
-      "name": "Meta: Llama 3.1 70B Instruct",
-      "context_length": 131072,
-      "architecture": {
-        "modality": "text->text",
-        "input_modalities": [
-          "text"
-        ],
-        "output_modalities": [
-          "text"
-        ],
-        "tokenizer": "Llama3",
-        "instruct_type": "llama3"
-      },
-      "pricing": {
-        "prompt": "0.0000004",
-        "completion": "0.0000004",
-        "request": "0",
-        "image": "0",
-        "web_search": "0",
-        "internal_reasoning": "0"
-      },
-      "benchmarks": {
-        "code": {
-          "humaneval": 0.805
-        },
-        "reasoning": {
-          "gpqa": 0.417,
-          "mmlu-pro": 0.664,
-          "mmlu": 0.836
-        },
-        "tool_calling": {
-          "bfcl": 0.848,
-          "nexus": 0.567
-        },
-        "general": {
-          "ifeval": 0.875
-        }
-      },
-      "category_scores": {
-        "code": 0.805,
-        "reasoning": 0.639,
-        "tool_calling": 0.7075,
-        "general": 0.875
-      }
-    },
     {
       "id": "meta-llama/llama-3.1-8b-instruct",
       "name": "Meta: Llama 3.1 8B Instruct",
@@ -10745,6 +11025,53 @@
         "general": 0.804
       }
     },
+    {
+      "id": "meta-llama/llama-3.1-70b-instruct",
+      "name": "Meta: Llama 3.1 70B Instruct",
+      "context_length": 131072,
+      "architecture": {
+        "modality": "text->text",
+        "input_modalities": [
+          "text"
+        ],
+        "output_modalities": [
+          "text"
+        ],
+        "tokenizer": "Llama3",
+        "instruct_type": "llama3"
+      },
+      "pricing": {
+        "prompt": "0.0000004",
+        "completion": "0.0000004",
+        "request": "0",
+        "image": "0",
+        "web_search": "0",
+        "internal_reasoning": "0"
+      },
+      "benchmarks": {
+        "code": {
+          "humaneval": 0.805
+        },
+        "reasoning": {
+          "gpqa": 0.417,
+          "mmlu-pro": 0.664,
+          "mmlu": 0.836
+        },
+        "tool_calling": {
+          "bfcl": 0.848,
+          "nexus": 0.567
+        },
+        "general": {
+          "ifeval": 0.875
+        }
+      },
+      "category_scores": {
+        "code": 0.805,
+        "reasoning": 0.639,
+        "tool_calling": 0.7075,
+        "general": 0.875
+      }
+    },
     {
       "id": "mistralai/mistral-nemo",
       "name": "Mistral: Mistral Nemo",
@@ -11036,32 +11363,6 @@
       "benchmarks": null,
       "category_scores": null
     },
-    {
-      "id": "mistralai/mistral-7b-instruct-v0.3",
-      "name": "Mistral: Mistral 7B Instruct v0.3",
-      "context_length": 32768,
-      "architecture": {
-        "modality": "text->text",
-        "input_modalities": [
-          "text"
-        ],
-        "output_modalities": [
-          "text"
-        ],
-        "tokenizer": "Mistral",
-        "instruct_type": "mistral"
-      },
-      "pricing": {
-        "prompt": "0.0000002",
-        "completion": "0.0000002",
-        "request": "0",
-        "image": "0",
-        "web_search": "0",
-        "internal_reasoning": "0"
-      },
-      "benchmarks": null,
-      "category_scores": null
-    },
     {
       "id": "nousresearch/hermes-2-pro-llama-3-8b",
       "name": "NousResearch: Hermes 2 Pro - Llama-3 8B",
@@ -11088,6 +11389,32 @@
       "benchmarks": null,
       "category_scores": null
     },
+    {
+      "id": "mistralai/mistral-7b-instruct-v0.3",
+      "name": "Mistral: Mistral 7B Instruct v0.3",
+      "context_length": 32768,
+      "architecture": {
+        "modality": "text->text",
+        "input_modalities": [
+          "text"
+        ],
+        "output_modalities": [
+          "text"
+        ],
+        "tokenizer": "Mistral",
+        "instruct_type": "mistral"
+      },
+      "pricing": {
+        "prompt": "0.0000002",
+        "completion": "0.0000002",
+        "request": "0",
+        "image": "0",
+        "web_search": "0",
+        "internal_reasoning": "0"
+      },
+      "benchmarks": null,
+      "category_scores": null
+    },
     {
       "id": "microsoft/phi-3-mini-128k-instruct",
       "name": "Microsoft: Phi-3 Mini 128K Instruct",
@@ -11140,6 +11467,32 @@
       "benchmarks": null,
       "category_scores": null
     },
+    {
+      "id": "meta-llama/llama-guard-2-8b",
+      "name": "Meta: LlamaGuard 2 8B",
+      "context_length": 8192,
+      "architecture": {
+        "modality": "text->text",
+        "input_modalities": [
+          "text"
+        ],
+        "output_modalities": [
+          "text"
+        ],
+        "tokenizer": "Llama3",
+        "instruct_type": "none"
+      },
+      "pricing": {
+        "prompt": "0.0000002",
+        "completion": "0.0000002",
+        "request": "0",
+        "image": "0",
+        "web_search": "0",
+        "internal_reasoning": "0"
+      },
+      "benchmarks": null,
+      "category_scores": null
+    },
     {
       "id": "openai/gpt-4o-2024-05-13",
       "name": "OpenAI: GPT-4o (2024-05-13)",
@@ -11184,32 +11537,6 @@
         "reasoning": 0.7163
       }
     },
-    {
-      "id": "meta-llama/llama-guard-2-8b",
-      "name": "Meta: LlamaGuard 2 8B",
-      "context_length": 8192,
-      "architecture": {
-        "modality": "text->text",
-        "input_modalities": [
-          "text"
-        ],
-        "output_modalities": [
-          "text"
-        ],
-        "tokenizer": "Llama3",
-        "instruct_type": "none"
-      },
-      "pricing": {
-        "prompt": "0.0000002",
-        "completion": "0.0000002",
-        "request": "0",
-        "image": "0",
-        "web_search": "0",
-        "internal_reasoning": "0"
-      },
-      "benchmarks": null,
-      "category_scores": null
-    },
     {
       "id": "openai/gpt-4o",
       "name": "OpenAI: GPT-4o",
@@ -11288,32 +11615,6 @@
       "benchmarks": null,
       "category_scores": null
     },
-    {
-      "id": "meta-llama/llama-3-8b-instruct",
-      "name": "Meta: Llama 3 8B Instruct",
-      "context_length": 8192,
-      "architecture": {
-        "modality": "text->text",
-        "input_modalities": [
-          "text"
-        ],
-        "output_modalities": [
-          "text"
-        ],
-        "tokenizer": "Llama3",
-        "instruct_type": "llama3"
-      },
-      "pricing": {
-        "prompt": "0.00000003",
-        "completion": "0.00000006",
-        "request": "0",
-        "image": "0",
-        "web_search": "0",
-        "internal_reasoning": "0"
-      },
-      "benchmarks": null,
-      "category_scores": null
-    },
     {
       "id": "meta-llama/llama-3-70b-instruct",
       "name": "Meta: Llama 3 70B Instruct",
@@ -11340,6 +11641,32 @@
       "benchmarks": null,
       "category_scores": null
     },
+    {
+      "id": "meta-llama/llama-3-8b-instruct",
+      "name": "Meta: Llama 3 8B Instruct",
+      "context_length": 8192,
+      "architecture": {
+        "modality": "text->text",
+        "input_modalities": [
+          "text"
+        ],
+        "output_modalities": [
+          "text"
+        ],
+        "tokenizer": "Llama3",
+        "instruct_type": "llama3"
+      },
+      "pricing": {
+        "prompt": "0.00000003",
+        "completion": "0.00000006",
+        "request": "0",
+        "image": "0",
+        "web_search": "0",
+        "internal_reasoning": "0"
+      },
+      "benchmarks": null,
+      "category_scores": null
+    },
     {
       "id": "mistralai/mixtral-8x22b-instruct",
       "name": "Mistral: Mixtral 8x22B Instruct",
diff --git a/scripts/merge_benchmarks.py b/scripts/merge_benchmarks.py
index 9bd90f5..6f9bb56 100644
--- a/scripts/merge_benchmarks.py
+++ b/scripts/merge_benchmarks.py
@@ -6,7 +6,8 @@ This script:
 1. Fetches all models from OpenRouter API
 2. Fetches benchmark metadata from ZeroEval API
 3. For key benchmarks in each category, fetches model scores
-4. Creates a merged JSON with benchmark scores per category
+4. Auto-detects model families and tracks latest versions
+5. Creates a merged JSON with benchmark scores per category
 
 Categories tracked:
 - code: Coding benchmarks (SWE-bench, HumanEval, etc.)
@@ -14,13 +15,20 @@ Categories tracked:
 - reasoning: Reasoning benchmarks (GPQA, MMLU, etc.)
 - tool_calling: Tool/function calling benchmarks
 - long_context: Long context benchmarks
+
+Model families tracked:
+- claude-sonnet, claude-haiku, claude-opus (Anthropic)
+- gpt-4, gpt-4-mini (OpenAI)
+- gemini-pro, gemini-flash (Google)
+- And more...
 """
 
 import json
+import re
 import time
 import sys
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 from urllib.request import Request, urlopen
 from urllib.error import URLError, HTTPError
 from collections import defaultdict
@@ -55,6 +63,55 @@ KEY_BENCHMARKS = {
     ]
 }
 
+# Model family patterns with tier classification
+# Format: (regex_pattern, family_name, tier)
+# Tier: "flagship" (best), "mid" (balanced), "fast" (cheap/fast)
+MODEL_FAMILY_PATTERNS = [
+    # Anthropic Claude
+    (r"^anthropic/claude-opus-(\d+\.?\d*)$", "claude-opus", "flagship"),
+    (r"^anthropic/claude-(\d+\.?\d*)-opus$", "claude-opus", "flagship"),
+    (r"^anthropic/claude-sonnet-(\d+\.?\d*)$", "claude-sonnet", "mid"),
+    (r"^anthropic/claude-(\d+\.?\d*)-sonnet$", "claude-sonnet", "mid"),
+    (r"^anthropic/claude-haiku-(\d+\.?\d*)$", "claude-haiku", "fast"),
+    (r"^anthropic/claude-(\d+\.?\d*)-haiku$", "claude-haiku", "fast"),
+    
+    # OpenAI GPT
+    (r"^openai/gpt-4\.1$", "gpt-4", "mid"),
+    (r"^openai/gpt-4o$", "gpt-4", "mid"),
+    (r"^openai/gpt-4-turbo", "gpt-4", "mid"),
+    (r"^openai/gpt-4\.1-mini$", "gpt-4-mini", "fast"),
+    (r"^openai/gpt-4o-mini$", "gpt-4-mini", "fast"),
+    (r"^openai/o1$", "o1", "flagship"),
+    (r"^openai/o1-preview", "o1", "flagship"),
+    (r"^openai/o1-mini", "o1-mini", "mid"),
+    (r"^openai/o3-mini", "o3-mini", "mid"),
+    
+    # Google Gemini
+    (r"^google/gemini-(\d+\.?\d*)-pro", "gemini-pro", "mid"),
+    (r"^google/gemini-pro", "gemini-pro", "mid"),
+    (r"^google/gemini-(\d+\.?\d*)-flash(?!-lite)", "gemini-flash", "fast"),
+    (r"^google/gemini-flash", "gemini-flash", "fast"),
+    
+    # DeepSeek
+    (r"^deepseek/deepseek-chat", "deepseek-chat", "mid"),
+    (r"^deepseek/deepseek-coder", "deepseek-coder", "mid"),
+    (r"^deepseek/deepseek-r1$", "deepseek-r1", "flagship"),
+    
+    # Mistral
+    (r"^mistralai/mistral-large", "mistral-large", "mid"),
+    (r"^mistralai/mistral-medium", "mistral-medium", "mid"),
+    (r"^mistralai/mistral-small", "mistral-small", "fast"),
+    
+    # Meta Llama
+    (r"^meta-llama/llama-3\.3-70b", "llama-3-70b", "mid"),
+    (r"^meta-llama/llama-3\.2-90b", "llama-3-90b", "mid"),
+    (r"^meta-llama/llama-3\.1-405b", "llama-3-405b", "flagship"),
+    
+    # Qwen
+    (r"^qwen/qwen-2\.5-72b", "qwen-72b", "mid"),
+    (r"^qwen/qwq-32b", "qwq", "mid"),
+]
+
 HEADERS = {
     "Accept": "application/json",
     "Origin": "https://llm-stats.com",
@@ -121,6 +178,75 @@ def normalize_model_id(model_id: str) -> str:
     return "-".join(filtered)
 
 
+def extract_version(model_id: str) -> Tuple[float, str]:
+    """
+    Extract version number from model ID for sorting.
+    Returns (version_float, original_id) for sorting.
+    Higher version = newer model.
+    """
+    # Try to find version patterns like 4.5, 3.7, 2.5, etc.
+    patterns = [
+        r"-(\d+\.?\d*)-",  # e.g., claude-3.5-sonnet
+        r"-(\d+\.?\d*)$",  # e.g., gemini-2.5-pro
+        r"(\d+\.?\d*)$",   # e.g., claude-sonnet-4.5
+        r"/[a-z]+-(\d+\.?\d*)",  # e.g., gpt-4.1
+    ]
+    
+    for pattern in patterns:
+        match = re.search(pattern, model_id)
+        if match:
+            try:
+                return (float(match.group(1)), model_id)
+            except ValueError:
+                pass
+    
+    # Fallback: use model name length as proxy (longer names often newer)
+    return (0.0, model_id)
+
+
+def infer_model_families(models: List[dict]) -> Dict[str, dict]:
+    """
+    Infer model families from OpenRouter model list.
+    
+    Returns a dict like:
+    {
+        "claude-sonnet": {
+            "latest": "anthropic/claude-sonnet-4.5",
+            "members": ["anthropic/claude-sonnet-4.5", ...],
+            "tier": "mid"
+        }
+    }
+    """
+    families: Dict[str, List[Tuple[str, float]]] = defaultdict(list)
+    family_tiers: Dict[str, str] = {}
+    
+    for model in models:
+        model_id = model.get("id", "")
+        
+        for pattern, family_name, tier in MODEL_FAMILY_PATTERNS:
+            if re.match(pattern, model_id):
+                version, _ = extract_version(model_id)
+                families[family_name].append((model_id, version))
+                family_tiers[family_name] = tier
+                break
+    
+    # Sort each family by version (descending) and build result
+    result = {}
+    for family_name, members in families.items():
+        # Sort by version descending (highest first = latest)
+        sorted_members = sorted(members, key=lambda x: x[1], reverse=True)
+        member_ids = [m[0] for m in sorted_members]
+        
+        if member_ids:
+            result[family_name] = {
+                "latest": member_ids[0],
+                "members": member_ids,
+                "tier": family_tiers.get(family_name, "mid")
+            }
+    
+    return result
+
+
 def build_model_score_map(benchmarks_data: Dict[str, dict]) -> Dict[str, dict]:
     """
     Build a map from normalized model names to their benchmark scores.
@@ -182,6 +308,52 @@ def calculate_category_averages(scores: dict) -> dict:
     return averages
 
 
+def generate_aliases(families: Dict[str, dict]) -> Dict[str, str]:
+    """
+    Generate common aliases that map to the latest model in a family.
+    
+    This helps resolve outdated model names like "claude-3.5-sonnet" 
+    to the latest "anthropic/claude-sonnet-4.5".
+    """
+    aliases = {}
+    
+    for family_name, family_info in families.items():
+        latest = family_info["latest"]
+        members = family_info["members"]
+        
+        # Add all members as aliases to latest
+        for member in members:
+            if member != latest:
+                aliases[member] = latest
+                
+                # Also add short forms
+                if "/" in member:
+                    short = member.split("/")[-1]
+                    aliases[short] = latest
+        
+        # Add family name as alias
+        aliases[family_name] = latest
+        
+        # Add common variations
+        if family_name == "claude-sonnet":
+            aliases["sonnet"] = latest
+            aliases["claude sonnet"] = latest
+        elif family_name == "claude-haiku":
+            aliases["haiku"] = latest
+            aliases["claude haiku"] = latest
+        elif family_name == "claude-opus":
+            aliases["opus"] = latest
+            aliases["claude opus"] = latest
+        elif family_name == "gpt-4":
+            aliases["gpt4"] = latest
+            aliases["gpt-4o"] = latest
+        elif family_name == "gpt-4-mini":
+            aliases["gpt4-mini"] = latest
+            aliases["gpt-4o-mini"] = latest
+    
+    return aliases
+
+
 def main():
     print("=" * 60)
     print("OpenRouter + ZeroEval Benchmark Merger")
@@ -199,7 +371,18 @@ def main():
         json.dump({"data": openrouter_models}, f)
     print(f"Saved raw OpenRouter models to {or_path}")
     
-    # Step 2: Fetch all benchmark metadata
+    # Step 2: Infer model families
+    print("\nInferring model families...")
+    families = infer_model_families(openrouter_models)
+    print(f"  Found {len(families)} model families:")
+    for name, info in sorted(families.items()):
+        print(f"    - {name}: {info['latest']} ({len(info['members'])} members, tier={info['tier']})")
+    
+    # Generate aliases
+    aliases = generate_aliases(families)
+    print(f"  Generated {len(aliases)} aliases for auto-upgrade")
+    
+    # Step 3: Fetch all benchmark metadata
     all_benchmarks = fetch_all_benchmarks()
     if not all_benchmarks:
         print("Failed to fetch benchmarks, exiting.")
@@ -214,7 +397,7 @@ def main():
     # Build benchmark ID lookup
     benchmark_lookup = {b["benchmark_id"]: b for b in all_benchmarks}
     
-    # Step 3: Fetch scores for key benchmarks in each category
+    # Step 4: Fetch scores for key benchmarks in each category
     print("\nFetching benchmark scores by category...")
     benchmarks_data = {}
     
@@ -245,12 +428,12 @@ def main():
             
             time.sleep(0.2)  # Rate limiting
     
-    # Step 4: Build model score map
+    # Step 5: Build model score map
     print("\nBuilding model score map...")
     model_scores = build_model_score_map(benchmarks_data)
     print(f"  Found scores for {len(model_scores)} unique model IDs")
     
-    # Step 5: Merge with OpenRouter models
+    # Step 6: Merge with OpenRouter models
     print("\nMerging with OpenRouter models...")
     merged_models = []
     matched_count = 0
@@ -281,12 +464,14 @@ def main():
     
     print(f"  Matched {matched_count}/{len(openrouter_models)} models with benchmarks")
     
-    # Step 6: Save merged data
+    # Step 7: Save merged data with families
     output = {
         "generated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
         "total_models": len(merged_models),
         "models_with_benchmarks": matched_count,
         "categories": list(KEY_BENCHMARKS.keys()),
+        "families": families,
+        "aliases": aliases,
         "models": merged_models
     }
     
@@ -295,14 +480,21 @@ def main():
         json.dump(output, f, indent=2)
     print(f"\n✓ Saved merged data to {output_path}")
     
-    # Step 7: Create summary
+    # Step 8: Create summary
     print("\n" + "=" * 60)
     print("Summary")
     print("=" * 60)
     print(f"Total OpenRouter models: {len(openrouter_models)}")
     print(f"Models with benchmark data: {matched_count}")
+    print(f"Model families detected: {len(families)}")
+    print(f"Aliases generated: {len(aliases)}")
     print(f"Categories tracked: {', '.join(KEY_BENCHMARKS.keys())}")
     
+    # Show family info
+    print("\nModel families (latest versions):")
+    for name, info in sorted(families.items()):
+        print(f"  - {name}: {info['latest']}")
+    
     # Show some example matches
     print("\nExample matched models:")
     for m in merged_models[:10]:
diff --git a/src/agents/leaf/executor.rs b/src/agents/leaf/executor.rs
index 4babb5e..b9a1584 100644
--- a/src/agents/leaf/executor.rs
+++ b/src/agents/leaf/executor.rs
@@ -523,13 +523,11 @@ Use `search_memory` when you encounter a problem you might have solved before or
             if let Some(tool_calls) = &response.tool_calls {
                 if !tool_calls.is_empty() {
                     // Add assistant message with tool calls
-                    // Preserve reasoning_details for models that require it (Gemini 3, Claude 3.7+)
                     messages.push(ChatMessage {
                         role: Role::Assistant,
                         content: response.content.clone().map(MessageContent::text),
                         tool_calls: Some(tool_calls.clone()),
                         tool_call_id: None,
-                        reasoning_details: response.reasoning_details.clone(),
                     });
 
                     // Check for repetitive actions
@@ -717,7 +715,6 @@ Use `search_memory` when you encounter a problem you might have solved before or
                             content: Some(message_content),
                             tool_calls: None,
                             tool_call_id: Some(tool_call.id.clone()),
-                            reasoning_details: None,
                         });
                     }
 
@@ -859,11 +856,25 @@ impl Agent for TaskExecutor {
 
     async fn execute(&self, task: &mut Task, ctx: &AgentContext) -> AgentResult {
         // Use model selected during planning, otherwise fall back to default.
-        let selected = task
-            .analysis()
-            .selected_model
-            .clone()
-            .unwrap_or_else(|| ctx.config.default_model.clone());
+        // If falling back to default, resolve it to latest version first.
+        let selected = if let Some(model) = task.analysis().selected_model.clone() {
+            model
+        } else {
+            // Resolve default model to latest version
+            if let Some(resolver) = &ctx.resolver {
+                let resolver = resolver.read().await;
+                let resolved = resolver.resolve(&ctx.config.default_model);
+                if resolved.upgraded {
+                    tracing::info!(
+                        "Executor: default model auto-upgraded: {} → {}",
+                        resolved.original, resolved.resolved
+                    );
+                }
+                resolved.resolved
+            } else {
+                ctx.config.default_model.clone()
+            }
+        };
         let model = selected.as_str();
 
         let result = self.run_loop(task, model, ctx).await;
@@ -909,11 +920,26 @@ impl Agent for TaskExecutor {
 impl TaskExecutor {
     /// Execute a task and return detailed execution result for retry analysis.
     pub async fn execute_with_signals(&self, task: &mut Task, ctx: &AgentContext) -> (AgentResult, ExecutionSignals) {
-        let selected = task
-            .analysis()
-            .selected_model
-            .clone()
-            .unwrap_or_else(|| ctx.config.default_model.clone());
+        // Use model selected during planning, otherwise fall back to default.
+        // If falling back to default, resolve it to latest version first.
+        let selected = if let Some(model) = task.analysis().selected_model.clone() {
+            model
+        } else {
+            // Resolve default model to latest version
+            if let Some(resolver) = &ctx.resolver {
+                let resolver = resolver.read().await;
+                let resolved = resolver.resolve(&ctx.config.default_model);
+                if resolved.upgraded {
+                    tracing::info!(
+                        "Executor: default model auto-upgraded: {} → {}",
+                        resolved.original, resolved.resolved
+                    );
+                }
+                resolved.resolved
+            } else {
+                ctx.config.default_model.clone()
+            }
+        };
         let model = selected.as_str();
 
         let result = self.run_loop(task, model, ctx).await;
diff --git a/src/agents/leaf/model_select.rs b/src/agents/leaf/model_select.rs
index 1468a83..52710c5 100644
--- a/src/agents/leaf/model_select.rs
+++ b/src/agents/leaf/model_select.rs
@@ -540,8 +540,20 @@ impl Agent for ModelSelector {
         let models = ctx.pricing.models_by_cost_filtered(true).await;
         
         if models.is_empty() {
-            // Fall back to configured default model
-            let default_model = ctx.config.default_model.clone();
+            // Fall back to configured default model (after resolving to latest)
+            let default_model = if let Some(resolver) = &ctx.resolver {
+                let resolver = resolver.read().await;
+                let resolved = resolver.resolve(&ctx.config.default_model);
+                if resolved.upgraded {
+                    tracing::info!(
+                        "Default model auto-upgraded: {} → {}",
+                        resolved.original, resolved.resolved
+                    );
+                }
+                resolved.resolved
+            } else {
+                ctx.config.default_model.clone()
+            };
             
             // Record on task analysis
             {
@@ -565,15 +577,43 @@ impl Agent for ModelSelector {
             }));
         }
 
-        // Get user-requested model - if specified, use it directly if available
+        // Get user-requested model - if specified, resolve to latest version and use it
         let requested_model = task.analysis().requested_model.clone();
         
-        // If user explicitly requested a model and it's available, use it directly
-        if let Some(ref req_model) = requested_model {
+        // Auto-upgrade outdated model names using the resolver
+        let (resolved_model, was_upgraded) = if let Some(ref req_model) = requested_model {
+            if let Some(resolver) = &ctx.resolver {
+                let resolver = resolver.read().await;
+                let resolved = resolver.resolve(req_model);
+                if resolved.upgraded {
+                    tracing::info!(
+                        "Model auto-upgraded: {} → {} ({})",
+                        resolved.original,
+                        resolved.resolved,
+                        resolved.reason.as_deref().unwrap_or("family upgrade")
+                    );
+                }
+                (Some(resolved.resolved), resolved.upgraded)
+            } else {
+                (Some(req_model.clone()), false)
+            }
+        } else {
+            (None, false)
+        };
+        
+        // If user explicitly requested a model (possibly upgraded) and it's available, use it directly
+        if let Some(ref req_model) = resolved_model {
             if models.iter().any(|m| &m.model_id == req_model) {
+                let upgrade_note = if was_upgraded {
+                    format!(" (auto-upgraded from {})", requested_model.as_deref().unwrap_or("unknown"))
+                } else {
+                    String::new()
+                };
+                
                 tracing::info!(
-                    "Using user-requested model directly: {} (not optimizing)",
-                    req_model
+                    "Using requested model directly: {}{}",
+                    req_model,
+                    upgrade_note
                 );
                 
                 // Record selection in analysis
@@ -584,17 +624,19 @@ impl Agent for ModelSelector {
                 }
 
                 return AgentResult::success(
-                    &format!("Using user-requested model: {}", req_model),
+                    &format!("Using requested model: {}{}", req_model, upgrade_note),
                     1,
                 )
                 .with_data(json!({
                     "model_id": req_model,
                     "expected_cost_cents": 50,
                     "confidence": 1.0,
-                    "reasoning": format!("User explicitly requested model: {}", req_model),
+                    "reasoning": format!("User requested model: {}{}", req_model, upgrade_note),
                     "fallbacks": [],
                     "used_historical_data": false,
                     "used_benchmark_data": false,
+                    "was_upgraded": was_upgraded,
+                    "original_model": requested_model,
                     "task_type": format!("{:?}", task_type),
                 }));
             }
@@ -607,7 +649,7 @@ impl Agent for ModelSelector {
             budget_cents,
             task_type,
             historical_stats.as_ref(),
-            requested_model.as_deref(),
+            resolved_model.as_deref(),
             ctx,
         ).await {
             Some(rec) => {
diff --git a/src/agents/orchestrator/node.rs b/src/agents/orchestrator/node.rs
index 8721db5..d45fa1a 100644
--- a/src/agents/orchestrator/node.rs
+++ b/src/agents/orchestrator/node.rs
@@ -12,36 +12,36 @@ use async_trait::async_trait;
 use serde_json::json;
 
 use crate::agents::{
-    Agent, AgentContext, AgentId, AgentRef, AgentResult, AgentType, Complexity, OrchestratorAgent,
     leaf::{ComplexityEstimator, ModelSelector, TaskExecutor, Verifier},
+    Agent, AgentContext, AgentId, AgentRef, AgentResult, AgentType, Complexity, OrchestratorAgent,
 };
 use crate::budget::Budget;
 use crate::llm::{ChatMessage, Role};
-use crate::task::{Task, Subtask, SubtaskPlan, VerificationCriteria};
+use crate::task::{Subtask, SubtaskPlan, Task, VerificationCriteria};
 
 /// Node agent - intermediate orchestrator.
-/// 
+///
 /// # Purpose
 /// Handles subtasks that may still be complex enough
 /// to warrant further splitting. Now with full recursive
 /// splitting capabilities like RootAgent.
-/// 
+///
 /// # Recursive Splitting
 /// NodeAgent can estimate complexity of its subtasks and
 /// recursively split them if they're still too complex,
 /// respecting the `max_split_depth` in context.
 pub struct NodeAgent {
     id: AgentId,
-    
+
     /// Name for identification in logs
     name: String,
-    
+
     // Child agents - full pipeline for recursive splitting
     complexity_estimator: Arc<ComplexityEstimator>,
     model_selector: Arc<ModelSelector>,
     task_executor: Arc<TaskExecutor>,
     verifier: Arc<Verifier>,
-    
+
     // Child node agents (for further splitting)
     child_nodes: Vec<Arc<NodeAgent>>,
 }
@@ -79,22 +79,25 @@ impl NodeAgent {
     /// Estimate complexity of a task.
     async fn estimate_complexity(&self, task: &mut Task, ctx: &AgentContext) -> Complexity {
         let result = self.complexity_estimator.execute(task, ctx).await;
-        
+
         if let Some(data) = result.data {
             let score = data["score"].as_f64().unwrap_or(0.5);
             let reasoning = data["reasoning"].as_str().unwrap_or("").to_string();
             let estimated_tokens = data["estimated_tokens"].as_u64().unwrap_or(2000);
             let should_split = data["should_split"].as_bool().unwrap_or(false);
-            
-            Complexity::new(score, reasoning, estimated_tokens)
-                .with_split(should_split)
+
+            Complexity::new(score, reasoning, estimated_tokens).with_split(should_split)
         } else {
             Complexity::moderate("Could not estimate complexity")
         }
     }
 
     /// Split a complex task into subtasks.
-    async fn split_task(&self, task: &Task, ctx: &AgentContext) -> Result<SubtaskPlan, AgentResult> {
+    async fn split_task(
+        &self,
+        task: &Task,
+        ctx: &AgentContext,
+    ) -> Result<SubtaskPlan, AgentResult> {
         let prompt = format!(
             r#"You are a task planner. Break down this task into smaller, manageable subtasks.
 
@@ -135,11 +138,15 @@ Respond ONLY with the JSON object."#,
         );
 
         let messages = vec![
-            ChatMessage::new(Role::System, "You are a precise task planner. Respond only with JSON."),
+            ChatMessage::new(
+                Role::System,
+                "You are a precise task planner. Respond only with JSON.",
+            ),
             ChatMessage::new(Role::User, prompt),
         ];
 
-        let response = ctx.llm
+        let response = ctx
+            .llm
             .chat_completion("openai/gpt-4.1-mini", &messages, None)
             .await
             .map_err(|e| AgentResult::failure(format!("LLM error: {}", e), 1))?;
@@ -151,7 +158,7 @@ Respond ONLY with the JSON object."#,
     /// Extract JSON from LLM response (handles markdown code blocks).
     fn extract_json(response: &str) -> String {
         let trimmed = response.trim();
-        
+
         // Check for markdown code block
         if trimmed.starts_with("```") {
             // Find the end of the opening fence
@@ -163,7 +170,7 @@ Respond ONLY with the JSON object."#,
                 }
             }
         }
-        
+
         // Try to find JSON object in the response
         if let Some(start) = trimmed.find('{') {
             if let Some(end) = trimmed.rfind('}') {
@@ -172,7 +179,7 @@ Respond ONLY with the JSON object."#,
                 }
             }
         }
-        
+
         // Return as-is if no extraction needed
         trimmed.to_string()
     }
@@ -184,8 +191,16 @@ Respond ONLY with the JSON object."#,
         parent_id: crate::task::TaskId,
     ) -> Result<SubtaskPlan, AgentResult> {
         let extracted = Self::extract_json(response);
-        let json: serde_json::Value = serde_json::from_str(&extracted)
-            .map_err(|e| AgentResult::failure(format!("Failed to parse subtasks: {} (raw: {}...)", e, response.chars().take(100).collect::<String>()), 0))?;
+        let json: serde_json::Value = serde_json::from_str(&extracted).map_err(|e| {
+            AgentResult::failure(
+                format!(
+                    "Failed to parse subtasks: {} (raw: {}...)",
+                    e,
+                    response.chars().take(100).collect::<String>()
+                ),
+                0,
+            )
+        })?;
 
         let reasoning = json["reasoning"]
             .as_str()
@@ -200,7 +215,7 @@ Respond ONLY with the JSON object."#,
                         let desc = s["description"].as_str().unwrap_or("").to_string();
                         let verification = s["verification"].as_str().unwrap_or("");
                         let weight = s["weight"].as_f64().unwrap_or(1.0);
-                        
+
                         // Parse dependencies array
                         let dependencies: Vec<usize> = s["dependencies"]
                             .as_array()
@@ -210,12 +225,9 @@ Respond ONLY with the JSON object."#,
                                     .collect()
                             })
                             .unwrap_or_default();
-                        
-                        Subtask::new(
-                            desc,
-                            VerificationCriteria::llm_based(verification),
-                            weight,
-                        ).with_dependencies(dependencies)
+
+                        Subtask::new(desc, VerificationCriteria::llm_based(verification), weight)
+                            .with_dependencies(dependencies)
                     })
                     .collect()
             })
@@ -266,7 +278,7 @@ Respond ONLY with the JSON object."#,
 
             // Create a child NodeAgent for this subtask (recursive)
             let child_node = NodeAgent::new(format!("{}-sub", self.name));
-            
+
             // Execute through the child node (which may split further)
             let result = child_node.execute(task, &child_ctx).await;
             total_cost += result.cost_cents;
@@ -278,19 +290,21 @@ Respond ONLY with the JSON object."#,
         let successes = results.iter().filter(|r| r.success).count();
         let total = results.len();
 
+        // Concatenate successful outputs for meaningful aggregation
+        let combined_output = Self::concatenate_outputs(&results);
+
         if successes == total {
-            AgentResult::success(
-                format!("All {} subtasks completed successfully", total),
-                total_cost,
-            )
-            .with_data(json!({
+            AgentResult::success(combined_output, total_cost).with_data(json!({
                 "subtasks_total": total,
                 "subtasks_succeeded": successes,
                 "results": results.iter().map(|r| &r.output).collect::<Vec<_>>(),
             }))
         } else {
             AgentResult::failure(
-                format!("{}/{} subtasks succeeded", successes, total),
+                format!(
+                    "{}/{} subtasks succeeded\n\n{}",
+                    successes, total, combined_output
+                ),
                 total_cost,
             )
             .with_data(json!({
@@ -304,6 +318,31 @@ Respond ONLY with the JSON object."#,
         }
     }
 
+    /// Concatenate subtask outputs into a single string.
+    /// Used for intermediate aggregation (RootAgent handles final synthesis).
+    fn concatenate_outputs(results: &[AgentResult]) -> String {
+        let outputs: Vec<String> = results
+            .iter()
+            .enumerate()
+            .filter(|(_, r)| r.success && !r.output.is_empty())
+            .map(|(i, r)| {
+                if results.len() == 1 {
+                    r.output.clone()
+                } else {
+                    format!("### Part {}\n{}", i + 1, r.output)
+                }
+            })
+            .collect();
+
+        if outputs.is_empty() {
+            "No output generated.".to_string()
+        } else if outputs.len() == 1 {
+            outputs.into_iter().next().unwrap()
+        } else {
+            outputs.join("\n\n")
+        }
+    }
+
     /// Execute with tree updates for visualization.
     /// This method updates the parent's tree structure as this node executes.
     pub async fn execute_with_tree(
@@ -315,7 +354,7 @@ Respond ONLY with the JSON object."#,
         emit_ctx: &AgentContext,
     ) -> AgentResult {
         use crate::api::control::AgentTreeNode;
-        
+
         let mut total_cost = 0u64;
 
         tracing::info!(
@@ -326,7 +365,11 @@ Respond ONLY with the JSON object."#,
         );
 
         // Step 1: Estimate complexity
-        ctx.emit_phase("estimating_complexity", Some("Analyzing subtask..."), Some(&self.name));
+        ctx.emit_phase(
+            "estimating_complexity",
+            Some("Analyzing subtask..."),
+            Some(&self.name),
+        );
         let complexity = self.estimate_complexity(task, ctx).await;
         total_cost += 1;
 
@@ -346,15 +389,21 @@ Respond ONLY with the JSON object."#,
 
         // Step 2: Decide execution strategy
         if complexity.should_split() && ctx.can_split() {
-            ctx.emit_phase("splitting_task", Some("Decomposing subtask..."), Some(&self.name));
+            ctx.emit_phase(
+                "splitting_task",
+                Some("Decomposing subtask..."),
+                Some(&self.name),
+            );
             tracing::info!("NodeAgent '{}' splitting task into sub-subtasks", self.name);
 
             match self.split_task(task, ctx).await {
                 Ok(plan) => {
                     total_cost += 2;
-                    
+
                     // Add child nodes to this node in the tree
-                    if let Some(parent_node) = root_tree.children.iter_mut().find(|n| n.id == node_id) {
+                    if let Some(parent_node) =
+                        root_tree.children.iter_mut().find(|n| n.id == node_id)
+                    {
                         for (i, subtask) in plan.subtasks().iter().enumerate() {
                             let child_node = AgentTreeNode::new(
                                 &format!("{}-sub-{}", node_id, i + 1),
@@ -367,7 +416,7 @@ Respond ONLY with the JSON object."#,
                         }
                     }
                     emit_ctx.emit_tree(root_tree.clone());
-                    
+
                     let subtask_count = plan.subtasks().len();
                     tracing::info!(
                         "NodeAgent '{}' created {} sub-subtasks",
@@ -378,8 +427,18 @@ Respond ONLY with the JSON object."#,
                     // Execute subtasks recursively with tree updates
                     let child_ctx = ctx.child_context();
                     let requested_model = task.analysis().requested_model.as_deref();
-                    let result = self.execute_subtasks_with_tree(plan, task.budget(), &child_ctx, node_id, root_tree, emit_ctx, requested_model).await;
-                    
+                    let result = self
+                        .execute_subtasks_with_tree(
+                            plan,
+                            task.budget(),
+                            &child_ctx,
+                            node_id,
+                            root_tree,
+                            emit_ctx,
+                            requested_model,
+                        )
+                        .await;
+
                     return AgentResult {
                         success: result.success,
                         output: result.output,
@@ -407,7 +466,7 @@ Respond ONLY with the JSON object."#,
                     "Task Executor",
                     "Execute subtask",
                 )
-                .with_status("running")
+                .with_status("running"),
             );
             parent_node.children.push(
                 AgentTreeNode::new(
@@ -416,13 +475,17 @@ Respond ONLY with the JSON object."#,
                     "Verifier",
                     "Verify result",
                 )
-                .with_status("pending")
+                .with_status("pending"),
             );
         }
         emit_ctx.emit_tree(root_tree.clone());
 
         // Select model
-        ctx.emit_phase("selecting_model", Some("Choosing model..."), Some(&self.name));
+        ctx.emit_phase(
+            "selecting_model",
+            Some("Choosing model..."),
+            Some(&self.name),
+        );
         let sel_result = self.model_selector.execute(task, ctx).await;
         total_cost += sel_result.cost_cents;
 
@@ -433,8 +496,16 @@ Respond ONLY with the JSON object."#,
 
         // Update executor status
         if let Some(parent_node) = root_tree.children.iter_mut().find(|n| n.id == node_id) {
-            if let Some(exec_node) = parent_node.children.iter_mut().find(|n| n.id == format!("{}-executor", node_id)) {
-                exec_node.status = if result.success { "completed".to_string() } else { "failed".to_string() };
+            if let Some(exec_node) = parent_node
+                .children
+                .iter_mut()
+                .find(|n| n.id == format!("{}-executor", node_id))
+            {
+                exec_node.status = if result.success {
+                    "completed".to_string()
+                } else {
+                    "failed".to_string()
+                };
                 exec_node.budget_spent = result.cost_cents;
             }
         }
@@ -444,18 +515,21 @@ Respond ONLY with the JSON object."#,
         task.set_last_output(result.output.clone());
 
         if !result.success {
-            return AgentResult::failure(result.output, total_cost)
-                .with_data(json!({
-                    "node_name": self.name,
-                    "complexity": complexity.score(),
-                    "was_split": false,
-                    "execution": result.data,
-                }));
+            return AgentResult::failure(result.output, total_cost).with_data(json!({
+                "node_name": self.name,
+                "complexity": complexity.score(),
+                "was_split": false,
+                "execution": result.data,
+            }));
         }
 
         // Verify
         if let Some(parent_node) = root_tree.children.iter_mut().find(|n| n.id == node_id) {
-            if let Some(ver_node) = parent_node.children.iter_mut().find(|n| n.id == format!("{}-verifier", node_id)) {
+            if let Some(ver_node) = parent_node
+                .children
+                .iter_mut()
+                .find(|n| n.id == format!("{}-verifier", node_id))
+            {
                 ver_node.status = "running".to_string();
             }
         }
@@ -467,8 +541,16 @@ Respond ONLY with the JSON object."#,
 
         // Update verifier status
         if let Some(parent_node) = root_tree.children.iter_mut().find(|n| n.id == node_id) {
-            if let Some(ver_node) = parent_node.children.iter_mut().find(|n| n.id == format!("{}-verifier", node_id)) {
-                ver_node.status = if verification.success { "completed".to_string() } else { "failed".to_string() };
+            if let Some(ver_node) = parent_node
+                .children
+                .iter_mut()
+                .find(|n| n.id == format!("{}-verifier", node_id))
+            {
+                ver_node.status = if verification.success {
+                    "completed".to_string()
+                } else {
+                    "failed".to_string()
+                };
                 ver_node.budget_spent = verification.cost_cents;
             }
         }
@@ -531,10 +613,16 @@ Respond ONLY with the JSON object."#,
 
         for (i, task) in tasks.iter_mut().enumerate() {
             let subtask_id = format!("{}-sub-{}", parent_node_id, i + 1);
-            
+
             // Update subtask status to running
-            if let Some(parent_node) = root_tree.children.iter_mut().find(|n| n.id == parent_node_id) {
-                if let Some(child_node) = parent_node.children.iter_mut().find(|n| n.id == subtask_id) {
+            if let Some(parent_node) = root_tree
+                .children
+                .iter_mut()
+                .find(|n| n.id == parent_node_id)
+            {
+                if let Some(child_node) =
+                    parent_node.children.iter_mut().find(|n| n.id == subtask_id)
+                {
                     child_node.status = "running".to_string();
                 }
             }
@@ -552,9 +640,19 @@ Respond ONLY with the JSON object."#,
             total_cost += result.cost_cents;
 
             // Update subtask status
-            if let Some(parent_node) = root_tree.children.iter_mut().find(|n| n.id == parent_node_id) {
-                if let Some(child_node) = parent_node.children.iter_mut().find(|n| n.id == subtask_id) {
-                    child_node.status = if result.success { "completed".to_string() } else { "failed".to_string() };
+            if let Some(parent_node) = root_tree
+                .children
+                .iter_mut()
+                .find(|n| n.id == parent_node_id)
+            {
+                if let Some(child_node) =
+                    parent_node.children.iter_mut().find(|n| n.id == subtask_id)
+                {
+                    child_node.status = if result.success {
+                        "completed".to_string()
+                    } else {
+                        "failed".to_string()
+                    };
                     child_node.budget_spent = result.cost_cents;
                 }
             }
@@ -566,18 +664,20 @@ Respond ONLY with the JSON object."#,
         let successes = results.iter().filter(|r| r.success).count();
         let total = results.len();
 
+        // Concatenate successful outputs for meaningful aggregation
+        let combined_output = Self::concatenate_outputs(&results);
+
         if successes == total {
-            AgentResult::success(
-                format!("All {} sub-subtasks completed successfully", total),
-                total_cost,
-            )
-            .with_data(json!({
+            AgentResult::success(combined_output, total_cost).with_data(json!({
                 "subtasks_total": total,
                 "subtasks_succeeded": successes,
             }))
         } else {
             AgentResult::failure(
-                format!("{}/{} sub-subtasks succeeded", successes, total),
+                format!(
+                    "{}/{} sub-subtasks succeeded\n\n{}",
+                    successes, total, combined_output
+                ),
                 total_cost,
             )
             .with_data(json!({
@@ -619,7 +719,11 @@ impl Agent for NodeAgent {
         );
 
         // Step 1: Estimate complexity
-        ctx.emit_phase("estimating_complexity", Some("Analyzing subtask..."), Some(&self.name));
+        ctx.emit_phase(
+            "estimating_complexity",
+            Some("Analyzing subtask..."),
+            Some(&self.name),
+        );
         let complexity = self.estimate_complexity(task, ctx).await;
         total_cost += 1;
 
@@ -634,13 +738,17 @@ impl Agent for NodeAgent {
         // Step 2: Decide execution strategy
         if complexity.should_split() && ctx.can_split() {
             // Complex subtask: split further recursively
-            ctx.emit_phase("splitting_task", Some("Decomposing subtask..."), Some(&self.name));
+            ctx.emit_phase(
+                "splitting_task",
+                Some("Decomposing subtask..."),
+                Some(&self.name),
+            );
             tracing::info!("NodeAgent '{}' splitting task into sub-subtasks", self.name);
 
             match self.split_task(task, ctx).await {
                 Ok(plan) => {
                     total_cost += 2; // Splitting cost
-                    
+
                     let subtask_count = plan.subtasks().len();
                     tracing::info!(
                         "NodeAgent '{}' created {} sub-subtasks",
@@ -650,8 +758,10 @@ impl Agent for NodeAgent {
 
                     // Execute subtasks recursively
                     let requested_model = task.analysis().requested_model.as_deref();
-                    let result = self.execute_subtasks(plan, task.budget(), ctx, requested_model).await;
-                    
+                    let result = self
+                        .execute_subtasks(plan, task.budget(), ctx, requested_model)
+                        .await;
+
                     return AgentResult {
                         success: result.success,
                         output: result.output,
@@ -672,7 +782,11 @@ impl Agent for NodeAgent {
 
         // Simple task or failed to split: execute directly
         // Select model
-        ctx.emit_phase("selecting_model", Some("Choosing model..."), Some(&self.name));
+        ctx.emit_phase(
+            "selecting_model",
+            Some("Choosing model..."),
+            Some(&self.name),
+        );
         let sel_result = self.model_selector.execute(task, ctx).await;
         total_cost += sel_result.cost_cents;
 
@@ -685,13 +799,12 @@ impl Agent for NodeAgent {
         task.set_last_output(result.output.clone());
 
         if !result.success {
-            return AgentResult::failure(result.output, total_cost)
-                .with_data(json!({
-                    "node_name": self.name,
-                    "complexity": complexity.score(),
-                    "was_split": false,
-                    "execution": result.data,
-                }));
+            return AgentResult::failure(result.output, total_cost).with_data(json!({
+                "node_name": self.name,
+                "complexity": complexity.score(),
+                "was_split": false,
+                "execution": result.data,
+            }));
         }
 
         // Verify
@@ -747,7 +860,9 @@ impl OrchestratorAgent for NodeAgent {
 
     fn find_child(&self, agent_type: AgentType) -> Option<AgentRef> {
         match agent_type {
-            AgentType::ComplexityEstimator => Some(Arc::clone(&self.complexity_estimator) as AgentRef),
+            AgentType::ComplexityEstimator => {
+                Some(Arc::clone(&self.complexity_estimator) as AgentRef)
+            }
             AgentType::ModelSelector => Some(Arc::clone(&self.model_selector) as AgentRef),
             AgentType::TaskExecutor => Some(Arc::clone(&self.task_executor) as AgentRef),
             AgentType::Verifier => Some(Arc::clone(&self.verifier) as AgentRef),
@@ -772,4 +887,3 @@ impl OrchestratorAgent for NodeAgent {
         results
     }
 }
-
diff --git a/src/budget/mod.rs b/src/budget/mod.rs
index 10f2182..021e351 100644
--- a/src/budget/mod.rs
+++ b/src/budget/mod.rs
@@ -6,16 +6,19 @@
 //! - Allocation: algorithms for distributing budget across subtasks
 //! - Retry: smart retry strategies for budget overflow
 //! - Benchmarks: model capability scores for task-aware selection
+//! - Resolver: auto-upgrade outdated model names to latest equivalents
 
 mod budget;
 mod pricing;
 mod allocation;
 mod retry;
 pub mod benchmarks;
+pub mod resolver;
 
 pub use budget::{Budget, BudgetError};
 pub use pricing::{ModelPricing, PricingInfo};
 pub use allocation::{AllocationStrategy, allocate_budget};
 pub use retry::{ExecutionSignals, FailureAnalysis, FailureMode, RetryRecommendation, RetryConfig};
 pub use benchmarks::{TaskType, BenchmarkRegistry, SharedBenchmarkRegistry, load_benchmarks};
+pub use resolver::{ModelResolver, ModelFamily, ResolvedModel, SharedModelResolver, load_resolver};
 
diff --git a/src/budget/pricing.rs b/src/budget/pricing.rs
index 56a6827..411caae 100644
--- a/src/budget/pricing.rs
+++ b/src/budget/pricing.rs
@@ -208,34 +208,73 @@ impl ModelPricing {
     /// - Models with $0 pricing
     /// - "Lite" or small model variants
     /// - Models not in the explicit allowlist
+    /// 
+    /// # Model Allowlist Maintenance
+    /// This list should be kept in sync with the model families defined in
+    /// `models_with_benchmarks.json` (generated by `scripts/merge_benchmarks.py`).
+    /// The ModelResolver auto-upgrades outdated model names to latest versions.
     pub async fn models_by_cost_filtered(&self, require_tools: bool) -> Vec<PricingInfo> {
         // Explicitly allowed model patterns (exact match or prefix with version suffix like -001)
-        // These are the ONLY models that will be considered for task execution
+        // These are the ONLY models that will be considered for task execution.
+        // 
+        // IMPORTANT: Keep in sync with MODEL_FAMILY_PATTERNS in scripts/merge_benchmarks.py
+        // When new model versions are released, add them here and run the merge script.
         const CAPABLE_MODEL_BASES: &[&str] = &[
-            // Claude family (all sizes work great)
+            // === Anthropic Claude ===
+            // Flagship tier
+            "anthropic/claude-opus-4.5",
+            "anthropic/claude-opus-4",
+            // Mid tier (balanced cost/performance)
             "anthropic/claude-sonnet-4.5",
             "anthropic/claude-sonnet-4",
             "anthropic/claude-3.7-sonnet",
             "anthropic/claude-3.5-sonnet",
+            // Fast tier (cheap/fast)
             "anthropic/claude-haiku-4.5",
             "anthropic/claude-3.5-haiku",
             "anthropic/claude-3-haiku",
-            // OpenAI GPT-4 family
-            "openai/gpt-4o",
-            "openai/gpt-4o-mini",
-            "openai/gpt-4-turbo",
+            
+            // === OpenAI ===
+            // Flagship tier
+            "openai/o1",
+            "openai/o1-preview",
+            // Mid tier
             "openai/gpt-4.1",
+            "openai/gpt-4o",
+            "openai/gpt-4-turbo",
+            "openai/o1-mini",
+            "openai/o3-mini",
+            // Fast tier
             "openai/gpt-4.1-mini",
-            // Google Gemini (large models ONLY - no lite/flash-lite)
-            "google/gemini-pro",
-            "google/gemini-1.5-pro",
+            "openai/gpt-4o-mini",
+            
+            // === Google Gemini ===
+            // Mid tier (large models ONLY - no lite/flash-lite)
             "google/gemini-2.5-pro",
-            // Mistral large models
+            "google/gemini-1.5-pro",
+            "google/gemini-pro",
+            // Fast tier
+            "google/gemini-2.0-flash",
+            "google/gemini-1.5-flash",
+            
+            // === Mistral ===
             "mistralai/mistral-large",
             "mistralai/mistral-medium",
-            // DeepSeek large
+            "mistralai/mistral-small",
+            
+            // === DeepSeek ===
+            "deepseek/deepseek-r1",
             "deepseek/deepseek-chat",
             "deepseek/deepseek-coder",
+            
+            // === Meta Llama ===
+            "meta-llama/llama-3.3-70b",
+            "meta-llama/llama-3.2-90b",
+            "meta-llama/llama-3.1-405b",
+            
+            // === Qwen ===
+            "qwen/qwen-2.5-72b",
+            "qwen/qwq-32b",
         ];
         
         // Patterns to exclude even if they match an allowed base
diff --git a/src/budget/resolver.rs b/src/budget/resolver.rs
new file mode 100644
index 0000000..0da6518
--- /dev/null
+++ b/src/budget/resolver.rs
@@ -0,0 +1,345 @@
+//! Model resolver for auto-upgrading outdated model names.
+//!
+//! # Problem
+//! AI models often suggest outdated model versions (e.g., "claude-3.5-sonnet")
+//! because their training data is stale. Newer models are typically cheaper and
+//! smarter, so we want to automatically upgrade to the latest equivalent.
+//!
+//! # Solution
+//! The `ModelResolver` maintains a mapping of:
+//! - Model families (claude-sonnet, gpt-4, etc.) with their latest versions
+//! - Aliases from old model IDs to new ones
+//!
+//! When a model is requested, the resolver:
+//! 1. Checks if it's an outdated family member
+//! 2. Returns the latest equivalent with upgrade info
+//!
+//! # Data Source
+//! Families and aliases are loaded from `models_with_benchmarks.json`,
+//! which is auto-generated by `scripts/merge_benchmarks.py`.
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::path::Path;
+use std::sync::Arc;
+use tokio::sync::RwLock;
+
+/// Information about a model family.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelFamily {
+    /// The latest (recommended) model in this family
+    pub latest: String,
+    /// All members of this family (sorted by version, latest first)
+    pub members: Vec<String>,
+    /// Performance tier: "flagship", "mid", or "fast"
+    pub tier: String,
+}
+
+/// Result of resolving a model ID.
+#[derive(Debug, Clone)]
+pub struct ResolvedModel {
+    /// Original model ID that was requested
+    pub original: String,
+    /// Resolved model ID (may be same or upgraded)
+    pub resolved: String,
+    /// Whether the model was upgraded
+    pub upgraded: bool,
+    /// Reason for upgrade (if any)
+    pub reason: Option<String>,
+    /// The family this model belongs to (if known)
+    pub family: Option<String>,
+}
+
+impl ResolvedModel {
+    /// Create a result for an unchanged model.
+    pub fn unchanged(model_id: &str) -> Self {
+        Self {
+            original: model_id.to_string(),
+            resolved: model_id.to_string(),
+            upgraded: false,
+            reason: None,
+            family: None,
+        }
+    }
+
+    /// Create a result for an upgraded model.
+    pub fn upgraded(original: &str, resolved: &str, reason: &str, family: Option<&str>) -> Self {
+        Self {
+            original: original.to_string(),
+            resolved: resolved.to_string(),
+            upgraded: true,
+            reason: Some(reason.to_string()),
+            family: family.map(|s| s.to_string()),
+        }
+    }
+}
+
+/// Model resolver with family-based auto-upgrade.
+#[derive(Debug, Default)]
+pub struct ModelResolver {
+    /// Model families: family_name -> ModelFamily
+    families: HashMap<String, ModelFamily>,
+    /// Direct aliases: old_model_id -> new_model_id
+    aliases: HashMap<String, String>,
+    /// Reverse lookup: model_id -> family_name
+    model_to_family: HashMap<String, String>,
+}
+
+impl ModelResolver {
+    /// Create an empty resolver.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Load resolver data from the benchmark JSON file.
+    pub fn load_from_file(path: impl AsRef<Path>) -> Result<Self, String> {
+        let content = std::fs::read_to_string(path.as_ref())
+            .map_err(|e| format!("Failed to read resolver data: {}", e))?;
+
+        Self::load_from_json(&content)
+    }
+
+    /// Load resolver data from JSON string.
+    pub fn load_from_json(json: &str) -> Result<Self, String> {
+        #[derive(Deserialize)]
+        struct BenchmarkFile {
+            #[serde(default)]
+            families: HashMap<String, ModelFamily>,
+            #[serde(default)]
+            aliases: HashMap<String, String>,
+        }
+
+        let data: BenchmarkFile = serde_json::from_str(json)
+            .map_err(|e| format!("Failed to parse resolver data: {}", e))?;
+
+        let mut resolver = Self {
+            families: data.families.clone(),
+            aliases: data.aliases,
+            model_to_family: HashMap::new(),
+        };
+
+        // Build reverse lookup
+        for (family_name, family) in &data.families {
+            for member in &family.members {
+                resolver
+                    .model_to_family
+                    .insert(member.clone(), family_name.clone());
+            }
+        }
+
+        tracing::info!(
+            "Loaded model resolver: {} families, {} aliases",
+            resolver.families.len(),
+            resolver.aliases.len()
+        );
+
+        Ok(resolver)
+    }
+
+    /// Resolve a potentially outdated model ID to the latest equivalent.
+    ///
+    /// # Examples
+    /// - "claude-3.5-sonnet" → "anthropic/claude-sonnet-4.5" (upgraded)
+    /// - "anthropic/claude-sonnet-4.5" → "anthropic/claude-sonnet-4.5" (unchanged)
+    /// - "gpt-4o" → "openai/gpt-4.1" (upgraded)
+    /// - "unknown-model" → "unknown-model" (unchanged, not in families)
+    pub fn resolve(&self, model_id: &str) -> ResolvedModel {
+        // 1. Check direct alias first (covers short names and old versions)
+        if let Some(target) = self.aliases.get(model_id) {
+            let family = self.model_to_family.get(target).map(|s| s.as_str());
+            return ResolvedModel::upgraded(
+                model_id,
+                target,
+                &format!("Alias resolved to latest"),
+                family,
+            );
+        }
+
+        // 2. Check if model is in a family but not the latest
+        if let Some(family_name) = self.model_to_family.get(model_id) {
+            if let Some(family) = self.families.get(family_name) {
+                if model_id != family.latest {
+                    return ResolvedModel::upgraded(
+                        model_id,
+                        &family.latest,
+                        &format!("Upgraded to latest {} model", family_name),
+                        Some(family_name),
+                    );
+                } else {
+                    // Already the latest
+                    return ResolvedModel {
+                        original: model_id.to_string(),
+                        resolved: model_id.to_string(),
+                        upgraded: false,
+                        reason: None,
+                        family: Some(family_name.clone()),
+                    };
+                }
+            }
+        }
+
+        // 3. Try fuzzy matching by normalizing the model name
+        let normalized = Self::normalize(model_id);
+        if let Some(target) = self.aliases.get(&normalized) {
+            let family = self.model_to_family.get(target).map(|s| s.as_str());
+            return ResolvedModel::upgraded(
+                model_id,
+                target,
+                "Fuzzy match to latest",
+                family,
+            );
+        }
+
+        // 4. Try to match family name directly
+        for (family_name, family) in &self.families {
+            if normalized.contains(family_name) || family_name.contains(&normalized) {
+                return ResolvedModel::upgraded(
+                    model_id,
+                    &family.latest,
+                    &format!("Matched to {} family", family_name),
+                    Some(family_name),
+                );
+            }
+        }
+
+        // 5. No match - return as-is
+        ResolvedModel::unchanged(model_id)
+    }
+
+    /// Check if a model ID exists and is the latest in its family.
+    pub fn is_latest(&self, model_id: &str) -> bool {
+        if let Some(family_name) = self.model_to_family.get(model_id) {
+            if let Some(family) = self.families.get(family_name) {
+                return model_id == family.latest;
+            }
+        }
+        // Unknown models are considered "latest" (no upgrade available)
+        true
+    }
+
+    /// Get the family a model belongs to.
+    pub fn get_family(&self, model_id: &str) -> Option<&ModelFamily> {
+        self.model_to_family
+            .get(model_id)
+            .and_then(|name| self.families.get(name))
+    }
+
+    /// Get all model families.
+    pub fn families(&self) -> &HashMap<String, ModelFamily> {
+        &self.families
+    }
+
+    /// Get all known latest model IDs (one per family).
+    pub fn latest_models(&self) -> Vec<&str> {
+        self.families.values().map(|f| f.latest.as_str()).collect()
+    }
+
+    /// Get all model IDs in a tier ("flagship", "mid", "fast").
+    pub fn models_by_tier(&self, tier: &str) -> Vec<&str> {
+        self.families
+            .values()
+            .filter(|f| f.tier == tier)
+            .map(|f| f.latest.as_str())
+            .collect()
+    }
+
+    /// Normalize a model ID for fuzzy matching.
+    fn normalize(model_id: &str) -> String {
+        model_id
+            .to_lowercase()
+            .replace([':', '-', '_', '.', '/'], "")
+    }
+}
+
+/// Thread-safe model resolver wrapper.
+pub type SharedModelResolver = Arc<RwLock<ModelResolver>>;
+
+/// Create a shared model resolver, loading from default path.
+pub fn load_resolver(workspace_dir: &str) -> SharedModelResolver {
+    let path = format!("{}/models_with_benchmarks.json", workspace_dir);
+
+    match ModelResolver::load_from_file(&path) {
+        Ok(resolver) => {
+            tracing::info!("Loaded model resolver from {}", path);
+            Arc::new(RwLock::new(resolver))
+        }
+        Err(e) => {
+            tracing::warn!("Failed to load resolver: {}. Using empty resolver.", e);
+            Arc::new(RwLock::new(ModelResolver::new()))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_resolver() -> ModelResolver {
+        let json = r#"{
+            "families": {
+                "claude-sonnet": {
+                    "latest": "anthropic/claude-sonnet-4.5",
+                    "members": ["anthropic/claude-sonnet-4.5", "anthropic/claude-3.7-sonnet", "anthropic/claude-3.5-sonnet"],
+                    "tier": "mid"
+                },
+                "gpt-4": {
+                    "latest": "openai/gpt-4.1",
+                    "members": ["openai/gpt-4.1", "openai/gpt-4o"],
+                    "tier": "mid"
+                }
+            },
+            "aliases": {
+                "claude-3.5-sonnet": "anthropic/claude-sonnet-4.5",
+                "sonnet": "anthropic/claude-sonnet-4.5",
+                "gpt-4o": "openai/gpt-4.1",
+                "gpt4": "openai/gpt-4.1"
+            }
+        }"#;
+        ModelResolver::load_from_json(json).unwrap()
+    }
+
+    #[test]
+    fn test_resolve_alias() {
+        let resolver = test_resolver();
+
+        let result = resolver.resolve("claude-3.5-sonnet");
+        assert!(result.upgraded);
+        assert_eq!(result.resolved, "anthropic/claude-sonnet-4.5");
+    }
+
+    #[test]
+    fn test_resolve_family_member() {
+        let resolver = test_resolver();
+
+        let result = resolver.resolve("anthropic/claude-3.7-sonnet");
+        assert!(result.upgraded);
+        assert_eq!(result.resolved, "anthropic/claude-sonnet-4.5");
+    }
+
+    #[test]
+    fn test_resolve_latest_unchanged() {
+        let resolver = test_resolver();
+
+        let result = resolver.resolve("anthropic/claude-sonnet-4.5");
+        assert!(!result.upgraded);
+        assert_eq!(result.resolved, "anthropic/claude-sonnet-4.5");
+    }
+
+    #[test]
+    fn test_resolve_unknown_unchanged() {
+        let resolver = test_resolver();
+
+        let result = resolver.resolve("some-unknown-model");
+        assert!(!result.upgraded);
+        assert_eq!(result.resolved, "some-unknown-model");
+    }
+
+    #[test]
+    fn test_is_latest() {
+        let resolver = test_resolver();
+
+        assert!(resolver.is_latest("anthropic/claude-sonnet-4.5"));
+        assert!(!resolver.is_latest("anthropic/claude-3.5-sonnet"));
+        assert!(resolver.is_latest("unknown-model")); // Unknown = no upgrade
+    }
+}
diff --git a/src/llm/mod.rs b/src/llm/mod.rs
index 3da56d7..2ca7aef 100644
--- a/src/llm/mod.rs
+++ b/src/llm/mod.rs
@@ -8,7 +8,7 @@
 mod error;
 mod openrouter;
 
-pub use error::{LlmError, LlmErrorKind, RetryConfig, classify_http_status};
+pub use error::{classify_http_status, LlmError, LlmErrorKind, RetryConfig};
 pub use openrouter::OpenRouterClient;
 
 use async_trait::async_trait;
@@ -122,10 +122,6 @@ pub struct ChatMessage {
     pub tool_calls: Option<Vec<ToolCall>>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub tool_call_id: Option<String>,
-    /// Reasoning details for models with extended thinking (Gemini 3, Claude 3.7+).
-    /// Must be preserved from responses and passed back in subsequent requests.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub reasoning_details: Option<serde_json::Value>,
 }
 
 impl ChatMessage {
@@ -136,7 +132,6 @@ impl ChatMessage {
             content: Some(MessageContent::text(content)),
             tool_calls: None,
             tool_call_id: None,
-            reasoning_details: None,
         }
     }
 
@@ -147,7 +142,6 @@ impl ChatMessage {
             content: Some(MessageContent::text_and_image(text, image_url)),
             tool_calls: None,
             tool_call_id: None,
-            reasoning_details: None,
         }
     }
 
@@ -199,9 +193,6 @@ pub struct ChatResponse {
     pub finish_reason: Option<String>,
     pub usage: Option<TokenUsage>,
     pub model: Option<String>,
-    /// Reasoning details for models with extended thinking (Gemini 3, Claude 3.7+).
-    /// Must be preserved and passed back in subsequent requests for tool calling.
-    pub reasoning_details: Option<serde_json::Value>,
 }
 
 /// Token usage information (if provided by the upstream provider).
@@ -260,4 +251,3 @@ pub trait LlmClient: Send + Sync {
         self.chat_completion(model, messages, tools).await
     }
 }
-
diff --git a/src/llm/openrouter.rs b/src/llm/openrouter.rs
index 6324b1f..6c6edf9 100644
--- a/src/llm/openrouter.rs
+++ b/src/llm/openrouter.rs
@@ -6,7 +6,9 @@ use serde::{Deserialize, Serialize};
 use std::time::{Duration, Instant};
 
 use super::error::{classify_http_status, LlmError, LlmErrorKind, RetryConfig};
-use super::{ChatMessage, ChatOptions, ChatResponse, LlmClient, TokenUsage, ToolCall, ToolDefinition};
+use super::{
+    ChatMessage, ChatOptions, ChatResponse, LlmClient, TokenUsage, ToolCall, ToolDefinition,
+};
 
 const OPENROUTER_API_URL: &str = "https://openrouter.ai/api/v1/chat/completions";
 
@@ -65,10 +67,7 @@ impl OpenRouterClient {
     }
 
     /// Execute a single request without retry.
-    async fn execute_request(
-        &self,
-        request: &OpenRouterRequest,
-    ) -> Result<ChatResponse, LlmError> {
+    async fn execute_request(&self, request: &OpenRouterRequest) -> Result<ChatResponse, LlmError> {
         let response = match self
             .client
             .post(OPENROUTER_API_URL)
@@ -119,7 +118,6 @@ impl OpenRouterClient {
                 .usage
                 .map(|u| TokenUsage::new(u.prompt_tokens, u.completion_tokens)),
             model: parsed.model.or_else(|| Some(request.model.clone())),
-            reasoning_details: choice.message.reasoning_details,
         })
     }
 
@@ -153,8 +151,8 @@ impl OpenRouterClient {
                     return Ok(response);
                 }
                 Err(error) => {
-                    let should_retry =
-                        self.retry_config.should_retry(&error) && attempt < self.retry_config.max_retries;
+                    let should_retry = self.retry_config.should_retry(&error)
+                        && attempt < self.retry_config.max_retries;
 
                     if should_retry {
                         let delay = error.suggested_delay(attempt);
@@ -280,10 +278,6 @@ struct OpenRouterChoice {
 struct OpenRouterMessage {
     content: Option<String>,
     tool_calls: Option<Vec<ToolCall>>,
-    /// Reasoning details for models that support extended thinking (Gemini 3, Claude 3.7+, etc.)
-    /// Must be preserved and passed back in subsequent requests for tool calling to work.
-    #[serde(default)]
-    reasoning_details: Option<serde_json::Value>,
 }
 
 /// Usage data (OpenAI-compatible).
@@ -294,4 +288,3 @@ struct OpenRouterUsage {
     #[serde(rename = "total_tokens")]
     _total_tokens: u64,
 }
-