diff --git a/src/agents/leaf/executor.rs b/src/agents/leaf/executor.rs index 8738b9d..4babb5e 100644 --- a/src/agents/leaf/executor.rs +++ b/src/agents/leaf/executor.rs @@ -287,6 +287,15 @@ java -jar /root/tools/cfr.jar --outputdir /root/work/java-analysis/ou 8. **Create reusable tools** — Save useful scripts to /root/tools/ with README 9. **Verify your work** — Test, run, check outputs when possible 10. **Iterate** — If first approach fails, try alternatives before giving up +11. **PREFER CLI OVER DESKTOP** — Always use command-line tools first: + - For HTTP: use `curl`, `wget`, `fetch_url` instead of browser automation + - For files: use `grep`, `find`, `unzip`, `7z` instead of GUI tools + - For downloads: construct URLs and use `curl -L` instead of clicking buttons + - Desktop automation (`desktop_*` tools) is a LAST RESORT for: + - Testing web applications visually + - Interacting with GUI-only applications + - When no CLI alternative exists + - Chrome extensions can be downloaded directly: `https://clients2.google.com/service/update2/crx?response=redirect&x=id%3D%26uc` ## Response When task is complete, provide a clear summary of: diff --git a/src/agents/leaf/model_select.rs b/src/agents/leaf/model_select.rs index 38765b3..c3a85ba 100644 --- a/src/agents/leaf/model_select.rs +++ b/src/agents/leaf/model_select.rs @@ -229,9 +229,10 @@ impl ModelSelector { /// /// # Algorithm /// 1. Calculate expected cost for each model using benchmark capabilities when available - /// 2. Filter models exceeding budget - /// 3. Select model with minimum expected cost - /// 4. Include fallbacks in case of failure + /// 2. If user requested a specific model, use it as minimum capability floor + /// 3. Filter models exceeding budget + /// 4. Select model with minimum expected cost + /// 5. Include fallbacks in case of failure /// /// # Preconditions /// - `models` is non-empty @@ -244,6 +245,7 @@ impl ModelSelector { budget_cents: u64, task_type: TaskType, historical_stats: Option<&HashMap>, + requested_model: Option<&str>, ctx: &AgentContext, ) -> Option { if models.is_empty() { @@ -284,14 +286,65 @@ impl ModelSelector { .cmp(&b.expected_cost_cents) }); + // If user requested a specific model, use it as minimum capability floor + // Filter out models with lower capability than the requested one + let min_capability = if let Some(req_model) = requested_model { + // Find the requested model's capability + if let Some(req_cost) = costs.iter().find(|c| c.model_id == req_model) { + tracing::info!( + "Using requested model {} as capability floor: {:.3}", + req_model, + req_cost.capability + ); + req_cost.capability + } else { + // Requested model not found - fall back to looking up its price + if let Some(req_pricing) = models.iter().find(|m| m.model_id == req_model) { + let cap = self.estimate_capability_from_price(req_pricing.average_cost_per_token()); + tracing::info!( + "Requested model {} not in costs list, using price-based capability: {:.3}", + req_model, + cap + ); + cap + } else { + // Model not found at all, use a reasonable floor (0.7 = mid-tier) + tracing::warn!( + "Requested model {} not found, using default capability floor 0.7", + req_model + ); + 0.7 + } + } + } else { + 0.0 // No minimum + }; + + // Filter to models meeting minimum capability + let filtered_costs: Vec<_> = if min_capability > 0.0 { + costs.iter() + .filter(|c| c.capability >= min_capability * 0.95) // Allow 5% tolerance + .cloned() + .collect() + } else { + costs.clone() + }; + + let costs_to_use = if filtered_costs.is_empty() { + tracing::warn!("No models meet minimum capability {:.2}, using all models", min_capability); + &costs + } else { + &filtered_costs + }; + // Find cheapest model within budget - let within_budget: Vec<_> = costs + let within_budget: Vec<_> = costs_to_use .iter() .filter(|c| c.expected_cost_cents <= budget_cents) .cloned() .collect(); - let selected = within_budget.first().cloned().or_else(|| costs.first().cloned())?; + let selected = within_budget.first().cloned().or_else(|| costs_to_use.first().cloned())?; // Get fallback models (next best options) let fallbacks: Vec = costs @@ -512,6 +565,9 @@ impl Agent for ModelSelector { })); } + // Get user-requested model as minimum capability floor + let requested_model = task.analysis().requested_model.as_deref(); + match self.select_optimal( &models, complexity, @@ -519,6 +575,7 @@ impl Agent for ModelSelector { budget_cents, task_type, historical_stats.as_ref(), + requested_model, ctx, ).await { Some(rec) => { diff --git a/src/agents/orchestrator/node.rs b/src/agents/orchestrator/node.rs index 84a0d5e..a7a694c 100644 --- a/src/agents/orchestrator/node.rs +++ b/src/agents/orchestrator/node.rs @@ -106,18 +106,22 @@ Respond with a JSON object: {{ "description": "What to do", "verification": "How to verify it's done", - "weight": 1.0 + "weight": 1.0, + "dependencies": [] }} ], "reasoning": "Why this breakdown makes sense" }} Guidelines: -- Each subtask should be independently executable +- Each subtask should be independently executable once its dependencies are complete +- The "dependencies" array contains indices (0-based) of subtasks that MUST complete before this one can start +- For example, if subtask 2 needs subtask 0's output, set "dependencies": [0] - Include verification for each subtask - Weight indicates relative effort (higher = more work) - Keep subtasks focused and specific - Aim for 2-4 subtasks typically +- IMPORTANT: If subtasks have a logical order (e.g., download before analyze), specify dependencies! Respond ONLY with the JSON object."#, task.description() @@ -190,11 +194,21 @@ Respond ONLY with the JSON object."#, let verification = s["verification"].as_str().unwrap_or(""); let weight = s["weight"].as_f64().unwrap_or(1.0); + // Parse dependencies array + let dependencies: Vec = s["dependencies"] + .as_array() + .map(|deps| { + deps.iter() + .filter_map(|d| d.as_u64().map(|n| n as usize)) + .collect() + }) + .unwrap_or_default(); + Subtask::new( desc, VerificationCriteria::llm_based(verification), weight, - ) + ).with_dependencies(dependencies) }) .collect() }) diff --git a/src/agents/orchestrator/root.rs b/src/agents/orchestrator/root.rs index 817d4af..d884ca0 100644 --- a/src/agents/orchestrator/root.rs +++ b/src/agents/orchestrator/root.rs @@ -101,17 +101,21 @@ Respond with a JSON object: {{ "description": "What to do", "verification": "How to verify it's done", - "weight": 1.0 + "weight": 1.0, + "dependencies": [] }} ], "reasoning": "Why this breakdown makes sense" }} Guidelines: -- Each subtask should be independently executable +- Each subtask should be independently executable once its dependencies are complete +- The "dependencies" array contains indices (0-based) of subtasks that MUST complete before this one can start +- For example, if subtask 2 needs subtask 0's output, set "dependencies": [0] - Include verification for each subtask - Weight indicates relative effort (higher = more work) - Keep subtasks focused and specific +- IMPORTANT: If subtasks have a logical order (e.g., download before analyze), specify dependencies! Respond ONLY with the JSON object."#, task.description() @@ -184,11 +188,21 @@ Respond ONLY with the JSON object."#, let verification = s["verification"].as_str().unwrap_or(""); let weight = s["weight"].as_f64().unwrap_or(1.0); + // Parse dependencies array + let dependencies: Vec = s["dependencies"] + .as_array() + .map(|deps| { + deps.iter() + .filter_map(|d| d.as_u64().map(|n| n as usize)) + .collect() + }) + .unwrap_or_default(); + Subtask::new( desc, VerificationCriteria::llm_based(verification), weight, - ) + ).with_dependencies(dependencies) }) .collect() }) diff --git a/src/api/routes.rs b/src/api/routes.rs index 132a4ff..48cf9d5 100644 --- a/src/api/routes.rs +++ b/src/api/routes.rs @@ -298,7 +298,7 @@ async fn run_agent_task( state: Arc, task_id: Uuid, task_description: String, - _model: String, + requested_model: String, working_dir: std::path::PathBuf, ) { // Update status to running @@ -327,6 +327,11 @@ async fn run_agent_task( } }; + // Set the user-requested model as minimum capability floor + if !requested_model.is_empty() { + task.analysis_mut().requested_model = Some(requested_model); + } + // Create context with the specified working directory and memory let llm = Arc::new(OpenRouterClient::new(state.config.api_key.clone())); let tools = ToolRegistry::new(); diff --git a/src/task/task.rs b/src/task/task.rs index 70e2c16..6e20a9a 100644 --- a/src/task/task.rs +++ b/src/task/task.rs @@ -29,6 +29,8 @@ pub struct TaskAnalysis { /// Estimated total tokens for completing the task (input + output) pub estimated_total_tokens: Option, + /// User-requested model (if specified) - used as minimum capability floor + pub requested_model: Option, /// Model chosen for execution (if selected) pub selected_model: Option, /// Estimated cost in cents (if computed)