Fix task splitting to use dependencies for sequential execution

Key improvements:
- Add dependencies field to task splitting prompt so LLM specifies execution order
- Parse dependencies from LLM response and use them for wave-based execution
- Respect user-requested model as minimum capability floor in model selector
- Add guidance to prefer CLI tools over desktop automation in executor prompt
- Include Chrome extension download URL pattern in system prompt

This fixes the issue where all subtasks ran in parallel even when they had
implicit dependencies (e.g., can't analyze code before downloading it).
This commit is contained in:
Thomas Marchand
2025-12-18 17:24:16 +00:00
parent e405b74ca0
commit c2cbf70f10
6 changed files with 113 additions and 12 deletions

View File

@@ -287,6 +287,15 @@ java -jar /root/tools/cfr.jar <jar_file> --outputdir /root/work/java-analysis/ou
8. **Create reusable tools** — Save useful scripts to /root/tools/ with README
9. **Verify your work** — Test, run, check outputs when possible
10. **Iterate** — If first approach fails, try alternatives before giving up
11. **PREFER CLI OVER DESKTOP** — Always use command-line tools first:
- For HTTP: use `curl`, `wget`, `fetch_url` instead of browser automation
- For files: use `grep`, `find`, `unzip`, `7z` instead of GUI tools
- For downloads: construct URLs and use `curl -L` instead of clicking buttons
- Desktop automation (`desktop_*` tools) is a LAST RESORT for:
- Testing web applications visually
- Interacting with GUI-only applications
- When no CLI alternative exists
- Chrome extensions can be downloaded directly: `https://clients2.google.com/service/update2/crx?response=redirect&x=id%3D<EXTENSION_ID>%26uc`
## Response
When task is complete, provide a clear summary of:

View File

@@ -229,9 +229,10 @@ impl ModelSelector {
///
/// # Algorithm
/// 1. Calculate expected cost for each model using benchmark capabilities when available
/// 2. Filter models exceeding budget
/// 3. Select model with minimum expected cost
/// 4. Include fallbacks in case of failure
/// 2. If user requested a specific model, use it as minimum capability floor
/// 3. Filter models exceeding budget
/// 4. Select model with minimum expected cost
/// 5. Include fallbacks in case of failure
///
/// # Preconditions
/// - `models` is non-empty
@@ -244,6 +245,7 @@ impl ModelSelector {
budget_cents: u64,
task_type: TaskType,
historical_stats: Option<&HashMap<String, ModelStats>>,
requested_model: Option<&str>,
ctx: &AgentContext,
) -> Option<ModelRecommendation> {
if models.is_empty() {
@@ -284,14 +286,65 @@ impl ModelSelector {
.cmp(&b.expected_cost_cents)
});
// If user requested a specific model, use it as minimum capability floor
// Filter out models with lower capability than the requested one
let min_capability = if let Some(req_model) = requested_model {
// Find the requested model's capability
if let Some(req_cost) = costs.iter().find(|c| c.model_id == req_model) {
tracing::info!(
"Using requested model {} as capability floor: {:.3}",
req_model,
req_cost.capability
);
req_cost.capability
} else {
// Requested model not found - fall back to looking up its price
if let Some(req_pricing) = models.iter().find(|m| m.model_id == req_model) {
let cap = self.estimate_capability_from_price(req_pricing.average_cost_per_token());
tracing::info!(
"Requested model {} not in costs list, using price-based capability: {:.3}",
req_model,
cap
);
cap
} else {
// Model not found at all, use a reasonable floor (0.7 = mid-tier)
tracing::warn!(
"Requested model {} not found, using default capability floor 0.7",
req_model
);
0.7
}
}
} else {
0.0 // No minimum
};
// Filter to models meeting minimum capability
let filtered_costs: Vec<_> = if min_capability > 0.0 {
costs.iter()
.filter(|c| c.capability >= min_capability * 0.95) // Allow 5% tolerance
.cloned()
.collect()
} else {
costs.clone()
};
let costs_to_use = if filtered_costs.is_empty() {
tracing::warn!("No models meet minimum capability {:.2}, using all models", min_capability);
&costs
} else {
&filtered_costs
};
// Find cheapest model within budget
let within_budget: Vec<_> = costs
let within_budget: Vec<_> = costs_to_use
.iter()
.filter(|c| c.expected_cost_cents <= budget_cents)
.cloned()
.collect();
let selected = within_budget.first().cloned().or_else(|| costs.first().cloned())?;
let selected = within_budget.first().cloned().or_else(|| costs_to_use.first().cloned())?;
// Get fallback models (next best options)
let fallbacks: Vec<String> = costs
@@ -512,6 +565,9 @@ impl Agent for ModelSelector {
}));
}
// Get user-requested model as minimum capability floor
let requested_model = task.analysis().requested_model.as_deref();
match self.select_optimal(
&models,
complexity,
@@ -519,6 +575,7 @@ impl Agent for ModelSelector {
budget_cents,
task_type,
historical_stats.as_ref(),
requested_model,
ctx,
).await {
Some(rec) => {

View File

@@ -106,18 +106,22 @@ Respond with a JSON object:
{{
"description": "What to do",
"verification": "How to verify it's done",
"weight": 1.0
"weight": 1.0,
"dependencies": []
}}
],
"reasoning": "Why this breakdown makes sense"
}}
Guidelines:
- Each subtask should be independently executable
- Each subtask should be independently executable once its dependencies are complete
- The "dependencies" array contains indices (0-based) of subtasks that MUST complete before this one can start
- For example, if subtask 2 needs subtask 0's output, set "dependencies": [0]
- Include verification for each subtask
- Weight indicates relative effort (higher = more work)
- Keep subtasks focused and specific
- Aim for 2-4 subtasks typically
- IMPORTANT: If subtasks have a logical order (e.g., download before analyze), specify dependencies!
Respond ONLY with the JSON object."#,
task.description()
@@ -190,11 +194,21 @@ Respond ONLY with the JSON object."#,
let verification = s["verification"].as_str().unwrap_or("");
let weight = s["weight"].as_f64().unwrap_or(1.0);
// Parse dependencies array
let dependencies: Vec<usize> = s["dependencies"]
.as_array()
.map(|deps| {
deps.iter()
.filter_map(|d| d.as_u64().map(|n| n as usize))
.collect()
})
.unwrap_or_default();
Subtask::new(
desc,
VerificationCriteria::llm_based(verification),
weight,
)
).with_dependencies(dependencies)
})
.collect()
})

View File

@@ -101,17 +101,21 @@ Respond with a JSON object:
{{
"description": "What to do",
"verification": "How to verify it's done",
"weight": 1.0
"weight": 1.0,
"dependencies": []
}}
],
"reasoning": "Why this breakdown makes sense"
}}
Guidelines:
- Each subtask should be independently executable
- Each subtask should be independently executable once its dependencies are complete
- The "dependencies" array contains indices (0-based) of subtasks that MUST complete before this one can start
- For example, if subtask 2 needs subtask 0's output, set "dependencies": [0]
- Include verification for each subtask
- Weight indicates relative effort (higher = more work)
- Keep subtasks focused and specific
- IMPORTANT: If subtasks have a logical order (e.g., download before analyze), specify dependencies!
Respond ONLY with the JSON object."#,
task.description()
@@ -184,11 +188,21 @@ Respond ONLY with the JSON object."#,
let verification = s["verification"].as_str().unwrap_or("");
let weight = s["weight"].as_f64().unwrap_or(1.0);
// Parse dependencies array
let dependencies: Vec<usize> = s["dependencies"]
.as_array()
.map(|deps| {
deps.iter()
.filter_map(|d| d.as_u64().map(|n| n as usize))
.collect()
})
.unwrap_or_default();
Subtask::new(
desc,
VerificationCriteria::llm_based(verification),
weight,
)
).with_dependencies(dependencies)
})
.collect()
})

View File

@@ -298,7 +298,7 @@ async fn run_agent_task(
state: Arc<AppState>,
task_id: Uuid,
task_description: String,
_model: String,
requested_model: String,
working_dir: std::path::PathBuf,
) {
// Update status to running
@@ -327,6 +327,11 @@ async fn run_agent_task(
}
};
// Set the user-requested model as minimum capability floor
if !requested_model.is_empty() {
task.analysis_mut().requested_model = Some(requested_model);
}
// Create context with the specified working directory and memory
let llm = Arc::new(OpenRouterClient::new(state.config.api_key.clone()));
let tools = ToolRegistry::new();

View File

@@ -29,6 +29,8 @@ pub struct TaskAnalysis {
/// Estimated total tokens for completing the task (input + output)
pub estimated_total_tokens: Option<u64>,
/// User-requested model (if specified) - used as minimum capability floor
pub requested_model: Option<String>,
/// Model chosen for execution (if selected)
pub selected_model: Option<String>,
/// Estimated cost in cents (if computed)