Fix task splitting to use dependencies for sequential execution
Key improvements: - Add dependencies field to task splitting prompt so LLM specifies execution order - Parse dependencies from LLM response and use them for wave-based execution - Respect user-requested model as minimum capability floor in model selector - Add guidance to prefer CLI tools over desktop automation in executor prompt - Include Chrome extension download URL pattern in system prompt This fixes the issue where all subtasks ran in parallel even when they had implicit dependencies (e.g., can't analyze code before downloading it).
This commit is contained in:
@@ -287,6 +287,15 @@ java -jar /root/tools/cfr.jar <jar_file> --outputdir /root/work/java-analysis/ou
|
||||
8. **Create reusable tools** — Save useful scripts to /root/tools/ with README
|
||||
9. **Verify your work** — Test, run, check outputs when possible
|
||||
10. **Iterate** — If first approach fails, try alternatives before giving up
|
||||
11. **PREFER CLI OVER DESKTOP** — Always use command-line tools first:
|
||||
- For HTTP: use `curl`, `wget`, `fetch_url` instead of browser automation
|
||||
- For files: use `grep`, `find`, `unzip`, `7z` instead of GUI tools
|
||||
- For downloads: construct URLs and use `curl -L` instead of clicking buttons
|
||||
- Desktop automation (`desktop_*` tools) is a LAST RESORT for:
|
||||
- Testing web applications visually
|
||||
- Interacting with GUI-only applications
|
||||
- When no CLI alternative exists
|
||||
- Chrome extensions can be downloaded directly: `https://clients2.google.com/service/update2/crx?response=redirect&x=id%3D<EXTENSION_ID>%26uc`
|
||||
|
||||
## Response
|
||||
When task is complete, provide a clear summary of:
|
||||
|
||||
@@ -229,9 +229,10 @@ impl ModelSelector {
|
||||
///
|
||||
/// # Algorithm
|
||||
/// 1. Calculate expected cost for each model using benchmark capabilities when available
|
||||
/// 2. Filter models exceeding budget
|
||||
/// 3. Select model with minimum expected cost
|
||||
/// 4. Include fallbacks in case of failure
|
||||
/// 2. If user requested a specific model, use it as minimum capability floor
|
||||
/// 3. Filter models exceeding budget
|
||||
/// 4. Select model with minimum expected cost
|
||||
/// 5. Include fallbacks in case of failure
|
||||
///
|
||||
/// # Preconditions
|
||||
/// - `models` is non-empty
|
||||
@@ -244,6 +245,7 @@ impl ModelSelector {
|
||||
budget_cents: u64,
|
||||
task_type: TaskType,
|
||||
historical_stats: Option<&HashMap<String, ModelStats>>,
|
||||
requested_model: Option<&str>,
|
||||
ctx: &AgentContext,
|
||||
) -> Option<ModelRecommendation> {
|
||||
if models.is_empty() {
|
||||
@@ -284,14 +286,65 @@ impl ModelSelector {
|
||||
.cmp(&b.expected_cost_cents)
|
||||
});
|
||||
|
||||
// If user requested a specific model, use it as minimum capability floor
|
||||
// Filter out models with lower capability than the requested one
|
||||
let min_capability = if let Some(req_model) = requested_model {
|
||||
// Find the requested model's capability
|
||||
if let Some(req_cost) = costs.iter().find(|c| c.model_id == req_model) {
|
||||
tracing::info!(
|
||||
"Using requested model {} as capability floor: {:.3}",
|
||||
req_model,
|
||||
req_cost.capability
|
||||
);
|
||||
req_cost.capability
|
||||
} else {
|
||||
// Requested model not found - fall back to looking up its price
|
||||
if let Some(req_pricing) = models.iter().find(|m| m.model_id == req_model) {
|
||||
let cap = self.estimate_capability_from_price(req_pricing.average_cost_per_token());
|
||||
tracing::info!(
|
||||
"Requested model {} not in costs list, using price-based capability: {:.3}",
|
||||
req_model,
|
||||
cap
|
||||
);
|
||||
cap
|
||||
} else {
|
||||
// Model not found at all, use a reasonable floor (0.7 = mid-tier)
|
||||
tracing::warn!(
|
||||
"Requested model {} not found, using default capability floor 0.7",
|
||||
req_model
|
||||
);
|
||||
0.7
|
||||
}
|
||||
}
|
||||
} else {
|
||||
0.0 // No minimum
|
||||
};
|
||||
|
||||
// Filter to models meeting minimum capability
|
||||
let filtered_costs: Vec<_> = if min_capability > 0.0 {
|
||||
costs.iter()
|
||||
.filter(|c| c.capability >= min_capability * 0.95) // Allow 5% tolerance
|
||||
.cloned()
|
||||
.collect()
|
||||
} else {
|
||||
costs.clone()
|
||||
};
|
||||
|
||||
let costs_to_use = if filtered_costs.is_empty() {
|
||||
tracing::warn!("No models meet minimum capability {:.2}, using all models", min_capability);
|
||||
&costs
|
||||
} else {
|
||||
&filtered_costs
|
||||
};
|
||||
|
||||
// Find cheapest model within budget
|
||||
let within_budget: Vec<_> = costs
|
||||
let within_budget: Vec<_> = costs_to_use
|
||||
.iter()
|
||||
.filter(|c| c.expected_cost_cents <= budget_cents)
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
let selected = within_budget.first().cloned().or_else(|| costs.first().cloned())?;
|
||||
let selected = within_budget.first().cloned().or_else(|| costs_to_use.first().cloned())?;
|
||||
|
||||
// Get fallback models (next best options)
|
||||
let fallbacks: Vec<String> = costs
|
||||
@@ -512,6 +565,9 @@ impl Agent for ModelSelector {
|
||||
}));
|
||||
}
|
||||
|
||||
// Get user-requested model as minimum capability floor
|
||||
let requested_model = task.analysis().requested_model.as_deref();
|
||||
|
||||
match self.select_optimal(
|
||||
&models,
|
||||
complexity,
|
||||
@@ -519,6 +575,7 @@ impl Agent for ModelSelector {
|
||||
budget_cents,
|
||||
task_type,
|
||||
historical_stats.as_ref(),
|
||||
requested_model,
|
||||
ctx,
|
||||
).await {
|
||||
Some(rec) => {
|
||||
|
||||
@@ -106,18 +106,22 @@ Respond with a JSON object:
|
||||
{{
|
||||
"description": "What to do",
|
||||
"verification": "How to verify it's done",
|
||||
"weight": 1.0
|
||||
"weight": 1.0,
|
||||
"dependencies": []
|
||||
}}
|
||||
],
|
||||
"reasoning": "Why this breakdown makes sense"
|
||||
}}
|
||||
|
||||
Guidelines:
|
||||
- Each subtask should be independently executable
|
||||
- Each subtask should be independently executable once its dependencies are complete
|
||||
- The "dependencies" array contains indices (0-based) of subtasks that MUST complete before this one can start
|
||||
- For example, if subtask 2 needs subtask 0's output, set "dependencies": [0]
|
||||
- Include verification for each subtask
|
||||
- Weight indicates relative effort (higher = more work)
|
||||
- Keep subtasks focused and specific
|
||||
- Aim for 2-4 subtasks typically
|
||||
- IMPORTANT: If subtasks have a logical order (e.g., download before analyze), specify dependencies!
|
||||
|
||||
Respond ONLY with the JSON object."#,
|
||||
task.description()
|
||||
@@ -190,11 +194,21 @@ Respond ONLY with the JSON object."#,
|
||||
let verification = s["verification"].as_str().unwrap_or("");
|
||||
let weight = s["weight"].as_f64().unwrap_or(1.0);
|
||||
|
||||
// Parse dependencies array
|
||||
let dependencies: Vec<usize> = s["dependencies"]
|
||||
.as_array()
|
||||
.map(|deps| {
|
||||
deps.iter()
|
||||
.filter_map(|d| d.as_u64().map(|n| n as usize))
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
Subtask::new(
|
||||
desc,
|
||||
VerificationCriteria::llm_based(verification),
|
||||
weight,
|
||||
)
|
||||
).with_dependencies(dependencies)
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
|
||||
@@ -101,17 +101,21 @@ Respond with a JSON object:
|
||||
{{
|
||||
"description": "What to do",
|
||||
"verification": "How to verify it's done",
|
||||
"weight": 1.0
|
||||
"weight": 1.0,
|
||||
"dependencies": []
|
||||
}}
|
||||
],
|
||||
"reasoning": "Why this breakdown makes sense"
|
||||
}}
|
||||
|
||||
Guidelines:
|
||||
- Each subtask should be independently executable
|
||||
- Each subtask should be independently executable once its dependencies are complete
|
||||
- The "dependencies" array contains indices (0-based) of subtasks that MUST complete before this one can start
|
||||
- For example, if subtask 2 needs subtask 0's output, set "dependencies": [0]
|
||||
- Include verification for each subtask
|
||||
- Weight indicates relative effort (higher = more work)
|
||||
- Keep subtasks focused and specific
|
||||
- IMPORTANT: If subtasks have a logical order (e.g., download before analyze), specify dependencies!
|
||||
|
||||
Respond ONLY with the JSON object."#,
|
||||
task.description()
|
||||
@@ -184,11 +188,21 @@ Respond ONLY with the JSON object."#,
|
||||
let verification = s["verification"].as_str().unwrap_or("");
|
||||
let weight = s["weight"].as_f64().unwrap_or(1.0);
|
||||
|
||||
// Parse dependencies array
|
||||
let dependencies: Vec<usize> = s["dependencies"]
|
||||
.as_array()
|
||||
.map(|deps| {
|
||||
deps.iter()
|
||||
.filter_map(|d| d.as_u64().map(|n| n as usize))
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
Subtask::new(
|
||||
desc,
|
||||
VerificationCriteria::llm_based(verification),
|
||||
weight,
|
||||
)
|
||||
).with_dependencies(dependencies)
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
|
||||
@@ -298,7 +298,7 @@ async fn run_agent_task(
|
||||
state: Arc<AppState>,
|
||||
task_id: Uuid,
|
||||
task_description: String,
|
||||
_model: String,
|
||||
requested_model: String,
|
||||
working_dir: std::path::PathBuf,
|
||||
) {
|
||||
// Update status to running
|
||||
@@ -327,6 +327,11 @@ async fn run_agent_task(
|
||||
}
|
||||
};
|
||||
|
||||
// Set the user-requested model as minimum capability floor
|
||||
if !requested_model.is_empty() {
|
||||
task.analysis_mut().requested_model = Some(requested_model);
|
||||
}
|
||||
|
||||
// Create context with the specified working directory and memory
|
||||
let llm = Arc::new(OpenRouterClient::new(state.config.api_key.clone()));
|
||||
let tools = ToolRegistry::new();
|
||||
|
||||
@@ -29,6 +29,8 @@ pub struct TaskAnalysis {
|
||||
/// Estimated total tokens for completing the task (input + output)
|
||||
pub estimated_total_tokens: Option<u64>,
|
||||
|
||||
/// User-requested model (if specified) - used as minimum capability floor
|
||||
pub requested_model: Option<String>,
|
||||
/// Model chosen for execution (if selected)
|
||||
pub selected_model: Option<String>,
|
||||
/// Estimated cost in cents (if computed)
|
||||
|
||||
Reference in New Issue
Block a user