biz-bud/config.yaml

# ==============================================================================
# Business Buddy - Comprehensive Configuration
# ==============================================================================
#
# This file defines all configuration values for the biz-budz project.
# It is reconciled against the Pydantic models in `src/biz_bud/config/schemas/`.
#
# Configuration Precedence (highest to lowest):
# 1. Runtime arguments passed to a function/method.
# 2. Environment variables (e.g., OPENAI_API_KEY).
# 3. Values set in this `config.yaml` file.
# 4. Default values defined in the Pydantic models.
#
# Values commented out are typically set via environment variables for security.
# ---

# Default query and greeting messages
# Env Override: DEFAULT_QUERY, DEFAULT_GREETING_MESSAGE
DEFAULT_QUERY: "You are a helpful AI assistant. Please help me with my request."
DEFAULT_GREETING_MESSAGE: "Hello! I'm your AI assistant. How can I help you with your market research today?"

# Input state configuration (typically provided at runtime)
inputs:
  # query: "Example query" # A default query can be set here
  # organization:
  #   - name: "Company Name"
  #     zip_code: "12345"
  catalog:
    table: "host_menu_items"
    items:
      - "Oxtail"
      - "Curry Goat"
      - "Jerk Chicken"
      - "Rice & Peas"
    category:
      - "Food, Restaurants & Service Industry"
    subcategory:
      - "Caribbean Food"

# ------------------------------------------------------------------------------
# SERVICE CONFIGURATIONS
# ------------------------------------------------------------------------------

# Logging configuration
# Env Override: LOG_LEVEL
logging:
  log_level: INFO  # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL

# LLM profiles configuration
# Env Override: e.g., TINY_LLM_NAME, LARGE_LLM_TEMPERATURE
llm_config:
  default_profile: "large"  # Options: tiny, small, large, reasoning
  tiny:
    name: "openai/gpt-4.1-mini"
    temperature: 0.7
    input_token_limit: 100000
    chunk_size: 4000
    chunk_overlap: 200
  small:
    name: "openai/gpt-4o"
    temperature: 0.7
    input_token_limit: 100000
    chunk_size: 4000
    chunk_overlap: 200
  large:
    name: "openai/gpt-4.1"
    temperature: 0.7
    input_token_limit: 100000
    chunk_size: 4000
    chunk_overlap: 200
  reasoning:
    name: "openai/o3-mini"
    input_token_limit: 65000
    chunk_size: 4000
    chunk_overlap: 200

# Agent behavior configuration
agent_config:
  max_loops: 5
  recursion_limit: 1000  # LangGraph recursion limit for agent execution
  default_llm_profile: "large"
  default_initial_user_query: "Hello"

  # System prompt for agent awareness and guidance
  system_prompt: |
    You are an intelligent Business Buddy agent operating within a sophisticated LangGraph-based system.
    You have access to comprehensive tools and capabilities through a registry-based architecture.

    ## YOUR CAPABILITIES AND TOOLS

    ### Core Tool Categories Available:
    - **Research Tools**: Web search (Tavily, Jina, ArXiv), content extraction, market analysis
    - **Analysis Tools**: Data processing, statistical analysis, trend identification, competitive intelligence
    - **Synthesis Tools**: Report generation, summary creation, insight compilation, recommendation formulation
    - **Integration Tools**: Database operations (PostgreSQL, Qdrant), document management (Paperless NGX), content crawling
    - **Validation Tools**: Registry validation, component discovery, end-to-end workflow testing

    ### Registry System:
    You operate within a registry-based architecture with three main registries:
    - **Node Registry**: Contains LangGraph workflow nodes for data processing and analysis
    - **Graph Registry**: Contains complete workflow graphs for complex multi-step operations
    - **Tool Registry**: Contains LangChain tools for external service integration

    Tools are dynamically discovered based on capabilities you request. The tool factory automatically creates tools from registered components matching your needs.

    ## PROJECT ARCHITECTURE AWARENESS

    ### System Structure:
    ```
    Business Buddy System
    ├── Agents (You are here)
    │   ├── Buddy Agent (Primary orchestrator)
    │   ├── Research Agents (Specialized research workflows)
    │   └── Tool Factory (Dynamic tool creation)
    ├── Registries (Component discovery)
    │   ├── Node Registry (Workflow components)
    │   ├── Graph Registry (Complete workflows)
    │   └── Tool Registry (External tools)
    ├── Services (External integrations)
    │   ├── LLM Providers (OpenAI, Anthropic, etc.)
    │   ├── Search Providers (Tavily, Jina, ArXiv)
    │   ├── Databases (PostgreSQL, Qdrant, Redis)
    │   └── Document Services (Firecrawl, Paperless)
    └── State Management (TypedDict-based workflows)
    ```

    ### Data Flow:
    1. **Input**: User queries and context
    2. **Planning**: Break down requests into capability requirements
    3. **Tool Discovery**: Registry system provides matching tools
    4. **Execution**: Orchestrate tools through LangGraph workflows
    5. **Synthesis**: Combine results into coherent responses
    6. **Output**: Structured reports and recommendations

    ## OPERATIONAL CONSTRAINTS AND GUIDELINES

    ### Performance Constraints:
    - **Token Limits**: Respect model-specific input limits (65K-100K tokens)
    - **Rate Limits**: Be mindful of API rate limits across providers
    - **Concurrency**: Maximum 10 concurrent searches, 5 concurrent scrapes
    - **Timeouts**: 30s scraper timeout, 10s provider timeout
    - **Recursion**: LangGraph recursion limit of 1000 steps

    ### Data Handling:
    - **Security**: Never expose API keys or sensitive credentials
    - **Privacy**: Handle personal/business data with appropriate care
    - **Validation**: Use registry validation system to ensure tool availability
    - **Error Handling**: Implement graceful degradation when tools are unavailable
    - **Caching**: Leverage tool caching (TTL: 1-7 days based on content type)

    ### Quality Standards:
    - **Accuracy**: Verify information from multiple sources when possible
    - **Completeness**: Address all aspects of user queries
    - **Relevance**: Focus on business intelligence and market research
    - **Actionability**: Provide concrete recommendations and next steps
    - **Transparency**: Clearly indicate sources and confidence levels

    ## WORKFLOW OPTIMIZATION

    ### Capability-Based Tool Selection:
    Instead of requesting specific tools, describe the capabilities you need:
    - "web_search" → Get search tools (Tavily, Jina, ArXiv)
    - "data_analysis" → Get analysis nodes and statistical tools
    - "content_extraction" → Get scraping and parsing tools
    - "report_generation" → Get synthesis and formatting tools

    ### State Management:
    - Use TypedDict-based state for type safety
    - Maintain context across workflow steps
    - Include metadata for tool discovery and validation
    - Preserve error information for debugging

    ### Error Recovery:
    - Implement retry logic with exponential backoff
    - Use fallback providers when primary services fail
    - Gracefully degrade functionality rather than complete failure
    - Log errors for system monitoring and improvement

    ## SPECIALIZED KNOWLEDGE AREAS

    ### Business Intelligence Focus:
    - Market research and competitive analysis
    - Industry trend identification and forecasting
    - Business opportunity assessment
    - Strategic recommendation development
    - Performance benchmarking and KPI analysis

    ### Technical Capabilities:
    - Multi-source data aggregation and synthesis
    - Statistical analysis and data visualization
    - Document processing and knowledge extraction
    - Workflow orchestration and automation
    - System monitoring and validation

    ## RESPONSE GUIDELINES

    ### Structure Your Responses:
    1. **Understanding**: Acknowledge the request and scope
    2. **Approach**: Explain your planned methodology
    3. **Execution**: Use appropriate tools and workflows
    4. **Analysis**: Process and interpret findings
    5. **Synthesis**: Compile insights and recommendations
    6. **Validation**: Verify results and check for completeness

    ### Communication Style:
    - **Professional**: Maintain business-appropriate tone
    - **Clear**: Use structured formatting and clear explanations
    - **Comprehensive**: Cover all relevant aspects thoroughly
    - **Actionable**: Provide specific recommendations and next steps
    - **Transparent**: Clearly indicate sources, methods, and limitations

    Remember: You are operating within a sophisticated, enterprise-grade system designed for comprehensive business intelligence. Leverage the full capabilities of the registry system while respecting constraints and maintaining high quality standards.

# Buddy Agent specific configuration
buddy_config:
  # Default capabilities that Buddy agent should have access to
  default_capabilities:
    - "web_search"
    - "data_analysis"
    - "content_extraction"
    - "report_generation"
    - "market_research"
    - "competitive_analysis"
    - "trend_analysis"
    - "synthesis"
    - "validation"

  # Buddy-specific system prompt additions
  buddy_system_prompt: |
    As the primary Buddy orchestrator agent, you have special responsibilities:

    ### PRIMARY ROLE:
    You are the main orchestrator for complex business intelligence workflows. Your role is to:
    - Analyze user requests and break them into capability requirements
    - Coordinate multiple specialized tools and workflows
    - Synthesize results from various sources into comprehensive reports
    - Provide strategic business insights and actionable recommendations

    ### ORCHESTRATION CAPABILITIES:
    - **Dynamic Tool Discovery**: Request tools by capability, not by name
    - **Workflow Management**: Coordinate multi-step analysis processes
    - **Quality Assurance**: Validate results and ensure completeness
    - **Context Management**: Maintain conversation context and user preferences
    - **Error Recovery**: Handle failures gracefully with fallback strategies

    ### DECISION MAKING:
    When choosing your approach:
    1. **Scope Assessment**: Determine complexity and required capabilities
    2. **Resource Planning**: Select appropriate tools and workflows
    3. **Execution Strategy**: Plan sequential vs parallel operations
    4. **Quality Control**: Define validation and verification steps
    5. **Output Optimization**: Structure responses for maximum value

    ### INTERACTION PATTERNS:
    - **Planning Phase**: Always explain your approach before execution
    - **Progress Updates**: Keep users informed during long operations
    - **Result Synthesis**: Combine findings into actionable insights
    - **Follow-up**: Suggest next steps and additional analysis opportunities

    Remember: You are the user's primary interface to the entire Business Buddy system. Make their experience smooth, informative, and valuable.

# API configuration
# Env Override: OPENAI_API_KEY, ANTHROPIC_API_KEY, R2R_BASE_URL, etc.
api_config:
  # openai_api_key: null
  # anthropic_api_key: null
  # fireworks_api_key: null
  # openai_api_base: null
  # brave_api_key: null
  # brave_search_endpoint: null
  # brave_web_endpoint: null
  # brave_summarizer_endpoint: null
  # brave_news_endpoint: null
  # searxng_url: null
  # jina_api_key: null
  # tavily_api_key: null
  # langsmith_api_key: null
  # langsmith_project: null
  # langsmith_endpoint: null
  # ragflow_api_key: null
  # ragflow_base_url: null
  # r2r_api_key: null
  # r2r_base_url: null
  # firecrawl_api_key: null
  # firecrawl_base_url: null

# Database configuration (Postgres for structured data, Qdrant for vectors)
# Env Override: QDRANT_HOST, QDRANT_PORT, POSTGRES_USER, etc.
database_config:
  # qdrant_host: null
  # qdrant_port: 6333
  # qdrant_api_key: null
  # qdrant_collection_name: "research"
  # postgres_user: null
  # postgres_password: null
  # postgres_db: null
  # postgres_host: null
  # postgres_port: 5432
  # postgres_min_pool_size: 2
  # postgres_max_pool_size: 15
  # postgres_command_timeout: 10
  default_page_size: 100
  max_page_size: 1000

# Proxy configuration
# Env Override: PROXY_URL, PROXY_USERNAME, PROXY_PASSWORD
proxy_config:
  # proxy_url: null
  # proxy_username: null
  # proxy_password: null

# Redis configuration
# Env Override: REDIS_URL
redis_config:
  # redis_url: "redis://localhost:6379/0"  # Set via environment variable
  # key_prefix: "biz_bud:"

# ------------------------------------------------------------------------------
# WORKFLOW AND FEATURE CONFIGURATIONS
# ------------------------------------------------------------------------------

# RAG (Retrieval-Augmented Generation) configuration
rag_config:
  crawl_depth: 2
  use_crawl_endpoint: false  # Use map+scrape for better discovery on documentation sites
  use_map_first: true  # Use map endpoint for URL discovery (recommended for docs sites)
  use_firecrawl_extract: true
  batch_size: 10
  enable_semantic_chunking: true
  chunk_size: 1000
  chunk_overlap: 200
  max_pages_to_map: 2000  # Max pages to discover during URL mapping
  max_pages_to_crawl: 2000  # Max pages to process after discovery (increased from default 20)
  # extraction_prompt: null # Optional custom prompt for Firecrawl's extract feature

# Vector store configuration
vector_store_enhanced:
  collection_name: "research"
  embedding_model: "text-embedding-3-small"
  namespace_prefix: "research"
  vector_size: 1536
  operation_timeout: 10

# Semantic extraction configuration
extraction:
  model_name: "openai/gpt-4o"
  chunk_size: 1000
  chunk_overlap: 200
  temperature: 0.2
  max_content_length: 3000
  extraction_profiles:
    minimal:
      extract_claims: false
      max_entities: 10
    standard:
      extract_claims: true
      max_entities: 25
    comprehensive:
      extract_claims: true
      max_entities: 50

# Search optimization configuration
search_optimization:
  query_optimization:
    enable_deduplication: true
    similarity_threshold: 0.85
    max_results_multiplier: 3
    max_results_limit: 10
    max_providers_per_query: 3
    max_query_merge_length: 150
    min_shared_words_for_merge: 2
    max_merged_query_words: 30
    min_results_per_query: 3
  concurrency:
    max_concurrent_searches: 10
    provider_timeout_seconds: 10
    provider_rate_limits:
      tavily: 5
      jina: 3
      arxiv: 2
  ranking:
    diversity_weight: 0.3
    min_quality_score: 0.5
    domain_frequency_weight: 0.8
    domain_frequency_min_count: 2
    freshness_decay_factor: 0.1
    max_sources_to_return: 20
    # domain_authority_scores: # Override specific domains if needed
    #   "example.com": 0.9
  caching:
    cache_ttl_seconds:
      temporal: 3600
      factual: 604800
      technical: 86400
      default: 86400
    lru_cache_size: 128
  enable_metrics: true
  metrics_window_size: 1000

# Error Handling configuration
error_handling:
  max_retry_attempts: 3
  retry_backoff_base: 1.5
  retry_max_delay: 60
  enable_llm_analysis: true
  recovery_timeout: 300
  enable_auto_recovery: true
  # Define rules for classifying error severity (corrected structure)
  criticality_rules:
    - pattern: "rate.limit|quota.exceeded"
      criticality: "medium"
      can_continue: true
    - pattern: "unauthorized|403|invalid.api.key"
      criticality: "critical"
      can_continue: false
    - pattern: "timeout|deadline.exceeded"
      criticality: "low"
      can_continue: true
    - pattern: "authentication|auth.*error"
      criticality: "critical"
      can_continue: false
    - pattern: "network|connection.*error"
      criticality: "medium"
      can_continue: true
  # Define recovery strategies for different error types (corrected structure)
  recovery_strategies:
    rate_limit:
      - action: "retry_with_backoff"
        parameters: { initial_delay: 5, max_delay: 60 }
      - action: "switch_provider"
        parameters: { providers: ["openai", "anthropic", "google"] }
    context_overflow:
      - action: "trim_context"
        parameters: { strategy: "sliding_window", window_size: 0.8 }
      - action: "chunk_input"
        parameters: { chunk_size: 1000, overlap: 100 }
    network:
      - action: "retry_with_backoff"
        parameters: { initial_delay: 2, max_delay: 30 }

# Tools configuration
tools:
  search:
    # name: null
    # max_results: null
    ranking:
      strategy: "basic"  # Options: basic, jina_ai, hybrid
      hybrid_weight: 0.7
      jina_rerank:
        model: "jina-reranker-v2-base-multilingual"
        timeout: 30.0
        max_retries: 3
        enable_fallback: true
  extract:
    # name: null
    chunk_size: 8000
    min_content_length: 100
  web_tools:
    scraper_timeout: 30
    max_concurrent_scrapes: 5
    max_concurrent_db_queries: 5
    max_concurrent_analysis: 3
    browser:
      headless: true
      timeout_seconds: 30.0
      connection_timeout: 10
      max_browsers: 3
      browser_load_threshold: 10
      max_scroll_percent: 500
      # user_agent: null
      viewport_width: 1920
      viewport_height: 1080
    network:
      timeout: 30.0
      max_retries: 3
      follow_redirects: true
      verify_ssl: true
  # Tool factory configuration for LangGraph integration
  factory:
    enable_caching: true
    cache_ttl_seconds: 3600.0
    max_cached_tools: 100
    auto_register_nodes: true
    auto_register_graphs: true
    default_tool_timeout: 300.0
  # State integration configuration
  state_integration:
    enable_state_validation: true
    preserve_message_history: true
    max_state_history_length: 50
    auto_enrich_state: true

# ------------------------------------------------------------------------------
# GENERAL APPLICATION SETTINGS
# ------------------------------------------------------------------------------

# Feature flags
feature_flags:
  enable_advanced_reasoning: false
  enable_streaming_response: true
  enable_tool_caching: true
  enable_parallel_tools: false  # Schema default is false, not true
  enable_memory_optimization: true
  experimental_features: {}  # Required by schema

# Rate limits configuration
rate_limits:
  # web_max_requests: null
  # web_time_window: null
  # llm_max_requests: null
  # llm_time_window: null
  # max_concurrent_connections: null
  # max_connections_per_host: null

# Telemetry configuration
telemetry_config:
  enabled: null  # Schema field (separate from enable_telemetry)
  enable_telemetry: false
  collect_performance_metrics: false
  collect_usage_statistics: false
  error_reporting_level: "minimal" # Options: none, minimal, full
  metrics_export_interval: 300
  metrics_retention_days: 30
  custom_metrics: {}  # Required by schema