biz-bud/examples/test_rag_agent_firecrawl.py

"""Test script to verify RAG agent uses enhanced Firecrawl tools."""

import asyncio
import os
from pprint import pprint

# from biz_bud.agents.rag_agent import process_url_with_dedup  # Module deleted
from biz_bud.core.config.loader import load_config_async


async def test_rag_agent_with_firecrawl():
    """Test the RAG agent with different URL types."""
    # Load configuration
    _ = await load_config_async()  # Configuration loaded but not used in this example
    # config_dict = config.model_dump()  # Would be used for agent configuration

    # Test URLs
    test_urls = [
        # Documentation site - should use deeper crawling
        "https://docs.python.org/3/tutorial/",
        # Regular website - should use standard settings
        "https://example.com",
        # GitHub repo - should be routed to repomix
        "https://github.com/langchain-ai/langchain",
    ]

    for url in test_urls:
        print(f"\n{'=' * 60}")
        print(f"Testing URL: {url}")
        print("=" * 60)

        try:
            # Note: process_url_with_dedup has been moved to nodes/graphs architecture
            # This example needs to be updated to use the new workflow
            print("This example needs updating to use new nodes/graphs architecture")
            result = {"rag_status": "skipped", "should_process": False, "processing_reason": "function deprecated"}

            # Show key results
            print(f"\nProcessing Status: {result.get('rag_status')}")
            print(f"Should Process: {result.get('should_process')}")
            print(f"Processing Reason: {result.get('processing_reason')}")

            # Show optimized parameters
            if result.get("scrape_params"):
                print("\nOptimized Scrape Parameters:")
                pprint(result["scrape_params"])

            if result.get("r2r_params"):
                print("\nOptimized R2R Parameters:")
                pprint(result["r2r_params"])

            # Show processing result
            if result.get("processing_result"):
                processing_result = result["processing_result"]
                if processing_result:
                    # Only call .get() if processing_result is a dictionary
                    if isinstance(processing_result, dict):
                        if processing_result.get("skipped"):
                            print(f"\nSkipped: {processing_result.get('reason')}")
                        else:
                            print("\nProcessed Successfully!")
                            if processing_result.get("scraped_content"):
                                print(f"Pages scraped: {len(processing_result['scraped_content'])}")
                            if processing_result.get("r2r_dataset_id"):
                                print(f"R2R dataset: {processing_result['r2r_dataset_id']}")
                    else:
                        # Handle non-dict processing results
                        print(f"\nProcessing result: {processing_result}")

        except Exception as e:
            print(f"\nError processing {url}: {e}")


async def test_firecrawl_endpoints_directly():
    """Test Firecrawl endpoints directly."""
    from biz_bud.tools.clients.firecrawl import (
        ExtractOptions,
        FirecrawlApp,
        MapOptions,
        SearchOptions,
    )

    api_key = os.getenv("FIRECRAWL_API_KEY")
    if not api_key:
        print("Warning: FIRECRAWL_API_KEY not set, skipping direct tests")
        return

    async with FirecrawlApp(api_key=api_key) as app:
        # Test map endpoint
        print("\n\nTesting /map endpoint...")
        map_options = MapOptions(limit=10)
        urls = await app.map_website("https://docs.firecrawl.dev", options=map_options)
        print(f"Discovered {len(urls)} URLs")

        # Test search endpoint
        print("\n\nTesting /search endpoint...")
        search_options = SearchOptions(limit=3)
        results = await app.search("web scraping best practices", options=search_options)
        print(f"Found {len(results)} search results")

        # Test extract endpoint
        print("\n\nTesting /extract endpoint...")
        extract_options = ExtractOptions(prompt="Extract the main features and pricing information")
        extract_result = await app.extract(["https://firecrawl.dev"], options=extract_options)
        if extract_result.get("success"):
            print("Extraction successful!")


def main():
    """Run all tests."""
    print("RAG Agent Firecrawl Integration Test")
    print("=====================================\n")

    # Check for required API keys
    if not os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY"):
        print("Error: Set OPENAI_API_KEY or ANTHROPIC_API_KEY for LLM operations")
        return

    if not os.getenv("FIRECRAWL_API_KEY"):
        print("Error: Set FIRECRAWL_API_KEY for web scraping")
        return

    # Run async tests
    asyncio.run(test_rag_agent_with_firecrawl())

    # Optionally test endpoints directly
    print("\n\n" + "=" * 60)
    print("Direct Firecrawl Endpoint Tests")
    print("=" * 60)
    asyncio.run(test_firecrawl_endpoints_directly())


if __name__ == "__main__":
    main()