biz-bud/examples/firecrawl_monitoring_example.py

#!/usr/bin/env python3
"""Example to monitor and understand Firecrawl crawl/scrape behavior.

This example helps debug the following issues:
1. Why crawl endpoint calls scrape internally
2. How to handle crawl returning 0 pages
3. Dataset naming and reuse patterns
"""

import asyncio
import os
from datetime import datetime

from dotenv import load_dotenv

# Load environment variables
load_dotenv()


async def analyze_crawl_vs_scrape():
    """Analyze the relationship between crawl and scrape endpoints."""
    # Note: These imports are from the original firecrawl library
    # They are not available in our current client implementation
    # from biz_bud.tools.clients.firecrawl import (
    #     CrawlJob,
    #     CrawlOptions,
    #     FirecrawlApp,
    #     FirecrawlOptions,
    # )

    # This example needs the actual firecrawl library
    print("This example requires the actual firecrawl-py library")
    return

    print("=== Understanding Crawl vs Scrape Behavior ===\n")
    print("Key insights:")
    print("1. The /crawl endpoint discovers URLs on a website")
    print("2. For each discovered URL, it internally calls /scrape")
    print("3. This is why you see both endpoints being called")
    print("4. If crawl finds 0 pages, our code falls back to direct scrape\n")

    async with FirecrawlApp() as app:
        url = "https://docs.anthropic.com/en/docs/intro-to-claude"

        # First, let's see what a direct scrape gives us
        print(f"1. Direct scrape of {url}")
        print("-" * 50)

        scrape_result = await app.scrape_url(
            url, FirecrawlOptions(formats=["markdown", "links"], only_main_content=True)
        )

        if scrape_result.success:
            print("✅ Direct scrape successful")
            if scrape_result.data:
                print(f"   - Content length: {len(scrape_result.data.markdown or '')} chars")
                print(f"   - Links found: {len(scrape_result.data.links or [])}")
                if scrape_result.data.links:
                    print("   - Sample links:")
                    for link in scrape_result.data.links[:3]:
                        print(f"     • {link}")
            else:
                print("   - No data returned")

        # Now let's see what crawl does
        print(f"\n2. Crawl starting from {url}")
        print("-" * 50)

        crawl_job = await app.crawl_website(
            url=url,
            options=CrawlOptions(
                limit=5,
                max_depth=1,
                scrape_options=FirecrawlOptions(formats=["markdown"], only_main_content=True),
            ),
            wait_for_completion=True,
        )

        if isinstance(crawl_job, CrawlJob):
            print("✅ Crawl completed")
            print(f"   - Status: {crawl_job.status}")
            print(f"   - Pages discovered: {crawl_job.total_count}")
            print(f"   - Pages scraped: {crawl_job.completed_count}")

            if crawl_job.data:
                print("   - Scraped URLs:")
                for i, page in enumerate(crawl_job.data[:5], 1):
                    page_url = "unknown"
                    if hasattr(page.metadata, "sourceURL"):
                        page_url = page.metadata.sourceURL
                    elif isinstance(page.metadata, dict):
                        page_url = page.metadata.get("url", "unknown")
                    print(f"     {i}. {page_url}")

            # This is the key insight
            print("\n📊 ANALYSIS:")
            print(f"   The crawl endpoint discovered {crawl_job.total_count} URLs")
            print(f"   Then it called /scrape for each URL ({crawl_job.completed_count} times)")
            print("   This is why you see both endpoints in your logs!")

            if crawl_job.completed_count == 0:
                print("\n⚠️  CRAWL RETURNED 0 PAGES!")
                print("   In this case, our code falls back to direct scrape")
                print("   This ensures we always get at least the main page content")


async def monitor_crawl_job():
    """Monitor a crawl job with real-time status updates."""
    # Note: These imports are from the original firecrawl library
    # They are not available in our current client implementation
    # from biz_bud.tools.clients.firecrawl import (
    #     CrawlJob,
    #     CrawlOptions,
    #     FirecrawlApp,
    #     FirecrawlOptions,
    # )

    # This example needs the actual firecrawl library
    print("This example requires the actual firecrawl-py library")
    return

    print("=== Firecrawl Crawl Job Monitoring Example ===\n")

    # Track status updates
    status_history = []

    def status_callback(job: CrawlJob) -> None:
        """Handle callback function called on each status update."""
        timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
        status_history.append(
            {
                "time": timestamp,
                "status": job.status,
                "completed": job.completed_count,
                "total": job.total_count,
            }
        )

        # Progress bar visualization
        if job.total_count > 0:
            progress = job.completed_count / job.total_count
            bar_length = 30
            filled = int(bar_length * progress)
            bar = "█" * filled + "░" * (bar_length - filled)
            print(
                f"\r[{timestamp}] {bar} {job.completed_count}/{job.total_count} - {job.status}",
                end="",
                flush=True,
            )

    async with FirecrawlApp() as app:
        print(f"Connected to: {app.client.base_url}")
        print(f"API Version: {app.api_version}\n")

        # Configure crawl options
        options = CrawlOptions(
            limit=10,  # Crawl up to 10 pages
            max_depth=2,  # Follow links 2 levels deep
            scrape_options=FirecrawlOptions(
                formats=["markdown"],
                only_main_content=True,
                timeout=30000,  # 30 seconds per page
            ),
        )

        # Example URL - replace with your target
        url = "https://example.com"

        print(f"Starting crawl of {url}")
        print(f"Max pages: {options.limit}, Max depth: {options.max_depth}\n")

        start_time = datetime.now()

        try:
            # Method 1: Start crawl and get job ID immediately
            initial_job = await app.crawl_website(
                url=url,
                options=options,
                wait_for_completion=False,  # Don't wait
            )

            if isinstance(initial_job, CrawlJob) and initial_job.job_id:
                print(f"Job ID: {initial_job.job_id}")
                print(f"Status URL: {app.client.base_url}/v1/crawl/{initial_job.job_id}")
                print("\nMonitoring progress:\n")

                # Poll for updates with callback
                final_job = await app._poll_crawl_status(
                    job_id=initial_job.job_id,
                    poll_interval=2,  # Check every 2 seconds
                    max_polls=150,  # Max 5 minutes
                    status_callback=status_callback,
                )

                print("\n")  # New line after progress bar

                # Calculate statistics
                duration = (datetime.now() - start_time).total_seconds()

                print("\n=== Crawl Complete ===")
                print(f"Duration: {duration:.1f} seconds")
                print(f"Status: {final_job.status}")
                print(f"Pages crawled: {final_job.completed_count}")
                print(f"Total pages found: {final_job.total_count}")

                if final_job.data:
                    print("\nCrawled URLs:")
                    for i, page in enumerate(final_job.data[:5], 1):
                        page_url = "unknown"
                        if hasattr(page, "metadata") and isinstance(page.metadata, dict):
                            page_url = page.metadata.get("url", "unknown")
                        content_length = len(page.markdown or page.content or "")
                        print(f"  {i}. {page_url} ({content_length} chars)")

                    if len(final_job.data) > 5:
                        print(f"  ... and {len(final_job.data) - 5} more pages")

                # Show status timeline
                print(f"\nStatus Updates ({len(status_history)} total):")
                # Show first 2 and last 2 updates
                if len(status_history) > 4:
                    for update in status_history[:2]:
                        print(
                            f"  [{update['time']}] {update['status']}: {update['completed']}/{update['total']}"
                        )
                    print("  ...")
                    for update in status_history[-2:]:
                        print(
                            f"  [{update['time']}] {update['status']}: {update['completed']}/{update['total']}"
                        )
                else:
                    for update in status_history:
                        print(
                            f"  [{update['time']}] {update['status']}: {update['completed']}/{update['total']}"
                        )

                # Performance metrics
                if final_job.completed_count > 0:
                    avg_time_per_page = duration / final_job.completed_count
                    print("\nPerformance:")
                    print(f"  Average time per page: {avg_time_per_page:.2f} seconds")
                    print(f"  Pages per minute: {(60 / avg_time_per_page):.1f}")

            else:
                print(f"Unexpected result: {type(initial_job)}")

        except Exception as e:
            print(f"\nError during crawl: {e}")
            import traceback

            traceback.print_exc()


async def monitor_batch_scrape():
    """Monitor batch scrape operations."""
    from biz_bud.tools.clients.firecrawl import FirecrawlApp, FirecrawlOptions

    print("\n\n=== Firecrawl Batch Scrape Monitoring Example ===\n")

    # URLs to scrape
    urls = [
        "https://example.com",
        "https://example.org",
        "https://example.net",
        "https://example.edu",
        "https://example.io",
    ]

    async with FirecrawlApp() as app:
        print(f"Scraping {len(urls)} URLs concurrently...")
        print("Max concurrent: 3\n")

        start_time = datetime.now()

        # Progress tracking
        print("Progress:")

        results = await app.batch_scrape(
            urls=urls,
            options=FirecrawlOptions(
                formats=["markdown"],
                only_main_content=True,
                timeout=30000,
            ),
            max_concurrent=3,
        )

        duration = (datetime.now() - start_time).total_seconds()

        # Analyze results - filter for valid result objects with proper attributes
        valid_results = [
            r for r in results if hasattr(r, "success") and not isinstance(r, (dict, list, str))
        ]
        successful = sum(1 for r in valid_results if getattr(r, "success", False))
        # failed = len(valid_results) - successful  # Would be used for monitoring stats
        total_content = sum(
            len(getattr(r.data, "markdown", "") or "")
            for r in valid_results
            if getattr(r, "success", False)
            and hasattr(r, "data")
            and r.data
            and hasattr(r.data, "markdown")
        )

        print("\n=== Batch Complete ===")
        print(f"Duration: {duration:.1f} seconds")
        print(f"Success rate: {successful}/{len(results)} ({successful / len(results) * 100:.0f}%)")
        print(f"Total content: {total_content:,} characters")
        print(f"Average time per URL: {duration / len(urls):.2f} seconds")

        # Show individual results
        print("\nIndividual Results:")
        for i, url in enumerate(urls):
            if i < len(results):
                result = results[i]
                if (
                    hasattr(result, "success")
                    and not isinstance(result, (dict, list, str))
                    and getattr(result, "success", False)
                    and hasattr(result, "data")
                    and result.data
                ):
                    content_len = len(getattr(result.data, "markdown", "") or "")
                    print(f"  ✅ {url} - {content_len} chars")
                else:
                    error = getattr(result, "error", "Unknown error") or "Unknown error"
                    print(f"  ❌ {url} - {error}")


async def monitor_ragflow_dataset_creation():
    """Monitor how RAGFlow datasets are created and reused."""
    print("\n\n=== RAGFlow Dataset Management ===\n")

    try:
        from urllib.parse import urlparse

        from ragflow_sdk import RAGFlow

        api_key = os.getenv("RAGFLOW_API_KEY")
        base_url = os.getenv("RAGFLOW_BASE_URL", "http://rag.lab")

        if not api_key:
            print("❌ RAGFLOW_API_KEY not set, skipping RAGFlow tests")
            return

        client = RAGFlow(api_key=api_key, base_url=base_url)

        # Test URL
        test_url = "https://example.com"
        parsed = urlparse(test_url)
        site_name = parsed.netloc.replace("www.", "").replace(":", "_")

        print(f"Testing with URL: {test_url}")
        print(f"Site name for dataset: {site_name}\n")

        # List existing datasets
        print("1. Existing datasets:")
        print("-" * 50)

        datasets = client.list_datasets()
        existing_dataset = None

        for dataset in datasets:
            if dataset.name == site_name:
                existing_dataset = dataset
                print(f"✅ Found existing dataset: {dataset.name} (ID: {dataset.id})")
            else:
                print(f"   - {dataset.name}")

        if existing_dataset:
            print(f"\n📊 INSIGHT: Dataset '{site_name}' already exists!")
            print("   Our code will reuse this instead of creating a new one")
            print("   This prevents duplicate datasets with timestamps")
        else:
            print(f"\n📊 INSIGHT: No dataset named '{site_name}' found")
            print("   A new dataset will be created with just the site name")
            print("   NOT with a timestamp - this allows future reuse")

        # Show what the old behavior would have done
        from datetime import datetime

        old_style_name = f"{site_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        print(f"\n⚠️  OLD BEHAVIOR would create: {old_style_name}")
        print(f"✅ NEW BEHAVIOR creates/reuses: {site_name}")

    except ImportError:
        print("❌ RAGFlow SDK not installed, skipping RAGFlow tests")
    except Exception as e:
        print(f"❌ RAGFlow test error: {e}")


async def main():
    """Run monitoring examples."""
    print(f"Firecrawl Base URL: {os.getenv('FIRECRAWL_BASE_URL', 'https://api.firecrawl.dev')}")
    print(f"RAGFlow Base URL: {os.getenv('RAGFLOW_BASE_URL', 'http://rag.lab')}")
    print(f"API Version: {os.getenv('FIRECRAWL_API_VERSION', 'auto-detect')}\n")

    try:
        # Analyze crawl vs scrape behavior
        await analyze_crawl_vs_scrape()

        # Run crawl monitoring
        await monitor_crawl_job()

        # Run batch scrape monitoring
        await monitor_batch_scrape()

        # Monitor RAGFlow dataset creation
        await monitor_ragflow_dataset_creation()

        print("\n✅ All examples completed!")
        print("\n" + "=" * 60)
        print("SUMMARY OF FIXES:")
        print("1. Crawl calling scrape is EXPECTED behavior - not a bug")
        print("2. Crawl returning 0 pages is handled with fallback scraping")
        print("3. Datasets now use site names for reuse, not timestamps")

    except KeyboardInterrupt:
        print("\n\nMonitoring cancelled by user")
    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback

        traceback.print_exc()


if __name__ == "__main__":
    asyncio.run(main())