biz-bud/examples/crawl_r2r_docs_fixed.py

#!/usr/bin/env python3
"""Fixed script to properly crawl R2R documentation and upload to R2R instance."""

import asyncio
import os
import sys
from typing import Any

from biz_bud.core.config.loader import load_config_async
from biz_bud.graphs.url_to_r2r import process_url_to_r2r_with_streaming


async def crawl_r2r_docs_fixed(max_depth: int = 3, max_pages: int = 50):
    """Crawl R2R documentation site and upload to R2R.

    This fixed version:
    - Uses the iterative graph for better control
    - Forces map+scrape approach for reliability
    - Provides real-time progress updates

    Args:
        max_depth: Maximum crawl depth (default: 3)
        max_pages: Maximum number of pages to crawl (default: 50)

    """
    url = "https://r2r-docs.sciphi.ai"

    print(f"🚀 Starting crawl of {url}")
    print(f"📊 Settings: max_depth={max_depth}, max_pages={max_pages}")
    print("-" * 60)

    # Load configuration
    config = await load_config_async()
    config_dict = config.model_dump()

    # Configure for reliable crawling
    config_dict["scrape_params"] = {"max_depth": max_depth, "max_pages": max_pages}

    # Force map+scrape approach for better reliability
    config_dict["rag_config"] = {
        "crawl_depth": max_depth,
        "max_pages_to_crawl": max_pages,
        "use_crawl_endpoint": False,  # Don't use crawl endpoint
        "use_map_first": True,  # Use map to discover URLs first
    }

    # Check for Firecrawl API key
    api_key = os.getenv("FIRECRAWL_API_KEY")
    if not api_key:
        api_config = config_dict.get("api", {})
        firecrawl_config = api_config.get("firecrawl", {})
        api_key = api_config.get("firecrawl_api_key") or firecrawl_config.get("api_key")

    if not api_key:
        print("❌ Error: FIRECRAWL_API_KEY not found in environment or config")
        print("Please set FIRECRAWL_API_KEY environment variable")
        sys.exit(1)

    # Check for R2R instance
    r2r_base_url = os.getenv("R2R_BASE_URL", "http://192.168.50.210:7272")
    if "api_config" not in config_dict:
        config_dict["api_config"] = {}
    config_dict["api_config"]["r2r_base_url"] = r2r_base_url

    print("✅ Using Firecrawl API (map+scrape mode)")
    print(f"✅ Using R2R instance at: {r2r_base_url}")
    print()

    # Track progress
    pages_processed = 0

    def on_update(update: dict[str, Any]) -> None:
        """Handle streaming updates."""
        nonlocal pages_processed

        if update.get("type") == "status":
            print(f"📌 {update.get('message', '')}")
        elif update.get("type") == "progress":
            progress = update.get("progress", {})
            current = progress.get("current", 0)
            total = progress.get("total", 0)
            if current > pages_processed:
                pages_processed = current
            print(f"📊 Progress: {current}/{total} pages")
        elif update.get("type") == "error":
            print(f"❌ Error: {update.get('message', '')}")

    try:
        # Process URL and upload to R2R with streaming updates
        print("🕷️ Starting crawl and R2R upload process...")
        result = await process_url_to_r2r_with_streaming(url, config_dict, on_update=on_update)

        # Display results
        print("\n" + "=" * 60)
        print("📊 CRAWL RESULTS")
        print("=" * 60)

        if result.get("error"):
            print(f"❌ Error: {result['error']}")
            return

        # Show scraped content summary
        scraped_content = result.get("scraped_content", [])
        if scraped_content:
            print(f"\n✅ Successfully crawled {len(scraped_content)} pages:")

            # Group by domain/section
            sections = {}
            for page in scraped_content:
                url_parts = page.get("url", "").split("/")
                section = url_parts[3] or "root" if len(url_parts) > 3 else "root"

                if section not in sections:
                    sections[section] = []
                sections[section].append(page)

            # Show organized results
            for section, pages in sorted(sections.items()):
                print(f"\n  📁 /{section} ({len(pages)} pages)")
                for page in pages[:3]:  # Show first 3 per section
                    title = page.get("title", "Untitled")
                    if len(title) > 60:
                        title = f"{title[:57]}..."
                    print(f"     - {title}")
                if len(pages) > 3:
                    print(f"     ... and {len(pages) - 3} more")

        # Show R2R upload results
        r2r_info = result.get("r2r_info")
        if r2r_info:
            print("\n✅ R2R Upload Successful:")

            # Check if multiple documents were uploaded
            if r2r_info.get("uploaded_documents"):
                docs = r2r_info["uploaded_documents"]
                print(f"  - Total documents uploaded: {len(docs)}")
                print(f"  - Collection: {r2r_info.get('collection_name', 'default')}")

                # Show sample document IDs
                print("  - Sample document IDs:")
                for doc_id in list(docs.keys())[:3]:
                    print(f"    • {doc_id}")

            else:
                # Single document upload
                print(f"  - Document ID: {r2r_info.get('document_id')}")
                print(f"  - Collection: {r2r_info.get('collection_name')}")
                print(f"  - Title: {r2r_info.get('title')}")

        print("\n✅ Crawl and upload completed successfully!")
        print(f"📊 Total pages processed: {len(scraped_content)}")

    except Exception as e:
        print(f"\n❌ Error during crawl: {e}")
        import traceback

        traceback.print_exc()


def main():
    """Run the main entry point."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Crawl R2R documentation and upload to R2R instance (fixed version)"
    )
    parser.add_argument("--max-depth", type=int, default=3, help="Maximum crawl depth (default: 3)")
    parser.add_argument(
        "--max-pages",
        type=int,
        default=50,
        help="Maximum number of pages to crawl (default: 50)",
    )
    parser.add_argument(
        "--use-crawl",
        action="store_true",
        help="Use crawl endpoint instead of map+scrape (not recommended)",
    )

    args = parser.parse_args()

    # Run the async crawl
    asyncio.run(crawl_r2r_docs_fixed(max_depth=args.max_depth, max_pages=args.max_pages))


if __name__ == "__main__":
    main()