rag-manager/ingest_pipeline/ingestors/firecrawl.py

"""Firecrawl ingestor for web and documentation sites."""

import asyncio
from collections.abc import AsyncGenerator
from datetime import UTC, datetime
from typing import Any
from uuid import uuid4

from firecrawl import AsyncFirecrawl
from typing_extensions import override

from ..config import get_settings
from ..core.models import (
    Document,
    DocumentMetadata,
    FirecrawlConfig,
    IngestionJob,
    IngestionSource,
)
from .base import BaseIngestor


class FirecrawlIngestor(BaseIngestor):
    """Ingestor for web and documentation sites using Firecrawl."""

    config: FirecrawlConfig
    client: Any  # AsyncFirecrawl client instance

    def __init__(self, config: FirecrawlConfig | None = None):
        """
        Initialize Firecrawl ingestor.

        Args:
            config: Firecrawl configuration (for operational params only)
        """
        self.config = config or FirecrawlConfig()
        settings = get_settings()

        # All connection details come from settings/.env
        # For self-hosted instances, use a dummy API key if none is provided
        # The SDK requires an API key even for self-hosted instances
        api_key = settings.firecrawl_api_key or "no-key-required"

        # AsyncFirecrawl automatically uses v2 endpoints
        self.client = AsyncFirecrawl(api_key=api_key, api_url=str(settings.firecrawl_endpoint))

    @override
    async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
        """
        Ingest documents from a web source.

        Args:
            job: The ingestion job configuration

        Yields:
            Documents from the web source
        """
        url = str(job.source_url)

        # First, map the site to understand its structure
        site_map = await self._map_site(url)

        # If map returns empty, just use the main URL
        if not site_map:
            site_map = [url]

        # Process pages in batches
        batch_size = 10
        for i in range(0, len(site_map), batch_size):
            batch_urls = site_map[i : i + batch_size]
            documents = await self._scrape_batch(batch_urls)

            for doc_data in documents:
                yield self._create_document(doc_data, job)

    @override
    async def validate_source(self, source_url: str) -> bool:
        """
        Validate if the web source is accessible.

        Args:
            source_url: URL to validate

        Returns:
            True if source is accessible
        """
        try:
            # Use SDK v2 endpoints for both self-hosted and cloud
            result = await self.client.scrape(source_url, formats=["markdown"])
            return result is not None and hasattr(result, "markdown")
        except Exception:
            return False

    @override
    async def estimate_size(self, source_url: str) -> int:
        """
        Estimate the number of pages in the website.

        Args:
            source_url: URL of the website

        Returns:
            Estimated number of pages
        """
        try:
            site_map = await self._map_site(source_url)
            return len(site_map) if site_map else 0
        except Exception:
            return 0

    async def _map_site(self, url: str) -> list[str]:
        """
        Map a website to get all URLs.

        Args:
            url: Base URL to map

        Returns:
            List of URLs found
        """
        try:
            # Use SDK v2 map endpoint
            result = await self.client.map(url=url, limit=self.config.limit)

            if result and hasattr(result, "links"):
                # Extract URLs from the result
                return [
                    link if isinstance(link, str) else getattr(link, "url", str(link))
                    for link in result.links
                ]
            return []
        except Exception as e:
            # If map fails (might not be available in all versions), fall back to single URL
            import logging

            logging.warning(f"Map endpoint not available or failed: {e}. Using single URL.")
            return [url]

    async def _scrape_batch(self, urls: list[str]) -> list[dict[str, str]]:
        """
        Scrape a batch of URLs.

        Args:
            urls: List of URLs to scrape

        Returns:
            List of scraped documents
        """
        tasks = []
        for url in urls:
            task = self._scrape_single(url)
            tasks.append(task)

        results = await asyncio.gather(*tasks, return_exceptions=True)

        documents = []
        for result in results:
            if isinstance(result, Exception):
                continue
            if result and isinstance(result, dict) and "markdown" in result:
                documents.append(result)

        return documents

    async def _scrape_single(self, url: str) -> dict[str, str]:
        """
        Scrape a single URL.

        Args:
            url: URL to scrape

        Returns:
            Scraped document data
        """
        try:
            # Use SDK v2 scrape endpoint
            result = await self.client.scrape(url, formats=self.config.formats)

            # Extract data from the result
            if result:
                # The SDK returns a ScrapeResult object with markdown and metadata
                metadata = getattr(result, "metadata", {})
                return {
                    "markdown": getattr(result, "markdown", ""),
                    "sourceURL": url,
                    "title": metadata.get("title", "")
                    if isinstance(metadata, dict)
                    else getattr(metadata, "title", ""),
                    "description": metadata.get("description", "")
                    if isinstance(metadata, dict)
                    else getattr(metadata, "description", ""),
                }
            return {}
        except Exception as e:
            import logging

            logging.debug(f"Failed to scrape {url}: {e}")
            return {}

    def _create_document(self, doc_data: dict[str, str], job: IngestionJob) -> Document:
        """
        Create a Document from scraped data.

        Args:
            doc_data: Scraped document data
            job: The ingestion job

        Returns:
            Document instance
        """
        content = doc_data.get("markdown", "")

        metadata: DocumentMetadata = {
            "source_url": doc_data.get("sourceURL", str(job.source_url)),
            "title": doc_data.get("title"),
            "description": doc_data.get("description"),
            "timestamp": datetime.now(UTC),
            "content_type": "text/markdown",
            "word_count": len(content.split()),
            "char_count": len(content),
        }

        return Document(
            id=uuid4(),
            content=content,
            metadata=metadata,
            source=IngestionSource.WEB,
            collection=job.storage_backend.value,
        )