biz-bud/tests/unit_tests/nodes/rag/test_check_duplicate.py

"""Unit tests for R2R document deduplication and skip logic."""

from datetime import datetime, timedelta, timezone
from unittest.mock import AsyncMock, Mock, patch

import pytest

from biz_bud.nodes.rag.agent_nodes import check_existing_content_node
from biz_bud.states.rag_agent import RAGAgentState


class TestR2RDeduplication:
    """Test R2R deduplication logic."""

    @pytest.fixture
    def mock_vector_store(self):
        """Mock vector store for deduplication checks."""
        with patch("biz_bud.services.factory.ServiceFactory") as mock_factory:
            mock_store = Mock()
            # Make all async methods AsyncMock
            mock_store.search_by_metadata = AsyncMock()
            mock_store.semantic_search = AsyncMock()
            mock_store.initialize = AsyncMock()
            mock_store.cleanup = AsyncMock()
            mock_store.initialize_collection = AsyncMock()
            mock_instance = Mock()
            mock_instance.get_vector_store = AsyncMock(return_value=mock_store)
            mock_factory.return_value = mock_instance
            yield mock_store

    @pytest.fixture
    def base_state(self) -> RAGAgentState:
        """Base state for testing using factory."""
        from tests.helpers.factories.state_factories import (
            create_minimal_rag_agent_state,
        )

        return create_minimal_rag_agent_state(
            input_url="https://example.com/page",
            config={
                "services": {
                    "vector_store_provider": "qdrant",
                    "embedding_provider": "openai",
                }
            },
        )

    @pytest.mark.asyncio
    async def test_skip_if_url_already_ingested(self, mock_vector_store, base_state):
        """Test that crawl/scrape is skipped if URL already in R2R."""
        # Setup - URL already exists
        mock_vector_store.semantic_search.return_value = [
            {
                "metadata": {
                    "source_url": "https://example.com/page",
                    "indexed_at": datetime.now(timezone.utc).isoformat(),
                    "content": "Existing content",
                },
                "score": 1.0,
            }
        ]

        # Execute
        result = await check_existing_content_node(base_state)

        # Verify
        assert result["existing_content"] is not None
        assert result["url_hash"] is not None
        assert result["content_age_days"] == 0

        # Check semantic search was called with correct URL
        mock_vector_store.semantic_search.assert_called_once()
        call_args = mock_vector_store.semantic_search.call_args
        assert call_args[1]["filters"]["source_url"] == "https://example.com/page"

    @pytest.mark.asyncio
    async def test_proceed_if_url_not_found(self, mock_vector_store, base_state):
        """Test that crawl proceeds if URL not in R2R."""
        # Setup - URL doesn't exist
        mock_vector_store.semantic_search.return_value = []

        # Execute
        result = await check_existing_content_node(base_state)

        # Verify
        assert result["existing_content"] is None
        assert result["content_age_days"] is None
        assert result["rag_status"] == "checking"

    @pytest.mark.asyncio
    async def test_content_age_calculation(self, mock_vector_store, base_state):
        """Test correct calculation of content age."""
        # Setup - content is 5 days old
        created_date = datetime.now(timezone.utc) - timedelta(days=5)
        mock_vector_store.semantic_search.return_value = [
            {
                "metadata": {
                    "source_url": "https://example.com/page",
                    "indexed_at": created_date.isoformat(),
                    "content": "Old content",
                },
                "score": 1.0,
            }
        ]

        # Execute
        result = await check_existing_content_node(base_state)

        # Verify
        assert result["content_age_days"] == 5

    @pytest.mark.asyncio
    async def test_create_document_if_collection_exists_but_url_not_found(
        self, mock_vector_store, base_state
    ):
        """Test document creation when collection exists but URL is new."""
        # This is handled by the main upload flow - if URL check returns None,
        # the document will be created in the existing collection

        # Setup - no existing document
        mock_vector_store.semantic_search.return_value = []

        # Execute check
        result = await check_existing_content_node(base_state)

        # Verify it indicates we should proceed with creation
        assert result["existing_content"] is None
        assert "existing_content" in result

        # The actual document creation happens in upload_to_r2r_node
        # which will use the existing collection


class TestR2RUrlVariations:
    """Test handling of URL variations and duplicates."""

    @pytest.fixture
    def mock_vector_store(self):
        """Mock vector store."""
        with patch("biz_bud.services.factory.ServiceFactory") as mock_factory:
            mock_store = Mock()
            # Make all async methods AsyncMock
            mock_store.search_by_metadata = AsyncMock()
            mock_store.semantic_search = AsyncMock()
            mock_store.initialize = AsyncMock()
            mock_store.cleanup = AsyncMock()
            mock_store.initialize_collection = AsyncMock()
            mock_instance = Mock()
            mock_instance.get_vector_store = AsyncMock(return_value=mock_store)
            mock_factory.return_value = mock_instance
            yield mock_store

    @pytest.mark.asyncio
    async def test_normalize_urls_for_dedup(self, mock_vector_store):
        """Test that similar URLs are properly deduplicated."""
        url_variations = [
            "https://example.com/page",
            "https://example.com/page/",
            "https://www.example.com/page",
            "http://example.com/page",
        ]

        # All should generate same hash for deduplication
        _hashes = set()
        for url in url_variations:
            from tests.helpers.factories.state_factories import (
                create_minimal_rag_agent_state,
            )

            state: RAGAgentState = create_minimal_rag_agent_state(
                input_url=url,
                config={"services": {}},
            )

            mock_vector_store.semantic_search.return_value = []
            _result = await check_existing_content_node(state)

            # URLs with/without trailing slash should match
            normalized_url = url.rstrip("/")
            if normalized_url.startswith("https://www."):
                normalized_url = normalized_url.replace("https://www.", "https://")

            # Check the URL used in search
            call_args = mock_vector_store.semantic_search.call_args
            searched_url = call_args[1]["filters"]["source_url"]
            assert searched_url == url  # Should search for exact URL

    @pytest.mark.asyncio
    async def test_page_name_matching(self, mock_vector_store):
        """Test matching by page name when URL differs."""
        # Setup - document with same title exists
        mock_vector_store.semantic_search.side_effect = [
            [],  # No URL match
            [
                {  # Title match
                    "metadata": {
                        "source_url": "https://old.example.com/different-path",
                        "title": "Installation Guide",
                        "indexed_at": datetime.now(timezone.utc).isoformat(),
                    },
                    "score": 0.95,
                }
            ],
        ]

        from tests.helpers.factories.state_factories import (
            create_minimal_rag_agent_state,
        )

        state: RAGAgentState = create_minimal_rag_agent_state(
            input_url="https://new.example.com/install",
            query="Installation Guide",  # Page title
            config={"services": {}},
        )

        # Execute
        _result = await check_existing_content_node(state)

        # Should have searched by both URL and title
        assert mock_vector_store.semantic_search.call_count >= 1


class TestCollectionNameValidation:
    """Test collection name validation functionality."""

    def test_validate_collection_name_valid_input(self):
        """Test validation with valid collection names."""
        from biz_bud.nodes.rag.check_duplicate import validate_collection_name

        # Valid names that should pass through with minimal changes
        assert validate_collection_name("myproject") == "myproject"
        assert validate_collection_name("my-project") == "my-project"
        assert validate_collection_name("my_project") == "my_project"
        assert validate_collection_name("project123") == "project123"

    def test_validate_collection_name_sanitization(self):
        """Test that invalid characters are properly sanitized."""
        from biz_bud.nodes.rag.check_duplicate import validate_collection_name

        # Invalid characters should be replaced with underscores
        assert validate_collection_name("My Project!") == "my_project_"
        assert validate_collection_name("project@#$%") == "project____"
        assert validate_collection_name("UPPERCASE") == "uppercase"
        assert validate_collection_name("with spaces") == "with_spaces"

    def test_validate_collection_name_empty_or_none(self):
        """Test handling of empty or None collection names."""
        from biz_bud.nodes.rag.check_duplicate import validate_collection_name

        # None and empty strings should return None
        assert validate_collection_name(None) is None
        assert validate_collection_name("") is None
        assert validate_collection_name("   ") is None

    def test_validate_collection_name_edge_cases(self):
        """Test edge cases for collection name validation."""
        from biz_bud.nodes.rag.check_duplicate import validate_collection_name

        # Names that become underscores after sanitization
        assert validate_collection_name("!@#$%") == "_____"
        # Names that are only whitespace should return None
        assert validate_collection_name("     ") is None

        # Names with whitespace that should be trimmed
        assert validate_collection_name("  project  ") == "project"
        assert validate_collection_name("\tproject\n") == "project"