Files
biz-bud/tests/unit_tests/nodes/rag/test_check_duplicate.py
Travis Vasceannie fe1636b99a route-n-plan (#44)
* fixed blocking call

* fixed blocking call

* fixed r2r flows

* fastapi wrapper and containerization

* chore: add langgraph-checkpoint-postgres as a dependency in pyproject.toml

- Included "langgraph-checkpoint-postgres>=2.0.23" in the dependencies section to enhance project capabilities.

* feat: add .env.example for environment variable configuration

- Introduced a new .env.example file to provide a template for required and optional API keys.
- Updated .env.production to ensure consistent formatting.
- Enhanced deploy.sh with a project name variable and improved health check logic.
- Modified docker-compose.production.yml to enforce required POSTGRES_PASSWORD environment variable.
- Updated README.md and devcontainer scripts to reflect changes in .env file creation.
- Improved code formatting and consistency across various files.

* fix: update .gitignore and clean up imports in webapp.py and rag_agent.py

- Modified .gitignore to include task files for better organization.
- Cleaned up unused imports and improved function calls in webapp.py for better readability.
- Updated rag_agent.py to streamline import statements and enhance type safety in function definitions.
- Refactored validation logic in check_duplicate.py to simplify checks for sanitized names.

* Update src/biz_bud/webapp.py

Co-authored-by: qodo-merge-pro[bot] <151058649+qodo-merge-pro[bot]@users.noreply.github.com>

* Update src/biz_bud/agents/rag/retriever.py

Co-authored-by: qodo-merge-pro[bot] <151058649+qodo-merge-pro[bot]@users.noreply.github.com>

* Update Dockerfile.production

Co-authored-by: qodo-merge-pro[bot] <151058649+qodo-merge-pro[bot]@users.noreply.github.com>

* Update packages/business-buddy-tools/src/bb_tools/r2r/tools.py

Co-authored-by: qodo-merge-pro[bot] <151058649+qodo-merge-pro[bot]@users.noreply.github.com>

* Update src/biz_bud/agents/rag_agent.py

Co-authored-by: qodo-merge-pro[bot] <151058649+qodo-merge-pro[bot]@users.noreply.github.com>

* feat: add BaseCheckpointSaver interface documentation and enhance singleton pattern guidelines

- Introduced new documentation for the BaseCheckpointSaver interface, detailing core methods for checkpoint management.
- Updated check_singletons.md to include additional singleton patterns and best practices for resource management.
- Enhanced error handling in create_research_graph to log failures when creating the Postgres checkpointer.

---------

Co-authored-by: qodo-merge-pro[bot] <151058649+qodo-merge-pro[bot]@users.noreply.github.com>
2025-07-17 18:32:58 -04:00

267 lines
10 KiB
Python

"""Unit tests for R2R document deduplication and skip logic."""
from datetime import datetime, timedelta, timezone
from unittest.mock import AsyncMock, Mock, patch
import pytest
from biz_bud.nodes.rag.agent_nodes import check_existing_content_node
from biz_bud.states.rag_agent import RAGAgentState
class TestR2RDeduplication:
"""Test R2R deduplication logic."""
@pytest.fixture
def mock_vector_store(self):
"""Mock vector store for deduplication checks."""
with patch("biz_bud.services.factory.ServiceFactory") as mock_factory:
mock_store = Mock()
# Make all async methods AsyncMock
mock_store.search_by_metadata = AsyncMock()
mock_store.semantic_search = AsyncMock()
mock_store.initialize = AsyncMock()
mock_store.cleanup = AsyncMock()
mock_store.initialize_collection = AsyncMock()
mock_instance = Mock()
mock_instance.get_vector_store = AsyncMock(return_value=mock_store)
mock_factory.return_value = mock_instance
yield mock_store
@pytest.fixture
def base_state(self) -> RAGAgentState:
"""Base state for testing using factory."""
from tests.helpers.factories.state_factories import (
create_minimal_rag_agent_state,
)
return create_minimal_rag_agent_state(
input_url="https://example.com/page",
config={
"services": {
"vector_store_provider": "qdrant",
"embedding_provider": "openai",
}
},
)
@pytest.mark.asyncio
async def test_skip_if_url_already_ingested(self, mock_vector_store, base_state):
"""Test that crawl/scrape is skipped if URL already in R2R."""
# Setup - URL already exists
mock_vector_store.semantic_search.return_value = [
{
"metadata": {
"source_url": "https://example.com/page",
"indexed_at": datetime.now(timezone.utc).isoformat(),
"content": "Existing content",
},
"score": 1.0,
}
]
# Execute
result = await check_existing_content_node(base_state)
# Verify
assert result["existing_content"] is not None
assert result["url_hash"] is not None
assert result["content_age_days"] == 0
# Check semantic search was called with correct URL
mock_vector_store.semantic_search.assert_called_once()
call_args = mock_vector_store.semantic_search.call_args
assert call_args[1]["filters"]["source_url"] == "https://example.com/page"
@pytest.mark.asyncio
async def test_proceed_if_url_not_found(self, mock_vector_store, base_state):
"""Test that crawl proceeds if URL not in R2R."""
# Setup - URL doesn't exist
mock_vector_store.semantic_search.return_value = []
# Execute
result = await check_existing_content_node(base_state)
# Verify
assert result["existing_content"] is None
assert result["content_age_days"] is None
assert result["rag_status"] == "checking"
@pytest.mark.asyncio
async def test_content_age_calculation(self, mock_vector_store, base_state):
"""Test correct calculation of content age."""
# Setup - content is 5 days old
created_date = datetime.now(timezone.utc) - timedelta(days=5)
mock_vector_store.semantic_search.return_value = [
{
"metadata": {
"source_url": "https://example.com/page",
"indexed_at": created_date.isoformat(),
"content": "Old content",
},
"score": 1.0,
}
]
# Execute
result = await check_existing_content_node(base_state)
# Verify
assert result["content_age_days"] == 5
@pytest.mark.asyncio
async def test_create_document_if_collection_exists_but_url_not_found(
self, mock_vector_store, base_state
):
"""Test document creation when collection exists but URL is new."""
# This is handled by the main upload flow - if URL check returns None,
# the document will be created in the existing collection
# Setup - no existing document
mock_vector_store.semantic_search.return_value = []
# Execute check
result = await check_existing_content_node(base_state)
# Verify it indicates we should proceed with creation
assert result["existing_content"] is None
assert "existing_content" in result
# The actual document creation happens in upload_to_r2r_node
# which will use the existing collection
class TestR2RUrlVariations:
"""Test handling of URL variations and duplicates."""
@pytest.fixture
def mock_vector_store(self):
"""Mock vector store."""
with patch("biz_bud.services.factory.ServiceFactory") as mock_factory:
mock_store = Mock()
# Make all async methods AsyncMock
mock_store.search_by_metadata = AsyncMock()
mock_store.semantic_search = AsyncMock()
mock_store.initialize = AsyncMock()
mock_store.cleanup = AsyncMock()
mock_store.initialize_collection = AsyncMock()
mock_instance = Mock()
mock_instance.get_vector_store = AsyncMock(return_value=mock_store)
mock_factory.return_value = mock_instance
yield mock_store
@pytest.mark.asyncio
async def test_normalize_urls_for_dedup(self, mock_vector_store):
"""Test that similar URLs are properly deduplicated."""
url_variations = [
"https://example.com/page",
"https://example.com/page/",
"https://www.example.com/page",
"http://example.com/page",
]
# All should generate same hash for deduplication
_hashes = set()
for url in url_variations:
from tests.helpers.factories.state_factories import (
create_minimal_rag_agent_state,
)
state: RAGAgentState = create_minimal_rag_agent_state(
input_url=url,
config={"services": {}},
)
mock_vector_store.semantic_search.return_value = []
_result = await check_existing_content_node(state)
# URLs with/without trailing slash should match
normalized_url = url.rstrip("/")
if normalized_url.startswith("https://www."):
normalized_url = normalized_url.replace("https://www.", "https://")
# Check the URL used in search
call_args = mock_vector_store.semantic_search.call_args
searched_url = call_args[1]["filters"]["source_url"]
assert searched_url == url # Should search for exact URL
@pytest.mark.asyncio
async def test_page_name_matching(self, mock_vector_store):
"""Test matching by page name when URL differs."""
# Setup - document with same title exists
mock_vector_store.semantic_search.side_effect = [
[], # No URL match
[
{ # Title match
"metadata": {
"source_url": "https://old.example.com/different-path",
"title": "Installation Guide",
"indexed_at": datetime.now(timezone.utc).isoformat(),
},
"score": 0.95,
}
],
]
from tests.helpers.factories.state_factories import (
create_minimal_rag_agent_state,
)
state: RAGAgentState = create_minimal_rag_agent_state(
input_url="https://new.example.com/install",
query="Installation Guide", # Page title
config={"services": {}},
)
# Execute
_result = await check_existing_content_node(state)
# Should have searched by both URL and title
assert mock_vector_store.semantic_search.call_count >= 1
class TestCollectionNameValidation:
"""Test collection name validation functionality."""
def test_validate_collection_name_valid_input(self):
"""Test validation with valid collection names."""
from biz_bud.nodes.rag.check_duplicate import validate_collection_name
# Valid names that should pass through with minimal changes
assert validate_collection_name("myproject") == "myproject"
assert validate_collection_name("my-project") == "my-project"
assert validate_collection_name("my_project") == "my_project"
assert validate_collection_name("project123") == "project123"
def test_validate_collection_name_sanitization(self):
"""Test that invalid characters are properly sanitized."""
from biz_bud.nodes.rag.check_duplicate import validate_collection_name
# Invalid characters should be replaced with underscores
assert validate_collection_name("My Project!") == "my_project_"
assert validate_collection_name("project@#$%") == "project____"
assert validate_collection_name("UPPERCASE") == "uppercase"
assert validate_collection_name("with spaces") == "with_spaces"
def test_validate_collection_name_empty_or_none(self):
"""Test handling of empty or None collection names."""
from biz_bud.nodes.rag.check_duplicate import validate_collection_name
# None and empty strings should return None
assert validate_collection_name(None) is None
assert validate_collection_name("") is None
assert validate_collection_name(" ") is None
def test_validate_collection_name_edge_cases(self):
"""Test edge cases for collection name validation."""
from biz_bud.nodes.rag.check_duplicate import validate_collection_name
# Names that become underscores after sanitization
assert validate_collection_name("!@#$%") == "_____"
# Names that are only whitespace should return None
assert validate_collection_name(" ") is None
# Names with whitespace that should be trimmed
assert validate_collection_name(" project ") == "project"
assert validate_collection_name("\tproject\n") == "project"