* fixed blocking call * fixed blocking call * fixed r2r flows * fastapi wrapper and containerization * chore: add langgraph-checkpoint-postgres as a dependency in pyproject.toml - Included "langgraph-checkpoint-postgres>=2.0.23" in the dependencies section to enhance project capabilities. * feat: add .env.example for environment variable configuration - Introduced a new .env.example file to provide a template for required and optional API keys. - Updated .env.production to ensure consistent formatting. - Enhanced deploy.sh with a project name variable and improved health check logic. - Modified docker-compose.production.yml to enforce required POSTGRES_PASSWORD environment variable. - Updated README.md and devcontainer scripts to reflect changes in .env file creation. - Improved code formatting and consistency across various files. * fix: update .gitignore and clean up imports in webapp.py and rag_agent.py - Modified .gitignore to include task files for better organization. - Cleaned up unused imports and improved function calls in webapp.py for better readability. - Updated rag_agent.py to streamline import statements and enhance type safety in function definitions. - Refactored validation logic in check_duplicate.py to simplify checks for sanitized names. * Update src/biz_bud/webapp.py Co-authored-by: qodo-merge-pro[bot] <151058649+qodo-merge-pro[bot]@users.noreply.github.com> * Update src/biz_bud/agents/rag/retriever.py Co-authored-by: qodo-merge-pro[bot] <151058649+qodo-merge-pro[bot]@users.noreply.github.com> * Update Dockerfile.production Co-authored-by: qodo-merge-pro[bot] <151058649+qodo-merge-pro[bot]@users.noreply.github.com> * Update packages/business-buddy-tools/src/bb_tools/r2r/tools.py Co-authored-by: qodo-merge-pro[bot] <151058649+qodo-merge-pro[bot]@users.noreply.github.com> * Update src/biz_bud/agents/rag_agent.py Co-authored-by: qodo-merge-pro[bot] <151058649+qodo-merge-pro[bot]@users.noreply.github.com> * feat: add BaseCheckpointSaver interface documentation and enhance singleton pattern guidelines - Introduced new documentation for the BaseCheckpointSaver interface, detailing core methods for checkpoint management. - Updated check_singletons.md to include additional singleton patterns and best practices for resource management. - Enhanced error handling in create_research_graph to log failures when creating the Postgres checkpointer. --------- Co-authored-by: qodo-merge-pro[bot] <151058649+qodo-merge-pro[bot]@users.noreply.github.com>
267 lines
10 KiB
Python
267 lines
10 KiB
Python
"""Unit tests for R2R document deduplication and skip logic."""
|
|
|
|
from datetime import datetime, timedelta, timezone
|
|
from unittest.mock import AsyncMock, Mock, patch
|
|
|
|
import pytest
|
|
|
|
from biz_bud.nodes.rag.agent_nodes import check_existing_content_node
|
|
from biz_bud.states.rag_agent import RAGAgentState
|
|
|
|
|
|
class TestR2RDeduplication:
|
|
"""Test R2R deduplication logic."""
|
|
|
|
@pytest.fixture
|
|
def mock_vector_store(self):
|
|
"""Mock vector store for deduplication checks."""
|
|
with patch("biz_bud.services.factory.ServiceFactory") as mock_factory:
|
|
mock_store = Mock()
|
|
# Make all async methods AsyncMock
|
|
mock_store.search_by_metadata = AsyncMock()
|
|
mock_store.semantic_search = AsyncMock()
|
|
mock_store.initialize = AsyncMock()
|
|
mock_store.cleanup = AsyncMock()
|
|
mock_store.initialize_collection = AsyncMock()
|
|
mock_instance = Mock()
|
|
mock_instance.get_vector_store = AsyncMock(return_value=mock_store)
|
|
mock_factory.return_value = mock_instance
|
|
yield mock_store
|
|
|
|
@pytest.fixture
|
|
def base_state(self) -> RAGAgentState:
|
|
"""Base state for testing using factory."""
|
|
from tests.helpers.factories.state_factories import (
|
|
create_minimal_rag_agent_state,
|
|
)
|
|
|
|
return create_minimal_rag_agent_state(
|
|
input_url="https://example.com/page",
|
|
config={
|
|
"services": {
|
|
"vector_store_provider": "qdrant",
|
|
"embedding_provider": "openai",
|
|
}
|
|
},
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_skip_if_url_already_ingested(self, mock_vector_store, base_state):
|
|
"""Test that crawl/scrape is skipped if URL already in R2R."""
|
|
# Setup - URL already exists
|
|
mock_vector_store.semantic_search.return_value = [
|
|
{
|
|
"metadata": {
|
|
"source_url": "https://example.com/page",
|
|
"indexed_at": datetime.now(timezone.utc).isoformat(),
|
|
"content": "Existing content",
|
|
},
|
|
"score": 1.0,
|
|
}
|
|
]
|
|
|
|
# Execute
|
|
result = await check_existing_content_node(base_state)
|
|
|
|
# Verify
|
|
assert result["existing_content"] is not None
|
|
assert result["url_hash"] is not None
|
|
assert result["content_age_days"] == 0
|
|
|
|
# Check semantic search was called with correct URL
|
|
mock_vector_store.semantic_search.assert_called_once()
|
|
call_args = mock_vector_store.semantic_search.call_args
|
|
assert call_args[1]["filters"]["source_url"] == "https://example.com/page"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_proceed_if_url_not_found(self, mock_vector_store, base_state):
|
|
"""Test that crawl proceeds if URL not in R2R."""
|
|
# Setup - URL doesn't exist
|
|
mock_vector_store.semantic_search.return_value = []
|
|
|
|
# Execute
|
|
result = await check_existing_content_node(base_state)
|
|
|
|
# Verify
|
|
assert result["existing_content"] is None
|
|
assert result["content_age_days"] is None
|
|
assert result["rag_status"] == "checking"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_content_age_calculation(self, mock_vector_store, base_state):
|
|
"""Test correct calculation of content age."""
|
|
# Setup - content is 5 days old
|
|
created_date = datetime.now(timezone.utc) - timedelta(days=5)
|
|
mock_vector_store.semantic_search.return_value = [
|
|
{
|
|
"metadata": {
|
|
"source_url": "https://example.com/page",
|
|
"indexed_at": created_date.isoformat(),
|
|
"content": "Old content",
|
|
},
|
|
"score": 1.0,
|
|
}
|
|
]
|
|
|
|
# Execute
|
|
result = await check_existing_content_node(base_state)
|
|
|
|
# Verify
|
|
assert result["content_age_days"] == 5
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_create_document_if_collection_exists_but_url_not_found(
|
|
self, mock_vector_store, base_state
|
|
):
|
|
"""Test document creation when collection exists but URL is new."""
|
|
# This is handled by the main upload flow - if URL check returns None,
|
|
# the document will be created in the existing collection
|
|
|
|
# Setup - no existing document
|
|
mock_vector_store.semantic_search.return_value = []
|
|
|
|
# Execute check
|
|
result = await check_existing_content_node(base_state)
|
|
|
|
# Verify it indicates we should proceed with creation
|
|
assert result["existing_content"] is None
|
|
assert "existing_content" in result
|
|
|
|
# The actual document creation happens in upload_to_r2r_node
|
|
# which will use the existing collection
|
|
|
|
|
|
class TestR2RUrlVariations:
|
|
"""Test handling of URL variations and duplicates."""
|
|
|
|
@pytest.fixture
|
|
def mock_vector_store(self):
|
|
"""Mock vector store."""
|
|
with patch("biz_bud.services.factory.ServiceFactory") as mock_factory:
|
|
mock_store = Mock()
|
|
# Make all async methods AsyncMock
|
|
mock_store.search_by_metadata = AsyncMock()
|
|
mock_store.semantic_search = AsyncMock()
|
|
mock_store.initialize = AsyncMock()
|
|
mock_store.cleanup = AsyncMock()
|
|
mock_store.initialize_collection = AsyncMock()
|
|
mock_instance = Mock()
|
|
mock_instance.get_vector_store = AsyncMock(return_value=mock_store)
|
|
mock_factory.return_value = mock_instance
|
|
yield mock_store
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_normalize_urls_for_dedup(self, mock_vector_store):
|
|
"""Test that similar URLs are properly deduplicated."""
|
|
url_variations = [
|
|
"https://example.com/page",
|
|
"https://example.com/page/",
|
|
"https://www.example.com/page",
|
|
"http://example.com/page",
|
|
]
|
|
|
|
# All should generate same hash for deduplication
|
|
_hashes = set()
|
|
for url in url_variations:
|
|
from tests.helpers.factories.state_factories import (
|
|
create_minimal_rag_agent_state,
|
|
)
|
|
|
|
state: RAGAgentState = create_minimal_rag_agent_state(
|
|
input_url=url,
|
|
config={"services": {}},
|
|
)
|
|
|
|
mock_vector_store.semantic_search.return_value = []
|
|
_result = await check_existing_content_node(state)
|
|
|
|
# URLs with/without trailing slash should match
|
|
normalized_url = url.rstrip("/")
|
|
if normalized_url.startswith("https://www."):
|
|
normalized_url = normalized_url.replace("https://www.", "https://")
|
|
|
|
# Check the URL used in search
|
|
call_args = mock_vector_store.semantic_search.call_args
|
|
searched_url = call_args[1]["filters"]["source_url"]
|
|
assert searched_url == url # Should search for exact URL
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_page_name_matching(self, mock_vector_store):
|
|
"""Test matching by page name when URL differs."""
|
|
# Setup - document with same title exists
|
|
mock_vector_store.semantic_search.side_effect = [
|
|
[], # No URL match
|
|
[
|
|
{ # Title match
|
|
"metadata": {
|
|
"source_url": "https://old.example.com/different-path",
|
|
"title": "Installation Guide",
|
|
"indexed_at": datetime.now(timezone.utc).isoformat(),
|
|
},
|
|
"score": 0.95,
|
|
}
|
|
],
|
|
]
|
|
|
|
from tests.helpers.factories.state_factories import (
|
|
create_minimal_rag_agent_state,
|
|
)
|
|
|
|
state: RAGAgentState = create_minimal_rag_agent_state(
|
|
input_url="https://new.example.com/install",
|
|
query="Installation Guide", # Page title
|
|
config={"services": {}},
|
|
)
|
|
|
|
# Execute
|
|
_result = await check_existing_content_node(state)
|
|
|
|
# Should have searched by both URL and title
|
|
assert mock_vector_store.semantic_search.call_count >= 1
|
|
|
|
|
|
class TestCollectionNameValidation:
|
|
"""Test collection name validation functionality."""
|
|
|
|
def test_validate_collection_name_valid_input(self):
|
|
"""Test validation with valid collection names."""
|
|
from biz_bud.nodes.rag.check_duplicate import validate_collection_name
|
|
|
|
# Valid names that should pass through with minimal changes
|
|
assert validate_collection_name("myproject") == "myproject"
|
|
assert validate_collection_name("my-project") == "my-project"
|
|
assert validate_collection_name("my_project") == "my_project"
|
|
assert validate_collection_name("project123") == "project123"
|
|
|
|
def test_validate_collection_name_sanitization(self):
|
|
"""Test that invalid characters are properly sanitized."""
|
|
from biz_bud.nodes.rag.check_duplicate import validate_collection_name
|
|
|
|
# Invalid characters should be replaced with underscores
|
|
assert validate_collection_name("My Project!") == "my_project_"
|
|
assert validate_collection_name("project@#$%") == "project____"
|
|
assert validate_collection_name("UPPERCASE") == "uppercase"
|
|
assert validate_collection_name("with spaces") == "with_spaces"
|
|
|
|
def test_validate_collection_name_empty_or_none(self):
|
|
"""Test handling of empty or None collection names."""
|
|
from biz_bud.nodes.rag.check_duplicate import validate_collection_name
|
|
|
|
# None and empty strings should return None
|
|
assert validate_collection_name(None) is None
|
|
assert validate_collection_name("") is None
|
|
assert validate_collection_name(" ") is None
|
|
|
|
def test_validate_collection_name_edge_cases(self):
|
|
"""Test edge cases for collection name validation."""
|
|
from biz_bud.nodes.rag.check_duplicate import validate_collection_name
|
|
|
|
# Names that become underscores after sanitization
|
|
assert validate_collection_name("!@#$%") == "_____"
|
|
# Names that are only whitespace should return None
|
|
assert validate_collection_name(" ") is None
|
|
|
|
# Names with whitespace that should be trimmed
|
|
assert validate_collection_name(" project ") == "project"
|
|
assert validate_collection_name("\tproject\n") == "project"
|