* feat: enhance coverage reporting and improve tool configuration - Added support for JSON coverage reports in pyproject.toml. - Updated .gitignore to include coverage.json and task files for better management. - Introduced a new Type Safety Audit Report to document findings and recommendations for type safety improvements. - Created a comprehensive coverage configuration guide to assist in understanding coverage reporting setup. - Refactored tools configuration to utilize environment variables for concurrent scraping settings. These changes improve the project's testing and reporting capabilities while enhancing overall code quality and maintainability. * feat: enhance configuration handling and improve error logging - Introduced a new utility function `_get_env_int` for robust environment variable integer retrieval with validation. - Updated `WebToolsConfig` and `ToolsConfigModel` to utilize the new utility for environment variable defaults. - Enhanced logging in `CircuitBreaker` to provide detailed state transition information. - Improved URL handling in `url_analyzer.py` for better file extension extraction and normalization. - Added type validation and logging in `SecureInputMixin` to ensure input sanitization and validation consistency. These changes improve the reliability and maintainability of configuration management and error handling across the codebase. * refactor: update imports and enhance .gitignore for improved organization - Updated import paths in various example scripts to reflect the new structure under `biz_bud`. - Enhanced .gitignore to include clearer formatting for task files. - Removed obsolete function calls and improved error handling in several scripts. - Added public alias for backward compatibility in `upload_r2r.py`. These changes improve code organization, maintainability, and compatibility across the project. * refactor: update graph paths in langgraph.json for improved organization - Changed paths for research, catalog, paperless, and url_to_r2r graphs to reflect new directory structure. - Added new entries for analysis and scraping graphs to enhance functionality. These changes improve the organization and maintainability of the graph configurations. * fix: enhance validation and error handling in date range and scraping functions - Updated date validation in UserFiltersModel to ensure date values are strings. - Improved error messages in create_scraped_content_dict to clarify conditions for success and failure. - Enhanced test coverage for date validation and scraping content creation to ensure robustness. These changes improve input validation and error handling across the application, enhancing overall reliability. * refactor: streamline graph creation and enhance type annotations in examples - Simplified graph creation in `catalog_ingredient_research_example.py` and `catalog_tech_components_example.py` by directly compiling the graph. - Updated type annotations in `catalog_intel_with_config.py` for improved clarity and consistency. - Enhanced error handling in catalog data processing to ensure robustness against unexpected data types. These changes improve code readability, maintainability, and error resilience across example scripts. * Update src/biz_bud/nodes/extraction/extractors.py Co-authored-by: sourcery-ai[bot] <58596630+sourcery-ai[bot]@users.noreply.github.com> * Update src/biz_bud/core/validation/pydantic_models.py Co-authored-by: sourcery-ai[bot] <58596630+sourcery-ai[bot]@users.noreply.github.com> * refactor: migrate Jina and Tavily clients to use ServiceFactory dependency injection * refactor: migrate URL processing to provider-based architecture with improved error handling * feat: add FirecrawlApp compatibility classes and mock implementations * fix: add thread-safe locking to LazyLoader factory management * feat: implement service restart and refactor cache decorator helpers * refactor: move r2r_direct_api_call to tools.clients.r2r_utils and improve HTTP service error handling * chore: update Sonar task IDs in report configuration --------- Co-authored-by: sourcery-ai[bot] <58596630+sourcery-ai[bot]@users.noreply.github.com>
135 lines
5.1 KiB
Python
135 lines
5.1 KiB
Python
"""Test script to verify RAG agent uses enhanced Firecrawl tools."""
|
|
|
|
import asyncio
|
|
import os
|
|
from pprint import pprint
|
|
|
|
# from biz_bud.agents.rag_agent import process_url_with_dedup # Module deleted
|
|
from biz_bud.core.config.loader import load_config_async
|
|
|
|
|
|
async def test_rag_agent_with_firecrawl():
|
|
"""Test the RAG agent with different URL types."""
|
|
# Load configuration
|
|
_ = await load_config_async() # Configuration loaded but not used in this example
|
|
# config_dict = config.model_dump() # Would be used for agent configuration
|
|
|
|
# Test URLs
|
|
test_urls = [
|
|
# Documentation site - should use deeper crawling
|
|
"https://docs.python.org/3/tutorial/",
|
|
# Regular website - should use standard settings
|
|
"https://example.com",
|
|
# GitHub repo - should be routed to repomix
|
|
"https://github.com/langchain-ai/langchain",
|
|
]
|
|
|
|
for url in test_urls:
|
|
print(f"\n{'=' * 60}")
|
|
print(f"Testing URL: {url}")
|
|
print("=" * 60)
|
|
|
|
try:
|
|
# Note: process_url_with_dedup has been moved to nodes/graphs architecture
|
|
# This example needs to be updated to use the new workflow
|
|
print("This example needs updating to use new nodes/graphs architecture")
|
|
result = {"rag_status": "skipped", "should_process": False, "processing_reason": "function deprecated"}
|
|
|
|
# Show key results
|
|
print(f"\nProcessing Status: {result.get('rag_status')}")
|
|
print(f"Should Process: {result.get('should_process')}")
|
|
print(f"Processing Reason: {result.get('processing_reason')}")
|
|
|
|
# Show optimized parameters
|
|
if result.get("scrape_params"):
|
|
print("\nOptimized Scrape Parameters:")
|
|
pprint(result["scrape_params"])
|
|
|
|
if result.get("r2r_params"):
|
|
print("\nOptimized R2R Parameters:")
|
|
pprint(result["r2r_params"])
|
|
|
|
# Show processing result
|
|
if result.get("processing_result"):
|
|
processing_result = result["processing_result"]
|
|
if processing_result:
|
|
# Only call .get() if processing_result is a dictionary
|
|
if isinstance(processing_result, dict):
|
|
if processing_result.get("skipped"):
|
|
print(f"\nSkipped: {processing_result.get('reason')}")
|
|
else:
|
|
print("\nProcessed Successfully!")
|
|
if processing_result.get("scraped_content"):
|
|
print(f"Pages scraped: {len(processing_result['scraped_content'])}")
|
|
if processing_result.get("r2r_dataset_id"):
|
|
print(f"R2R dataset: {processing_result['r2r_dataset_id']}")
|
|
else:
|
|
# Handle non-dict processing results
|
|
print(f"\nProcessing result: {processing_result}")
|
|
|
|
except Exception as e:
|
|
print(f"\nError processing {url}: {e}")
|
|
|
|
|
|
async def test_firecrawl_endpoints_directly():
|
|
"""Test Firecrawl endpoints directly."""
|
|
from biz_bud.tools.clients.firecrawl import (
|
|
ExtractOptions,
|
|
FirecrawlApp,
|
|
MapOptions,
|
|
SearchOptions,
|
|
)
|
|
|
|
api_key = os.getenv("FIRECRAWL_API_KEY")
|
|
if not api_key:
|
|
print("Warning: FIRECRAWL_API_KEY not set, skipping direct tests")
|
|
return
|
|
|
|
async with FirecrawlApp(api_key=api_key) as app:
|
|
# Test map endpoint
|
|
print("\n\nTesting /map endpoint...")
|
|
map_options = MapOptions(limit=10)
|
|
urls = await app.map_website("https://docs.firecrawl.dev", options=map_options)
|
|
print(f"Discovered {len(urls)} URLs")
|
|
|
|
# Test search endpoint
|
|
print("\n\nTesting /search endpoint...")
|
|
search_options = SearchOptions(limit=3)
|
|
results = await app.search("web scraping best practices", options=search_options)
|
|
print(f"Found {len(results)} search results")
|
|
|
|
# Test extract endpoint
|
|
print("\n\nTesting /extract endpoint...")
|
|
extract_options = ExtractOptions(prompt="Extract the main features and pricing information")
|
|
extract_result = await app.extract(["https://firecrawl.dev"], options=extract_options)
|
|
if extract_result.get("success"):
|
|
print("Extraction successful!")
|
|
|
|
|
|
def main():
|
|
"""Run all tests."""
|
|
print("RAG Agent Firecrawl Integration Test")
|
|
print("=====================================\n")
|
|
|
|
# Check for required API keys
|
|
if not os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY"):
|
|
print("Error: Set OPENAI_API_KEY or ANTHROPIC_API_KEY for LLM operations")
|
|
return
|
|
|
|
if not os.getenv("FIRECRAWL_API_KEY"):
|
|
print("Error: Set FIRECRAWL_API_KEY for web scraping")
|
|
return
|
|
|
|
# Run async tests
|
|
asyncio.run(test_rag_agent_with_firecrawl())
|
|
|
|
# Optionally test endpoints directly
|
|
print("\n\n" + "=" * 60)
|
|
print("Direct Firecrawl Endpoint Tests")
|
|
print("=" * 60)
|
|
asyncio.run(test_firecrawl_endpoints_directly())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|