Files
rag-manager/tests/unit/ingestors/test_repomix_ingestor.py
2025-09-21 01:38:47 +00:00

89 lines
2.5 KiB
Python

from __future__ import annotations
import pytest
from ingest_pipeline.core.models import IngestionJob, IngestionSource, StorageBackend
from ingest_pipeline.ingestors.repomix import RepomixIngestor
@pytest.mark.parametrize(
("content", "expected_keys"),
(
('## File: src/app.py\nprint("hi")', ["src/app.py"]),
("plain content without markers", ["repository"]),
),
)
def test_split_by_files_detects_file_markers(content: str, expected_keys: list[str]) -> None:
ingestor = RepomixIngestor()
results = ingestor._split_by_files(content)
assert list(results) == expected_keys
@pytest.mark.parametrize(
("content", "chunk_size", "expected"),
(
("line-one\nline-two\nline-three", 9, ["line-one", "line-two", "line-three"]),
("single-line", 50, ["single-line"]),
),
)
def test_chunk_content_respects_max_size(
content: str,
chunk_size: int,
expected: list[str],
) -> None:
ingestor = RepomixIngestor()
chunks = ingestor._chunk_content(content, chunk_size=chunk_size)
assert chunks == expected
@pytest.mark.parametrize(
("file_path", "content", "expected"),
(
("src/app.py", "def feature():\n return True", "python"),
("scripts/run", '#!/usr/bin/env python\nprint("ok")', "python"),
("documentation.md", "# Title", "markdown"),
("unknown.ext", "text", None),
),
)
def test_detect_programming_language_infers_extension(
file_path: str,
content: str,
expected: str | None,
) -> None:
ingestor = RepomixIngestor()
detected = ingestor._detect_programming_language(file_path, content)
assert detected == expected
def test_create_document_enriches_metadata() -> None:
ingestor = RepomixIngestor()
job = IngestionJob(
source_url="https://example.com/repo.git",
source_type=IngestionSource.REPOSITORY,
storage_backend=StorageBackend.WEAVIATE,
)
document = ingestor._create_document(
"src/module.py",
"def alpha():\n return 42\n",
job,
chunk_index=1,
git_metadata={"branch_name": "main", "commit_hash": "deadbeef"},
repo_info={"repository_name": "demo"},
)
assert document.metadata["repository_name"] == "demo"
assert document.metadata["branch_name"] == "main"
assert document.metadata["commit_hash"] == "deadbeef"
title = document.metadata["title"]
assert title is not None
assert title.endswith("(chunk 1)")
assert document.collection == job.storage_backend.value