Files
noteflow/tests/evaluation/test_meeting_qa.py
Travis Vasceannie d8090a98e8
Some checks failed
CI / test-typescript (push) Has been cancelled
CI / test-rust (push) Has been cancelled
CI / test-python (push) Has been cancelled
ci/cd fixes
2026-01-26 00:28:15 +00:00

246 lines
9.9 KiB
Python

"""Meeting-scoped Q&A evaluation tests."""
from __future__ import annotations
from typing import Final
import pytest
from tests.conftest import approx_float
from .conftest import (
SAMPLE_ALL_TEXT,
SAMPLE_MEETING_TRANSCRIPT,
MockSegment,
calculate_citation_accuracy,
calculate_retrieval_metrics,
extract_citations_from_answer,
extract_segment_ids_from_retrieved,
)
DOUBLE_DIGIT_CITATION_A: Final[int] = 10
DOUBLE_DIGIT_CITATION_END: Final[int] = 15
class TestRetrievalMetricsCalculation:
def test_perfect_retrieval_yields_perfect_mrr(self) -> None:
retrieved = [1, 2, 3]
relevant = {1, 2, 3}
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.mean_reciprocal_rank == 1.0, "First result relevant = MRR 1.0"
def test_second_position_relevant_yields_half_mrr(self) -> None:
retrieved = [4, 1, 3]
relevant = {1, 2, 3}
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.mean_reciprocal_rank == 0.5, "Second position relevant = MRR 0.5"
def test_third_position_relevant_yields_third_mrr(self) -> None:
retrieved = [4, 5, 1]
relevant = {1, 2, 3}
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.mean_reciprocal_rank == approx_float(1 / 3), "Third position = MRR 1/3"
def test_no_relevant_yields_zero_mrr(self) -> None:
retrieved = [4, 5, 6]
relevant = {1, 2, 3}
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.mean_reciprocal_rank == 0.0, "No relevant results = MRR 0"
def test_precision_at_k_full_match(self) -> None:
retrieved = [1, 2, 3]
relevant = {1, 2, 3}
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.precision_at_k == 1.0, "All retrieved are relevant"
def test_precision_at_k_partial_match(self) -> None:
retrieved = [1, 4, 5]
relevant = {1, 2, 3}
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.precision_at_k == approx_float(1 / 3), "1 of 3 retrieved is relevant"
def test_recall_at_k_full_match(self) -> None:
retrieved = [1, 2, 3]
relevant = {1, 2, 3}
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.recall_at_k == 1.0, "All relevant found"
def test_recall_at_k_partial_match(self) -> None:
retrieved = [1, 4, 5]
relevant = {1, 2, 3}
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.recall_at_k == approx_float(1 / 3), "1 of 3 relevant found"
def test_hit_rate_with_hit(self) -> None:
retrieved = [4, 1, 5]
relevant = {1, 2, 3}
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.hit_rate == 1.0, "At least one relevant = hit rate 1.0"
def test_hit_rate_without_hit(self) -> None:
retrieved = [4, 5, 6]
relevant = {1, 2, 3}
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.hit_rate == 0.0, "No relevant = hit rate 0.0"
def test_empty_relevant_set_returns_zeros(self) -> None:
retrieved = [1, 2, 3]
relevant: set[int] = set()
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.mean_reciprocal_rank == 0.0, "Empty relevant set"
assert metrics.precision_at_k == 0.0, "Empty relevant set"
assert metrics.recall_at_k == 0.0, "Empty relevant set"
class TestNdcgCalculation:
def test_perfect_ranking_yields_perfect_ndcg(self) -> None:
retrieved = [1, 2, 3]
relevant = {1, 2, 3}
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.ndcg == approx_float(1.0), "Perfect ranking = NDCG 1.0"
def test_suboptimal_ranking_yields_lower_ndcg(self) -> None:
retrieved = [4, 1, 2]
relevant = {1, 2, 3}
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.ndcg < 1.0, "Non-ideal ranking should have NDCG < 1.0"
assert metrics.ndcg > 0.0, "Some relevant results should have NDCG > 0"
def test_worst_ranking_yields_low_ndcg(self) -> None:
retrieved = [4, 5, 1]
relevant = {1, 2, 3}
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.ndcg < 0.5, "Only last result relevant = low NDCG"
class TestCitationAccuracy:
def test_all_valid_citations(self) -> None:
cited = [1, 2, 3]
valid = {1, 2, 3, 4, 5}
metrics = calculate_citation_accuracy(cited, valid)
assert metrics.citation_accuracy == 1.0, "All citations valid"
assert metrics.invalid_citations == 0, "No invalid citations"
assert metrics.citation_count == 3, "3 citations total"
def test_some_invalid_citations(self) -> None:
cited = [1, 2, 99]
valid = {1, 2, 3, 4, 5}
metrics = calculate_citation_accuracy(cited, valid)
assert metrics.citation_accuracy == approx_float(2 / 3), "2 of 3 valid"
assert metrics.invalid_citations == 1, "1 invalid citation"
def test_all_invalid_citations(self) -> None:
cited = [99, 100, 101]
valid = {1, 2, 3}
metrics = calculate_citation_accuracy(cited, valid)
assert metrics.citation_accuracy == 0.0, "No valid citations"
assert metrics.invalid_citations == 3, "All 3 invalid"
def test_empty_citations_yields_perfect_accuracy(self) -> None:
cited: list[int] = []
valid = {1, 2, 3}
metrics = calculate_citation_accuracy(cited, valid)
assert metrics.citation_accuracy == 1.0, "No citations = vacuously true"
assert metrics.citation_count == 0, "0 citations"
class TestCitationExtraction:
def test_extract_single_citation(self) -> None:
answer = "The deadline is March 15th [1]."
citations = extract_citations_from_answer(answer)
assert citations == [1], "Should extract [1]"
def test_extract_multiple_citations(self) -> None:
answer = "John [1] and Sarah [2] discussed the API [3]."
citations = extract_citations_from_answer(answer)
assert citations == [1, 2, 3], "Should extract [1], [2], [3]"
def test_extract_no_citations(self) -> None:
answer = "This is a summary without citations."
citations = extract_citations_from_answer(answer)
assert citations == [], "No citations in text"
def test_extract_repeated_citations(self) -> None:
answer = "The budget [1] is $50,000 [1]."
citations = extract_citations_from_answer(answer)
assert citations == [1, 1], "Should extract repeated citations"
def test_extract_double_digit_citations(self) -> None:
answer = f"See references [{DOUBLE_DIGIT_CITATION_A}] and [{DOUBLE_DIGIT_CITATION_END}]."
citations = extract_citations_from_answer(answer)
assert citations == [
DOUBLE_DIGIT_CITATION_A,
DOUBLE_DIGIT_CITATION_END,
], "Should handle double digits"
class TestSegmentIdExtraction:
def test_extract_ids_from_scored_segments(self) -> None:
seg1 = MockSegment(segment_id=1, meeting_id=None, text="a", start_time=0.0, end_time=1.0)
seg2 = MockSegment(segment_id=2, meeting_id=None, text="b", start_time=1.0, end_time=2.0)
retrieved = [(seg1, 0.9), (seg2, 0.8)]
ids = extract_segment_ids_from_retrieved(retrieved)
assert ids == [1, 2], "Should extract segment IDs in order"
def test_extract_ids_preserves_order(self) -> None:
seg1 = MockSegment(segment_id=5, meeting_id=None, text="a", start_time=0.0, end_time=1.0)
seg2 = MockSegment(segment_id=3, meeting_id=None, text="b", start_time=1.0, end_time=2.0)
seg3 = MockSegment(segment_id=1, meeting_id=None, text="c", start_time=2.0, end_time=3.0)
retrieved = [(seg1, 0.9), (seg2, 0.8), (seg3, 0.7)]
ids = extract_segment_ids_from_retrieved(retrieved)
assert ids == [5, 3, 1], "Order should match retrieval ranking"
def test_extract_ids_empty_list(self) -> None:
retrieved: list[tuple[MockSegment, float]] = []
ids = extract_segment_ids_from_retrieved(retrieved)
assert ids == [], "Empty input yields empty output"
class TestSampleTranscriptForQA:
def test_sample_has_sufficient_segments(self) -> None:
assert len(SAMPLE_MEETING_TRANSCRIPT) >= 5, "Need enough segments"
def test_sample_covers_deadline_topic(self) -> None:
assert "deadline" in SAMPLE_ALL_TEXT.lower(), "Should mention deadlines"
def test_sample_covers_budget_topic(self) -> None:
assert "budget" in SAMPLE_ALL_TEXT.lower(), "Should mention budget"
def test_sample_has_action_items(self) -> None:
action_segment = SAMPLE_MEETING_TRANSCRIPT[6]
assert "action item" in action_segment.text.lower(), "Should have action item"
@pytest.mark.parametrize(
("retrieved", "relevant", "expected_mrr"),
[
([1, 2, 3], {1}, 1.0),
([2, 1, 3], {1}, 0.5),
([2, 3, 1], {1}, 1 / 3),
([2, 3, 4], {1}, 0.0),
],
ids=["first-position", "second-position", "third-position", "not-found"],
)
def test_mrr_parametrized(
retrieved: list[int],
relevant: set[int],
expected_mrr: float,
) -> None:
metrics = calculate_retrieval_metrics(retrieved, relevant, k=3)
assert metrics.mean_reciprocal_rank == approx_float(expected_mrr), "MRR mismatch"
@pytest.mark.parametrize(
("answer", "expected_count"),
[
("No citations here.", 0),
("Single [1] citation.", 1),
("Multiple [1] [2] [3] citations.", 3),
("Edge [1][2] together.", 2),
],
ids=["none", "single", "multiple", "adjacent"],
)
def test_citation_count_parametrized(answer: str, expected_count: int) -> None:
citations = extract_citations_from_answer(answer)
assert len(citations) == expected_count, "Citation count mismatch"