From 6fa792990b4cc807c8cd2220c7a2c3f014c455f2 Mon Sep 17 00:00:00 2001 From: Travis Vasceannie Date: Thu, 25 Dec 2025 13:46:00 -0500 Subject: [PATCH] Enhance summarization model attributes and database schema - Updated the Summary entity to include provider and model names, along with tokens used and latency metrics for better tracking of summarization performance. - Modified the ORM converters and repository methods to accommodate new attributes, ensuring backward compatibility. - Introduced word timing position indexing to maintain order within summaries. - Added a new SQLAlchemy model structure for improved organization of persistence layers, including core, identity, and integration models. - Removed deprecated models and files to streamline the codebase. --- .claude/settings.json | 5 + .serena/project.yml | 4 +- docker/db/schema.sql | 599 ++ docs/code-quality-correction-plan.md | 633 -- docs/milestones.md | 1098 -- docs/qa-report-2024-12-24.md | 466 - docs/roadmap.md | 97 +- docs/sprints/QUALITY_STANDARDS.md | 575 ++ .../sprint-0-proto-schema/README.md | 989 ++ .../sprint-1-ai-templates/README.md | 548 + .../sprint-2-diarization-service/README.md | 1699 ++++ .../sprint-3-pdf-export/README.md | 778 ++ .../sprint-4-ner-extraction/README.md | 1971 ++++ .../sprint-5-calendar-sync/README.md | 2767 +++++ .../sprint-6-webhooks/README.md | 1427 +++ docs/triage.md | 93 - docs/ui.md | 9032 ----------------- repomix.config.json | 4 +- .../application/services/meeting_service.py | 9 +- src/noteflow/domain/entities/summary.py | 24 +- src/noteflow/grpc/_mixins/summarization.py | 3 +- .../converters/orm_converters.py | 18 +- .../infrastructure/persistence/models.py | 396 - .../persistence/models/__init__.py | 100 + .../persistence/models/_base.py | 18 + .../persistence/models/core/__init__.py | 29 + .../persistence/models/core/annotation.py | 64 + .../persistence/models/core/diarization.py | 96 + .../persistence/models/core/meeting.py | 244 + .../persistence/models/core/summary.py | 143 + .../persistence/models/entities/__init__.py | 11 + .../persistence/models/entities/speaker.py | 115 + .../persistence/models/identity/__init__.py | 19 + .../persistence/models/identity/identity.py | 150 + .../persistence/models/identity/settings.py | 101 + .../models/integrations/__init__.py | 19 + .../models/integrations/integration.py | 323 + .../models/organization/__init__.py | 13 + .../models/organization/tagging.py | 89 + .../persistence/models/organization/task.py | 110 + .../persistence/repositories/segment_repo.py | 6 +- .../persistence/repositories/summary_repo.py | 22 +- .../summarization/citation_verifier.py | 6 +- .../summarization/cloud_provider.py | 18 +- .../summarization/mock_provider.py | 3 +- .../summarization/ollama_provider.py | 18 +- tests/application/test_meeting_service.py | 6 +- tests/grpc/test_generate_summary.py | 4 +- .../summarization/test_citation_verifier.py | 5 +- tests/infrastructure/test_converters.py | 6 +- tests/integration/test_e2e_summarization.py | 7 +- tests/integration/test_repositories.py | 5 +- 52 files changed, 13196 insertions(+), 11789 deletions(-) create mode 100644 .claude/settings.json create mode 100644 docker/db/schema.sql delete mode 100644 docs/code-quality-correction-plan.md delete mode 100644 docs/milestones.md delete mode 100644 docs/qa-report-2024-12-24.md create mode 100644 docs/sprints/QUALITY_STANDARDS.md create mode 100644 docs/sprints/phase-0-foundation/sprint-0-proto-schema/README.md create mode 100644 docs/sprints/phase-1-core-pipeline/sprint-1-ai-templates/README.md create mode 100644 docs/sprints/phase-1-core-pipeline/sprint-2-diarization-service/README.md create mode 100644 docs/sprints/phase-1-core-pipeline/sprint-3-pdf-export/README.md create mode 100644 docs/sprints/phase-2-intelligence/sprint-4-ner-extraction/README.md create mode 100644 docs/sprints/phase-3-integrations/sprint-5-calendar-sync/README.md create mode 100644 docs/sprints/phase-3-integrations/sprint-6-webhooks/README.md delete mode 100644 docs/ui.md delete mode 100644 src/noteflow/infrastructure/persistence/models.py create mode 100644 src/noteflow/infrastructure/persistence/models/__init__.py create mode 100644 src/noteflow/infrastructure/persistence/models/_base.py create mode 100644 src/noteflow/infrastructure/persistence/models/core/__init__.py create mode 100644 src/noteflow/infrastructure/persistence/models/core/annotation.py create mode 100644 src/noteflow/infrastructure/persistence/models/core/diarization.py create mode 100644 src/noteflow/infrastructure/persistence/models/core/meeting.py create mode 100644 src/noteflow/infrastructure/persistence/models/core/summary.py create mode 100644 src/noteflow/infrastructure/persistence/models/entities/__init__.py create mode 100644 src/noteflow/infrastructure/persistence/models/entities/speaker.py create mode 100644 src/noteflow/infrastructure/persistence/models/identity/__init__.py create mode 100644 src/noteflow/infrastructure/persistence/models/identity/identity.py create mode 100644 src/noteflow/infrastructure/persistence/models/identity/settings.py create mode 100644 src/noteflow/infrastructure/persistence/models/integrations/__init__.py create mode 100644 src/noteflow/infrastructure/persistence/models/integrations/integration.py create mode 100644 src/noteflow/infrastructure/persistence/models/organization/__init__.py create mode 100644 src/noteflow/infrastructure/persistence/models/organization/tagging.py create mode 100644 src/noteflow/infrastructure/persistence/models/organization/task.py diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..4427955 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,5 @@ +{ + "enabledPlugins": { + "pyright-lsp@claude-plugins-official": true + } +} diff --git a/.serena/project.yml b/.serena/project.yml index 7986cb0..b840a2b 100644 --- a/.serena/project.yml +++ b/.serena/project.yml @@ -15,9 +15,7 @@ # Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored. languages: - python - -# the encoding used by text files in the project -# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings +- typescript encoding: "utf-8" # whether to use the project's gitignore file to ignore files diff --git a/docker/db/schema.sql b/docker/db/schema.sql new file mode 100644 index 0000000..965b460 --- /dev/null +++ b/docker/db/schema.sql @@ -0,0 +1,599 @@ +-- noteflow_init.sql +-- Creates schema + tables + placeholder data for local dev. + +-- Extensions (safe to run repeatedly) +CREATE EXTENSION IF NOT EXISTS pgcrypto; +CREATE EXTENSION IF NOT EXISTS citext; +CREATE EXTENSION IF NOT EXISTS vector; + +-- Schema +CREATE SCHEMA IF NOT EXISTS noteflow; +SET search_path TO noteflow, public; + +-- updated_at trigger helper +CREATE OR REPLACE FUNCTION noteflow.set_updated_at() +RETURNS TRIGGER AS $$ +BEGIN + NEW.updated_at = now(); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +-------------------------------------------------------------------------------- +-- Identity / tenancy (future-ready) +-------------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS noteflow.workspaces ( + id uuid PRIMARY KEY, + slug text UNIQUE, + name text NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + metadata jsonb NOT NULL DEFAULT '{}'::jsonb +); + +CREATE TABLE IF NOT EXISTS noteflow.users ( + id uuid PRIMARY KEY, + email citext UNIQUE, + display_name text NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + metadata jsonb NOT NULL DEFAULT '{}'::jsonb +); + +CREATE TABLE IF NOT EXISTS noteflow.workspace_memberships ( + workspace_id uuid NOT NULL REFERENCES noteflow.workspaces(id) ON DELETE CASCADE, + user_id uuid NOT NULL REFERENCES noteflow.users(id) ON DELETE CASCADE, + role text NOT NULL DEFAULT 'owner', + created_at timestamptz NOT NULL DEFAULT now(), + PRIMARY KEY (workspace_id, user_id) +); + +CREATE TRIGGER trg_workspaces_updated_at +BEFORE UPDATE ON noteflow.workspaces +FOR EACH ROW EXECUTE FUNCTION noteflow.set_updated_at(); + +CREATE TRIGGER trg_users_updated_at +BEFORE UPDATE ON noteflow.users +FOR EACH ROW EXECUTE FUNCTION noteflow.set_updated_at(); + +-------------------------------------------------------------------------------- +-- Core domain (matches current project shape) +-------------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS noteflow.meetings ( + id uuid PRIMARY KEY, + -- Forward-looking fields: safe defaults for current code + workspace_id uuid NOT NULL DEFAULT '00000000-0000-0000-0000-000000000001'::uuid + REFERENCES noteflow.workspaces(id) ON DELETE RESTRICT, + created_by_id uuid NULL DEFAULT '00000000-0000-0000-0000-000000000001'::uuid + REFERENCES noteflow.users(id) ON DELETE SET NULL, + + title varchar(255) NOT NULL, + state integer NOT NULL DEFAULT 1, -- 1..5 (Created..Error) + created_at timestamptz NOT NULL DEFAULT now(), + started_at timestamptz NULL, + ended_at timestamptz NULL, + + metadata jsonb NOT NULL DEFAULT '{}'::jsonb, + wrapped_dek bytea NULL, + asset_path text NULL, + + deleted_at timestamptz NULL +); + +ALTER TABLE noteflow.meetings + ADD CONSTRAINT IF NOT EXISTS meetings_state_chk + CHECK (state BETWEEN 1 AND 5); + +CREATE INDEX IF NOT EXISTS idx_meetings_workspace_created_at + ON noteflow.meetings(workspace_id, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_meetings_state + ON noteflow.meetings(state); + +CREATE TABLE IF NOT EXISTS noteflow.segments ( + id bigserial PRIMARY KEY, + meeting_id uuid NOT NULL REFERENCES noteflow.meetings(id) ON DELETE CASCADE, + segment_id integer NOT NULL, -- stable ordering within meeting + text text NOT NULL, + start_time double precision NOT NULL, + end_time double precision NOT NULL, + + language varchar(10) NOT NULL DEFAULT 'en', + language_confidence double precision NOT NULL DEFAULT 0, + avg_logprob double precision NOT NULL DEFAULT 0, + no_speech_prob double precision NOT NULL DEFAULT 0, + + embedding vector(1536) NULL, + + speaker_id varchar(50) NULL, + speaker_confidence double precision NOT NULL DEFAULT 0.0, + + created_at timestamptz NOT NULL DEFAULT now() +); + +ALTER TABLE noteflow.segments + ADD CONSTRAINT IF NOT EXISTS segments_unique_per_meeting + UNIQUE (meeting_id, segment_id); + +CREATE INDEX IF NOT EXISTS idx_segments_meeting_id + ON noteflow.segments(meeting_id); + +CREATE INDEX IF NOT EXISTS idx_segments_meeting_time + ON noteflow.segments(meeting_id, start_time); + +-- Vector index (ivfflat is broadly supported; you can switch to hnsw later) +CREATE INDEX IF NOT EXISTS idx_segments_embedding_ivfflat + ON noteflow.segments USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); + +CREATE TABLE IF NOT EXISTS noteflow.word_timings ( + id bigserial PRIMARY KEY, + segment_pk bigint NOT NULL REFERENCES noteflow.segments(id) ON DELETE CASCADE, + word_index integer NOT NULL, + word varchar(255) NOT NULL, + start_time double precision NOT NULL, + end_time double precision NOT NULL, + probability double precision NOT NULL, + UNIQUE (segment_pk, word_index) +); + +CREATE INDEX IF NOT EXISTS idx_word_timings_segment_pk + ON noteflow.word_timings(segment_pk); + +CREATE TABLE IF NOT EXISTS noteflow.summaries ( + id bigserial PRIMARY KEY, + meeting_id uuid NOT NULL UNIQUE REFERENCES noteflow.meetings(id) ON DELETE CASCADE, + executive_summary text NOT NULL DEFAULT '', + generated_at timestamptz NOT NULL DEFAULT now(), + + provider_name text NOT NULL DEFAULT '', + model_name text NOT NULL DEFAULT '', + tokens_used integer NULL, + latency_ms double precision NULL, + + verification jsonb NOT NULL DEFAULT '{}'::jsonb +); + +CREATE TABLE IF NOT EXISTS noteflow.key_points ( + id bigserial PRIMARY KEY, + summary_id bigint NOT NULL REFERENCES noteflow.summaries(id) ON DELETE CASCADE, + position integer NOT NULL, + text text NOT NULL, + segment_ids integer[] NOT NULL DEFAULT '{}'::integer[], + start_time double precision NOT NULL DEFAULT 0, + end_time double precision NOT NULL DEFAULT 0, + UNIQUE (summary_id, position) +); + +CREATE TABLE IF NOT EXISTS noteflow.action_items ( + id bigserial PRIMARY KEY, + summary_id bigint NOT NULL REFERENCES noteflow.summaries(id) ON DELETE CASCADE, + position integer NOT NULL, + text text NOT NULL, + segment_ids integer[] NOT NULL DEFAULT '{}'::integer[], + start_time double precision NOT NULL DEFAULT 0, + end_time double precision NOT NULL DEFAULT 0, + + assignee text NOT NULL DEFAULT '', + due_date timestamptz NULL, + priority integer NOT NULL DEFAULT 0, + UNIQUE (summary_id, position) +); + +CREATE TABLE IF NOT EXISTS noteflow.annotations ( + id bigserial PRIMARY KEY, + annotation_id uuid NOT NULL DEFAULT gen_random_uuid(), + meeting_id uuid NOT NULL REFERENCES noteflow.meetings(id) ON DELETE CASCADE, + annotation_type varchar(50) NOT NULL, + text text NOT NULL, + start_time double precision NOT NULL DEFAULT 0, + end_time double precision NOT NULL DEFAULT 0, + segment_ids integer[] NOT NULL DEFAULT '{}'::integer[], + created_at timestamptz NOT NULL DEFAULT now(), + UNIQUE (annotation_id) +); + +CREATE INDEX IF NOT EXISTS idx_annotations_meeting_id + ON noteflow.annotations(meeting_id); + +CREATE TABLE IF NOT EXISTS noteflow.diarization_jobs ( + id varchar(36) PRIMARY KEY, + meeting_id uuid NOT NULL REFERENCES noteflow.meetings(id) ON DELETE CASCADE, + status integer NOT NULL DEFAULT 0, + segments_updated integer NOT NULL DEFAULT 0, + speaker_ids text[] NOT NULL DEFAULT '{}'::text[], + error_message text NOT NULL DEFAULT '', + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() +); + +CREATE TRIGGER trg_diarization_jobs_updated_at +BEFORE UPDATE ON noteflow.diarization_jobs +FOR EACH ROW EXECUTE FUNCTION noteflow.set_updated_at(); + +CREATE TABLE IF NOT EXISTS noteflow.streaming_diarization_turns ( + id bigserial PRIMARY KEY, + meeting_id uuid NOT NULL REFERENCES noteflow.meetings(id) ON DELETE CASCADE, + speaker varchar(50) NOT NULL, + start_time double precision NOT NULL, + end_time double precision NOT NULL, + confidence double precision NOT NULL DEFAULT 0.0, + created_at timestamptz NOT NULL DEFAULT now() +); + +CREATE INDEX IF NOT EXISTS idx_streaming_turns_meeting_time + ON noteflow.streaming_diarization_turns(meeting_id, start_time); + +-- Existing style KV preferences (compat with current repo pattern) [oai_citation:11‑repomix-output.md](sediment://file_000000004f2c722fbba5e8a81215dabf) +CREATE TABLE IF NOT EXISTS noteflow.user_preferences ( + key varchar(64) PRIMARY KEY, + value jsonb NOT NULL DEFAULT '{}'::jsonb, + updated_at timestamptz NOT NULL DEFAULT now() +); + +-------------------------------------------------------------------------------- +-- Future-facing but safe additions: people, tags, tasks, integrations, settings +-------------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS noteflow.persons ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + workspace_id uuid NOT NULL REFERENCES noteflow.workspaces(id) ON DELETE CASCADE, + display_name text NOT NULL, + email citext NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + metadata jsonb NOT NULL DEFAULT '{}'::jsonb, + UNIQUE (workspace_id, email) +); + +CREATE TRIGGER trg_persons_updated_at +BEFORE UPDATE ON noteflow.persons +FOR EACH ROW EXECUTE FUNCTION noteflow.set_updated_at(); + +CREATE TABLE IF NOT EXISTS noteflow.meeting_speakers ( + meeting_id uuid NOT NULL REFERENCES noteflow.meetings(id) ON DELETE CASCADE, + speaker_id varchar(50) NOT NULL, + display_name text NULL, + person_id uuid NULL REFERENCES noteflow.persons(id) ON DELETE SET NULL, + created_at timestamptz NOT NULL DEFAULT now(), + PRIMARY KEY (meeting_id, speaker_id) +); + +CREATE TABLE IF NOT EXISTS noteflow.tags ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + workspace_id uuid NOT NULL REFERENCES noteflow.workspaces(id) ON DELETE CASCADE, + name text NOT NULL, + color text NOT NULL DEFAULT '#888888', + created_at timestamptz NOT NULL DEFAULT now(), + UNIQUE (workspace_id, name) +); + +CREATE TABLE IF NOT EXISTS noteflow.meeting_tags ( + meeting_id uuid NOT NULL REFERENCES noteflow.meetings(id) ON DELETE CASCADE, + tag_id uuid NOT NULL REFERENCES noteflow.tags(id) ON DELETE CASCADE, + PRIMARY KEY (meeting_id, tag_id) +); + +CREATE TABLE IF NOT EXISTS noteflow.tasks ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + workspace_id uuid NOT NULL REFERENCES noteflow.workspaces(id) ON DELETE CASCADE, + meeting_id uuid NULL REFERENCES noteflow.meetings(id) ON DELETE SET NULL, + action_item_id bigint NULL REFERENCES noteflow.action_items(id) ON DELETE SET NULL, + text text NOT NULL, + status text NOT NULL DEFAULT 'open', + assignee_person_id uuid NULL REFERENCES noteflow.persons(id) ON DELETE SET NULL, + due_date timestamptz NULL, + priority integer NOT NULL DEFAULT 0, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + completed_at timestamptz NULL, + metadata jsonb NOT NULL DEFAULT '{}'::jsonb +); + +ALTER TABLE noteflow.tasks + ADD CONSTRAINT IF NOT EXISTS tasks_status_chk + CHECK (status IN ('open','done','dismissed')); + +CREATE INDEX IF NOT EXISTS idx_tasks_workspace_status + ON noteflow.tasks(workspace_id, status); + +CREATE TRIGGER trg_tasks_updated_at +BEFORE UPDATE ON noteflow.tasks +FOR EACH ROW EXECUTE FUNCTION noteflow.set_updated_at(); + +CREATE TABLE IF NOT EXISTS noteflow.integrations ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + workspace_id uuid NOT NULL REFERENCES noteflow.workspaces(id) ON DELETE CASCADE, + name text NOT NULL, + type text NOT NULL, + status text NOT NULL DEFAULT 'disconnected', + config jsonb NOT NULL DEFAULT '{}'::jsonb, + last_sync timestamptz NULL, + error_message text NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() +); + +ALTER TABLE noteflow.integrations + ADD CONSTRAINT IF NOT EXISTS integrations_type_chk + CHECK (type IN ('auth','email','calendar','pkm','custom')); + +ALTER TABLE noteflow.integrations + ADD CONSTRAINT IF NOT EXISTS integrations_status_chk + CHECK (status IN ('disconnected','connected','error')); + +CREATE TRIGGER trg_integrations_updated_at +BEFORE UPDATE ON noteflow.integrations +FOR EACH ROW EXECUTE FUNCTION noteflow.set_updated_at(); + +CREATE TABLE IF NOT EXISTS noteflow.integration_secrets ( + integration_id uuid NOT NULL REFERENCES noteflow.integrations(id) ON DELETE CASCADE, + secret_key text NOT NULL, + secret_value bytea NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + PRIMARY KEY (integration_id, secret_key) +); + +CREATE TRIGGER trg_integration_secrets_updated_at +BEFORE UPDATE ON noteflow.integration_secrets +FOR EACH ROW EXECUTE FUNCTION noteflow.set_updated_at(); + +CREATE TABLE IF NOT EXISTS noteflow.integration_sync_runs ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + integration_id uuid NOT NULL REFERENCES noteflow.integrations(id) ON DELETE CASCADE, + status text NOT NULL, + started_at timestamptz NOT NULL DEFAULT now(), + ended_at timestamptz NULL, + duration_ms integer NULL, + error_message text NULL, + stats jsonb NOT NULL DEFAULT '{}'::jsonb +); + +ALTER TABLE noteflow.integration_sync_runs + ADD CONSTRAINT IF NOT EXISTS integration_sync_runs_status_chk + CHECK (status IN ('running','success','error')); + +CREATE INDEX IF NOT EXISTS idx_sync_runs_integration_started + ON noteflow.integration_sync_runs(integration_id, started_at DESC); + +CREATE TABLE IF NOT EXISTS noteflow.calendar_events ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + integration_id uuid NOT NULL REFERENCES noteflow.integrations(id) ON DELETE CASCADE, + external_id text NOT NULL, + calendar_id text NOT NULL, + calendar_name text NOT NULL, + title text NOT NULL, + description text NULL, + start_time timestamptz NOT NULL, + end_time timestamptz NOT NULL, + location text NULL, + attendees text[] NULL, + is_all_day boolean NOT NULL DEFAULT false, + meeting_link text NULL, + raw jsonb NOT NULL DEFAULT '{}'::jsonb, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + UNIQUE (integration_id, external_id) +); + +CREATE TRIGGER trg_calendar_events_updated_at +BEFORE UPDATE ON noteflow.calendar_events +FOR EACH ROW EXECUTE FUNCTION noteflow.set_updated_at(); + +CREATE TABLE IF NOT EXISTS noteflow.meeting_calendar_links ( + meeting_id uuid NOT NULL REFERENCES noteflow.meetings(id) ON DELETE CASCADE, + calendar_event_id uuid NOT NULL REFERENCES noteflow.calendar_events(id) ON DELETE CASCADE, + PRIMARY KEY (meeting_id, calendar_event_id) +); + +CREATE TABLE IF NOT EXISTS noteflow.external_refs ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + integration_id uuid NOT NULL REFERENCES noteflow.integrations(id) ON DELETE CASCADE, + entity_type text NOT NULL, + entity_id text NOT NULL, + external_id text NOT NULL, + external_url text NULL, + created_at timestamptz NOT NULL DEFAULT now(), + UNIQUE (integration_id, entity_type, entity_id) +); + +CREATE TABLE IF NOT EXISTS noteflow.settings ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + scope text NOT NULL, -- system | workspace | user + workspace_id uuid NULL REFERENCES noteflow.workspaces(id) ON DELETE CASCADE, + user_id uuid NULL REFERENCES noteflow.users(id) ON DELETE CASCADE, + key text NOT NULL, + value jsonb NOT NULL DEFAULT '{}'::jsonb, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + UNIQUE (scope, workspace_id, user_id, key) +); + +ALTER TABLE noteflow.settings + ADD CONSTRAINT IF NOT EXISTS settings_scope_chk + CHECK (scope IN ('system','workspace','user')); + +CREATE TRIGGER trg_settings_updated_at +BEFORE UPDATE ON noteflow.settings +FOR EACH ROW EXECUTE FUNCTION noteflow.set_updated_at(); + +-------------------------------------------------------------------------------- +-- Seed data (safe to re-run) +-------------------------------------------------------------------------------- +-- Deterministic IDs for local dev +-- workspace/user share the same UUID to simplify defaults +INSERT INTO noteflow.workspaces (id, slug, name, metadata) +VALUES ( + '00000000-0000-0000-0000-000000000001', + 'default', + 'Default Workspace', + '{"seed":true}'::jsonb +) +ON CONFLICT (id) DO NOTHING; + +INSERT INTO noteflow.users (id, email, display_name, metadata) +VALUES ( + '00000000-0000-0000-0000-000000000001', + 'local@noteflow.local', + 'Local User', + '{"seed":true}'::jsonb +) +ON CONFLICT (id) DO NOTHING; + +INSERT INTO noteflow.workspace_memberships (workspace_id, user_id, role) +VALUES ( + '00000000-0000-0000-0000-000000000001', + '00000000-0000-0000-0000-000000000001', + 'owner' +) +ON CONFLICT DO NOTHING; + +-- Sample meeting +INSERT INTO noteflow.meetings ( + id, title, state, created_at, started_at, ended_at, metadata, asset_path +) VALUES ( + '11111111-1111-1111-1111-111111111111', + 'Seed Meeting: Project Kickoff', + 4, + now() - interval '2 days', + now() - interval '2 days' + interval '5 minutes', + now() - interval '2 days' + interval '47 minutes', + '{"source":"seed","topic":"kickoff"}'::jsonb, + '11111111-1111-1111-1111-111111111111' +) +ON CONFLICT (id) DO NOTHING; + +-- Sample segments +INSERT INTO noteflow.segments ( + id, meeting_id, segment_id, text, start_time, end_time, language, speaker_id, speaker_confidence +) VALUES + (1, '11111111-1111-1111-1111-111111111111', 0, 'Welcome everyone. Today we will align on goals and deliverables.', 0.0, 6.2, 'en', 'SPEAKER_00', 0.92), + (2, '11111111-1111-1111-1111-111111111111', 1, 'We should prioritize the database schema first, then build the UI around it.', 6.2, 12.4, 'en', 'SPEAKER_01', 0.88), + (3, '11111111-1111-1111-1111-111111111111', 2, 'Action item: draft an initial schema and seed script for local development.', 12.4, 18.0, 'en', 'SPEAKER_00', 0.90) +ON CONFLICT (id) DO NOTHING; + +-- Word timings (a few illustrative words) +INSERT INTO noteflow.word_timings (segment_pk, word_index, word, start_time, end_time, probability) +VALUES + (1, 0, 'Welcome', 0.00, 0.40, 0.98), + (1, 1, 'everyone.', 0.41, 0.80, 0.97), + (2, 0, 'We', 6.20, 6.30, 0.99), + (2, 1, 'should', 6.31, 6.55, 0.99), + (3, 0, 'Action', 12.40, 12.62, 0.97), + (3, 1, 'item:', 12.63, 12.82, 0.95) +ON CONFLICT DO NOTHING; + +-- Summary + points + items +INSERT INTO noteflow.summaries ( + id, meeting_id, executive_summary, provider_name, model_name, tokens_used, latency_ms, verification +) VALUES ( + 1, + '11111111-1111-1111-1111-111111111111', + 'Aligned on building a scalable schema first; UI will follow. Identified a concrete next action to draft schema + seeds.', + 'local', + 'mock', + 123, + 42.0, + '{"seed":true}'::jsonb +) +ON CONFLICT (id) DO NOTHING; + +INSERT INTO noteflow.key_points (id, summary_id, position, text, segment_ids, start_time, end_time) +VALUES + (1, 1, 0, 'Schema-first development to accelerate UI work.', ARRAY[1], 6.2, 12.4) +ON CONFLICT (id) DO NOTHING; + +INSERT INTO noteflow.action_items (id, summary_id, position, text, segment_ids, start_time, end_time, assignee, priority) +VALUES + (1, 1, 0, 'Draft initial database schema + seed script.', ARRAY[2], 12.4, 18.0, 'Local User', 2) +ON CONFLICT (id) DO NOTHING; + +-- Task derived from action item (future task workflow) +INSERT INTO noteflow.tasks (id, workspace_id, meeting_id, action_item_id, text, status, priority) +VALUES ( + '22222222-2222-2222-2222-222222222222', + '00000000-0000-0000-0000-000000000001', + '11111111-1111-1111-1111-111111111111', + 1, + 'Draft initial database schema + seed script.', + 'open', + 2 +) +ON CONFLICT (id) DO NOTHING; + +-- Annotation +INSERT INTO noteflow.annotations (id, meeting_id, annotation_type, text, start_time, end_time, segment_ids) +VALUES + (1, '11111111-1111-1111-1111-111111111111', 'ANNOTATION_TYPE_NOTE', 'Remember to keep schema modular and future-proof.', 6.0, 10.0, ARRAY[1]) +ON CONFLICT (id) DO NOTHING; + +-- Speaker/person mapping +INSERT INTO noteflow.persons (id, workspace_id, display_name, email) +VALUES + ('33333333-3333-3333-3333-333333333333', '00000000-0000-0000-0000-000000000001', 'Alex Example', 'alex@example.com') +ON CONFLICT (id) DO NOTHING; + +INSERT INTO noteflow.meeting_speakers (meeting_id, speaker_id, display_name, person_id) +VALUES + ('11111111-1111-1111-1111-111111111111', 'SPEAKER_00', 'Alex', '33333333-3333-3333-3333-333333333333'), + ('11111111-1111-1111-1111-111111111111', 'SPEAKER_01', 'Jordan', NULL) +ON CONFLICT DO NOTHING; + +-- Tags +INSERT INTO noteflow.tags (id, workspace_id, name, color) +VALUES + ('44444444-4444-4444-4444-444444444444', '00000000-0000-0000-0000-000000000001', 'seed', '#00AEEF') +ON CONFLICT (id) DO NOTHING; + +INSERT INTO noteflow.meeting_tags (meeting_id, tag_id) +VALUES + ('11111111-1111-1111-1111-111111111111', '44444444-4444-4444-4444-444444444444') +ON CONFLICT DO NOTHING; + +-- Mock integration + a calendar event (shape matches your client-side config model) [oai_citation:12‑repomix-output.md](sediment://file_000000004f2c722fbba5e8a81215dabf) +INSERT INTO noteflow.integrations (id, workspace_id, name, type, status, config, last_sync) +VALUES ( + '55555555-5555-5555-5555-555555555555', + '00000000-0000-0000-0000-000000000001', + 'Mock Calendar', + 'calendar', + 'connected', + '{"sync_interval_minutes":60,"calendar_ids":["primary"],"webhook_url":"https://example.invalid/webhook"}'::jsonb, + now() - interval '1 day' +) +ON CONFLICT (id) DO NOTHING; + +INSERT INTO noteflow.calendar_events ( + id, integration_id, external_id, calendar_id, calendar_name, title, start_time, end_time, attendees, meeting_link +) VALUES ( + '66666666-6666-6666-6666-666666666666', + '55555555-5555-5555-5555-555555555555', + 'evt_seed_001', + 'primary', + 'Primary', + 'Seed Meeting: Project Kickoff', + now() - interval '2 days' + interval '5 minutes', + now() - interval '2 days' + interval '47 minutes', + ARRAY['alex@example.com'], + 'https://meet.example.invalid/seed' +) +ON CONFLICT (id) DO NOTHING; + +INSERT INTO noteflow.meeting_calendar_links (meeting_id, calendar_event_id) +VALUES ('11111111-1111-1111-1111-111111111111', '66666666-6666-6666-6666-666666666666') +ON CONFLICT DO NOTHING; + +-- Preferences KV used by server-side logic (stored as {"value": ...}) [oai_citation:13‑repomix-output.md](sediment://file_000000004f2c722fbba5e8a81215dabf) +INSERT INTO noteflow.user_preferences (key, value) +VALUES + ('cloud_consent_granted', '{"value": false}'::jsonb), + ('schema_seeded', '{"value": true}'::jsonb) +ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value, updated_at = now(); + +-- Keep sequences sane if you re-run +SELECT setval('noteflow.segments_id_seq', (SELECT COALESCE(MAX(id), 1) FROM noteflow.segments)); +SELECT setval('noteflow.summaries_id_seq', (SELECT COALESCE(MAX(id), 1) FROM noteflow.summaries)); +SELECT setval('noteflow.key_points_id_seq', (SELECT COALESCE(MAX(id), 1) FROM noteflow.key_points)); +SELECT setval('noteflow.action_items_id_seq', (SELECT COALESCE(MAX(id), 1) FROM noteflow.action_items)); +SELECT setval('noteflow.annotations_id_seq', (SELECT COALESCE(MAX(id), 1) FROM noteflow.annotations)); +SELECT setval('noteflow.word_timings_id_seq', (SELECT COALESCE(MAX(id), 1) FROM noteflow.word_timings)); +SELECT setval('noteflow.streaming_diarization_turns_id_seq', (SELECT COALESCE(MAX(id), 1) FROM noteflow.streaming_diarization_turns)); \ No newline at end of file diff --git a/docs/code-quality-correction-plan.md b/docs/code-quality-correction-plan.md deleted file mode 100644 index f512e4c..0000000 --- a/docs/code-quality-correction-plan.md +++ /dev/null @@ -1,633 +0,0 @@ -# Code Quality Correction Plan - -This plan addresses code quality issues identified by automated testing across the NoteFlow codebase. - -## Executive Summary - -| Area | Failing Tests | Issues Found | Status | -|------|---------------|--------------|--------| -| Python Backend Code | 10 | 17 violations | πŸ”΄ Thresholds tightened | -| Python Test Smells | 7 | 223 smells | πŸ”΄ Thresholds tightened | -| React/TypeScript Frontend | 6 | 23 violations | πŸ”΄ Already strict | -| Rust/Tauri | 0 | 4 large files | βšͺ No quality tests | - -**2024-12-24 Update:** Quality test thresholds have been aggressively tightened to expose real technical debt. Previously, all tests passed because thresholds were set just above actual violation counts. - ---- - -## Phase 1: Python Backend (High Priority) - -### 1.1 Split `NoteFlowClient` God Class - -**File:** `src/noteflow/grpc/client.py` (942 lines, 32 methods) - -**Problem:** Single class combines 6 distinct concerns: connection management, streaming, meeting CRUD, annotation CRUD, export, and diarization. - -**Solution:** Apply mixin pattern (already used successfully in `grpc/_mixins/`). - -``` -src/noteflow/grpc/ -β”œβ”€β”€ client.py # Thin facade (~100 lines) -β”œβ”€β”€ _client_mixins/ -β”‚ β”œβ”€β”€ __init__.py -β”‚ β”œβ”€β”€ connection.py # GrpcConnectionMixin (~100 lines) -β”‚ β”œβ”€β”€ streaming.py # AudioStreamingMixin (~150 lines) -β”‚ β”œβ”€β”€ meeting.py # MeetingClientMixin (~100 lines) -β”‚ β”œβ”€β”€ annotation.py # AnnotationClientMixin (~150 lines) -β”‚ β”œβ”€β”€ export.py # ExportClientMixin (~50 lines) -β”‚ β”œβ”€β”€ diarization.py # DiarizationClientMixin (~100 lines) -β”‚ └── converters.py # Proto conversion helpers (~100 lines) -└── ... -``` - -**Steps:** -1. Create `_client_mixins/` directory structure -2. Extract `converters.py` with static proto conversion functions -3. Extract each mixin with focused responsibilities -4. Compose `NoteFlowClient` from mixins -5. Update imports in dependent code - -**Estimated Impact:** -800 lines in single file, +750 lines across 7 focused files - ---- - -### 1.2 Reduce `StreamTranscription` Complexity - -**File:** `src/noteflow/grpc/_mixins/streaming.py` (579 lines, complexity=16) - -**Problem:** 11 per-meeting state dictionaries, deeply nested async generators. - -**Solution:** Create `StreamingSession` class to encapsulate per-meeting state. - -```python -# New file: src/noteflow/grpc/_mixins/_streaming_session.py - -@dataclass -class StreamingSession: - """Encapsulates all per-meeting streaming state.""" - meeting_id: str - vad: StreamingVad - segmenter: Segmenter - partial_state: PartialState - diarization_state: DiarizationState | None - audio_writer: BufferedAudioWriter | None - next_segment_id: int - stop_requested: bool = False - - @classmethod - async def create(cls, meeting_id: str, host: ServicerHost, ...) -> "StreamingSession": - """Factory method for session initialization.""" - ... -``` - -**Steps:** -1. Define `StreamingSession` dataclass with all session state -2. Extract `PartialState` and `DiarizationState` as nested dataclasses -3. Replace dictionary lookups (`self._vad_instances[meeting_id]`) with session attributes -4. Move helper methods into session class where appropriate -5. Simplify `StreamTranscription` to manage session lifecycle - -**Estimated Impact:** Complexity 16 β†’ 10, clearer state management - ---- - -### 1.3 Create Server Configuration Objects - -**File:** `src/noteflow/grpc/server.py` (430 lines) - -**Problem:** `run_server()` has 12 parameters, `main()` has 124 lines of argument parsing. - -**Solution:** Create configuration dataclasses. - -```python -# New file: src/noteflow/grpc/_config.py - -@dataclass(frozen=True) -class AsrConfig: - model: str - device: str - compute_type: str - -@dataclass(frozen=True) -class DiarizationConfig: - enabled: bool = False - hf_token: str | None = None - device: str = "auto" - streaming_latency: float | None = None - min_speakers: int | None = None - max_speakers: int | None = None - refinement_enabled: bool = True - -@dataclass(frozen=True) -class ServerConfig: - port: int - asr: AsrConfig - database_url: str | None = None - diarization: DiarizationConfig | None = None -``` - -**Steps:** -1. Create `_config.py` with config dataclasses -2. Refactor `run_server()` to accept `ServerConfig` -3. Extract `_parse_arguments()` function from `main()` -4. Create `_build_config()` to construct config from args -5. Extract `ServerBootstrap` class for initialization phases - -**Estimated Impact:** 12 params β†’ 3, functions 146 β†’ ~60 lines each - ---- - -### 1.4 Simplify `parse_llm_response` - -**File:** `src/noteflow/infrastructure/summarization/_parsing.py` (complexity=21) - -**Problem:** Multiple parsing phases, repeated patterns for key_points/action_items. - -**Solution:** Extract helper functions for common patterns. - -```python -# Refactored structure -def _strip_markdown_fences(text: str) -> str: - """Remove markdown code block delimiters.""" - ... - -def _parse_items[T]( - raw_items: list[dict], - valid_segment_ids: set[int], - segments: Sequence[Segment], - item_factory: Callable[..., T], -) -> list[T]: - """Generic parser for key_points and action_items.""" - ... - -def parse_llm_response( - raw_response: str, - request: SummarizationRequest, -) -> Summary: - """Parse LLM JSON response into Summary entity.""" - text = _strip_markdown_fences(raw_response) - data = json.loads(text) - valid_ids = {seg.id for seg in request.segments} - - key_points = _parse_items(data.get("key_points", []), valid_ids, ...) - action_items = _parse_items(data.get("action_items", []), valid_ids, ...) - - return Summary(...) -``` - -**Steps:** -1. Extract `_strip_markdown_fences()` helper -2. Create generic `_parse_items()` function -3. Simplify `parse_llm_response()` to use helpers -4. Add unit tests for extracted functions - -**Estimated Impact:** Complexity 21 β†’ 12 - ---- - -### 1.5 Update Quality Test Thresholds - -The feature envy test has 39 false positives because converters and repositories legitimately work with external objects. - -**File:** `tests/quality/test_code_smells.py` - -**Changes:** -```python -def test_no_feature_envy() -> None: - """Detect methods that use other objects more than self.""" - # Exclude known patterns that are NOT feature envy: - # - Converter classes (naturally transform external objects) - # - Repository methods (query + convert pattern) - # - Exporter classes (transform domain to output) - excluded_patterns = [ - "converter", - "repo", - "exporter", - "_to_domain", - "_to_proto", - "_proto_to_", - ] - ... -``` - ---- - -## Phase 2: React/TypeScript Frontend (High Priority) - -### 2.1 Split `Settings.tsx` into Sub-Components - -**File:** `client/src/pages/Settings.tsx` (1,831 lines) - -**Problem:** Monolithic page with 7+ concerns mixed together. - -**Solution:** Extract into settings module. - -``` -client/src/pages/settings/ -β”œβ”€β”€ Settings.tsx # Page orchestrator (~150 lines) -β”œβ”€β”€ components/ -β”‚ β”œβ”€β”€ ServerConnectionPanel.tsx # Connection settings (~150 lines) -β”‚ β”œβ”€β”€ AudioDevicePanel.tsx # Audio device selection (~200 lines) -β”‚ β”œβ”€β”€ ProviderConfigPanel.tsx # AI provider configs (~400 lines) -β”‚ β”œβ”€β”€ AITemplatePanel.tsx # Tone/format/verbosity (~150 lines) -β”‚ β”œβ”€β”€ SyncPanel.tsx # Sync settings (~100 lines) -β”‚ β”œβ”€β”€ IntegrationsPanel.tsx # Third-party integrations (~200 lines) -β”‚ └── QuickActionsPanel.tsx # Quick actions bar (~80 lines) -└── hooks/ - β”œβ”€β”€ useProviderConfig.ts # Provider state management (~150 lines) - └── useServerConnection.ts # Connection state (~100 lines) -``` - -**Steps:** -1. Create `settings/` directory structure -2. Extract `useProviderConfig` hook for shared provider logic -3. Extract each accordion section into focused component -4. Create shared `ProviderConfigCard` component for reuse -5. Update routing to use new `Settings.tsx` - -**Estimated Impact:** 1,831 lines β†’ ~150 lines main + 1,500 distributed - ---- - -### 2.2 Centralize Configuration Constants - -**Problem:** Hardcoded endpoints scattered across 4 files. - -**Solution:** Create centralized configuration. - -```typescript -// client/src/lib/config/index.ts -export * from './provider-endpoints'; -export * from './defaults'; -export * from './server'; - -// client/src/lib/config/provider-endpoints.ts -export const PROVIDER_ENDPOINTS = { - openai: 'https://api.openai.com/v1', - anthropic: 'https://api.anthropic.com/v1', - google: 'https://generativelanguage.googleapis.com/v1', - azure: 'https://{resource}.openai.azure.com', - ollama: 'http://localhost:11434/api', - deepgram: 'https://api.deepgram.com/v1', - elevenlabs: 'https://api.elevenlabs.io/v1', -} as const; - -// client/src/lib/config/server.ts -export const SERVER_DEFAULTS = { - HOST: 'localhost', - PORT: 50051, -} as const; - -// client/src/lib/config/defaults.ts -export const DEFAULT_PREFERENCES = { ... }; -``` - -**Files to Update:** -- `lib/ai-providers.ts` - Import from config -- `lib/preferences.ts` - Import defaults from config -- `pages/Settings.tsx` - Import server defaults - -**Estimated Impact:** Eliminates 16 hardcoded endpoint violations - ---- - -### 2.3 Extract Shared Adapter Utilities - -**Files:** `api/mock-adapter.ts` (637 lines), `api/tauri-adapter.ts` (635 lines) - -**Problem:** ~150 lines of duplicated helper code. - -**Solution:** Extract shared utilities. - -```typescript -// client/src/api/constants.ts -export const TauriCommands = { ... }; -export const TauriEvents = { ... }; - -// client/src/api/helpers.ts -export function isRecord(value: unknown): value is Record { ... } -export function extractStringArrayFromRecords(records: unknown[], key: string): string[] { ... } -export function getErrorMessage(value: unknown): string | undefined { ... } -export function normalizeSuccessResponse(response: boolean | { success: boolean }): boolean { ... } -export function stateToGrpcEnum(state: string): number { ... } -``` - -**Steps:** -1. Create `api/constants.ts` with shared command/event names -2. Create `api/helpers.ts` with type guards and converters -3. Update both adapters to import from shared modules -4. Remove duplicated code - -**Estimated Impact:** -150 lines of duplication - ---- - -### 2.4 Refactor `lib/preferences.ts` - -**File:** `client/src/lib/preferences.ts` (670 lines) - -**Problem:** 15 identical setter patterns. - -**Solution:** Create generic setter factory. - -```typescript -// Before: 15 methods like this -setTranscriptionProvider(provider: TranscriptionProviderType, baseUrl: string): void { - const prefs = loadPreferences(); - prefs.ai_config.transcription.provider = provider; - prefs.ai_config.transcription.base_url = baseUrl; - prefs.ai_config.transcription.test_status = 'untested'; - savePreferences(prefs); -} - -// After: Single generic function -updateAIConfig( - configType: K, - updates: Partial -): void { - const prefs = loadPreferences(); - prefs.ai_config[configType] = { - ...prefs.ai_config[configType], - ...updates, - test_status: 'untested', - }; - savePreferences(prefs); -} -``` - -**Steps:** -1. Create generic `updateAIConfig()` function -2. Deprecate individual setter methods -3. Update Settings.tsx to use generic setter -4. Remove deprecated methods after migration - -**Estimated Impact:** -200 lines of repetitive code - ---- - -### 2.5 Split Type Definitions - -**File:** `client/src/api/types.ts` (659 lines) - -**Solution:** Organize into focused modules. - -``` -client/src/api/types/ -β”œβ”€β”€ index.ts # Re-exports all -β”œβ”€β”€ enums.ts # All enum types (~100 lines) -β”œβ”€β”€ messages.ts # Core DTOs (Meeting, Segment, etc.) (~200 lines) -β”œβ”€β”€ requests.ts # Request/Response types (~150 lines) -β”œβ”€β”€ config.ts # Provider config types (~100 lines) -└── integrations.ts # Integration types (~80 lines) -``` - -**Steps:** -1. Create `types/` directory -2. Split types by domain (safe refactor - no logic changes) -3. Create `index.ts` with re-exports -4. Update imports across codebase - -**Estimated Impact:** Better organization, easier navigation - ---- - -## Phase 3: Component Refactoring (Medium Priority) - -### 3.1 Split `Recording.tsx` - -**File:** `client/src/pages/Recording.tsx` (641 lines) - -**Solution:** Extract hooks and components. - -``` -client/src/pages/recording/ -β”œβ”€β”€ Recording.tsx # Orchestrator (~100 lines) -β”œβ”€β”€ hooks/ -β”‚ β”œβ”€β”€ useRecordingState.ts # State machine (~150 lines) -β”‚ β”œβ”€β”€ useTranscriptionStream.ts # Stream handling (~120 lines) -β”‚ └── useRecordingControls.ts # Control actions (~80 lines) -└── components/ - β”œβ”€β”€ RecordingHeader.tsx # Title + timer (~50 lines) - β”œβ”€β”€ TranscriptPanel.tsx # Transcript display (~80 lines) - β”œβ”€β”€ NotesPanel.tsx # Notes editor (~70 lines) - └── RecordingControls.tsx # Control buttons (~50 lines) -``` - ---- - -### 3.2 Split `sidebar.tsx` - -**File:** `client/src/components/ui/sidebar.tsx` (639 lines) - -**Solution:** Split into sidebar module with sub-components. - -``` -client/src/components/ui/sidebar/ -β”œβ”€β”€ index.ts # Re-exports -β”œβ”€β”€ context.ts # SidebarContext + useSidebar (~50 lines) -β”œβ”€β”€ provider.tsx # SidebarProvider (~200 lines) -└── components/ - β”œβ”€β”€ sidebar-trigger.tsx # (~40 lines) - β”œβ”€β”€ sidebar-rail.tsx # (~40 lines) - β”œβ”€β”€ sidebar-content.tsx # (~40 lines) - β”œβ”€β”€ sidebar-menu.tsx # (~60 lines) - └── sidebar-inset.tsx # (~20 lines) -``` - ---- - -### 3.3 Refactor `ai-providers.ts` - -**File:** `client/src/lib/ai-providers.ts` (618 lines) - -**Problem:** 7 provider-specific fetch functions with duplicated error handling. - -**Solution:** Create provider metadata + generic fetcher. - -```typescript -// client/src/lib/ai-providers/provider-metadata.ts -interface ProviderMetadata { - value: string; - label: string; - defaultUrl: string; - authHeader: { name: string; prefix: string }; - modelsEndpoint: string | null; - modelKey: string; - fallbackModels: string[]; -} - -export const PROVIDERS: Record = { - openai: { - value: 'openai', - label: 'OpenAI', - defaultUrl: PROVIDER_ENDPOINTS.openai, - authHeader: { name: 'Authorization', prefix: 'Bearer ' }, - modelsEndpoint: '/models', - modelKey: 'id', - fallbackModels: ['gpt-4o', 'gpt-4o-mini', 'gpt-4-turbo'], - }, - // ... other providers -}; - -// client/src/lib/ai-providers/model-fetcher.ts -export async function fetchModels( - provider: string, - baseUrl: string, - apiKey: string -): Promise { - const meta = PROVIDERS[provider]; - if (!meta?.modelsEndpoint) return meta?.fallbackModels ?? []; - - const response = await fetch(`${baseUrl}${meta.modelsEndpoint}`, { - headers: { [meta.authHeader.name]: `${meta.authHeader.prefix}${apiKey}` }, - }); - - const data = await response.json(); - return extractModels(data, meta.modelKey); -} -``` - ---- - -## Phase 4: Rust/Tauri (Low Priority) - -### 4.1 Add Clippy Lints - -**File:** `client/src-tauri/Cargo.toml` - -Add additional clippy lints: -```toml -[lints.clippy] -unwrap_used = "warn" -expect_used = "warn" -todo = "warn" -cognitive_complexity = "warn" -``` - -### 4.2 Review Clone Usage - -Run quality script and address files with excessive `.clone()` calls. - ---- - -## Implementation Order - -### Week 1: Configuration & Quick Wins -1. βœ… Create `lib/config/` with centralized endpoints -2. βœ… Extract `api/helpers.ts` shared utilities -3. βœ… Update quality test thresholds for false positives -4. βœ… Tighten Python quality test thresholds (2024-12-24) -5. βœ… Add test smell detection suite (15 tests) (2024-12-24) - -### Week 2: Python Backend Core -4. Create `ServerConfig` dataclasses -5. Refactor `run_server()` to use config -6. Extract `parse_llm_response` helpers - -### Week 3: Client God Class -7. Create `_client_mixins/converters.py` -8. Extract connection mixin -9. Extract streaming mixin -10. Extract remaining mixins -11. Compose `NoteFlowClient` from mixins - -### Week 4: Frontend Pages -12. Split `Settings.tsx` into sub-components -13. Create `useProviderConfig` hook -14. Refactor `preferences.ts` with generic setter - -### Week 5: Streaming & Types -15. Create `StreamingSession` class -16. Split `api/types.ts` into modules -17. Refactor `ai-providers.ts` with metadata - -### Week 6: Component Cleanup -18. Split `Recording.tsx` -19. Split `sidebar.tsx` -20. Final quality test run & verification - ---- - -## Current Quality Test Status (2024-12-24) - -### Python Backend Tests (17 failures) - -| Test | Found | Threshold | Key Offenders | -|------|-------|-----------|---------------| -| Long parameter lists | 4 | ≀2 | `run_server` (12), `add_segment` (11) | -| God classes | 3 | ≀1 | `NoteFlowClient` (32 methods, 815 lines) | -| Long methods | 7 | ≀4 | `run_server` (145 lines), `main` (123) | -| Module size (hard >750) | 1 | ≀0 | `client.py` (942 lines) | -| Module size (soft >500) | 3 | ≀1 | `streaming.py`, `diarization.py` | -| Scattered helpers | 21 | ≀10 | Helpers across unrelated modules | -| Duplicate helper signatures | 32 | ≀20 | `is_enabled` (7x), `get_by_meeting` (6x) | -| Repeated code patterns | 92 | ≀50 | Docstring blocks, method signatures | -| Magic numbers | 15 | ≀10 | `10` (20x), `1024` (14x), `5` (13x) | -| Repeated strings | 53 | ≀30 | Log messages, schema names | -| Thin wrappers | 46 | ≀25 | Passthrough functions | - -### Python Test Smell Tests (7 failures) - -| Test | Found | Threshold | Issue | -|------|-------|-----------|-------| -| Assertion roulette | 91 | ≀50 | Tests with naked asserts (no messages) | -| Conditional test logic | 75 | ≀40 | Loops/ifs in test bodies | -| Sleepy tests | 5 | ≀3 | Uses `time.sleep()` | -| Broad exception handling | 5 | ≀3 | Catches generic `Exception` | -| Sensitive equality | 12 | ≀10 | Comparing `str()` output | -| Duplicate test names | 26 | ≀15 | Same test name in multiple files | -| Long test methods | 5 | ≀3 | Tests exceeding 50 lines | - -### Frontend Tests (6 failures) - -| Test | Found | Threshold | Key Offenders | -|------|-------|-----------|---------------| -| Overly long files | 9 | ≀3 | `Settings.tsx` (1832!), 8 others >500 | -| Hardcoded endpoints | 4 | 0 | API URLs outside config | -| Nested ternaries | 1 | 0 | Complex conditional | -| TODO/FIXME comments | >15 | ≀15 | Technical debt markers | -| Commented-out code | >10 | ≀10 | Stale code blocks | - -### Rust/Tauri (no quality tests yet) - -Large files that could benefit from splitting: -- `noteflow.rs`: 1205 lines (generated proto) -- `recording.rs`: 897 lines -- `app_state.rs`: 851 lines -- `client.rs`: 681 lines - ---- - -## Success Metrics - -| Metric | Current | Target | -|--------|---------|--------| -| Python files > 750 lines | 1 | 0 | -| TypeScript files > 500 lines | 9 | 3 | -| Functions > 100 lines | 8 | 2 | -| Cyclomatic complexity > 15 | 2 | 0 | -| Functions with > 7 params | 4 | 0 | -| Hardcoded endpoints | 4 | 0 | -| Duplicated adapter code | ~150 lines | 0 | -| Python quality tests passing | 23/40 (58%) | 38/40 (95%) | -| Frontend quality tests passing | 15/21 (71%) | 20/21 (95%) | - ---- - -## Notes - -### False Positives to Ignore - -The following "feature envy" detections are **correct design patterns** and should NOT be refactored: - -1. **Converter classes** (`OrmConverter`, `AsrConverter`) - Inherently transform external objects -2. **Repository methods** - Queryβ†’fetchβ†’convert is the standard pattern -3. **Exporter classes** - Transformation classes work with domain entities -4. **Proto converters in gRPC** - Protoβ†’DTO adaptation is appropriate - -### Patterns to Preserve - -- Mixin architecture in `grpc/_mixins/` - Apply to client -- Repository base class helpers - Keep shared utilities -- Export formatting helpers - Already well-centralized -- Domain utilities in `domain/utils/` - Appropriate location diff --git a/docs/milestones.md b/docs/milestones.md deleted file mode 100644 index 5927b02..0000000 --- a/docs/milestones.md +++ /dev/null @@ -1,1098 +0,0 @@ -# NoteFlow V1 Implementation Plan - -**Architecture:** Client-Server with gRPC (evolved from original single-process design) -**Core principles:** Local-first, mic capture baseline, partialβ†’final transcripts, evidence-linked summaries with strict citation enforcement. - -**Last updated:** December 2025 - ---- - -## 1) Milestones and Gates - -### Milestone 0 β€” Spikes to de-risk platform & pipeline βœ… COMPLETE - -**Goal:** validate the 4 biggest "desktop app cliffs" before committing to architecture. - -**Spikes (each ends with a tiny working prototype + written findings):** - -1. **UI + Tray + Hotkeys feasibility** βœ… - - * Verified: system tray/menubar icon, notification prompt, global hotkey start/stop - * Flet works for main UI; pystray/pynput validated for tray + hotkeys - * Location: `spikes/spike_01_ui_tray_hotkeys/` - -2. **Audio capture robustness** βœ… - - * Validated `sounddevice.InputStream` with PortAudio: - * default mic capture works - * device unplug / device switch handling - * stable VU meter feed - * Location: `spikes/spike_02_audio_capture/` - -3. **ASR latency feasibility** βœ… - - * faster-whisper benchmarked at 0.05x real-time (excellent) - * Model download/cache strategy validated - * Location: `spikes/spike_03_asr_latency/` - -4. **Key storage + encryption approach** βœ… - - * OS keystore integration works (Keychain/Credential Manager via `keyring`) - * Encrypted streaming audio file validated (chunked AES-GCM, 826 MB/s throughput) - * Location: `spikes/spike_04_encryption/` - -**Exit criteria (M0):** βœ… ALL MET - -* [x] Start recording β†’ see VU meter β†’ stop β†’ playback file on both OSs -* [x] Run ASR over captured audio and display text in UI -* [x] Store/read an encrypted blob using a stored master key - ---- - -### Milestone 1 β€” Repo foundation + CI + core contracts βœ… COMPLETE - -**Goal:** establish maintainable structure, typing, test harness, logging. - -**Deliverables:** βœ… ALL COMPLETE - -* [x] Repository layout with hexagonal architecture (domain β†’ application β†’ infrastructure) -* [x] `pyproject.toml` + uv lockfile -* [x] Quality gates: `ruff`, `basedpyright`, `pytest` -* [x] Structured logging (structlog) with content-safe defaults -* [x] Settings system (Pydantic settings + `NOTEFLOW_` env vars) -* [x] Minimal "app shell" (Flet UI opens, logs write) - -**Implementation locations:** -* Domain: `src/noteflow/domain/` (entities, ports, value objects) -* Application: `src/noteflow/application/services/` -* Infrastructure: `src/noteflow/infrastructure/` -* Config: `src/noteflow/config/` - -**Exit criteria:** βœ… ALL MET - -* [x] CI passes lint/type/tests -* [x] Running app opens a window (tray integration deferred to M5) - ---- - -### Milestone 2 β€” Meeting lifecycle + mic capture + crash-safe persistence βœ… COMPLETE - -**Goal:** reliable recording as the foundation. - -**Deliverables:** βœ… ALL COMPLETE - -* [x] `MeetingService` state machine (CREATED β†’ RECORDING β†’ STOPPING β†’ STOPPED β†’ COMPLETED) - * Location: `src/noteflow/application/services/meeting_service.py` -* [x] Audio capture via `SoundDeviceCapture` - * Location: `src/noteflow/infrastructure/audio/capture.py` -* [x] Encrypted streaming asset writer (NFAE format, AES-256-GCM) - * Location: `src/noteflow/infrastructure/audio/writer.py` - * Crypto: `src/noteflow/infrastructure/security/crypto.py` -* [x] Meeting folder layout + manifest.json - * Format: `~/.noteflow/meetings//audio.enc` + `manifest.json` -* [x] Active Meeting UI: timer + VU meter + start/stop - * Components: `recording_timer.py`, `vu_meter.py` in `client/components/` -* [x] Crash recovery via `RecoveryService` - * Location: `src/noteflow/application/services/recovery_service.py` - * Detects meetings left in RECORDING/STOPPING state, marks as ERROR - -**Exit criteria:** βœ… ALL MET - -* [x] Record 30 minutes without UI freezing -* [x] App restart after forced kill recovers incomplete meetings - ---- - -### Milestone 3 β€” Partialβ†’Final transcription + transcript persistence βœ… COMPLETE - -**Goal:** near real-time transcription with stability rules. - -**Deliverables:** βœ… ALL COMPLETE - -* [x] ASR wrapper service (faster-whisper with word timestamps) - * Location: `src/noteflow/infrastructure/asr/engine.py` - * Supports 13 model sizes, CPU/GPU, word-level timestamps -* [x] VAD + segment finalization logic - * EnergyVad: `src/noteflow/infrastructure/asr/streaming_vad.py` - * Segmenter: `src/noteflow/infrastructure/asr/segmenter.py` -* [x] Partial transcript feed to UI - * Server: `_maybe_emit_partial()` called during streaming (`service.py:601`) - * 2-second cadence with text deduplication - * Client: Handles `is_final=False` in `client.py:458-467` - * UI: `[LIVE]` row with blue styling (`transcript.py:182-219`) -* [x] Final segments persisted to PostgreSQL + pgvector - * Repository: `src/noteflow/infrastructure/persistence/repositories/segment.py` -* [x] Post-meeting transcript view - * Component: `src/noteflow/client/components/transcript.py` - -**Implementation details:** - -* Server emits `UPDATE_TYPE_PARTIAL` every 2 seconds during speech activity -* Minimum 0.5 seconds of audio before partial inference -* Partial text deduplicated (only emitted when changed) -* Client renders partials with `is_final=False` flag -* UI displays `[LIVE]` indicator with blue background, grey italic text -* Partial row cleared when final segment arrives - -**Exit criteria:** βœ… ALL MET - -* [x] Live view shows partial text that settles into final segments -* [x] After restart, final segments are still present and searchable within the meeting - ---- - -### Milestone 4 β€” Review UX: playback, annotations, export βœ… MOSTLY COMPLETE - -**Goal:** navigable recall loop. - -**Deliverables:** - -* [x] Audio playback synced to segment timestamps - * `PlaybackControls`: `src/noteflow/client/components/playback_controls.py` - * `PlaybackSyncController`: `src/noteflow/client/components/playback_sync.py` - * `SoundDevicePlayback`: `src/noteflow/infrastructure/audio/playback.py` -* [x] Add annotations in live view + review view - * `AnnotationToolbar`: `src/noteflow/client/components/annotation_toolbar.py` - * `AnnotationDisplay`: `src/noteflow/client/components/annotation_display.py` - * All 4 types: `ACTION_ITEM`, `DECISION`, `NOTE`, `RISK` -* [x] Export: Markdown + HTML - * `ExportService`: `src/noteflow/application/services/export_service.py` - * Markdown exporter: `src/noteflow/infrastructure/export/markdown.py` - * HTML exporter: `src/noteflow/infrastructure/export/html.py` -* [x] Meeting library list + per-meeting search - * `MeetingLibrary`: `src/noteflow/client/components/meeting_library.py` - * `TranscriptComponent` with search: `src/noteflow/client/components/transcript.py` - -**Previous gaps β€” now closed:** - -* [x] Wire meeting library into the main UI and selection flow -* [x] Add per-meeting transcript search (client-side filter) -* [x] Add `risk` annotation type end-to-end (domain enum, UI, persistence) - -**Exit criteria:** βœ… ALL MET - -* [x] Clicking a segment seeks audio playback to that time -* [x] Export produces correct Markdown/HTML for at least one meeting - ---- - -### Milestone 5 β€” Smart triggers (confidence model) + snooze/suppression ⚠️ PARTIALLY INTEGRATED - -**Goal:** prompts that are helpful, not annoying. - -**Deliverables:** - -* [x] Trigger engine + scoring - * `TriggerService`: `src/noteflow/application/services/trigger_service.py` - * Domain entities: `src/noteflow/domain/triggers/entities.py` - * `TriggerSignal`, `TriggerDecision`, `TriggerAction` (IGNORE, NOTIFY, AUTO_START) -* [x] `SignalProvider` protocol defined - * Location: `src/noteflow/domain/triggers/ports.py` -* [x] Foreground app detector integration - * Infrastructure: `src/noteflow/infrastructure/triggers/foreground_app.py` - * Wired via `TriggerMixin`: `src/noteflow/client/_trigger_mixin.py` -* [x] Audio activity detector integration - * Infrastructure: `src/noteflow/infrastructure/triggers/audio_activity.py` - * Wired via `TriggerMixin`: `src/noteflow/client/_trigger_mixin.py` -* [ ] Optional calendar connector stub (disabled by default) -* [x] Trigger prompts + snooze (AlertDialog, not system notifications) - * `TriggerMixin._show_trigger_prompt()` displays AlertDialog - * Snooze button integrated - * Rate limiting active -* [ ] **System tray integration** ← GAP -* [ ] **Global hotkeys** ← GAP -* [x] Settings for sensitivity and auto-start opt-in (in `TriggerService`) - -**Current integration status:** - -* Client app inherits from `TriggerMixin` (`app.py:65`) -* Signal providers initialized in `_initialize_triggers()` method -* Background trigger check loop runs via `_trigger_check_loop()` -* Handles NOTIFY and AUTO_START actions -* Prompts shown via Flet AlertDialog (not system notifications) - -**What works:** -* Confidence scoring with configurable thresholds (0.40 notify, 0.80 auto-start) -* Rate limiting between triggers -* Snooze functionality with remaining time tracking -* Per-app suppression config -* Foreground app detection (PyWinCtl) -* Audio activity detection (RMS sliding window) - -**Remaining work:** - -1. **System Tray Integration** (New file: `src/noteflow/client/tray.py`) - * Integrate pystray for minimize-to-tray - * Show trigger prompts as system notifications - * Recording indicator icon - * Complexity: Medium (spike validated in `spikes/spike_01_ui_tray_hotkeys/`) - -2. **Global Hotkeys** (New file: `src/noteflow/client/hotkeys.py`) - * Integrate pynput for start/stop/annotation hotkeys - * Complexity: Medium (spike validated) - -**Exit criteria:** - -* [x] Trigger prompts happen when expected and can be snoozed -* [x] Prompt rate-limited to prevent spam -* [ ] System tray notifications (currently AlertDialog only) -* [ ] Global hotkeys for quick actions - ---- - -### Milestone 6 β€” Evidence-linked summaries (extract β†’ synthesize β†’ verify) βœ… COMPLETE - -**Goal:** no uncited claims. - -**Deliverables:** - -* [x] Summarizer provider interface - * Protocol: `src/noteflow/domain/summarization/ports.py` - * DTOs: `SummarizationRequest`, `SummarizationResult`, `CitationVerificationResult` -* [x] Provider implementations (3 complete): - * `MockSummarizer`: `src/noteflow/infrastructure/summarization/mock_provider.py` - * `OllamaSummarizer` (local): `src/noteflow/infrastructure/summarization/ollama_provider.py` - * `CloudSummarizer` (OpenAI/Anthropic): `src/noteflow/infrastructure/summarization/cloud_provider.py` -* [x] Citation verifier + "uncited drafts" handling - * `CitationVerifier`: `src/noteflow/infrastructure/summarization/citation_verifier.py` - * Validates segment_ids, filters invalid citations -* [x] Summary UI panel with clickable citations - * `SummaryPanel`: `src/noteflow/client/components/summary_panel.py` - * Shows key points + action items with evidence links - * "Uncited drafts hidden" toggle -* [x] Factory function for service creation - * `create_summarization_service()`: `src/noteflow/infrastructure/summarization/factory.py` - * Shared by client app and gRPC server - -**Application service complete:** - -* `SummarizationService`: `src/noteflow/application/services/summarization_service.py` -* Multi-provider with consent management -* Fallback chain: CLOUD β†’ LOCAL β†’ MOCK -* Citation verification and filtering - -**gRPC integration complete:** - -* `GenerateSummary` RPC calls `SummarizationService.summarize()` -* Auto-detects provider availability (tries LOCAL, falls back to MOCK) -* Placeholder fallback if service unavailable - -**Exit criteria:** βœ… ALL MET - -* [x] Every displayed bullet has citations (RPC wired to real service) -* [x] Clicking bullet jumps to cited transcript segment and audio timestamp - ---- - -### Milestone 7 β€” Retention, deletion, telemetry (opt-in), packaging ⚠️ RETENTION COMPLETE - -**Goal:** ship safely. - -**Deliverables:** - -* [x] Retention job - * `RetentionService`: `src/noteflow/application/services/retention_service.py` - * Configurable retention days, dry-run support - * Runs at startup and periodically -* [x] Delete meeting (cryptographic delete) - * `MeetingService.delete_meeting()` removes: - * Database rows (meeting, segments, summary, annotations) - * Encrypted audio file (`audio.enc`) - * Wrapped DEK from manifest (renders audio unrecoverable) - * Meeting directory -* [ ] **Optional telemetry (content-free)** ← GAP -* [ ] **PyInstaller build** ← GAP -* [ ] **"Check for updates" flow** ← GAP -* [ ] Release checklist & troubleshooting docs - -**What's implemented:** - -* Meeting deletion cascade is complete: - * DB cascade: meeting β†’ segments β†’ summary β†’ annotations - * Filesystem: `~/.noteflow/meetings//` removed - * Crypto: DEK deleted from manifest, audio unrecoverable - -* Retention service is complete: - * `RetentionService.run_cleanup()` with dry-run - * Finds meetings older than retention cutoff - * Generates `RetentionReport` with counts - * Integration tests validate cascade - -**Remaining work:** - -1. **PyInstaller Packaging** (New: build scripts) - * Create distributable for macOS + Windows - * Complexity: High (cross-platform, native deps) - -2. **Code Signing** - * macOS notarization, Windows signing - * Complexity: Medium - -3. **Update Check Flow** - * Version display + "Check for Updates" β†’ release page - * Complexity: Low - -4. **Telemetry (Opt-in)** - * Content-free metrics: crash stacktrace, latency, feature flags - * Complexity: Medium - -**Exit criteria:** - -* [ ] A signed installer that installs and runs on both OSs -* [x] Deleting a meeting removes DB rows + assets; audio cannot be decrypted after key deletion - ---- - -### Milestone 8 (Optional pre‑release) β€” Post-meeting anonymous diarization βœ… COMPLETE - -**Goal:** "Speaker A/B/C" best-effort labeling. - -**Deliverables:** - -* [x] Diarization engine with streaming + offline modes - * Location: `src/noteflow/infrastructure/diarization/engine.py` (315 lines) - * Streaming: `diart` library for real-time processing - * Offline: `pyannote.audio` for post-meeting refinement - * Device support: auto, cpu, cuda, mps -* [x] Speaker assignment logic - * Location: `src/noteflow/infrastructure/diarization/assigner.py` - * `assign_speaker()` maps time ranges via maximum overlap - * `assign_speakers_batch()` for bulk assignment - * Confidence scoring based on overlap duration -* [x] Data transfer objects - * Location: `src/noteflow/infrastructure/diarization/dto.py` - * `SpeakerTurn` with validation and overlap methods -* [x] Domain entity updates - * `Segment.speaker_id: str | None` and `speaker_confidence: float` -* [x] Proto/gRPC definitions - * `FinalSegment.speaker_id` and `speaker_confidence` fields - * `ServerInfo.diarization_enabled` and `diarization_ready` flags - * `RefineSpeakerDiarization` and `RenameSpeaker` RPCs -* [x] gRPC refinement RPC - * `refine_speaker_diarization()` in `service.py` for post-meeting processing - * `rename_speaker()` for user-friendly speaker labels -* [x] Configuration/settings - * `diarization_enabled`, `diarization_hf_token`, `diarization_device` - * `diarization_streaming_latency`, `diarization_min/max_speakers` -* [x] Dependencies added - * Optional extra `[diarization]`: pyannote.audio, diart, torch -* [x] UI display - * Speaker labels with color coding in `transcript.py` - * "Analyze Speakers" and "Rename Speakers" buttons in `meeting_library.py` -* [x] Server initialization - * `DiarizationEngine` wired in `server.py` with CLI args - * `--diarization`, `--diarization-hf-token`, `--diarization-device` flags -* [x] Client integration - * `refine_speaker_diarization()` and `rename_speaker()` methods in `client.py` - * `DiarizationResult` and `RenameSpeakerResult` DTOs -* [x] Tests - * 24 unit tests in `tests/infrastructure/test_diarization.py` - * Covers `SpeakerTurn`, `assign_speaker()`, `assign_speakers_batch()` - -**Deferred (optional future enhancement):** - -* [ ] **Streaming integration** - Real-time speaker labels during recording - * Feed audio chunks to diarization during `StreamTranscription` - * Emit speaker changes in real-time - * Complexity: High (requires significant latency tuning) - -**Exit criteria:** βœ… ALL MET - -* [x] If diarization fails, app degrades gracefully to "Unknown." -* [x] Post-meeting diarization refinement works end-to-end -* [ ] (Optional) Streaming diarization shows live speaker labels β€” deferred - ---- - -## 2) Proposed Repository Layout - -This layout is designed to: - -* separate server and client concerns, -* isolate platform-specific code, -* keep modules < 500 LoC, -* make DI clean, -* keep writing to disk centralized. - -```text -noteflow/ -β”œβ”€ pyproject.toml -β”œβ”€ src/noteflow/ -β”‚ β”œβ”€ core/ -β”‚ β”‚ β”œβ”€ config.py # Settings (Pydantic) + load/save -β”‚ β”‚ β”œβ”€ logging.py # structlog config, redaction helpers -β”‚ β”‚ β”œβ”€ types.py # common NewTypes / Protocols -β”‚ β”‚ └─ errors.py # domain error types -β”‚ β”‚ -β”‚ β”œβ”€ grpc/ # gRPC server components -β”‚ β”‚ β”œβ”€ proto/ -β”‚ β”‚ β”‚ β”œβ”€ noteflow.proto # Service definitions -β”‚ β”‚ β”‚ β”œβ”€ noteflow_pb2.py # Generated protobuf -β”‚ β”‚ β”‚ └─ noteflow_pb2_grpc.py -β”‚ β”‚ β”œβ”€ server.py # Server entry point -β”‚ β”‚ β”œβ”€ service.py # NoteFlowServicer implementation -β”‚ β”‚ β”œβ”€ meeting_store.py # In-memory meeting management -β”‚ β”‚ └─ client.py # gRPC client wrapper -β”‚ β”‚ -β”‚ β”œβ”€ client/ # GUI client application -β”‚ β”‚ β”œβ”€ app.py # Flet app entry point -β”‚ β”‚ β”œβ”€ state.py # App state store -β”‚ β”‚ └─ components/ -β”‚ β”‚ β”œβ”€ transcript.py -β”‚ β”‚ β”œβ”€ vu_meter.py -β”‚ β”‚ └─ summary_panel.py -β”‚ β”‚ -β”‚ β”œβ”€ audio/ # Audio capture (client-side) -β”‚ β”‚ β”œβ”€ capture.py # sounddevice InputStream wrapper -β”‚ β”‚ β”œβ”€ levels.py # RMS/VU meter computation -β”‚ β”‚ β”œβ”€ ring_buffer.py # timestamped audio buffer -β”‚ β”‚ └─ playback.py # audio playback synced to timestamp -β”‚ β”‚ -β”‚ β”œβ”€ asr/ # ASR engine (server-side) -β”‚ β”‚ β”œβ”€ engine.py # faster-whisper wrapper + model cache -β”‚ β”‚ β”œβ”€ segmenter.py # partial/final logic, silence boundaries -β”‚ β”‚ └─ dto.py # ASR outputs (words optional) -β”‚ β”‚ -β”‚ β”œβ”€ data/ # Persistence (server-side) -β”‚ β”‚ β”œβ”€ db.py # LanceDB connection + table handles -β”‚ β”‚ β”œβ”€ schema.py # table schemas + version -β”‚ β”‚ └─ repos/ -β”‚ β”‚ β”œβ”€ meetings.py -β”‚ β”‚ β”œβ”€ segments.py -β”‚ β”‚ └─ summaries.py -β”‚ β”‚ -β”‚ β”œβ”€ platform/ # Platform-specific (client-side) -β”‚ β”‚ β”œβ”€ tray/ # tray/menubar (pystray) -β”‚ β”‚ β”œβ”€ hotkeys/ # global hotkeys (pynput) -β”‚ β”‚ └─ notifications/ # toast notifications -β”‚ β”‚ -β”‚ └─ summarization/ # Summary generation (server-side) -β”‚ β”œβ”€ providers/ -β”‚ β”‚ β”œβ”€ base.py -β”‚ β”‚ └─ cloud.py -β”‚ β”œβ”€ prompts.py -β”‚ └─ verifier.py -β”‚ -β”œβ”€ spikes/ # De-risking spikes (M0) -β”‚ β”œβ”€ spike_01_ui_tray_hotkeys/ -β”‚ β”œβ”€ spike_02_audio_capture/ -β”‚ β”œβ”€ spike_03_asr_latency/ -β”‚ └─ spike_04_encryption/ -β”‚ -└─ tests/ - β”œβ”€ unit/ - β”œβ”€ integration/ - └─ e2e/ -``` - ---- - -## 3) Core Runtime Design - -### 3.1 State Machine (Meeting Lifecycle) - -Define explicitly so UI + services remain consistent. - -```text -IDLE - β”œβ”€ start(manual/trigger) β†’ RECORDING - └─ prompt(trigger) β†’ PROMPTED - -PROMPTED - β”œβ”€ accept β†’ RECORDING - └─ dismiss/snooze β†’ IDLE - -RECORDING - β”œβ”€ stop β†’ STOPPING - β”œβ”€ error(audio) β†’ ERROR (with recover attempt) - └─ crash β†’ RECOVERABLE_INCOMPLETE on restart - -STOPPING - β”œβ”€ flush assets/segments β†’ REVIEW_READY - └─ failure β†’ REVIEW_READY (marked incomplete) - -REVIEW_READY - β”œβ”€ summarize β†’ REVIEW_READY (summary updated) - └─ delete β†’ IDLE -``` - -**Invariant:** segments are only β€œfinal” when persisted. Partial text is never persisted. - ---- - -### 3.2 Threading + Queue Model (Client-Server) - -**Server Threads:** - -* **gRPC thread pool:** handles incoming RPC requests -* **ASR worker thread:** processes audio buffers through faster-whisper -* **IO worker thread:** *only* place that writes DB + manifest updates -* **Background jobs:** summarization, diarization, retention - -**Client Threads:** - -* **Main/UI thread:** Flet rendering + user actions -* **Audio callback thread:** receives frames, does *minimal work*: - * compute lightweight RMS for VU meter - * enqueue frames to gRPC stream queue -* **gRPC stream thread:** sends audio chunks, receives transcript updates -* **Event dispatch:** updates UI from transcript callbacks - -**Rules:** -* Anything blocking > 5ms does not run in the audio callback -* Only the server's IO worker writes to the database - ---- - -## 4) Dependency Injection and Service Wiring - -Use a small container (manual DI) rather than a framework. - -```python -# core/types.py -from typing import Protocol - -class Clock(Protocol): - def monotonic(self) -> float: ... - def now(self): ... - -class Notifier(Protocol): - def prompt_recording(self, title: str, body: str) -> None: ... - def toast(self, title: str, body: str) -> None: ... - -class ForegroundAppProvider(Protocol): - def current_app(self) -> str | None: ... - -class KeyStore(Protocol): - def get_or_create_master_key(self) -> bytes: ... -``` - -```python -# app.py (wiring idea) -def build_container() -> AppContainer: - settings = load_settings() - logger = configure_logging(settings) - keystore = build_keystore() - crypt = CryptoBox(keystore) - db = LanceDatabase(settings.paths.db_dir) - repos = Repositories(db) - jobs = JobQueue(...) - audio = AudioCapture(...) - asr = AsrEngine(...) - meeting = MeetingService(...) - triggers = TriggerService(...) - ui = UiController(...) - return AppContainer(...) -``` - ---- - -## 5) Detailed Subsystem Plans - -## 5.1 Audio Capture + Assets - -### AudioCapture - -Responsibilities: - -* open/close stream -* handle device change / reconnect -* feed ring buffer -* expose current level for VU meter - -Key APIs: - -```python -class AudioCapture: - def start(self, on_frames: Callable[[np.ndarray, float], None]) -> None: ... - def stop(self) -> None: ... - def current_device(self) -> AudioDeviceInfo: ... -``` - -### RingBuffer (timestamped) - -* store `(timestamp, frames)` so segment times are stable even if UI thread lags -* provide β€œlast N seconds” view for ASR worker - -### VAD - -Define an interface so you can swap implementations (webrtcvad vs silero) without rewriting pipeline. - -```python -class Vad: - def is_speech(self, pcm16: bytes, sample_rate: int) -> bool: ... -``` - -### Encrypted Audio Container (streaming) - -**Implementation approach (V1-safe):** encrypted chunk format (AES-GCM) storing PCM16 frames. -Optional: later add β€œcompress after meeting” job (Opus) once stable. - -**Writer contract:** - -* write header once -* write chunks frequently (every ~200–500ms) -* flush frequently (crash-safe) - -**Deletion contract:** - -* delete per-meeting DEK record first (crypto delete) -* delete meeting folder - ---- - -## 5.2 ASR and Segment Finalization - -### ASR Engine Wrapper (faster-whisper) - -Responsibilities: - -* model download/cache -* run inference -* return tokens/segments with timestamps (word timestamps optional) - -```python -class AsrEngine: - def transcribe(self, audio_f32_16k: np.ndarray) -> AsrResult: ... -``` - -### Segmenter (partial/final) - -Responsibilities: - -* build current β€œactive utterance” from VAD-speech frames -* run partial inference every N seconds -* finalize when silence boundary detected - -**Data contract:** - -* PartialUpdate: `{text, start_offset, end_offset, stable=False}` -* FinalSegment: `{segment_id, text, start_offset, end_offset, stable=True}` - -**Important:** final segments get their IDs at commit time (IO worker), not earlier. - ---- - -## 5.3 Persistence (LanceDB + repositories) - -### DB access policy - -* One DB connection managed centrally -* IO worker serializes all writes - -Repositories: - -* `MeetingsRepo`: create/update meeting status, store DEK metadata reference -* `SegmentsRepo`: append segments, query by meeting, basic search -* `AnnotationsRepo`: add/list annotations -* `SummariesRepo`: store summary + verification report - -Also store: - -* schema version -* app version -* migration logic (even if minimal) - ---- - -## 5.4 MeetingService (Orchestration) - -Responsibilities: - -* create meeting directory + metadata -* start/stop audio capture -* start/stop ASR segmenter -* handle UI events (annotation hotkeys, stop, etc.) -* coordinate with TriggerService -* ensure crash-safe flush and marking incomplete - -Key public API: - -```python -class MeetingService: - def start(self, source: TriggerSource) -> MeetingID: ... - def stop(self) -> None: ... - def add_annotation(self, type: AnnotationType, text: str | None = None) -> None: ... - def current_meeting_id(self) -> MeetingID | None: ... -``` - ---- - -## 5.5 TriggerService (Confidence Model + throttling) - -Inputs (each independently optional): - -* calendar (optional connector) -* foreground app provider -* audio activity provider - -Outputs: - -* prompt notification -* optional auto-start (if user enabled) -* snooze & suppression state - -Policies: - -* **rate limit prompts** (e.g., max 1 prompt / 10 min) -* **cooldown after dismiss** -* **per-app suppression** config - -Implementation detail: - -* TriggerService publishes events via signals: - - * `trigger_prompted` - * `trigger_snoozed` - * `trigger_accepted` - ---- - -## 5.6 Summarization Service (Extract β†’ Synthesize β†’ Verify) - -Provider interface: - -```python -class SummarizerProvider(Protocol): - def extract(self, transcript: str) -> ExtractionResult: ... - def synthesize(self, extraction: ExtractionResult) -> DraftSummary: ... -``` - -Verifier: - -* parse bullets -* ensure each displayed bullet contains `[...]` with at least one Segment ID -* uncited bullets go into `uncited_points` and are hidden by default - -UI behavior: - -* Summary panel shows β€œX uncited drafts hidden” toggle -* Clicking bullet scrolls transcript and seeks audio - -**Testing requirement:** - -* Summary verifier must be unit-tested with adversarial outputs (missing brackets, invalid IDs, empty citations). - ---- - -## 5.7 UI Implementation Approach (Flet) - -### State management - -Treat UI as a thin layer over a single state store: - -* `AppState` - - * current meeting status - * live transcript partial - * list of finalized segments - * playback state - * summary state - * settings state - * prompt/snooze state - -Changes flow: - -* Services emit signals (blinker) -* UI controller converts signal payload β†’ state update β†’ re-render - -This avoids UI code reaching into services and creating race conditions. - ---- - -## 6) Testing Plan (Practical and CI-friendly) - -### Unit tests (fast) - -* Trigger scoring + thresholds -* Summarization verifier -* Segment model validation (`end >= start`) -* Retention policy logic -* Encryption chunk read/write roundtrip - -### Integration tests - -* DB CRUD roundtrip for each repo -* Meeting create β†’ segments append β†’ summary store -* Delete meeting removes all rows and assets - -### E2E tests (required) - -**Audio injection harness** - -* Feed prerecorded WAV into AudioCapture abstraction (mock capture) -* Run through VAD + ASR pipeline -* Assert: - - * segments are produced - * partial updates happen - * final segments persist - * seeking works (timestamp consistency) - -**Note:** CI should never require a live microphone. - ---- - -## 7) Release Checklist (V1) - -* [ ] Recording indicator always visible when capturing -* [ ] Permission errors show actionable instructions -* [ ] Crash recovery works for incomplete meetings -* [ ] Summary bullets displayed are always cited -* [ ] Delete meeting removes keys + assets + DB rows -* [ ] Telemetry default off; no content ever logged -* [ ] Build artifacts install/run on macOS + Windows - ---- - -## 8) "First Implementation Targets" (what to build first) - -Build server-side first, then client, to ensure reliable foundation: - -**Server (build first):** -1. **gRPC service skeleton** - proto definitions + basic server startup -2. **Meeting store** - in-memory meeting lifecycle management -3. **ASR integration** - faster-whisper wrapper with streaming output -4. **Bidirectional streaming** - audio in, transcripts out -5. **Persistence** - LanceDB storage for meetings/segments -6. **Summarization** - evidence-linked summary generation - -**Client (build second):** -7. **gRPC client wrapper** - connection management + streaming -8. **Audio capture** - sounddevice integration + VU meter -9. **Live UI** - Flet app with transcript display -10. **Tray + hotkeys** - pystray/pynput integration -11. **Review view** - playback synced to transcript -12. **Packaging** - PyInstaller for both server and client - -This ordering ensures the server is stable before building client features on top. - ---- - -## 9) Minimal API Skeletons (so devs can start coding) - -### gRPC Service Definition (proto) - -```protobuf -service NoteFlowService { - // Bidirectional streaming: audio β†’ transcripts - rpc StreamTranscription(stream AudioChunk) returns (stream TranscriptUpdate); - - // Meeting lifecycle - rpc CreateMeeting(CreateMeetingRequest) returns (Meeting); - rpc StopMeeting(StopMeetingRequest) returns (Meeting); - rpc ListMeetings(ListMeetingsRequest) returns (ListMeetingsResponse); - rpc GetMeeting(GetMeetingRequest) returns (Meeting); - - // Summary generation - rpc GenerateSummary(GenerateSummaryRequest) returns (Summary); - - // Server health - rpc GetServerInfo(ServerInfoRequest) returns (ServerInfo); -} -``` - -### Client Callback Types - -```python -# Client receives these from server via gRPC stream -@dataclass -class TranscriptSegment: - segment_id: int - text: str - start_time: float - end_time: float - language: str - is_final: bool - -# Callback signatures -TranscriptCallback = Callable[[TranscriptSegment], None] -ConnectionCallback = Callable[[bool, str], None] # connected, message -``` - -### Client-Side Signals (UI updates) - -```python -# client/signals.py - for UI thread dispatch -from blinker import signal - -audio_level_updated = signal("audio_level_updated") # rms: float -transcript_received = signal("transcript_received") # TranscriptSegment -connection_changed = signal("connection_changed") # connected: bool, message: str -``` - -And a β€œjob queue” minimal contract: - -```python -class JobQueue: - def submit(self, job: "Job") -> None: ... - def cancel(self, job_id: str) -> None: ... - -class Job(Protocol): - id: str - def run(self) -> None: ... -``` - ---- - -## 10) Current Implementation Status - -### Summary by Milestone - -| Milestone | Status | Completeness | -|-----------|--------|--------------| -| M0 Spikes | βœ… Complete | 100% | -| M1 Repo Foundation | βœ… Complete | 100% | -| M2 Meeting Lifecycle | βœ… Complete | 100% | -| M3 Transcription | βœ… Complete | 100% | -| M4 Review UX | βœ… Complete | 100% | -| M5 Triggers | ⚠️ Partial | 70% (integrated via mixin, tray/hotkeys not) | -| M6 Summarization | βœ… Complete | 100% | -| M7 Packaging | ⚠️ Partial | 40% (retention done, packaging not) | -| M8 Diarization | ⚠️ Partial | 55% (infrastructure done, wiring not) | - -### Layer-by-Layer Status - -**Domain Layer** βœ… 100% -- [x] Meeting entity with state machine -- [x] Segment entity with word-level timing -- [x] Annotation entity (4 types) -- [x] Summary entity with evidence links (KeyPoint, ActionItem) -- [x] Repository ports (Protocol-based DI) -- [x] Unit of Work port -- [x] Trigger domain (TriggerSignal, TriggerDecision) -- [x] Summarization ports - -**Application Layer** βœ… 100% -- [x] `MeetingService` - full CRUD + lifecycle -- [x] `SummarizationService` - multi-provider, consent, verification -- [x] `TriggerService` - scoring, rate limiting, snooze -- [x] `RetentionService` - cleanup, dry-run -- [x] `ExportService` - Markdown, HTML -- [x] `RecoveryService` - crash recovery - -**Infrastructure Layer** βœ… 98% -- [x] Audio: capture, ring buffer, levels, playback, encrypted writer/reader -- [x] ASR: faster-whisper engine, VAD, segmenter -- [x] Persistence: SQLAlchemy + pgvector, Alembic migrations -- [x] Security: AES-256-GCM, keyring keystore -- [x] Summarization: Mock, Ollama, Cloud providers + citation verifier -- [x] Export: Markdown, HTML formatters -- [x] Triggers: signal providers wired via TriggerMixin -- [x] Diarization: engine, assigner, DTOs (not wired to server) - -**gRPC Layer** βœ… 100% -- [x] Proto definitions with bidirectional streaming -- [x] Server: StreamTranscription, CreateMeeting, StopMeeting, etc. -- [x] Client wrapper with connection management -- [x] Meeting store (in-memory + DB modes) -- [x] GenerateSummary RPC wired to SummarizationService -- [x] Partial transcript streaming (2-second cadence, deduplication) - -**Client Layer** βœ… 85% -- [x] Flet app with state management -- [x] VU meter, recording timer, transcript -- [x] Playback controls + sync controller -- [x] Annotation toolbar + display -- [x] Meeting library -- [x] Summary panel with clickable citations -- [x] Connection panel with auto-reconnect -- [x] Trigger detection via TriggerMixin (AlertDialog prompts) -- [ ] System tray integration (spike validated, not integrated) -- [ ] Global hotkeys (spike validated, not integrated) - ---- - -## 11) Remaining Work Summary - -### Medium Priority (Platform Features) - -| # | Task | Files | Complexity | Blocker For | -|---|------|-------|------------|-------------| -| 1 | **System Tray Integration** | New: `src/noteflow/client/tray.py` | Medium | M5 completion | -| | Integrate pystray for minimize-to-tray, system notifications, recording indicator | | | | -| 2 | **Global Hotkeys** | New: `src/noteflow/client/hotkeys.py` | Medium | M5 completion | -| | Integrate pynput for start/stop/annotation hotkeys | | | | - -### Medium Priority (Diarization Wiring) - -| # | Task | Files | Complexity | Blocker For | -|---|------|-------|------------|-------------| -| 3 | **Diarization Application Service** | New: `application/services/diarization_service.py` | Medium | M8 completion | -| | Orchestrate diarization workflow, model management | | | | -| 4 | **Diarization Server Wiring** | `src/noteflow/grpc/server.py` | Low | M8 completion | -| | Initialize DiarizationEngine on startup when enabled | | | | -| 5 | **Diarization Tests** | New: `tests/infrastructure/diarization/` | Medium | M8 stability | -| | Unit tests for engine, assigner, DTOs | | | | - -### Lower Priority (Shipping) - -| # | Task | Files | Complexity | Blocker For | -|---|------|-------|------------|-------------| -| 6 | **PyInstaller Packaging** | New: build scripts | High | M7 release | -| | Create distributable for macOS + Windows | | | | -| 7 | **Code Signing** | Build config | Medium | M7 release | -| | macOS notarization, Windows signing | | | | -| 8 | **Update Check Flow** | New: `src/noteflow/client/update.py` | Low | M7 release | -| | Version display + "Check for Updates" link | | | | -| 9 | **Telemetry (Opt-in)** | New: telemetry module | Medium | M7 release | -| | Content-free metrics collection | | | | - -### Recommended Implementation Order - -1. **System Tray + Hotkeys** (Can be done in parallel, completes M5) -2. **Diarization Wiring** (Server init + tests, completes M8 core) -3. **PyInstaller Packaging** (Enables distribution) -4. **Remaining M7 items** (Polish for release) - ---- - -## 12) Architecture Reference - -### Key File Locations - -| Component | Location | -|-----------|----------| -| **Domain Entities** | `src/noteflow/domain/entities/` | -| **Repository Ports** | `src/noteflow/domain/ports/repositories.py` | -| **Application Services** | `src/noteflow/application/services/` | -| **gRPC Server** | `src/noteflow/grpc/server.py`, `service.py` | -| **gRPC Client** | `src/noteflow/grpc/client.py` | -| **Audio Capture** | `src/noteflow/infrastructure/audio/` | -| **ASR Engine** | `src/noteflow/infrastructure/asr/` | -| **Persistence** | `src/noteflow/infrastructure/persistence/` | -| **Security** | `src/noteflow/infrastructure/security/` | -| **Summarization** | `src/noteflow/infrastructure/summarization/` | -| **Client App** | `src/noteflow/client/app.py` | -| **UI Components** | `src/noteflow/client/components/` | - -### Data Flow - -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ CLIENT β”‚ -β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ Audio Capture β†’ VU Meter β†’ gRPC Stream β†’ UI Components β”‚ -β”‚ ↓ ↑ β”‚ -β”‚ sounddevice Transcript Updates β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ gRPC - β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ SERVER β”‚ -β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ Audio Buffer β†’ VAD β†’ Segmenter β†’ ASR Engine β”‚ -β”‚ ↓ ↓ β”‚ -β”‚ Encrypted Writer Final Segments β”‚ -β”‚ ↓ ↓ β”‚ -β”‚ audio.enc PostgreSQL + pgvector β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` - -### Meeting Lifecycle States - -``` -CREATED β†’ RECORDING β†’ STOPPING β†’ STOPPED β†’ COMPLETED - ↓ ↓ ↓ - ERROR β†β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ (crash recovery) -``` diff --git a/docs/qa-report-2024-12-24.md b/docs/qa-report-2024-12-24.md deleted file mode 100644 index 2797aba..0000000 --- a/docs/qa-report-2024-12-24.md +++ /dev/null @@ -1,466 +0,0 @@ -# Code Quality Analysis Report -**Date:** 2024-12-24 -**Sprint:** Comprehensive Backend QA Scan -**Scope:** `/home/trav/repos/noteflow/src/noteflow/` - ---- - -## Executive Summary - -**Status:** PASS βœ… - -The NoteFlow Python backend demonstrates excellent code quality with: -- **0 type checking errors** (basedpyright clean) -- **0 remaining lint violations** (all Ruff issues auto-fixed) -- **0 security issues** detected -- **3 complexity violations** requiring architectural improvements - -### Quality Metrics - -| Category | Status | Details | -|----------|--------|---------| -| Type Safety | βœ… PASS | 0 errors (basedpyright strict mode) | -| Code Linting | βœ… PASS | 1 fix applied, 0 remaining | -| Formatting | ⚠️ SKIP | Black not installed in venv | -| Security | βœ… PASS | 0 vulnerabilities (Bandit rules) | -| Complexity | ⚠️ WARN | 3 functions exceed threshold | -| Architecture | βœ… GOOD | Modular mixin pattern, clean separation | - ---- - -## 1. Type Safety Analysis (basedpyright) - -### Result: PASS βœ… - -**Command:** `basedpyright --pythonversion 3.12 src/noteflow/` -**Outcome:** `0 errors, 0 warnings, 0 notes` - -#### Configuration Strengths -- `typeCheckingMode = "standard"` -- Python 3.12 target with modern type syntax -- Appropriate exclusions for generated proto files -- SQLAlchemy-specific overrides for known false positives - -#### Notes -The mypy output showed numerous errors, but these are **false positives** due to: -1. Missing type stubs for third-party libraries (`grpc`, `pgvector`, `diart`, `sounddevice`) -2. Generated protobuf files (excluded from analysis scope) -3. SQLAlchemy's dynamic attribute system (correctly configured in basedpyright) - -**Recommendation:** Basedpyright is the authoritative type checker for this project. The mypy configuration should be removed or aligned with basedpyright's exclusions. - ---- - -## 2. Linting Analysis (Ruff) - -### Result: PASS βœ… (1 fix applied) - -**Command:** `ruff check --fix src/noteflow/` - -#### Fixed Issues - -| File | Code | Issue | Fix Applied | -|------|------|-------|-------------| -| `grpc/_config.py:95` | UP037 | Quoted type annotation | Removed unnecessary quotes from `GrpcServerConfig` | - -#### Configuration Issues - -**Deprecated settings detected:** -```toml -# Current (deprecated) -[tool.ruff] -select = [...] -ignore = [...] -per-file-ignores = {...} - -# Required migration -[tool.ruff.lint] -select = [...] -ignore = [...] -per-file-ignores = {...} -``` - -**Action Required:** Update `pyproject.toml` to use `[tool.ruff.lint]` section. - -#### Selected Rules (Good Coverage) -- E/W: pycodestyle errors/warnings -- F: Pyflakes -- I: isort (import sorting) -- B: flake8-bugbear (bug detection) -- C4: flake8-comprehensions -- UP: pyupgrade (modern syntax) -- SIM: flake8-simplify -- RUF: Ruff-specific rules - ---- - -## 3. Complexity Analysis - -### Result: WARN ⚠️ (3 violations) - -**Command:** `ruff check --select C901 src/noteflow/` - -| File | Function | Complexity | Threshold | Severity | -|------|----------|------------|-----------|----------| -| `grpc/_mixins/diarization.py:102` | `_process_streaming_diarization` | 11 | ≀10 | 🟑 LOW | -| `grpc/_mixins/streaming.py:55` | `StreamTranscription` | 14 | ≀10 | 🟠 MEDIUM | -| `grpc/server.py:159` | `run_server_with_config` | 16 | ≀10 | πŸ”΄ HIGH | - ---- - -### 3.1 HIGH Priority: `run_server_with_config` (CC=16) - -**Location:** `src/noteflow/grpc/server.py:159-254` - -**Issues:** -- 96 lines with multiple initialization phases -- Deeply nested conditionals for database/diarization/consent logic -- Mixes infrastructure setup with business logic - -**Suggested Refactoring:** - -```python -# Extract helper functions to reduce complexity - -async def _initialize_database( - config: GrpcServerConfig -) -> tuple[AsyncSessionFactory | None, RecoveryResult | None]: - """Initialize database connection and run recovery.""" - if not config.database_url: - return None, None - - session_factory = create_async_session_factory(config.database_url) - await ensure_schema_ready(session_factory, config.database_url) - - recovery_service = RecoveryService( - SqlAlchemyUnitOfWork(session_factory), - meetings_dir=get_settings().meetings_dir, - ) - recovery_result = await recovery_service.recover_all() - return session_factory, recovery_result - -async def _initialize_consent_persistence( - session_factory: AsyncSessionFactory, - summarization_service: SummarizationService, -) -> None: - """Load cloud consent from DB and set up persistence callback.""" - async with SqlAlchemyUnitOfWork(session_factory) as uow: - cloud_consent = await uow.preferences.get_bool("cloud_consent_granted", False) - summarization_service.settings.cloud_consent_granted = cloud_consent - - async def persist_consent(granted: bool) -> None: - async with SqlAlchemyUnitOfWork(session_factory) as uow: - await uow.preferences.set("cloud_consent_granted", granted) - await uow.commit() - - summarization_service.on_consent_change = persist_consent - -def _initialize_diarization( - config: GrpcServerConfig -) -> DiarizationEngine | None: - """Create diarization engine if enabled and configured.""" - diarization = config.diarization - if not diarization.enabled: - return None - - if not diarization.hf_token: - logger.warning("Diarization enabled but no HF token provided") - return None - - diarization_kwargs = { - "device": diarization.device, - "hf_token": diarization.hf_token, - } - if diarization.streaming_latency is not None: - diarization_kwargs["streaming_latency"] = diarization.streaming_latency - if diarization.min_speakers is not None: - diarization_kwargs["min_speakers"] = diarization.min_speakers - if diarization.max_speakers is not None: - diarization_kwargs["max_speakers"] = diarization.max_speakers - - return DiarizationEngine(**diarization_kwargs) - -async def run_server_with_config(config: GrpcServerConfig) -> None: - """Run the async gRPC server with structured configuration.""" - # Initialize database and recovery - session_factory, recovery_result = await _initialize_database(config) - if recovery_result: - _log_recovery_results(recovery_result) - - # Initialize summarization - summarization_service = create_summarization_service() - if session_factory: - await _initialize_consent_persistence(session_factory, summarization_service) - - # Initialize diarization - diarization_engine = _initialize_diarization(config) - - # Create and start server - server = NoteFlowServer( - port=config.port, - asr_model=config.asr.model, - asr_device=config.asr.device, - asr_compute_type=config.asr.compute_type, - session_factory=session_factory, - summarization_service=summarization_service, - diarization_engine=diarization_engine, - diarization_refinement_enabled=config.diarization.refinement_enabled, - ) - await server.start() - await server.wait_for_termination() -``` - -**Expected Impact:** CC 16 β†’ ~6 (main function becomes orchestration only) - ---- - -### 3.2 MEDIUM Priority: `StreamTranscription` (CC=14) - -**Location:** `src/noteflow/grpc/_mixins/streaming.py:55-115` - -**Issues:** -- Multiple conditional checks for stream initialization -- Nested error handling with context managers -- Mixed concerns: stream lifecycle + chunk processing - -**Suggested Refactoring:** - -The codebase already has `_streaming_session.py` created. Recommendation: - -```python -# Use StreamingSession to encapsulate per-meeting state -async def StreamTranscription( - self: ServicerHost, - request_iterator: AsyncIterator[noteflow_pb2.AudioChunk], - context: grpc.aio.ServicerContext, -) -> AsyncIterator[noteflow_pb2.TranscriptUpdate]: - """Handle bidirectional audio streaming with persistence.""" - if self._asr_engine is None or not self._asr_engine.is_loaded: - await abort_failed_precondition(context, "ASR engine not loaded") - - session: StreamingSession | None = None - - try: - async for chunk in request_iterator: - # Initialize session on first chunk - if session is None: - session = await StreamingSession.create(chunk.meeting_id, self, context) - if session is None: - return - - # Check for stop request - if session.should_stop(): - logger.info("Stop requested, exiting stream gracefully") - break - - # Process chunk - async for update in session.process_chunk(chunk): - yield update - - # Flush remaining audio - if session: - async for update in session.flush(): - yield update - finally: - if session: - await session.cleanup() -``` - -**Expected Impact:** CC 14 β†’ ~8 (move complexity into StreamingSession methods) - ---- - -### 3.3 LOW Priority: `_process_streaming_diarization` (CC=11) - -**Location:** `src/noteflow/grpc/_mixins/diarization.py:102-174` - -**Issues:** -- Multiple early returns (guard clauses) -- Lock-based session management -- Error handling for streaming pipeline - -**Analysis:** -This function is already well-structured with clear separation: -1. Early validation checks (lines 114-119) -2. Session creation under lock (lines 124-145) -3. Chunk processing in thread pool (lines 148-164) -4. Turn persistence (lines 167-174) - -**Recommendation:** Accept CC=11 as reasonable for this complex concurrent operation. The early returns are defensive programming, not complexity. - ---- - -## 4. Security Analysis (Bandit/Ruff S Rules) - -### Result: PASS βœ… - -**Command:** `ruff check --select S src/noteflow/` -**Outcome:** 0 security issues detected - -**Scanned Patterns:** -- S101: Use of assert -- S102: Use of exec -- S103: Insecure file permissions -- S104-S113: Cryptographic issues -- S301-S324: SQL injection, pickle usage, etc. - -**Notable Security Strengths:** -1. **Encryption:** `infrastructure/security/crypto.py` uses AES-GCM (authenticated encryption) -2. **Key Management:** `infrastructure/security/keystore.py` uses system keyring -3. **Database:** SQLAlchemy ORM prevents SQL injection -4. **No hardcoded secrets:** Uses environment variables and keyring - ---- - -## 5. Architecture Quality - -### Result: EXCELLENT βœ… - -**Strengths:** - -#### 5.1 Hexagonal Architecture -``` -domain/ (pure business logic) - ↓ depends on -application/ (use cases) - ↓ depends on -infrastructure/ (adapters) -``` -Clean dependency direction with no circular imports. - -#### 5.2 Modular gRPC Mixins -``` -grpc/_mixins/ -β”œβ”€β”€ streaming.py # ASR streaming -β”œβ”€β”€ diarization.py # Speaker diarization -β”œβ”€β”€ summarization.py # Summary generation -β”œβ”€β”€ meeting.py # Meeting CRUD -β”œβ”€β”€ annotation.py # Annotations -β”œβ”€β”€ export.py # Document export -└── protocols.py # ServicerHost protocol -``` -Each mixin focuses on single responsibility, composed via `ServicerHost` protocol. - -#### 5.3 Repository Pattern with Unit of Work -```python -async with SqlAlchemyUnitOfWork(session_factory) as uow: - meeting = await uow.meetings.get(meeting_id) - await uow.segments.add(segment) - await uow.commit() # Atomic transaction -``` -Proper transaction boundaries and separation of concerns. - -#### 5.4 Protocol-Based Dependency Injection -```python -# domain/ports/ -class MeetingRepository(Protocol): - async def get(self, meeting_id: MeetingId) -> Meeting | None: ... - -# infrastructure/persistence/repositories/ -class SqlAlchemyMeetingRepository: - """Concrete implementation.""" -``` -Testable, swappable implementations (DB vs memory). - ---- - -## 6. File Size Analysis - -### Result: GOOD βœ… - -| File | Lines | Status | Notes | -|------|-------|--------|-------| -| `grpc/server.py` | 489 | βœ… Good | Under 500-line soft limit | -| `grpc/_mixins/streaming.py` | 579 | ⚠️ Review | Near 750-line hard limit | -| `grpc/_mixins/diarization.py` | 578 | ⚠️ Review | Near 750-line hard limit | - -**Recommendation:** Both large mixins are candidates for splitting into sub-modules once complexity is addressed. - ---- - -## 7. Missing Quality Tools - -### 7.1 Black Formatter -**Status:** Not installed in venv -**Impact:** Cannot verify formatting compliance -**Action Required:** -```bash -source .venv/bin/activate -uv pip install black -black --check src/noteflow/ -``` - -### 7.2 Pyrefly -**Status:** Not available -**Impact:** Missing semantic bug detection -**Action:** Optional enhancement (not critical) - ---- - -## Next Actions - -### Critical (Do Before Next Commit) -1. βœ… **Fixed:** Remove quoted type annotation in `_config.py` (auto-fixed by Ruff) -2. ⚠️ **Required:** Update `pyproject.toml` to use `[tool.ruff.lint]` section -3. ⚠️ **Required:** Install Black and verify formatting: `uv pip install black && black src/noteflow/` - -### High Priority (This Sprint) -4. **Extract helpers from `run_server_with_config`** to reduce CC from 16 β†’ ~6 - - Create `_initialize_database()`, `_initialize_consent_persistence()`, `_initialize_diarization()` - - Target: <10 complexity per function - -5. **Complete `StreamingSession` refactoring** to reduce `StreamTranscription` CC from 14 β†’ ~8 - - File already created: `grpc/_streaming_session.py` - - Move per-meeting state into session class - - Simplify main async generator - -### Medium Priority (Next Sprint) -6. **Split large mixin files** if they exceed 750 lines after complexity fixes - - `streaming.py` (579 lines) β†’ `streaming/` package - - `diarization.py` (578 lines) β†’ `diarization/` package - -7. **Add mypy exclusions** to align with basedpyright configuration - - Exclude proto files, third-party libraries without stubs - -### Low Priority (Backlog) -8. Consider adding `pyrefly` for additional semantic checks -9. Review duplication patterns from code-quality-correction-plan.md - ---- - -## Summary - -### Mechanical Fixes Applied βœ… -- **Ruff:** Removed quoted type annotation in `grpc/_config.py:95` - -### Configuration Issues ⚠️ -- **pyproject.toml:** Migrate to `[tool.ruff.lint]` section (deprecated warning) -- **Black:** Not installed in venv (cannot verify formatting) - -### Architectural Recommendations πŸ“‹ - -#### Complexity Violations (3 total) -| Priority | Function | Current CC | Target | Effort | -|----------|----------|------------|--------|--------| -| πŸ”΄ HIGH | `run_server_with_config` | 16 | ≀10 | 2-3 hours | -| 🟠 MEDIUM | `StreamTranscription` | 14 | ≀10 | 3-4 hours | -| 🟑 LOW | `_process_streaming_diarization` | 11 | Accept | N/A | - -**Total Estimated Effort:** 5-7 hours to address HIGH and MEDIUM priorities - -### Pass Criteria Met βœ… -- [x] Type safety (basedpyright): 0 errors -- [x] Linting (Ruff): 0 violations remaining -- [x] Security (Bandit): 0 vulnerabilities -- [x] Architecture: Clean hexagonal design -- [x] No critical issues blocking development - -### Status: PASS βœ… - -The NoteFlow backend demonstrates **excellent code quality** with well-architected patterns, strong type safety, and zero critical issues. The complexity violations are isolated to 3 functions and have clear refactoring paths. All mechanical fixes have been applied successfully. - ---- - -**QA Agent:** Code-Quality Agent -**Report Generated:** 2024-12-24 -**Next Review:** After complexity refactoring (estimated 1 week) diff --git a/docs/roadmap.md b/docs/roadmap.md index 17682aa..a2cd6cc 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -1,6 +1,6 @@ # NoteFlow Feature Gap Analysis & Development Roadmap -> Generated: 2025-12-23 +> Generated: 2025-12-23 | Updated: 2025-12-25 > Focus: Core pipeline completion (transcription β†’ summary β†’ diarization β†’ export) --- @@ -19,10 +19,23 @@ This document identifies features not yet developed or fully connected between t | **Export** | Partial | Markdown/HTML working, PDF missing | | **Integrations** | Stub | UI exists, backend handlers missing | +### Sprint Overview + +| Sprint | Name | Phase | Status | Prerequisites | +|--------|------|-------|--------|---------------| +| **0** | Proto & Schema Foundation | Foundation | New | β€” | +| 1 | AI Templates Pass-Through | Core Pipeline | Planned | Sprint 0 | +| 2 | Diarization Application Service | Core Pipeline | Planned | Sprint 0 | +| 3 | PDF Export | Core Pipeline | Planned | Sprint 0 | +| 4 | Named Entity Extraction | Intelligence | Planned | Sprint 0 | +| 5 | Calendar Sync | Integrations | Planned | Sprint 0 | +| 6 | Webhook Execution | Integrations | Planned | Sprint 0 | + ### Feature Gap Summary | Priority | Feature | Owner | Complexity | Status | |----------|---------|-------|------------|--------| +| 0 | Proto & Schema Foundation | Backend | Medium | **NEW** - Consolidates proto/DB changes | | 1 | AI Templates Pass-Through | Both | Low | Not connected | | 2 | Diarization Application Service | Backend | Medium | Engine exists, service missing | | 3 | PDF Export | Backend | Low-Medium | Not implemented | @@ -32,6 +45,32 @@ This document identifies features not yet developed or fully connected between t --- +## Sprint 0: Proto & Schema Foundation (NEW) + +> **Priority**: 0 | **Owner**: Backend | **Complexity**: Medium +> **Documentation**: `docs/sprints/phase-0-foundation/sprint-0-proto-schema/README.md` + +### Objective + +Consolidate all proto and database schema changes required by Sprints 1-6 into a single, atomic foundation sprint. This prevents proto version conflicts and ensures all sprints start from a consistent base. + +### Key Components + +1. **Consolidated Proto Definitions**: All RPCs and messages for Sprints 1-6 +2. **Alembic Migrations**: `named_entities`, `webhooks`, `webhook_deliveries` tables +3. **Feature Flags**: Toggle experimental features (`ner_extraction_enabled`, `calendar_sync_enabled`) +4. **Docker Integration**: spaCy model downloads, database initialization +5. **Proto Regeneration Script**: Consistent stub generation + +### Critical Fixes Included + +- Resolves proto version conflicts across sprints +- Ensures database schema exists before feature sprints +- Provides feature flags for gradual rollout +- Documents proto changelog for sync points + +--- + ## Ownership Guidelines ### Backend (Python gRPC) Responsibilities @@ -70,7 +109,6 @@ This document identifies features not yet developed or fully connected between t **Priority**: 1 **Owner**: Both (proto change + frontend wiring) **Complexity**: Low -**Estimated Effort**: 1-2 days #### Current State @@ -222,7 +260,6 @@ async generateSummary( **Priority**: 2 **Owner**: Backend **Complexity**: Medium -**Estimated Effort**: 2-3 days #### Current State @@ -435,7 +472,6 @@ class DiarizationPort(Protocol): **Priority**: 3 **Owner**: Backend **Complexity**: Low-Medium -**Estimated Effort**: 1-2 days #### Current State @@ -645,7 +681,6 @@ Note: weasyprint requires system dependencies (cairo, pango). Document in README **Priority**: 4 **Owner**: Backend **Complexity**: Medium -**Estimated Effort**: 3-4 days #### Current State @@ -1012,7 +1047,6 @@ Post-install: `python -m spacy download en_core_web_sm` **Priority**: 5 **Owner**: Backend **Complexity**: Medium-High -**Estimated Effort**: 4-5 days #### Current State @@ -1317,7 +1351,6 @@ google-auth-oauthlib = "^1.1" **Priority**: 6 **Owner**: Backend **Complexity**: Medium -**Estimated Effort**: 2-3 days #### Current State @@ -1580,25 +1613,49 @@ After `GenerateSummary` completes successfully, call `webhook_service.trigger_su ## Implementation Order & Dependencies ``` -Phase 1 (Parallel where possible): -β”œβ”€β”€ Feature 1: AI Templates ─────────────────┐ -β”œβ”€β”€ Feature 3: PDF Export ─────────────────────→ Proto regeneration -└── Feature 2: Diarization Service β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +Sprint 0: Foundation (MUST complete first) +└── Proto & Schema Foundation ───────────────→ All proto + DB migrations + feature flags -Phase 2: -└── Feature 4: NER ──────────────────────────→ Requires proto changes +Phase 1: Core Pipeline (Parallel, after Sprint 0): +β”œβ”€β”€ Sprint 1: AI Templates ─────────────────┐ +β”œβ”€β”€ Sprint 3: PDF Export ─────────────────────→ Use proto from Sprint 0 +└── Sprint 2: Diarization Service β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ (DB persistence, application layer) -Phase 3 (Sequential): -β”œβ”€β”€ Feature 5: Calendar Sync ────────────────→ OAuth infrastructure -└── Feature 6: Webhooks ─────────────────────→ Can start after Phase 1 +Phase 2: Intelligence (after Phase 1): +└── Sprint 4: NER ──────────────────────────→ Uses NerService application layer + +Phase 3: Integrations (Sequential, after Phase 2): +β”œβ”€β”€ Sprint 5: Calendar Sync ────────────────→ Complete OAuth flow with PKCE +└── Sprint 6: Webhooks ─────────────────────→ HMAC signing, retry logic ``` ### Critical Path -1. **Proto changes** (Features 1, 3, 4, 5) must be done and regenerated together -2. **Diarization Service** blocks nothing, can proceed independently -3. **Calendar Sync** requires OAuth token storage infrastructure -4. **Webhooks** can be implemented at any time +1. **Sprint 0** is the **mandatory prerequisite** for all other sprints +2. **All proto/DB changes consolidated** in Sprint 0 - no more scattered migrations +3. **Feature flags** control feature availability before full rollout +4. **Application service layer** required for Sprints 2, 4, 5 (hexagonal architecture) +5. **Sprint 5 OAuth** now includes complete PKCE flow, token persistence, and refresh + +### Architectural Decisions (Updated) + +| Sprint | Key Improvement | +|--------|-----------------| +| Sprint 0 | Consolidated proto + feature flags + Docker model downloads | +| Sprint 2 | Database persistence via repository (not in-memory `_jobs` dict) | +| Sprint 4 | `NerService` application layer (gRPC β†’ Service β†’ Engine) | +| Sprint 5 | Complete OAuth with PKCE, token storage, and auto-refresh | + +### Quality Gates + +Each sprint must pass before merge: +```bash +pytest tests/quality/ # 23+ quality checks +ruff check src/noteflow # Linting +basedpyright # Type checking +``` + +See `docs/sprints/QUALITY_STANDARDS.md` for thresholds and reduction targets. --- diff --git a/docs/sprints/QUALITY_STANDARDS.md b/docs/sprints/QUALITY_STANDARDS.md new file mode 100644 index 0000000..e5c74e7 --- /dev/null +++ b/docs/sprints/QUALITY_STANDARDS.md @@ -0,0 +1,575 @@ +# NoteFlow Quality Standards Reference + +> All sprint implementations MUST comply with these standards. Run quality gates before PR. + +--- + +## Quick Reference: Quality Commands + +```bash +# Python Backend +pytest tests/quality/ # All quality checks (23+ rules) +ruff check src/noteflow # Linting +mypy src/noteflow # Type checking (strict) +basedpyright # Additional type checks + +# TypeScript/React Frontend +cd client +npm run test:quality # Frontend quality checks +npm run lint # ESLint + +# Rust/Tauri +cd client +npm run quality:rs # Rust quality script +cargo clippy # Rust linting + +# Full Suite +npm run quality:all # TS + Rust quality +``` + +--- + +## Python Standards (`src/noteflow/`) + +### Type Safety (STRICT) + +| Rule | Description | Enforcement | +|------|-------------|-------------| +| No `# type: ignore` | Forbidden without justification | mypy strict | +| No `Any` type | Use specific types always | basedpyright | +| Union syntax | Use `str \| None` over `Optional[str]` | ruff UP | +| Return annotations | All public functions must have returns | mypy | + +### Code Limits + +| Metric | Soft Limit | Hard Limit | Location | +|--------|------------|------------|----------| +| Module lines | 500 | 750 | `test_code_smells.py` | +| Function lines | 50 (tests), 75 (src) | β€” | `test_code_smells.py` | +| Function complexity | 15 | β€” | `test_code_smells.py` | +| Parameters | 7 | β€” | `test_code_smells.py` | +| Class methods | 20 | β€” | `test_code_smells.py` | +| Nesting depth | 5 | β€” | `test_code_smells.py` | + +### Test Requirements + +**Current thresholds** (to be reduced each sprint): + +| Rule | Max Allowed | Target | File | +|------|-------------|--------|------| +| Assertion roulette (>3 assertions without msg) | 25 | 0 | `test_test_smells.py` | +| Conditional test logic | 15 | 0 | `test_test_smells.py` | +| Empty tests | 0 | 0 | `test_test_smells.py` | +| Sleepy tests (time.sleep) | 3 | 0 | `test_test_smells.py` | +| Tests without assertions | 3 | 0 | `test_test_smells.py` | +| Redundant assertions | 0 | 0 | `test_test_smells.py` | +| Print statements in tests | 3 | 0 | `test_test_smells.py` | +| Skipped tests without reason | 0 | 0 | `test_test_smells.py` | +| Exception handling (try/except) | 3 | 0 | `test_test_smells.py` | +| Magic numbers in assertions | 25 | 10 | `test_test_smells.py` | +| Duplicate test names | 5 | 0 | `test_test_smells.py` | +| Long test methods (>50 lines) | 3 | 0 | `test_test_smells.py` | +| unittest-style assertions | 0 | 0 | `test_test_smells.py` | +| Fixtures without type hints | 5 | 0 | `test_test_smells.py` | +| Unused fixture parameters | 3 | 0 | `test_test_smells.py` | +| pytest.raises without match= | 20 | 0 | `test_test_smells.py` | +| Cross-file fixture duplicates | 0 | 0 | `test_test_smells.py` | + +**Reduction schedule**: +- After each sprint, reduce non-zero thresholds by 20% (rounded down) +- Goal: All thresholds at target values by Sprint 6 + +### Docstring Requirements + +- Write imperatively with proper punctuation +- All public functions, classes, modules documented +- Document complex business rules and edge cases + +--- + +## TypeScript/React Standards (`client/src/`) + +### Type Safety + +| Rule | Max Allowed | File | +|------|-------------|------| +| `any` type usage | 10 | `code-quality.test.ts` | +| Unsafe type assertions (`as any/unknown/never`) | 5 | `code-quality.test.ts` | +| TypeScript suppressions (@ts-ignore) | 3 | `code-quality.test.ts` | + +### Code Quality + +| Rule | Max Allowed | Description | +|------|-------------|-------------| +| Repeated string literals | 5 | Same string in multiple files | +| Complex JSX patterns | 10 | Repeated component structures | +| Scattered helper functions | 2 | format/parse/convert scattered | +| TODO/FIXME comments | 15 | Unaddressed tech debt | +| Commented-out code | 10 | Stale code blocks | +| Trivial wrapper components | 3 | Components that just spread props | +| Magic numbers (>3 digits) | 5 | Use named constants | +| Hardcoded colors in JSX | 3 | Use theme/CSS variables | +| Hardcoded API endpoints | 0 | Use config | +| Long files (>500 lines) | 3 | Split into modules | +| Complex inline styles | 5 | Use CSS/Tailwind | +| Deeply nested ternaries | 0 | Use if/switch | +| Excessive prop spreading | 2 | Consider context | + +### Naming Conventions + +- Components: PascalCase (`RecordingPanel`, not `recordingPanel`) +- Hooks: `use` prefix (`useAudioLevel`) +- Utils: camelCase (`formatDuration`) +- Constants: SCREAMING_SNAKE_CASE (`MAX_RETRIES`) + +--- + +## Rust/Tauri Standards (`client/src-tauri/src/`) + +### Code Quality Checks + +| Check | Threshold | Description | +|-------|-----------|-------------| +| Magic numbers | Warning | Numbers >100 not in const | +| Repeated strings | >3 occurrences | Extract to constants | +| TODO/FIXME comments | >10 | Address or remove | +| Long functions | >100 lines | Split into helpers | +| Deep nesting | >5 levels (20 spaces) | Flatten control flow | +| unwrap() calls | >20 | Use ? or expect() | +| clone() per file | >10 | Review ownership | +| Parameters | >5 | Use struct/builder | +| Duplicate error messages | >2 | Use error enum | +| File size | >500 lines | Split module | + +### Clippy Enforcement + +```bash +cargo clippy -- -W unused_imports -W dead_code +``` + +Must pass with zero warnings for: +- Unused imports +- Dead code +- Missing docs on public items + +--- + +## Pre-Commit Checklist + +Before any PR: + +```markdown +## Python +- [ ] `pytest tests/quality/` passes +- [ ] `ruff check --fix .` run +- [ ] `mypy src/noteflow` clean +- [ ] No `# type: ignore` without comment +- [ ] Docstrings on all new public functions + +## TypeScript/React +- [ ] `npm run test:quality` passes +- [ ] `npm run lint` clean +- [ ] No `any` types added +- [ ] Components use PascalCase + +## Rust +- [ ] `npm run quality:rs` passes +- [ ] `cargo clippy` clean +- [ ] No unwrap() in error paths +- [ ] Error types documented +``` + +--- + +## Architecture Patterns + +### Hexagonal Architecture (Python) + +``` +domain/ β†’ Entities, value objects, ports (interfaces) +application/ β†’ Use cases, services, orchestration +infrastructure/ β†’ Implementations, adapters, external services +grpc/ β†’ Transport layer, proto definitions +``` + +### File Organization + +| Layer | Pattern | Example | +|-------|---------|---------| +| Domain | `entities/meeting.py`, `ports/repository.py` | Pure business logic | +| Application | `services/meeting_service.py` | Orchestrates domain | +| Infrastructure | `persistence/repositories/meeting_repo.py` | Implements ports | +| gRPC | `_mixins/meeting.py` | Transport handlers | + +### Naming Conventions + +| Type | Convention | Example | +|------|------------|---------| +| Domain entity | Singular noun | `Meeting`, `Segment` | +| Service | NounService | `MeetingService`, `SummarizationService` | +| Repository | NounRepository | `MeetingRepository` | +| Port | NounPort (Protocol) | `SummarizationPort` | +| Mixin | NounMixin | `StreamingMixin` | +| Factory | create_noun() | `create_summarization_service()` | + +--- + +## Testable Code Patterns + +### Protocol-Based Dependency Injection + +All services MUST use **constructor injection** with **Protocol-based abstractions** for testability. + +**References**: +- [ArjanCodes: Python DI Best Practices](https://arjancodes.com/blog/python-dependency-injection-best-practices/) +- [Real Python: SOLID Principles](https://realpython.com/solid-principles-python/) + +### Key Principles + +| Principle | Description | Example | +|-----------|-------------|---------| +| Constructor injection | All dependencies passed via `__init__` | `Service(repo: RepositoryPort)` | +| Protocol abstractions | Use `typing.Protocol` for interfaces | `class RepositoryPort(Protocol)` | +| Factory functions | Create configured instances | `create_service() -> Service` | +| No global state | Avoid singletons and module-level state | Use DI instead of `get_instance()` | + +### Pattern: Service with Protocol Dependencies + +```python +from typing import Protocol + +# 1. Define port (interface) in domain layer +class NerPort(Protocol): + """Port for NER operations.""" + + def extract(self, text: str) -> list[NamedEntity]: + """Extract named entities from text.""" + ... + + +# 2. Application service depends on protocol (not concrete impl) +class NerService: + """Application service for NER operations.""" + + def __init__( + self, + ner_engine: NerPort, # Protocol, not SpacyNerEngine + uow_factory: Callable[[], UnitOfWork], + ) -> None: + self._ner_engine = ner_engine + self._uow_factory = uow_factory + + async def extract_entities(self, meeting_id: MeetingId) -> list[NamedEntity]: + """Extract entities from meeting transcript.""" + async with self._uow_factory() as uow: + meeting = await uow.meetings.get(meeting_id) + return self._ner_engine.extract(meeting.transcript) + + +# 3. Infrastructure implements the protocol +class SpacyNerEngine: + """spaCy implementation of NerPort.""" + + def __init__(self, model_name: str = "en_core_web_sm") -> None: + self._nlp = spacy.load(model_name) + + def extract(self, text: str) -> list[NamedEntity]: + """Extract entities using spaCy.""" + doc = self._nlp(text) + return [NamedEntity.from_spacy(ent) for ent in doc.ents] + + +# 4. Factory function wires dependencies +def create_ner_service( + model_name: str = "en_core_web_sm", + uow_factory: Callable[[], UnitOfWork] | None = None, +) -> NerService: + """Create NER service with dependencies.""" + engine = SpacyNerEngine(model_name) + factory = uow_factory or SQLAlchemyUnitOfWork + return NerService(engine, factory) +``` + +### Testing with Mock Protocols + +```python +@pytest.fixture +def mock_ner_engine() -> MagicMock: + """Create mock NER engine implementing NerPort.""" + engine = MagicMock(spec=NerPort) + engine.extract.return_value = [ + NamedEntity.create("Test Person", EntityCategory.PERSON, [1], 0.9), + ] + return engine + + +@pytest.fixture +def ner_service(mock_ner_engine: MagicMock, mock_uow_factory: Callable) -> NerService: + """Create NER service with mock dependencies.""" + return NerService(mock_ner_engine, mock_uow_factory) + + +def test_extract_entities_calls_engine( + ner_service: NerService, + mock_ner_engine: MagicMock, +) -> None: + """Extraction delegates to NER engine.""" + await ner_service.extract_entities(MeetingId(uuid4())) + + mock_ner_engine.extract.assert_called_once() +``` + +### Anti-Patterns (AVOID) + +```python +# ❌ WRONG: Direct instantiation in service +class BadService: + def __init__(self) -> None: + self._engine = SpacyNerEngine() # Untestable! + +# ❌ WRONG: Module-level singleton +_engine = SpacyNerEngine() # Global state! + +def get_engine() -> SpacyNerEngine: + return _engine + +# ❌ WRONG: Concrete type dependency +class BadService: + def __init__(self, engine: SpacyNerEngine) -> None: # Concrete, not Protocol! + self._engine = engine +``` + +### Modern Library Recommendations + +| Category | Library | Rationale | +|----------|---------|-----------| +| OAuth 2.0 | **Authlib** | Built-in PKCE, async support, handles edge cases | +| HTTP Client | **httpx** | Modern async, compatible with Authlib | +| NER | **spaCy** or **GLiNER** | spaCy for production, GLiNER for zero-shot | +| Validation | **Pydantic** | Already used in project | +| Testing | **pytest** | With `pytest.mark.parametrize` | + +--- + +## Test Patterns + +### CRITICAL: No Conditionals in Tests + +**FORBIDDEN in test code:** +- `if`/`else` statements with assertions +- `for` loops with assertions +- `while` loops +- Conditional logic that determines test behavior + +**USE INSTEAD: `pytest.mark.parametrize`** + +```python +# ❌ WRONG: Conditional test logic +def test_entity_extraction(engine: NerEngine) -> None: + for text, expected in test_cases: # FORBIDDEN + entities = engine.extract(text) + if expected: # FORBIDDEN + assert entities + +# βœ… CORRECT: Parametrized tests +@pytest.mark.parametrize( + ("text", "expected_category"), + [ + pytest.param("John Smith", EntityCategory.PERSON, id="person"), + pytest.param("Google", EntityCategory.COMPANY, id="company"), + pytest.param("New York", EntityCategory.LOCATION, id="location"), + ], +) +def test_entity_extraction( + engine: NerEngine, + text: str, + expected_category: EntityCategory, +) -> None: + """Extract entity of expected category.""" + entities = engine.extract(text) + matching = [e for e in entities if e.category == expected_category] + assert matching, f"Expected {expected_category.value} in: {text}" +``` + +### Parametrization Best Practices + +```python +# Use pytest.param with descriptive IDs +@pytest.mark.parametrize( + ("input_value", "expected_output", "description"), + [ + pytest.param("", [], id="empty-input"), + pytest.param("hello", ["hello"], id="single-word"), + pytest.param("a b c", ["a", "b", "c"], id="multiple-words"), + ], +) +def test_tokenize(input_value: str, expected_output: list[str], description: str) -> None: + """Tokenize input produces expected tokens.""" + assert tokenize(input_value) == expected_output + +# Class-based organization for related tests +class TestAuthorizationUrl: + """Test authorization URL generation.""" + + @pytest.mark.parametrize( + ("provider", "expected_host"), + [ + pytest.param(OAuthProvider.GOOGLE, "accounts.google.com", id="google"), + pytest.param(OAuthProvider.MICROSOFT, "login.microsoftonline.com", id="microsoft"), + ], + ) + def test_generates_valid_url(self, provider: OAuthProvider, expected_host: str) -> None: + """Generate URL for each provider.""" + url = generate_auth_url(provider) + assert expected_host in url +``` + +### Fixture Scoping for Performance + +```python +# Module-scoped for expensive operations (model loading, DB setup) +@pytest.fixture(scope="module") +def ner_engine() -> SpacyNerEngine: + """Load spaCy model once per test module.""" + return SpacyNerEngine("en_core_web_sm") + +# Function-scoped for mutable state +@pytest.fixture +def mock_uow() -> AsyncMock: + """Fresh mock for each test.""" + return AsyncMock(spec=UnitOfWork) +``` + +### Required Test Elements + +1. **Type hints** on fixtures and test functions +2. **Docstring** explaining what's being tested +3. **AAA pattern** (Arrange/Act/Assert) with comments +4. **Specific assertions** with messages for complex checks +5. **pytest.raises with match=** for exception tests +6. **`pytest.param` with IDs** for parametrized tests +7. **No conditionals or loops** around assertions + +--- + +## Code Reuse Checklist + +Before creating new code, check: + +| Location | Contains | +|----------|----------| +| `domain/entities/` | Existing entity types | +| `domain/ports/` | Existing port interfaces | +| `infrastructure/converters/` | Entity ↔ ORM converters | +| `grpc/_mixins/converters.py` | Proto ↔ Domain converters | +| `infrastructure/*/protocols.py` | Infrastructure interfaces | +| `application/services/` | Existing service patterns | + +### Shared Utilities + +| File | Functions | +|------|-----------| +| `infrastructure/export/_formatting.py` | `format_timestamp()`, `format_datetime()` | +| `infrastructure/security/keystore.py` | `_generate_key()`, `_decode_and_validate_key()` | +| `infrastructure/summarization/_parsing.py` | `build_transcript_prompt()`, `parse_llm_response()` | +| `infrastructure/diarization/assigner.py` | `assign_speaker()`, `assign_speakers_batch()` | + +--- + +## Documentation Requirements + +Every new feature must include: + +1. **Unit tests** covering core logic +2. **Integration tests** for end-to-end flow (where applicable) +3. **Docstrings** on all public APIs +4. **CLAUDE.md updates** if architectural patterns change +5. **Proto changes** documented in commit message + +--- + +## Shared Test Fixtures + +### Available Fixtures (`tests/conftest.py`) + +**DO NOT redefine these fixtures in test files**. Use them from conftest.py. + +| Fixture | Type | Scope | Description | +|---------|------|-------|-------------| +| `crypto` | `CryptoService` | function | Encryption service for test data | +| `meetings_dir` | `Path` | function | Temporary directory for meeting assets | +| `mock_uow` | `AsyncMock` | function | Mock Unit of Work with all repositories | +| `mock_uow_factory` | `type` | function | Factory that returns mock_uow | +| `temp_db` | `Engine` | session | Temporary SQLite database | +| `async_session` | `AsyncSession` | function | Async SQLAlchemy session | +| `grpc_server` | `NoteFlowServicer` | function | Test gRPC server instance | +| `grpc_client` | `NoteFlowClient` | function | Test gRPC client | +| `sample_meeting` | `Meeting` | function | Pre-populated meeting entity | +| `sample_segments` | `list[Segment]` | function | Sample transcript segments | +| `mock_ner_engine` | `MagicMock` | function | Mock NER engine | +| `mock_oauth_manager` | `MagicMock` | function | Mock OAuth manager | +| `mock_calendar_settings` | `CalendarSettings` | function | Calendar settings with test OAuth creds | + +### Usage Pattern + +```python +# CORRECT: Use shared fixtures +def test_meeting_creation(mock_uow: AsyncMock, sample_meeting: Meeting) -> None: + """Create meeting uses repository correctly.""" + mock_uow.meetings.save.return_value = None + # ... test logic + +# INCORRECT: Do not redefine fixtures +@pytest.fixture +def mock_uow(): # DON'T DO THIS - use conftest.py fixture + return AsyncMock() +``` + +### Cross-File Fixture Detection + +The `test_test_smells.py` quality check detects when fixtures are redefined: + +```python +# Fails quality check - fixture "mock_uow" already in conftest.py +@pytest.fixture +def mock_uow(): + ... +``` + +Move new shared fixtures to `tests/conftest.py` to avoid duplication. + +### Adding New Shared Fixtures + +When adding a fixture that could be reused: + +1. Check if a similar fixture exists in `tests/conftest.py` +2. If not, add it to `tests/conftest.py` with: + - Type annotation on the return + - Docstring explaining the fixture + - Appropriate scope (`function`, `class`, `module`, `session`) + +```python +@pytest.fixture +def new_shared_fixture() -> SomeType: + """Provide X for Y tests. + + Returns: + Configured SomeType instance. + """ + return SomeType(...) +``` + +--- + +## Sprint-Specific Quality Requirements + +Each sprint must: + +1. **Not increase** any quality threshold violations +2. **Reduce** at least one threshold toward target +3. **Add fixtures** to conftest.py (not test files) +4. **Run quality suite** before PR: + ```bash + pytest tests/quality/ -v + ``` +5. **Document** any threshold exceptions in PR description diff --git a/docs/sprints/phase-0-foundation/sprint-0-proto-schema/README.md b/docs/sprints/phase-0-foundation/sprint-0-proto-schema/README.md new file mode 100644 index 0000000..a7683e0 --- /dev/null +++ b/docs/sprints/phase-0-foundation/sprint-0-proto-schema/README.md @@ -0,0 +1,989 @@ +# Sprint 0: Proto & Schema Foundation + +> **Priority**: 0 | **Owner**: Backend | **Complexity**: Medium | **Prerequisite for all other sprints** + +--- + +## Objective + +Consolidate all protobuf schema changes and database migrations required by Sprints 1-6 into a single coordinated release. This prevents proto conflicts, ensures backward compatibility, and establishes the persistence foundation for all features. + +--- + +## Rationale + +Multiple sprints modify shared infrastructure: + +| Sprint | Proto Changes | DB Changes | +|--------|---------------|------------| +| 1 (AI Templates) | `SummarizationOptions` message | None | +| 3 (PDF Export) | `EXPORT_FORMAT_PDF` enum | None | +| 4 (NER) | `ExtractEntities` RPC + messages | `named_entities` table | +| 5 (Calendar) | `ListCalendarEvents` RPC + messages | Uses existing tables | +| 6 (Webhooks) | None | `webhook_configs`, `webhook_deliveries` tables | + +Without coordination: +- Proto regeneration conflicts between parallel sprints +- Migration ordering issues +- Client/server version mismatches + +--- + +## Phased Implementation + +Sprint 0 is split into four sub-increments to enable independent verification and reduce blast radius: + +| Increment | Scope | Verification Gate | +|-----------|-------|-------------------| +| **0a** | Proto schema + stub regeneration | `python -c "from noteflow.grpc.proto import noteflow_pb2"` | +| **0b** | Database schema (schema.sql) | `psql -f docker/db/schema.sql` on fresh DB | +| **0c** | Alembic migrations | `alembic upgrade head && alembic downgrade -1` | +| **0d** | Dependencies + Docker + Feature flags | `pip install -e ".[all]" && pytest tests/` | + +### Increment 0a: Proto Schema + +**Files**: `noteflow.proto`, `*_pb2.py`, `*_pb2_grpc.py`, `*_pb2.pyi` + +**Tasks**: Task 1, Task 7, Task 9 + +**Done when**: +- [ ] Proto compiles without errors +- [ ] Python stubs import cleanly +- [ ] Rust/TS stubs generate via `client/build.rs` +- [ ] PROTO_CHANGELOG.md committed + +### Increment 0b: Database Schema + +**Files**: `docker/db/schema.sql` + +**Tasks**: Task 2 + +**Done when**: +- [ ] Schema applies to fresh PostgreSQL +- [ ] All tables have proper indexes +- [ ] Foreign key constraints validated +- [ ] Triggers for `updated_at` in place + +### Increment 0c: Alembic Migrations + +**Files**: `migrations/versions/001_*.py`, `migrations/versions/002_*.py` + +**Tasks**: Task 3 + +**Done when**: +- [ ] Migrations apply to existing database +- [ ] Downgrade path works for each migration +- [ ] Schema matches schema.sql output + +### Increment 0d: Dependencies and Docker + +**Files**: `pyproject.toml`, `Dockerfile`, `docker-compose.yml`, `settings.py`, `cli/models.py` + +**Tasks**: Task 4, Task 5, Task 6, Task 8 + +**Done when**: +- [ ] All optional dependencies install +- [ ] Feature flags control availability +- [ ] Model download CLI works +- [ ] Docker build completes with NER support + +--- + +## Target/Affected Code + +### Files to Modify + +| File | Change Type | +|------|-------------| +| `src/noteflow/grpc/proto/noteflow.proto` | All proto additions | +| `src/noteflow/grpc/proto/noteflow_pb2.py` | Regenerated | +| `src/noteflow/grpc/proto/noteflow_pb2_grpc.py` | Regenerated | +| `src/noteflow/grpc/proto/noteflow_pb2.pyi` | Regenerated | +| `docker/db/schema.sql` | All table additions | +| `pyproject.toml` | All new dependencies | +| `client/src-tauri/build.rs` | Proto path verification | + +### Files to Create + +| File | Purpose | +|------|---------| +| `src/noteflow/infrastructure/persistence/migrations/versions/001_add_named_entities.py` | NER tables | +| `src/noteflow/infrastructure/persistence/migrations/versions/002_add_webhooks.py` | Webhook tables | +| `docs/sprints/phase-0-foundation/PROTO_CHANGELOG.md` | Proto version history | + +--- + +## Implementation Tasks + +### Task 1: Proto Schema Consolidation + +**File**: `src/noteflow/grpc/proto/noteflow.proto` + +Add all new messages and RPCs in a single commit: + +```protobuf +// ============================================================================= +// Sprint 0: Consolidated Proto Changes +// Version: 2.0.0 +// Date: 2025-XX-XX +// ============================================================================= + +// ----------------------------------------------------------------------------- +// Sprint 1: AI Templates +// ----------------------------------------------------------------------------- + +// Summarization style options passed from frontend settings +message SummarizationOptions { + // Tone: professional, casual, technical, friendly + string tone = 1; + + // Format: bullet_points, narrative, structured, concise + string format = 2; + + // Verbosity: minimal, balanced, detailed, comprehensive + string verbosity = 3; +} + +// Modify existing GenerateSummaryRequest (add field 3) +// message GenerateSummaryRequest { +// string meeting_id = 1; +// bool force_regenerate = 2; +// SummarizationOptions options = 3; // NEW +// } + +// ----------------------------------------------------------------------------- +// Sprint 3: PDF Export +// ----------------------------------------------------------------------------- + +// Add to existing ExportFormat enum +// enum ExportFormat { +// EXPORT_FORMAT_UNSPECIFIED = 0; +// EXPORT_FORMAT_MARKDOWN = 1; +// EXPORT_FORMAT_HTML = 2; +// EXPORT_FORMAT_PDF = 3; // NEW +// } + +// ----------------------------------------------------------------------------- +// Sprint 4: Named Entity Extraction +// ----------------------------------------------------------------------------- + +// Add to service definition +// rpc ExtractEntities(ExtractEntitiesRequest) returns (ExtractEntitiesResponse); + +message ExtractEntitiesRequest { + string meeting_id = 1; + bool force_refresh = 2; // Re-extract even if entities exist +} + +message ExtractedEntity { + string id = 1; + string text = 2; + // Category: person, company, product, technical, acronym, location, date, other + string category = 3; + repeated int32 segment_ids = 4; + float confidence = 5; + bool is_pinned = 6; // User-confirmed +} + +message ExtractEntitiesResponse { + repeated ExtractedEntity entities = 1; + int32 total_count = 2; + bool cached = 3; // True if returning cached results +} + +// ----------------------------------------------------------------------------- +// Sprint 5: Calendar Sync +// ----------------------------------------------------------------------------- + +// Add to service definition +// rpc ListCalendarEvents(ListCalendarEventsRequest) returns (ListCalendarEventsResponse); +// rpc GetCalendarProviders(GetCalendarProvidersRequest) returns (GetCalendarProvidersResponse); +// rpc InitiateCalendarAuth(InitiateCalendarAuthRequest) returns (InitiateCalendarAuthResponse); +// rpc CompleteCalendarAuth(CompleteCalendarAuthRequest) returns (CompleteCalendarAuthResponse); + +message CalendarEvent { + string id = 1; + string title = 2; + int64 start_time = 3; // Unix timestamp (seconds) + int64 end_time = 4; // Unix timestamp (seconds) + repeated string attendees = 5; + string location = 6; + string description = 7; + string meeting_url = 8; + bool is_recurring = 9; + string provider = 10; // google, outlook +} + +message ListCalendarEventsRequest { + int32 hours_ahead = 1; // How far ahead to look (default: 24) + int32 limit = 2; // Max events to return (default: 10) + string provider = 3; // Optional: specific provider name +} + +message ListCalendarEventsResponse { + repeated CalendarEvent events = 1; + int32 total_count = 2; +} + +message GetCalendarProvidersRequest {} + +message CalendarProvider { + string name = 1; + bool is_authenticated = 2; + string display_name = 3; // "Google Calendar", "Microsoft Outlook" +} + +message GetCalendarProvidersResponse { + repeated CalendarProvider providers = 1; +} + +// OAuth flow messages +message InitiateCalendarAuthRequest { + string provider = 1; // google, outlook + string redirect_uri = 2; // Where to redirect after auth +} + +message InitiateCalendarAuthResponse { + string auth_url = 1; // URL to redirect user to + string state = 2; // CSRF token to verify callback +} + +message CompleteCalendarAuthRequest { + string provider = 1; + string code = 2; // Authorization code from OAuth callback + string state = 3; // CSRF token for verification +} + +message CompleteCalendarAuthResponse { + bool success = 1; + string error_message = 2; + string provider_email = 3; // Email of authenticated account +} +``` + +--- + +### Task 2: Database Schema Additions + +**File**: `docker/db/schema.sql` + +Add after existing tables (preserve insertion order for foreign keys): + +```sql +-------------------------------------------------------------------------------- +-- Sprint 4: Named Entities +-------------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS noteflow.named_entities ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + meeting_id uuid NOT NULL REFERENCES noteflow.meetings(id) ON DELETE CASCADE, + text text NOT NULL, + normalized_text text NOT NULL, -- Lowercase, trimmed for deduplication + category varchar(50) NOT NULL, -- person, company, product, location, etc. + segment_ids integer[] NOT NULL DEFAULT '{}'::integer[], + confidence double precision NOT NULL DEFAULT 0.0, + is_pinned boolean NOT NULL DEFAULT false, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + + -- Unique constraint for deduplication within a meeting + CONSTRAINT uq_named_entities_meeting_text UNIQUE (meeting_id, normalized_text) +); + +CREATE TRIGGER trg_named_entities_updated_at +BEFORE UPDATE ON noteflow.named_entities +FOR EACH ROW EXECUTE FUNCTION noteflow.set_updated_at(); + +CREATE INDEX IF NOT EXISTS idx_named_entities_meeting_id + ON noteflow.named_entities(meeting_id); + +CREATE INDEX IF NOT EXISTS idx_named_entities_category + ON noteflow.named_entities(category); + +-------------------------------------------------------------------------------- +-- Sprint 6: Webhooks +-------------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS noteflow.webhook_configs ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + workspace_id uuid NOT NULL REFERENCES noteflow.workspaces(id) ON DELETE CASCADE, + name varchar(255) NOT NULL DEFAULT 'Webhook', + url text NOT NULL, + events text[] NOT NULL DEFAULT '{}'::text[], + secret text NULL, -- HMAC signing secret + enabled boolean NOT NULL DEFAULT true, + timeout_ms integer NOT NULL DEFAULT 10000, + max_retries integer NOT NULL DEFAULT 3, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + + -- Validate URL format + CONSTRAINT chk_webhook_url_format CHECK (url ~ '^https?://') +); + +CREATE TRIGGER trg_webhook_configs_updated_at +BEFORE UPDATE ON noteflow.webhook_configs +FOR EACH ROW EXECUTE FUNCTION noteflow.set_updated_at(); + +CREATE INDEX IF NOT EXISTS idx_webhook_configs_workspace_id + ON noteflow.webhook_configs(workspace_id); + +CREATE TABLE IF NOT EXISTS noteflow.webhook_deliveries ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + webhook_id uuid NOT NULL REFERENCES noteflow.webhook_configs(id) ON DELETE CASCADE, + event_type text NOT NULL, + payload jsonb NOT NULL DEFAULT '{}'::jsonb, + status_code integer NULL, + response_body text NULL, -- First 1KB of response for debugging + error_message text NULL, + attempt_count integer NOT NULL DEFAULT 1, + duration_ms integer NULL, -- Request duration for monitoring + delivered_at timestamptz NOT NULL DEFAULT now() +); + +CREATE INDEX IF NOT EXISTS idx_webhook_deliveries_webhook_id + ON noteflow.webhook_deliveries(webhook_id, delivered_at DESC); + +CREATE INDEX IF NOT EXISTS idx_webhook_deliveries_event_type + ON noteflow.webhook_deliveries(event_type, delivered_at DESC); + +-- Partition by month for large deployments (optional) +-- CREATE INDEX IF NOT EXISTS idx_webhook_deliveries_delivered_at +-- ON noteflow.webhook_deliveries(delivered_at); +``` + +--- + +### Task 3: Alembic Migrations + +**File**: `src/noteflow/infrastructure/persistence/migrations/versions/001_add_named_entities.py` + +```python +"""Add named_entities table. + +Revision ID: 001_named_entities +Revises: +Create Date: 2025-XX-XX +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +revision = "001_named_entities" +down_revision = "" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + """Create named_entities table.""" + op.create_table( + "named_entities", + sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), + sa.Column( + "meeting_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey("noteflow.meetings.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column("text", sa.Text(), nullable=False), + sa.Column("normalized_text", sa.Text(), nullable=False), + sa.Column("category", sa.String(50), nullable=False), + sa.Column( + "segment_ids", + postgresql.ARRAY(sa.Integer()), + nullable=False, + server_default="{}", + ), + sa.Column( + "confidence", + sa.Float(), + nullable=False, + server_default="0.0", + ), + sa.Column( + "is_pinned", + sa.Boolean(), + nullable=False, + server_default="false", + ), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + nullable=False, + server_default=sa.func.now(), + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + nullable=False, + server_default=sa.func.now(), + ), + sa.UniqueConstraint( + "meeting_id", + "normalized_text", + name="uq_named_entities_meeting_text", + ), + schema="noteflow", + ) + + op.create_index( + "idx_named_entities_meeting_id", + "named_entities", + ["meeting_id"], + schema="noteflow", + ) + + op.create_index( + "idx_named_entities_category", + "named_entities", + ["category"], + schema="noteflow", + ) + + +def downgrade() -> None: + """Drop named_entities table.""" + op.drop_index("idx_named_entities_category", schema="noteflow") + op.drop_index("idx_named_entities_meeting_id", schema="noteflow") + op.drop_table("named_entities", schema="noteflow") +``` + +**File**: `src/noteflow/infrastructure/persistence/migrations/versions/002_add_webhooks.py` + +```python +"""Add webhook tables. + +Revision ID: 002_webhooks +Revises: 001_named_entities +Create Date: 2025-XX-XX +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +revision = "002_webhooks" +down_revision = "001_named_entities" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + """Create webhook tables.""" + # webhook_configs + op.create_table( + "webhook_configs", + sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), + sa.Column( + "workspace_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey("noteflow.workspaces.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column("name", sa.String(255), nullable=False, server_default="Webhook"), + sa.Column("url", sa.Text(), nullable=False), + sa.Column( + "events", + postgresql.ARRAY(sa.Text()), + nullable=False, + server_default="{}", + ), + sa.Column("secret", sa.Text(), nullable=True), + sa.Column("enabled", sa.Boolean(), nullable=False, server_default="true"), + sa.Column("timeout_ms", sa.Integer(), nullable=False, server_default="10000"), + sa.Column("max_retries", sa.Integer(), nullable=False, server_default="3"), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + nullable=False, + server_default=sa.func.now(), + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + nullable=False, + server_default=sa.func.now(), + ), + sa.CheckConstraint("url ~ '^https?://'", name="chk_webhook_url_format"), + schema="noteflow", + ) + + op.create_index( + "idx_webhook_configs_workspace_id", + "webhook_configs", + ["workspace_id"], + schema="noteflow", + ) + + # webhook_deliveries + op.create_table( + "webhook_deliveries", + sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), + sa.Column( + "webhook_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey("noteflow.webhook_configs.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column("event_type", sa.Text(), nullable=False), + sa.Column( + "payload", + postgresql.JSONB(), + nullable=False, + server_default="{}", + ), + sa.Column("status_code", sa.Integer(), nullable=True), + sa.Column("response_body", sa.Text(), nullable=True), + sa.Column("error_message", sa.Text(), nullable=True), + sa.Column("attempt_count", sa.Integer(), nullable=False, server_default="1"), + sa.Column("duration_ms", sa.Integer(), nullable=True), + sa.Column( + "delivered_at", + sa.DateTime(timezone=True), + nullable=False, + server_default=sa.func.now(), + ), + schema="noteflow", + ) + + op.create_index( + "idx_webhook_deliveries_webhook_id", + "webhook_deliveries", + ["webhook_id", "delivered_at"], + schema="noteflow", + ) + + op.create_index( + "idx_webhook_deliveries_event_type", + "webhook_deliveries", + ["event_type", "delivered_at"], + schema="noteflow", + ) + + +def downgrade() -> None: + """Drop webhook tables.""" + op.drop_index("idx_webhook_deliveries_event_type", schema="noteflow") + op.drop_index("idx_webhook_deliveries_webhook_id", schema="noteflow") + op.drop_table("webhook_deliveries", schema="noteflow") + + op.drop_index("idx_webhook_configs_workspace_id", schema="noteflow") + op.drop_table("webhook_configs", schema="noteflow") +``` + +--- + +### Task 4: Dependency Consolidation + +**File**: `pyproject.toml` + +Add all new dependencies in a single update: + +```toml +[project] +dependencies = [ + # ... existing dependencies ... + + # Sprint 0: Consolidated new dependencies + "httpx>=0.27", # HTTP client (webhooks, future integrations) +] + +[project.optional-dependencies] +# PDF Export (Sprint 3) +pdf = [ + "weasyprint>=62.0", +] + +# Named Entity Recognition (Sprint 4) +ner = [ + "spacy>=3.7", +] + +# Calendar Integration (Sprint 5) +calendar = [ + "google-api-python-client>=2.100", + "google-auth>=2.23", + "google-auth-oauthlib>=1.1", + # Outlook support (future) + # "msal>=1.24", +] + +# All optional features +all = [ + "noteflow[pdf,ner,calendar]", +] + +[project.scripts] +# Model download helper +noteflow-download-models = "noteflow.cli.models:download_all" +``` + +--- + +### Task 5: Model Download CLI + +**File**: `src/noteflow/cli/models.py` + +```python +"""CLI for downloading ML models.""" + +from __future__ import annotations + +import subprocess +import sys + + +def download_spacy_model(model: str = "en_core_web_sm") -> None: + """Download spaCy model. + + Args: + model: Model name to download. + """ + print(f"Downloading spaCy model: {model}") + subprocess.run( + [sys.executable, "-m", "spacy", "download", model], + check=True, + ) + print(f"Successfully downloaded: {model}") + + +def download_all() -> None: + """Download all required ML models.""" + print("Downloading all NoteFlow ML models...") + + try: + download_spacy_model("en_core_web_sm") + except subprocess.CalledProcessError as e: + print(f"Failed to download spaCy model: {e}") + sys.exit(1) + + print("\nAll models downloaded successfully!") + print("You can now use NER features.") + + +if __name__ == "__main__": + download_all() +``` + +--- + +### Task 6: Docker Integration + +**File**: `Dockerfile` (additions) + +```dockerfile +# Stage: Download ML models (optional, for NER support) +FROM python:3.12-slim AS models + +# Install spacy and download model +RUN pip install spacy>=3.7 && \ + python -m spacy download en_core_web_sm + +# Stage: Runtime with models +FROM noteflow-base AS runtime-with-ner + +# Copy spaCy model from models stage +COPY --from=models /usr/local/lib/python3.12/site-packages/en_core_web_sm \ + /usr/local/lib/python3.12/site-packages/en_core_web_sm + +# Verify model is available +RUN python -c "import spacy; spacy.load('en_core_web_sm')" +``` + +**File**: `docker-compose.yml` (additions) + +```yaml +services: + noteflow: + build: + context: . + target: runtime-with-ner # Use runtime-with-ner for NER support + environment: + # Feature flags + NOTEFLOW_FEATURE_NER_ENABLED: "true" + NOTEFLOW_FEATURE_CALENDAR_ENABLED: "true" + NOTEFLOW_FEATURE_WEBHOOKS_ENABLED: "true" +``` + +--- + +### Task 7: Proto Regeneration Script + +**File**: `scripts/regenerate_proto.sh` + +```bash +#!/usr/bin/env bash +set -euo pipefail + +PROTO_DIR="src/noteflow/grpc/proto" +PROTO_FILE="$PROTO_DIR/noteflow.proto" + +echo "Regenerating protobuf stubs..." + +python -m grpc_tools.protoc \ + -I "$PROTO_DIR" \ + --python_out="$PROTO_DIR" \ + --grpc_python_out="$PROTO_DIR" \ + --pyi_out="$PROTO_DIR" \ + "$PROTO_FILE" + +echo "Fixing imports for Python 3.12+ compatibility..." +# Fix relative imports in generated files +sed -i '' 's/^import noteflow_pb2/from . import noteflow_pb2/' "$PROTO_DIR/noteflow_pb2_grpc.py" 2>/dev/null || \ + sed -i 's/^import noteflow_pb2/from . import noteflow_pb2/' "$PROTO_DIR/noteflow_pb2_grpc.py" + +echo "Proto stubs regenerated successfully!" +echo "" +echo "Files updated:" +echo " - $PROTO_DIR/noteflow_pb2.py" +echo " - $PROTO_DIR/noteflow_pb2_grpc.py" +echo " - $PROTO_DIR/noteflow_pb2.pyi" +echo "" +echo "Next steps:" +echo " 1. Run 'cd client && npm run build:proto' to update Rust/TS stubs" +echo " 2. Run tests: pytest tests/grpc/" +echo " 3. Commit all generated files together" +``` + +--- + +### Task 8: Feature Flags + +**File**: `src/noteflow/config/settings.py` (additions) + +```python +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class FeatureFlags(BaseSettings): + """Feature flag settings for gradual rollout.""" + + model_config = SettingsConfigDict(env_prefix="NOTEFLOW_FEATURE_") + + # Sprint 1: AI Templates + templates_enabled: bool = Field( + default=True, + description="Enable summarization template options", + ) + + # Sprint 3: PDF Export + pdf_export_enabled: bool = Field( + default=True, + description="Enable PDF export format", + ) + + # Sprint 4: NER + ner_enabled: bool = Field( + default=False, # Disabled by default (requires model download) + description="Enable named entity extraction", + ) + + # Sprint 5: Calendar + calendar_enabled: bool = Field( + default=False, # Disabled by default (requires OAuth setup) + description="Enable calendar integration", + ) + + # Sprint 6: Webhooks + webhooks_enabled: bool = Field( + default=True, + description="Enable webhook notifications", + ) + + +class Settings(BaseSettings): + """Main application settings.""" + + # ... existing fields ... + + features: FeatureFlags = Field(default_factory=FeatureFlags) + + +@lru_cache +def get_feature_flags() -> FeatureFlags: + """Get cached feature flags.""" + return get_settings().features +``` + +--- + +### Task 9: Proto Changelog + +**File**: `docs/sprints/phase-0-foundation/PROTO_CHANGELOG.md` + +```markdown +# Proto Changelog + +All notable changes to `noteflow.proto` are documented here. + +## [2.0.0] - 2025-XX-XX + +### Added + +#### Messages +- `SummarizationOptions` - AI template preferences (tone, format, verbosity) +- `ExtractEntitiesRequest` / `ExtractEntitiesResponse` - NER extraction +- `ExtractedEntity` - Named entity with category, segments, confidence +- `CalendarEvent` - Calendar event representation +- `ListCalendarEventsRequest` / `ListCalendarEventsResponse` - Calendar listing +- `CalendarProvider` - Provider info with auth status +- `GetCalendarProvidersRequest` / `GetCalendarProvidersResponse` - Provider listing +- `InitiateCalendarAuthRequest` / `InitiateCalendarAuthResponse` - OAuth initiation +- `CompleteCalendarAuthRequest` / `CompleteCalendarAuthResponse` - OAuth completion + +#### RPCs +- `ExtractEntities` - Extract named entities from meeting +- `ListCalendarEvents` - List upcoming calendar events +- `GetCalendarProviders` - Get available calendar providers +- `InitiateCalendarAuth` - Start OAuth flow +- `CompleteCalendarAuth` - Complete OAuth flow + +#### Enums +- `ExportFormat.EXPORT_FORMAT_PDF` - PDF export support + +### Modified + +#### Messages +- `GenerateSummaryRequest` - Added optional `options` field (field 3) + +### Compatibility Notes + +- All new fields are optional or have defaults +- Existing clients will continue to work without changes +- New features require updated clients to access + +## [1.x.x] - Previous Versions + +See git history for earlier changes. +``` + +--- + +## Acceptance Criteria + +### Functional + +- [ ] All proto messages compile without errors +- [ ] Proto stubs regenerate cleanly +- [ ] Alembic migrations apply to fresh database +- [ ] Alembic migrations apply to existing database (upgrade path) +- [ ] Feature flags control feature availability +- [ ] Model download CLI works correctly + +### Technical + +- [ ] Proto backward compatible (existing clients work) +- [ ] No breaking changes to existing RPCs +- [ ] All new tables have proper indexes +- [ ] Foreign key constraints correct +- [ ] Triggers for `updated_at` in place + +### Quality Gates + +- [ ] `pytest tests/quality/` passes +- [ ] `ruff check src/noteflow` clean +- [ ] `mypy src/noteflow` clean +- [ ] `alembic upgrade head` succeeds on fresh DB +- [ ] `alembic downgrade -1` succeeds for each migration +- [ ] Proto regeneration produces identical output (idempotent) + +--- + +## Test Plan + +### Migration Tests + +**File**: `tests/infrastructure/persistence/test_migrations.py` + +```python +import pytest +from alembic import command +from alembic.config import Config + + +@pytest.fixture +def alembic_config(tmp_path) -> Config: + """Create Alembic config for testing.""" + config = Config() + config.set_main_option("script_location", "src/noteflow/infrastructure/persistence/migrations") + config.set_main_option("sqlalchemy.url", f"sqlite:///{tmp_path}/test.db") + return config + + +def test_migrations_upgrade_downgrade(alembic_config: Config) -> None: + """All migrations can upgrade and downgrade.""" + # Upgrade to head + command.upgrade(alembic_config, "head") + + # Downgrade each migration + command.downgrade(alembic_config, "-1") + command.downgrade(alembic_config, "-1") + + # Upgrade again + command.upgrade(alembic_config, "head") +``` + +### Proto Tests + +**File**: `tests/grpc/test_proto_compilation.py` + +```python +def test_proto_imports() -> None: + """Proto stubs import without errors.""" + from noteflow.grpc.proto import noteflow_pb2, noteflow_pb2_grpc + + # Verify new messages exist + assert hasattr(noteflow_pb2, "SummarizationOptions") + assert hasattr(noteflow_pb2, "ExtractEntitiesRequest") + assert hasattr(noteflow_pb2, "CalendarEvent") + + # Verify new enum values + assert noteflow_pb2.EXPORT_FORMAT_PDF == 3 + + +def test_proto_message_defaults() -> None: + """New messages have correct defaults.""" + from noteflow.grpc.proto import noteflow_pb2 + + # SummarizationOptions defaults + opts = noteflow_pb2.SummarizationOptions() + assert opts.tone == "" + assert opts.format == "" + assert opts.verbosity == "" + + # ExtractedEntity defaults + entity = noteflow_pb2.ExtractedEntity() + assert entity.confidence == 0.0 + assert entity.is_pinned is False +``` + +--- + +## Definition of Done + +- [ ] All proto changes committed in single commit +- [ ] All migrations committed and tested +- [ ] Proto regeneration script works +- [ ] Feature flags documented +- [ ] PROTO_CHANGELOG.md updated +- [ ] Client proto sync verified (`cd client && npm run build:proto`) +- [ ] Integration tests pass with new schema +- [ ] CLAUDE.md updated with new proto messages +- [ ] README updated with new optional dependencies + +--- + +## Dependencies + +- None (this is the foundation sprint) + +## Blocks + +- All other sprints depend on Sprint 0 + +## Post-Sprint + +- Monitor for proto compatibility issues +- Consider proto versioning strategy for future breaking changes +- Document migration rollback procedures diff --git a/docs/sprints/phase-1-core-pipeline/sprint-1-ai-templates/README.md b/docs/sprints/phase-1-core-pipeline/sprint-1-ai-templates/README.md new file mode 100644 index 0000000..ce24fec --- /dev/null +++ b/docs/sprints/phase-1-core-pipeline/sprint-1-ai-templates/README.md @@ -0,0 +1,548 @@ +# Sprint 1: AI Templates Pass-Through + +> **Priority**: 1 | **Owner**: Both (Backend + Frontend) | **Complexity**: Low + +--- + +## Objective + +Enable user-configured summarization style preferences (tone, format, verbosity) to flow from frontend settings through gRPC to the LLM prompt builder. + +--- + +## Current State Analysis + +### What Exists + +| Component | Location | Status | +|-----------|----------|--------| +| Frontend UI | `client/src/pages/Settings.tsx` | AI template controls saved to local preferences | +| gRPC Proto | `src/noteflow/grpc/proto/noteflow.proto:291` | `GenerateSummaryRequest` lacks options field | +| Summarization Service | `src/noteflow/application/services/summarization_service.py:167` | `summarize()` has no template params | +| Prompt Builder | `src/noteflow/infrastructure/summarization/_parsing.py` | Only `build_transcript_prompt()`, no style builder | + +### Gap + +User preferences in Settings are never transmitted to the backend. `GenerateSummaryRequest` only contains: +```protobuf +message GenerateSummaryRequest { + string meeting_id = 1; + bool force_regenerate = 2; + // Missing: SummarizationOptions options = 3; +} +``` + +--- + +## Target/Affected Code + +### Files to Modify + +| File | Change Type | Lines Est. | +|------|-------------|------------| +| `src/noteflow/grpc/proto/noteflow.proto` | Add message + field | +15 | +| `src/noteflow/infrastructure/summarization/_parsing.py` | Add `build_template_prompt()` | +40 | +| `src/noteflow/application/services/summarization_service.py` | Accept options param | +10 | +| `src/noteflow/grpc/_mixins/summarization.py` | Extract and pass options | +15 | +| `client/src-tauri/src/commands/summary.rs` | Accept template params | +20 | +| `client/src/api/tauri-adapter.ts` | Read prefs, pass to command | +15 | + +### Files to Create + +None - all changes are modifications to existing files. + +--- + +## Implementation Tasks + +### Task 1: Proto Update + +**File**: `src/noteflow/grpc/proto/noteflow.proto` + +```protobuf +// Add after line 288 (before GenerateSummaryRequest) +message SummarizationOptions { + // Tone: professional, casual, technical, friendly + string tone = 1; + + // Format: bullet_points, narrative, structured, concise + string format = 2; + + // Verbosity: minimal, balanced, detailed, comprehensive + string verbosity = 3; +} + +// Modify existing GenerateSummaryRequest (line 291) +message GenerateSummaryRequest { + string meeting_id = 1; + bool force_regenerate = 2; + SummarizationOptions options = 3; // NEW +} +``` + +**Post-change**: Regenerate proto stubs: +```bash +python -m grpc_tools.protoc -I src/noteflow/grpc/proto \ + --python_out=src/noteflow/grpc/proto \ + --grpc_python_out=src/noteflow/grpc/proto \ + --pyi_out=src/noteflow/grpc/proto \ + src/noteflow/grpc/proto/noteflow.proto +``` + +--- + +### Task 2: Template Prompt Builder + +**File**: `src/noteflow/infrastructure/summarization/_parsing.py` + +**Insert after** `SYSTEM_PROMPT` constant: + +```python +from noteflow.grpc.proto import noteflow_pb2 + +_TONE_INSTRUCTIONS: dict[str, str] = { + "professional": "Use formal, business-appropriate language.", + "casual": "Use conversational, approachable language.", + "technical": "Use precise technical terminology.", + "friendly": "Use warm, personable language.", +} + +_FORMAT_INSTRUCTIONS: dict[str, str] = { + "bullet_points": "Present information in bullet points.", + "narrative": "Write in flowing paragraphs.", + "structured": "Use headers and organized sections.", + "concise": "Be extremely brief and to the point.", +} + +_VERBOSITY_INSTRUCTIONS: dict[str, str] = { + "minimal": "Provide only essential information.", + "balanced": "Include moderate detail.", + "detailed": "Include comprehensive information.", + "comprehensive": "Include all relevant details and context.", +} + + +def build_template_prompt( + options: noteflow_pb2.SummarizationOptions | None, +) -> str: + """Build prompt prefix based on user template preferences. + + Args: + options: User's summarization style preferences. + + Returns: + Style instruction string to prepend to system prompt. + """ + if not options: + return "" + + parts: list[str] = [] + + if options.tone and options.tone in _TONE_INSTRUCTIONS: + parts.append(_TONE_INSTRUCTIONS[options.tone]) + if options.format and options.format in _FORMAT_INSTRUCTIONS: + parts.append(_FORMAT_INSTRUCTIONS[options.format]) + if options.verbosity and options.verbosity in _VERBOSITY_INSTRUCTIONS: + parts.append(_VERBOSITY_INSTRUCTIONS[options.verbosity]) + + return " ".join(parts) +``` + +--- + +### Task 3: Service Update + +**File**: `src/noteflow/application/services/summarization_service.py` + +**Modify** `summarize()` signature (line 167): + +```python +async def summarize( + self, + meeting_id: MeetingId, + segments: Sequence[Segment], + mode: SummarizationMode | None = None, + max_key_points: int | None = None, + max_action_items: int | None = None, + style_prompt: str | None = None, # NEW PARAMETER +) -> SummarizationServiceResult: +``` + +**Update** request building (around line 205): + +```python +request = SummarizationRequest( + meeting_id=meeting_id, + segments=segments, + max_key_points=max_key_points or self.settings.max_key_points, + max_action_items=max_action_items or self.settings.max_action_items, + style_prompt=style_prompt, # NEW FIELD +) +``` + +**Note**: Also update `SummarizationRequest` dataclass in domain to include `style_prompt`. + +--- + +### Task 4: gRPC Mixin Update + +**File**: `src/noteflow/grpc/_mixins/summarization.py` + +**Modify** `GenerateSummary` method: + +```python +async def GenerateSummary( + self: ServicerHost, + request: noteflow_pb2.GenerateSummaryRequest, + context: grpc.aio.ServicerContext, +) -> noteflow_pb2.Summary: + """Generate AI summary for meeting.""" + from noteflow.infrastructure.summarization._parsing import build_template_prompt + + meeting_id = self._parse_meeting_id(request.meeting_id) + + # Build style prompt from options + style_prompt = build_template_prompt(request.options) if request.options else None + + # ... existing meeting fetch logic ... + + result = await self._summarization_service.summarize( + meeting_id=meeting_id, + segments=meeting.segments, + style_prompt=style_prompt, # Pass style prompt + ) +``` + +--- + +### Task 5: Rust Command Update + +**File**: `client/src-tauri/src/commands/summary.rs` + +```rust +#[derive(Debug, Serialize, Deserialize)] +pub struct SummarizationOptions { + pub tone: Option, + pub format: Option, + pub verbosity: Option, +} + +#[tauri::command] +pub async fn generate_summary( + meeting_id: String, + force_regenerate: Option, + options: Option, // NEW + state: State<'_, AppState>, +) -> Result { + let client = state.grpc_client.lock().await; + + let proto_options = options.map(|o| proto::SummarizationOptions { + tone: o.tone.unwrap_or_default(), + format: o.format.unwrap_or_default(), + verbosity: o.verbosity.unwrap_or_default(), + }); + + let request = proto::GenerateSummaryRequest { + meeting_id, + force_regenerate: force_regenerate.unwrap_or(false), + options: proto_options, + }; + + // ... rest of gRPC call +} +``` + +--- + +### Task 6: TypeScript Adapter Update + +**File**: `client/src/api/tauri-adapter.ts` + +```typescript +interface SummarizationOptions { + tone?: 'professional' | 'casual' | 'technical' | 'friendly'; + format?: 'bullet_points' | 'narrative' | 'structured' | 'concise'; + verbosity?: 'minimal' | 'balanced' | 'detailed' | 'comprehensive'; +} + +async generateSummary( + meetingId: string, + forceRegenerate?: boolean, +): Promise { + // Read from local preferences + const prefs = await this.getPreferences(); + const template = prefs.ai_template; + + const options: SummarizationOptions | undefined = template ? { + tone: template.tone, + format: template.format, + verbosity: template.verbosity, + } : undefined; + + return invoke(Commands.GENERATE_SUMMARY, { + meetingId, + forceRegenerate, + options, + }); +} +``` + +--- + +## Code Segments to Reuse + +### Existing Prompt Building + +**Location**: `src/noteflow/infrastructure/summarization/_parsing.py:20-80` + +```python +SYSTEM_PROMPT = """You are an expert meeting analyst...""" + +def build_transcript_prompt(segments: Sequence[Segment], ...) -> str: + """Build transcript with segment markers.""" +``` + +Use this pattern for `build_template_prompt()`. + +### Existing Service Pattern + +**Location**: `src/noteflow/application/services/summarization_service.py:167-249` + +The `summarize()` method shows how to: +- Accept optional parameters with defaults +- Pass through to providers +- Handle verification and persistence + +### Rust Command Pattern + +**Location**: `client/src-tauri/src/commands/meeting.rs` + +Follow the pattern for: +- Deriving `Serialize`, `Deserialize` on structs +- Using `Option` for optional command params +- Converting to proto types + +--- + +## Acceptance Criteria + +### Functional + +- [ ] User can select tone (professional/casual/technical/friendly) in Settings +- [ ] User can select format (bullet_points/narrative/structured/concise) in Settings +- [ ] User can select verbosity (minimal/balanced/detailed/comprehensive) in Settings +- [ ] When generating summary, selected options affect the output style +- [ ] Default behavior (no options) produces same result as before + +### Technical + +- [ ] Proto regenerated and compiles cleanly +- [ ] No breaking changes to existing clients (options field is optional) +- [ ] Style prompt logged at DEBUG level for troubleshooting +- [ ] Unit tests cover all tone/format/verbosity combinations + +### Quality Gates + +- [ ] `pytest tests/quality/` passes +- [ ] `ruff check src/noteflow` clean +- [ ] `mypy src/noteflow` clean +- [ ] `npm run test:quality` passes (client) +- [ ] `cargo clippy` clean (Rust) + +--- + +## Test Plan + +### Unit Tests + +**File**: `tests/infrastructure/summarization/test_parsing.py` + +```python +import pytest +from noteflow.grpc.proto import noteflow_pb2 +from noteflow.infrastructure.summarization._parsing import build_template_prompt + + +@pytest.mark.parametrize( + "tone,expected_fragment", + [ + ("professional", "formal, business-appropriate"), + ("casual", "conversational, approachable"), + ("technical", "precise technical terminology"), + ("friendly", "warm, personable"), + ], +) +def test_build_template_prompt_tone(tone: str, expected_fragment: str) -> None: + """Template prompt includes correct tone instruction.""" + options = noteflow_pb2.SummarizationOptions(tone=tone) + + result = build_template_prompt(options) + + assert expected_fragment in result + + +def test_build_template_prompt_combines_all_options() -> None: + """Template prompt combines tone, format, and verbosity.""" + options = noteflow_pb2.SummarizationOptions( + tone="professional", + format="bullet_points", + verbosity="detailed", + ) + + result = build_template_prompt(options) + + assert "formal" in result + assert "bullet points" in result + assert "comprehensive" in result.lower() or "detailed" in result.lower() + + +def test_build_template_prompt_none_returns_empty() -> None: + """No options returns empty string.""" + result = build_template_prompt(None) + + assert result == "" + + +def test_build_template_prompt_unknown_values_ignored() -> None: + """Unknown option values are safely ignored.""" + options = noteflow_pb2.SummarizationOptions( + tone="unknown_tone", + format="unknown_format", + ) + + result = build_template_prompt(options) + + assert result == "" +``` + +### Integration Tests + +**File**: `tests/integration/test_summarization_templates.py` + +```python +@pytest.mark.integration +async def test_generate_summary_with_professional_tone( + grpc_client: NoteFlowClient, + meeting_with_segments: Meeting, +) -> None: + """Summary generation respects professional tone setting.""" + options = noteflow_pb2.SummarizationOptions(tone="professional") + + summary = await grpc_client.generate_summary( + meeting_id=str(meeting_with_segments.id), + options=options, + ) + + # Verify summary was generated (content verification is model-dependent) + assert summary.executive_summary + assert summary.key_points +``` + +### Frontend Tests + +**File**: `client/src/api/tauri-adapter.test.ts` + +```typescript +describe('generateSummary', () => { + it('should pass template options from preferences', async () => { + // Mock preferences with AI template + mockPreferences.ai_template = { + tone: 'professional', + format: 'bullet_points', + verbosity: 'detailed', + }; + + await adapter.generateSummary('meeting-123'); + + expect(invoke).toHaveBeenCalledWith( + Commands.GENERATE_SUMMARY, + expect.objectContaining({ + options: { + tone: 'professional', + format: 'bullet_points', + verbosity: 'detailed', + }, + }) + ); + }); +}); +``` + +--- + +## Rollback Plan + +If issues arise: + +1. **Proto rollback**: Remove `options` field (clients ignore unknown fields) +2. **Backend**: `build_template_prompt()` returns empty string if options invalid +3. **Frontend**: Gracefully handle missing options in existing summaries + +--- + +## Frontend/Backend Sync Protocol + +### Architecture Decision: Per-Request Transmission + +Preferences are stored **only on the frontend** (local storage) and transmitted **per-request** via gRPC. The backend is stateless regarding user preferences. + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Local Storage │────────▢│ Tauri Command │────────▢│ gRPC Request β”‚ +β”‚ (preferences) β”‚ read β”‚ (summary.rs) β”‚ proto β”‚ (options field)β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### Why Per-Request (Not Persisted on Backend) + +| Approach | Pros | Cons | +|----------|------|------| +| **Per-request (chosen)** | No sync conflicts; works offline; privacy-preserving | Slightly larger request payloads | +| Backend-persisted | Single source of truth | Sync complexity; requires user accounts; offline failures | + +### Failure Handling + +| Scenario | Behavior | +|----------|----------| +| Backend unreachable | Summary generation fails (as expected); preferences remain in local storage | +| Invalid preference value | Backend ignores unknown values; uses default behavior | +| Missing preferences | `options` field omitted; backend uses default prompts | +| Corrupted local storage | `getPreferences()` returns defaults; user re-configures in Settings | + +### Implementation Notes + +1. **No caching on backend**: Each `GenerateSummary` call reads `options` fresh from the request +2. **No version conflicts**: Frontend preferences are authoritative; no bidirectional sync +3. **Offline-first**: Preferences are always available locally; only summary generation requires connectivity +4. **Migration path**: If backend persistence is needed later, add `UserPreferences` table and sync endpoint + +### TypeScript Preference Loading + +```typescript +// client/src/api/tauri-adapter.ts +private async getPreferences(): Promise { + try { + const stored = localStorage.getItem('noteflow_preferences'); + return stored ? JSON.parse(stored) : DEFAULT_PREFERENCES; + } catch { + // Corrupted storage: reset to defaults + localStorage.removeItem('noteflow_preferences'); + return DEFAULT_PREFERENCES; + } +} +``` + +--- + +## Dependencies + +- None (standalone feature) + +## Blocks + +- None + +## Post-Sprint + +- Update CLAUDE.md with new proto message +- Consider adding template presets ("Meeting Notes", "Executive Brief") diff --git a/docs/sprints/phase-1-core-pipeline/sprint-2-diarization-service/README.md b/docs/sprints/phase-1-core-pipeline/sprint-2-diarization-service/README.md new file mode 100644 index 0000000..a87da9b --- /dev/null +++ b/docs/sprints/phase-1-core-pipeline/sprint-2-diarization-service/README.md @@ -0,0 +1,1699 @@ +# Sprint 2: Diarization Application Service + +> **Priority**: 2 | **Owner**: Backend | **Complexity**: Medium + +--- + +## Objective + +Create a proper application service layer for speaker diarization, following hexagonal architecture. Currently the gRPC mixin calls the diarization engine directly, violating separation of concerns. Additionally, job state is stored in-memoryβ€”this sprint migrates to database persistence for reliability across restarts. + +--- + +## Current State Analysis + +### What Exists + +| Component | Location | Status | +|-----------|----------|--------| +| Diarization Engine | `src/noteflow/infrastructure/diarization/engine.py` | Full implementation (streaming + offline) | +| Speaker Assigner | `src/noteflow/infrastructure/diarization/assigner.py` | `assign_speaker()`, `assign_speakers_batch()` | +| gRPC Mixin | `src/noteflow/grpc/_mixins/diarization.py` | Direct engine calls, in-memory job dict | +| Proto RPCs | `noteflow.proto` | `RefineSpeakerDiarization`, `GetDiarizationJobStatus`, `RenameSpeaker` | + +### Existing Persistence Infrastructure (MUST USE) + +The database already has tables for diarization: + +| Table | ORM Model | Location | Purpose | +|-------|-----------|----------|---------| +| `noteflow.diarization_jobs` | `DiarizationJobModel` | `models/core/diarization.py` | Job status, progress, error tracking | +| `noteflow.streaming_diarization_turns` | `StreamingDiarizationTurnModel` | `models/core/diarization.py` | Real-time speaker turns | + +**Repository**: `src/noteflow/infrastructure/persistence/repositories/diarization_job_repo.py` + +**Schema reference**: `docker/db/schema.sql:198-224` + +### Gap + +The `DiarizationMixin` currently: +- Uses in-memory `_diarization_jobs` dict (lost on restart!) +- Calls engine directly (violates hexagonal architecture) +- Manages background tasks inline +- Has no observability (metrics/structured logging) + +### Architecture Violation + +``` +Current: gRPC Mixin β†’ DiarizationEngine (infrastructure) + β†’ In-memory job dict (volatile!) + +Expected: gRPC Mixin β†’ DiarizationService (application) β†’ DiarizationEngine (infrastructure) + β†’ DiarizationJobRepository (persistence) +``` + +--- + +## Target/Affected Code + +### Files to Create + +| File | Purpose | Lines Est. | +|------|---------|------------| +| `src/noteflow/application/services/diarization_service.py` | Application service | ~200 | +| `src/noteflow/domain/ports/diarization.py` | Port interfaces | ~60 | +| `src/noteflow/infrastructure/converters/diarization_converters.py` | ORM ↔ domain | ~50 | +| `tests/application/test_diarization_service.py` | Unit tests | ~250 | +| `tests/integration/test_diarization_workflow.py` | Integration tests | ~150 | + +### Files to Modify + +| File | Change Type | Lines Est. | +|------|-------------|------------| +| `src/noteflow/grpc/server.py` | Initialize service | +20 | +| `src/noteflow/grpc/service.py` | Accept service dependency | +5 | +| `src/noteflow/grpc/_mixins/diarization.py` | Delegate to service | -50 (simplify) | +| `src/noteflow/infrastructure/persistence/repositories/diarization_job_repo.py` | Add missing methods | +30 | +| `src/noteflow/infrastructure/persistence/unit_of_work.py` | Ensure diarization repo exposed | +5 | +| `src/noteflow/application/services/__init__.py` | Export new service | +2 | + +--- + +## Implementation Tasks + +### Task 0: Verify Persistence Infrastructure + +Before implementing the application layer, verify the existing DB infrastructure is complete. + +**File**: `src/noteflow/infrastructure/persistence/models/core/diarization.py` + +Verify the `DiarizationJobModel` has all required fields: + +```python +class DiarizationJobModel(Base): + """Diarization job tracking.""" + + __tablename__ = "diarization_jobs" + __table_args__: ClassVar[dict[str, str]] = {"schema": "noteflow"} + + id: Mapped[PyUUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid4) + meeting_id: Mapped[PyUUID] = mapped_column( + UUID(as_uuid=True), + ForeignKey("noteflow.meetings.id", ondelete="CASCADE"), + nullable=False, + index=True, + ) + status: Mapped[str] = mapped_column( + String(50), + nullable=False, + default="queued", + ) # queued, running, completed, failed + num_speakers_hint: Mapped[int | None] = mapped_column(Integer, nullable=True) + segments_updated: Mapped[int] = mapped_column(Integer, nullable=False, default=0) + speaker_ids: Mapped[list[str]] = mapped_column( + ARRAY(Text), + nullable=False, + default=list, + ) + error_message: Mapped[str | None] = mapped_column(Text, nullable=True) + started_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + completed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), + nullable=False, + default=utc_now, + ) + + # Relationships + meeting: Mapped["MeetingModel"] = relationship("MeetingModel", back_populates="diarization_jobs") +``` + +**File**: `src/noteflow/infrastructure/persistence/repositories/diarization_job_repo.py` + +Ensure repository has all CRUD methods: + +```python +"""Diarization job repository.""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import TYPE_CHECKING +from uuid import UUID + +from sqlalchemy import select, update + +from noteflow.infrastructure.persistence.models.core.diarization import DiarizationJobModel + +from ._base import BaseRepository + +if TYPE_CHECKING: + from sqlalchemy.ext.asyncio import AsyncSession + + +class SqlAlchemyDiarizationJobRepository(BaseRepository): + """SQLAlchemy repository for diarization jobs.""" + + def __init__(self, session: AsyncSession) -> None: + """Initialize repository.""" + super().__init__(session) + + async def create(self, job: DiarizationJobModel) -> DiarizationJobModel: + """Create a new diarization job.""" + await self._add_and_flush(job) + return job + + async def get(self, job_id: UUID) -> DiarizationJobModel | None: + """Get job by ID.""" + stmt = select(DiarizationJobModel).where(DiarizationJobModel.id == job_id) + return await self._execute_scalar(stmt) + + async def get_by_meeting(self, meeting_id: UUID) -> list[DiarizationJobModel]: + """Get all jobs for a meeting.""" + stmt = ( + select(DiarizationJobModel) + .where(DiarizationJobModel.meeting_id == meeting_id) + .order_by(DiarizationJobModel.created_at.desc()) + ) + return list(await self._execute_scalars(stmt)) + + async def get_active_jobs(self) -> list[DiarizationJobModel]: + """Get all jobs that are queued or running.""" + stmt = select(DiarizationJobModel).where( + DiarizationJobModel.status.in_(["queued", "running"]) + ) + return list(await self._execute_scalars(stmt)) + + async def update_status( + self, + job_id: UUID, + status: str, + *, + segments_updated: int | None = None, + speaker_ids: list[str] | None = None, + error_message: str | None = None, + ) -> None: + """Update job status and results.""" + values: dict[str, object] = {"status": status} + + if status == "running": + values["started_at"] = datetime.now(timezone.utc) + elif status in {"completed", "failed"}: + values["completed_at"] = datetime.now(timezone.utc) + + if segments_updated is not None: + values["segments_updated"] = segments_updated + if speaker_ids is not None: + values["speaker_ids"] = speaker_ids + if error_message is not None: + values["error_message"] = error_message + + stmt = ( + update(DiarizationJobModel) + .where(DiarizationJobModel.id == job_id) + .values(**values) + ) + await self._session.execute(stmt) + await self._session.flush() + + async def cleanup_old_jobs(self, older_than: datetime) -> int: + """Delete completed/failed jobs older than threshold. + + Returns: + Number of jobs deleted. + """ + from sqlalchemy import delete + + stmt = delete(DiarizationJobModel).where( + DiarizationJobModel.status.in_(["completed", "failed"]), + DiarizationJobModel.created_at < older_than, + ) + result = await self._session.execute(stmt) + await self._session.flush() + return result.rowcount +``` + +--- + +### Task 1: Create Port Interface + +**File**: `src/noteflow/domain/ports/diarization.py` + +```python +"""Diarization port interfaces.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime +from enum import Enum +from typing import TYPE_CHECKING, Protocol +from uuid import UUID + +if TYPE_CHECKING: + from noteflow.domain.entities.meeting import MeetingId + from noteflow.domain.entities.segment import Segment + + +class JobStatus(Enum): + """Diarization job status.""" + + QUEUED = "queued" + RUNNING = "running" + COMPLETED = "completed" + FAILED = "failed" + + +@dataclass(frozen=True, slots=True) +class DiarizationJob: + """Domain entity for a diarization job.""" + + id: UUID + meeting_id: UUID + status: JobStatus + num_speakers_hint: int | None = None + segments_updated: int = 0 + speaker_ids: tuple[str, ...] = field(default_factory=tuple) + error_message: str | None = None + started_at: datetime | None = None + completed_at: datetime | None = None + created_at: datetime | None = None + + +@dataclass(frozen=True, slots=True) +class DiarizationJobResult: + """Result of a diarization job query.""" + + job_id: str + status: JobStatus + segments_updated: int + speaker_ids: list[str] + error_message: str | None = None + progress_percent: int = 0 # 0-100 for UI progress bar + + +class DiarizationEnginePort(Protocol): + """Port for diarization engine operations.""" + + async def process_audio( + self, + audio_path: str, + num_speakers: int | None = None, + ) -> list[tuple[float, float, str]]: + """Process audio file for speaker diarization. + + Args: + audio_path: Path to audio file. + num_speakers: Optional hint for number of speakers. + + Returns: + List of (start_time, end_time, speaker_id) tuples. + """ + ... + + def is_ready(self) -> bool: + """Check if engine is initialized and ready.""" + ... + + +class DiarizationServicePort(Protocol): + """Port for diarization service operations.""" + + async def refine_meeting( + self, + meeting_id: MeetingId, + num_speakers: int | None = None, + ) -> str: + """Start diarization refinement job. + + Args: + meeting_id: Meeting to process. + num_speakers: Optional speaker count hint. + + Returns: + Job ID for status polling. + """ + ... + + async def get_job_status(self, job_id: str) -> DiarizationJobResult: + """Get status of diarization job. + + Args: + job_id: Job identifier. + + Returns: + Current job status and results. + """ + ... + + async def rename_speaker( + self, + meeting_id: MeetingId, + old_speaker_id: str, + new_speaker_name: str, + ) -> int: + """Rename speaker across all segments. + + Args: + meeting_id: Meeting containing segments. + old_speaker_id: Current speaker identifier. + new_speaker_name: New display name. + + Returns: + Number of segments updated. + """ + ... + + async def cancel_job(self, job_id: str) -> bool: + """Cancel a running or queued job. + + Args: + job_id: Job to cancel. + + Returns: + True if job was cancelled. + """ + ... +``` + +--- + +### Task 2: Create Converters + +**File**: `src/noteflow/infrastructure/converters/diarization_converters.py` + +```python +"""Diarization ORM ↔ domain converters.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from noteflow.domain.ports.diarization import DiarizationJob, JobStatus + +if TYPE_CHECKING: + from noteflow.infrastructure.persistence.models.core.diarization import ( + DiarizationJobModel, + ) + + +class DiarizationConverter: + """Convert between DiarizationJob domain entity and ORM model.""" + + @staticmethod + def to_domain(model: DiarizationJobModel) -> DiarizationJob: + """Convert ORM model to domain entity. + + Args: + model: SQLAlchemy model instance. + + Returns: + Domain entity. + """ + return DiarizationJob( + id=model.id, + meeting_id=model.meeting_id, + status=JobStatus(model.status), + num_speakers_hint=model.num_speakers_hint, + segments_updated=model.segments_updated, + speaker_ids=tuple(model.speaker_ids) if model.speaker_ids else (), + error_message=model.error_message, + started_at=model.started_at, + completed_at=model.completed_at, + created_at=model.created_at, + ) + + @staticmethod + def to_model( + entity: DiarizationJob, + ) -> dict: + """Convert domain entity to dict for ORM model creation. + + Args: + entity: Domain entity. + + Returns: + Dict suitable for model instantiation. + """ + return { + "id": entity.id, + "meeting_id": entity.meeting_id, + "status": entity.status.value, + "num_speakers_hint": entity.num_speakers_hint, + "segments_updated": entity.segments_updated, + "speaker_ids": list(entity.speaker_ids), + "error_message": entity.error_message, + "started_at": entity.started_at, + "completed_at": entity.completed_at, + } +``` + +--- + +### Task 3: Create Application Service + +**File**: `src/noteflow/application/services/diarization_service.py` + +```python +"""Diarization application service with database persistence.""" + +from __future__ import annotations + +import asyncio +import logging +from dataclasses import dataclass +from datetime import timedelta +from typing import TYPE_CHECKING +from uuid import UUID, uuid4 + +from noteflow.domain.ports.diarization import ( + DiarizationJobResult, + JobStatus, +) +from noteflow.domain.utils.time import utc_now +from noteflow.infrastructure.converters.diarization_converters import DiarizationConverter + +if TYPE_CHECKING: + from noteflow.domain.entities.meeting import MeetingId + from noteflow.domain.ports.diarization import DiarizationEnginePort + from noteflow.infrastructure.persistence.models.core.diarization import ( + DiarizationJobModel, + ) + from noteflow.infrastructure.persistence.unit_of_work import SQLAlchemyUnitOfWork + +_logger = logging.getLogger(__name__) + +# Constants +JOB_CLEANUP_AGE = timedelta(hours=24) +JOB_TIMEOUT = timedelta(minutes=10) + + +@dataclass(frozen=True, slots=True) +class DiarizationServiceSettings: + """Diarization service configuration.""" + + max_concurrent_jobs: int = 2 + job_timeout_seconds: float = 600.0 # 10 minutes + cleanup_age_hours: int = 24 + + +class DiarizationService: + """Orchestrates speaker diarization workflows with DB persistence. + + Manages background diarization jobs, speaker renaming, and + coordinates between the diarization engine and persistence layer. + All job state is persisted to database for reliability. + """ + + def __init__( + self, + engine: DiarizationEnginePort, + uow_factory: type[SQLAlchemyUnitOfWork], + settings: DiarizationServiceSettings | None = None, + ) -> None: + """Initialize diarization service. + + Args: + engine: Diarization engine implementation. + uow_factory: Unit of work factory for database access. + settings: Optional service configuration. + """ + self._engine = engine + self._uow_factory = uow_factory + self.settings = settings or DiarizationServiceSettings() + self._active_tasks: dict[str, asyncio.Task] = {} + self._semaphore = asyncio.Semaphore(self.settings.max_concurrent_jobs) + + async def refine_meeting( + self, + meeting_id: MeetingId, + num_speakers: int | None = None, + ) -> str: + """Start diarization refinement job. + + Args: + meeting_id: Meeting to process. + num_speakers: Optional hint for number of speakers. + + Returns: + Job ID for status polling. + + Raises: + ValueError: If meeting not found or has no audio. + """ + job_id = uuid4() + + # Create job in database + async with self._uow_factory() as uow: + # Verify meeting exists + meeting = await uow.meetings.get(meeting_id) + if not meeting: + msg = f"Meeting {meeting_id} not found" + raise ValueError(msg) + + if not meeting.asset_path: + msg = f"Meeting {meeting_id} has no audio file" + raise ValueError(msg) + + # Create job record + from noteflow.infrastructure.persistence.models.core.diarization import ( + DiarizationJobModel, + ) + + job = DiarizationJobModel( + id=job_id, + meeting_id=meeting_id.value if hasattr(meeting_id, "value") else meeting_id, + status="queued", + num_speakers_hint=num_speakers, + ) + await uow.diarization_jobs.create(job) + await uow.commit() + + # Spawn background task + task = asyncio.create_task( + self._run_diarization_job(str(job_id), meeting_id, num_speakers) + ) + self._active_tasks[str(job_id)] = task + + _logger.info( + "Queued diarization job", + extra={ + "job_id": str(job_id), + "meeting_id": str(meeting_id), + "num_speakers_hint": num_speakers, + }, + ) + + return str(job_id) + + async def _run_diarization_job( + self, + job_id: str, + meeting_id: MeetingId, + num_speakers: int | None, + ) -> None: + """Execute diarization job in background with concurrency control.""" + job_uuid = UUID(job_id) + + async with self._semaphore: + try: + # Update status to running + async with self._uow_factory() as uow: + await uow.diarization_jobs.update_status(job_uuid, "running") + await uow.commit() + + # Get audio path + async with self._uow_factory() as uow: + meeting = await uow.meetings.get(meeting_id) + if not meeting or not meeting.asset_path: + raise ValueError("Meeting or audio not found") + audio_path = meeting.asset_path + + _logger.info( + "Starting diarization processing", + extra={"job_id": job_id, "audio_path": audio_path}, + ) + + # Run diarization engine (potentially slow) + turns = await asyncio.wait_for( + self._engine.process_audio( + audio_path=audio_path, + num_speakers=num_speakers, + ), + timeout=self.settings.job_timeout_seconds, + ) + + # Apply results to segments + segments_updated, speaker_ids = await self._apply_diarization( + meeting_id, turns + ) + + # Update job as completed + async with self._uow_factory() as uow: + await uow.diarization_jobs.update_status( + job_uuid, + "completed", + segments_updated=segments_updated, + speaker_ids=speaker_ids, + ) + await uow.commit() + + _logger.info( + "Completed diarization job", + extra={ + "job_id": job_id, + "segments_updated": segments_updated, + "speaker_count": len(speaker_ids), + }, + ) + + except asyncio.TimeoutError: + error_msg = f"Job timed out after {self.settings.job_timeout_seconds}s" + _logger.error("Diarization job timeout", extra={"job_id": job_id}) + await self._fail_job(job_uuid, error_msg) + + except Exception as e: + _logger.exception("Diarization job failed", extra={"job_id": job_id}) + await self._fail_job(job_uuid, str(e)) + + finally: + # Cleanup task reference + self._active_tasks.pop(job_id, None) + + async def _fail_job(self, job_id: UUID, error_message: str) -> None: + """Mark job as failed in database.""" + try: + async with self._uow_factory() as uow: + await uow.diarization_jobs.update_status( + job_id, + "failed", + error_message=error_message, + ) + await uow.commit() + except Exception: + _logger.exception("Failed to update job status", extra={"job_id": str(job_id)}) + + async def _apply_diarization( + self, + meeting_id: MeetingId, + turns: list[tuple[float, float, str]], + ) -> tuple[int, list[str]]: + """Apply diarization results to meeting segments. + + Returns: + Tuple of (segments_updated, unique_speaker_ids). + """ + from noteflow.infrastructure.diarization.assigner import assign_speakers_batch + + async with self._uow_factory() as uow: + meeting = await uow.meetings.get(meeting_id) + if not meeting: + return 0, [] + + # Assign speakers to segments + assignments = assign_speakers_batch( + segments=meeting.segments, + turns=turns, + ) + + # Update segments + speaker_ids: set[str] = set() + count = 0 + for segment, speaker_id in assignments: + if speaker_id and segment.speaker_id != speaker_id: + segment.speaker_id = speaker_id + speaker_ids.add(speaker_id) + count += 1 + + await uow.commit() + return count, sorted(speaker_ids) + + async def get_job_status(self, job_id: str) -> DiarizationJobResult: + """Get status of diarization job from database. + + Args: + job_id: Job identifier. + + Returns: + Current job status and results. + """ + try: + job_uuid = UUID(job_id) + except ValueError: + return DiarizationJobResult( + job_id=job_id, + status=JobStatus.FAILED, + segments_updated=0, + speaker_ids=[], + error_message="Invalid job ID format", + ) + + async with self._uow_factory() as uow: + job = await uow.diarization_jobs.get(job_uuid) + + if not job: + return DiarizationJobResult( + job_id=job_id, + status=JobStatus.FAILED, + segments_updated=0, + speaker_ids=[], + error_message="Job not found", + ) + + # Calculate progress estimate + progress = 0 + if job.status == "running": + progress = 50 # Could be enhanced with actual progress tracking + elif job.status == "completed": + progress = 100 + + return DiarizationJobResult( + job_id=job_id, + status=JobStatus(job.status), + segments_updated=job.segments_updated, + speaker_ids=list(job.speaker_ids) if job.speaker_ids else [], + error_message=job.error_message, + progress_percent=progress, + ) + + async def rename_speaker( + self, + meeting_id: MeetingId, + old_speaker_id: str, + new_speaker_name: str, + ) -> int: + """Rename speaker across all segments. + + Args: + meeting_id: Meeting containing segments. + old_speaker_id: Current speaker identifier. + new_speaker_name: New display name. + + Returns: + Number of segments updated. + + Raises: + ValueError: If meeting not found. + """ + async with self._uow_factory() as uow: + meeting = await uow.meetings.get(meeting_id) + if not meeting: + msg = f"Meeting {meeting_id} not found" + raise ValueError(msg) + + count = 0 + for segment in meeting.segments: + if segment.speaker_id == old_speaker_id: + segment.speaker_id = new_speaker_name + count += 1 + + await uow.commit() + + _logger.info( + "Renamed speaker", + extra={ + "meeting_id": str(meeting_id), + "old_speaker": old_speaker_id, + "new_speaker": new_speaker_name, + "segments_updated": count, + }, + ) + + return count + + async def cancel_job(self, job_id: str) -> bool: + """Cancel a running or queued job. + + Args: + job_id: Job to cancel. + + Returns: + True if job was cancelled. + """ + # Cancel active task if running + task = self._active_tasks.get(job_id) + if task and not task.done(): + task.cancel() + + # Update database status + try: + job_uuid = UUID(job_id) + async with self._uow_factory() as uow: + job = await uow.diarization_jobs.get(job_uuid) + if job and job.status in {"queued", "running"}: + await uow.diarization_jobs.update_status( + job_uuid, + "failed", + error_message="Cancelled by user", + ) + await uow.commit() + _logger.info("Cancelled diarization job", extra={"job_id": job_id}) + return True + except Exception: + _logger.exception("Failed to cancel job", extra={"job_id": job_id}) + + return False + + async def recover_jobs(self) -> int: + """Recover jobs that were running when server stopped. + + Called at startup to re-queue interrupted jobs. + + Returns: + Number of jobs recovered. + """ + async with self._uow_factory() as uow: + active_jobs = await uow.diarization_jobs.get_active_jobs() + + recovered = 0 + for job in active_jobs: + if job.status == "running": + # Mark as failed (was interrupted) + await uow.diarization_jobs.update_status( + job.id, + "failed", + error_message="Interrupted by server restart", + ) + recovered += 1 + + await uow.commit() + + if recovered > 0: + _logger.warning( + "Recovered interrupted diarization jobs", + extra={"count": recovered}, + ) + + return recovered + + async def cleanup_old_jobs(self) -> int: + """Clean up completed/failed jobs older than threshold. + + Returns: + Number of jobs deleted. + """ + threshold = utc_now() - JOB_CLEANUP_AGE + + async with self._uow_factory() as uow: + count = await uow.diarization_jobs.cleanup_old_jobs(threshold) + await uow.commit() + + if count > 0: + _logger.info("Cleaned up old diarization jobs", extra={"count": count}) + + return count +``` + +--- + +### Task 4: Update Server Initialization + +**File**: `src/noteflow/grpc/server.py` + +Add to `serve()` function: + +```python +from noteflow.application.services.diarization_service import ( + DiarizationService, + DiarizationServiceSettings, +) +from noteflow.infrastructure.diarization.engine import DiarizationEngine + +# Initialize diarization engine +diarization_engine = DiarizationEngine( + model_path=settings.diarization_model_path, + device=settings.device, +) + +# Create diarization service with DB persistence +diarization_service = DiarizationService( + engine=diarization_engine, + uow_factory=SQLAlchemyUnitOfWork, + settings=DiarizationServiceSettings( + max_concurrent_jobs=settings.max_diarization_jobs, + job_timeout_seconds=settings.diarization_timeout, + ), +) + +# Recover any interrupted jobs from previous run +await diarization_service.recover_jobs() + +# Pass to servicer +servicer = NoteFlowServicer( + # ... existing params ... + diarization_service=diarization_service, +) + +# Schedule periodic cleanup +async def cleanup_loop(): + while True: + await asyncio.sleep(3600) # Every hour + try: + await diarization_service.cleanup_old_jobs() + except Exception: + _logger.exception("Job cleanup failed") + +asyncio.create_task(cleanup_loop()) +``` + +--- + +### Task 5: Simplify gRPC Mixin + +**File**: `src/noteflow/grpc/_mixins/diarization.py` + +Refactor to delegate all logic to service: + +```python +"""Diarization gRPC mixin - thin layer delegating to DiarizationService.""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +from noteflow.domain.ports.diarization import JobStatus +from noteflow.grpc.proto import noteflow_pb2 + +if TYPE_CHECKING: + import grpc + + from noteflow.grpc._mixins.protocols import ServicerHost + +_logger = logging.getLogger(__name__) + +# Map domain status to proto status +_STATUS_MAP = { + JobStatus.QUEUED: noteflow_pb2.JOB_STATUS_QUEUED, + JobStatus.RUNNING: noteflow_pb2.JOB_STATUS_RUNNING, + JobStatus.COMPLETED: noteflow_pb2.JOB_STATUS_COMPLETED, + JobStatus.FAILED: noteflow_pb2.JOB_STATUS_FAILED, +} + + +class DiarizationMixin: + """Mixin for diarization RPC methods. + + All business logic delegated to DiarizationService. + This mixin only handles proto conversion and error mapping. + """ + + async def RefineSpeakerDiarization( + self: ServicerHost, + request: noteflow_pb2.RefineSpeakerDiarizationRequest, + context: grpc.aio.ServicerContext, + ) -> noteflow_pb2.RefineSpeakerDiarizationResponse: + """Start speaker diarization refinement job.""" + meeting_id = self._parse_meeting_id(request.meeting_id) + num_speakers = request.num_speakers if request.num_speakers > 0 else None + + try: + job_id = await self._diarization_service.refine_meeting( + meeting_id=meeting_id, + num_speakers=num_speakers, + ) + + return noteflow_pb2.RefineSpeakerDiarizationResponse( + job_id=job_id, + status=noteflow_pb2.JOB_STATUS_QUEUED, + ) + + except ValueError as e: + context.set_code(grpc.StatusCode.NOT_FOUND) + context.set_details(str(e)) + return noteflow_pb2.RefineSpeakerDiarizationResponse( + error_message=str(e), + status=noteflow_pb2.JOB_STATUS_FAILED, + ) + + async def GetDiarizationJobStatus( + self: ServicerHost, + request: noteflow_pb2.GetDiarizationJobStatusRequest, + context: grpc.aio.ServicerContext, + ) -> noteflow_pb2.DiarizationJobStatus: + """Get status of diarization job.""" + result = await self._diarization_service.get_job_status(request.job_id) + + return noteflow_pb2.DiarizationJobStatus( + job_id=result.job_id, + status=_STATUS_MAP.get(result.status, noteflow_pb2.JOB_STATUS_UNSPECIFIED), + segments_updated=result.segments_updated, + speaker_ids=result.speaker_ids, + error_message=result.error_message or "", + progress_percent=result.progress_percent, + ) + + async def RenameSpeaker( + self: ServicerHost, + request: noteflow_pb2.RenameSpeakerRequest, + context: grpc.aio.ServicerContext, + ) -> noteflow_pb2.RenameSpeakerResponse: + """Rename speaker across meeting segments.""" + meeting_id = self._parse_meeting_id(request.meeting_id) + + try: + count = await self._diarization_service.rename_speaker( + meeting_id=meeting_id, + old_speaker_id=request.old_speaker_id, + new_speaker_name=request.new_speaker_name, + ) + + return noteflow_pb2.RenameSpeakerResponse( + segments_updated=count, + success=True, + ) + + except ValueError as e: + context.set_code(grpc.StatusCode.NOT_FOUND) + context.set_details(str(e)) + return noteflow_pb2.RenameSpeakerResponse( + segments_updated=0, + success=False, + ) +``` + +--- + +### Task 6: Frontend Integration + +**File**: `client/src/hooks/use-diarization.ts` + +```typescript +import { useState, useCallback, useRef, useEffect } from 'react'; +import { invoke } from '@tauri-apps/api/tauri'; + +interface DiarizationJobStatus { + jobId: string; + status: 'queued' | 'running' | 'completed' | 'failed'; + segmentsUpdated: number; + speakerIds: string[]; + errorMessage?: string; + progressPercent: number; +} + +interface UseDiarizationResult { + isProcessing: boolean; + progress: number; + error: string | null; + speakerIds: string[]; + startDiarization: (meetingId: string, numSpeakers?: number) => Promise; + renameSpeaker: (meetingId: string, oldId: string, newName: string) => Promise; + cancelDiarization: () => Promise; +} + +const POLL_INTERVAL_MS = 1000; + +export function useDiarization(): UseDiarizationResult { + const [isProcessing, setIsProcessing] = useState(false); + const [progress, setProgress] = useState(0); + const [error, setError] = useState(null); + const [speakerIds, setSpeakerIds] = useState([]); + + const currentJobId = useRef(null); + const pollIntervalRef = useRef(null); + + const stopPolling = useCallback(() => { + if (pollIntervalRef.current) { + clearInterval(pollIntervalRef.current); + pollIntervalRef.current = null; + } + }, []); + + const pollJobStatus = useCallback(async () => { + if (!currentJobId.current) return; + + try { + const status = await invoke( + 'get_diarization_job_status', + { jobId: currentJobId.current } + ); + + setProgress(status.progressPercent); + + if (status.status === 'completed') { + setIsProcessing(false); + setSpeakerIds(status.speakerIds); + stopPolling(); + } else if (status.status === 'failed') { + setIsProcessing(false); + setError(status.errorMessage || 'Diarization failed'); + stopPolling(); + } + } catch (err) { + setError(err instanceof Error ? err.message : 'Failed to get job status'); + setIsProcessing(false); + stopPolling(); + } + }, [stopPolling]); + + const startDiarization = useCallback( + async (meetingId: string, numSpeakers?: number) => { + setIsProcessing(true); + setProgress(0); + setError(null); + setSpeakerIds([]); + + try { + const response = await invoke<{ jobId: string }>( + 'refine_speaker_diarization', + { meetingId, numSpeakers } + ); + + currentJobId.current = response.jobId; + + // Start polling for status + pollIntervalRef.current = setInterval(pollJobStatus, POLL_INTERVAL_MS); + } catch (err) { + setIsProcessing(false); + setError(err instanceof Error ? err.message : 'Failed to start diarization'); + } + }, + [pollJobStatus] + ); + + const renameSpeaker = useCallback( + async (meetingId: string, oldId: string, newName: string): Promise => { + try { + const result = await invoke<{ segmentsUpdated: number }>( + 'rename_speaker', + { meetingId, oldSpeakerId: oldId, newSpeakerName: newName } + ); + return result.segmentsUpdated; + } catch (err) { + throw new Error(err instanceof Error ? err.message : 'Failed to rename speaker'); + } + }, + [] + ); + + const cancelDiarization = useCallback(async () => { + if (currentJobId.current) { + await invoke('cancel_diarization_job', { jobId: currentJobId.current }); + stopPolling(); + setIsProcessing(false); + setProgress(0); + currentJobId.current = null; + } + }, [stopPolling]); + + // Cleanup on unmount + useEffect(() => { + return () => { + stopPolling(); + }; + }, [stopPolling]); + + return { + isProcessing, + progress, + error, + speakerIds, + startDiarization, + renameSpeaker, + cancelDiarization, + }; +} +``` + +**File**: `client/src/components/DiarizationPanel.tsx` + +```tsx +import React, { useState } from 'react'; +import { Button } from '@/components/ui/button'; +import { Progress } from '@/components/ui/progress'; +import { Alert, AlertDescription } from '@/components/ui/alert'; +import { Input } from '@/components/ui/input'; +import { Label } from '@/components/ui/label'; +import { Loader2, Users, X, Check } from 'lucide-react'; +import { useDiarization } from '@/hooks/use-diarization'; + +interface DiarizationPanelProps { + meetingId: string; + onComplete?: (speakerIds: string[]) => void; +} + +export function DiarizationPanel({ meetingId, onComplete }: DiarizationPanelProps) { + const { + isProcessing, + progress, + error, + speakerIds, + startDiarization, + renameSpeaker, + cancelDiarization, + } = useDiarization(); + + const [numSpeakers, setNumSpeakers] = useState(''); + const [editingSpeaker, setEditingSpeaker] = useState(null); + const [newName, setNewName] = useState(''); + + const handleStart = async () => { + const speakers = numSpeakers ? parseInt(numSpeakers, 10) : undefined; + await startDiarization(meetingId, speakers); + }; + + const handleRename = async (oldId: string) => { + if (!newName.trim()) return; + + try { + await renameSpeaker(meetingId, oldId, newName.trim()); + setEditingSpeaker(null); + setNewName(''); + } catch (err) { + // Error handled by hook + } + }; + + // Notify parent on completion + React.useEffect(() => { + if (speakerIds.length > 0 && onComplete) { + onComplete(speakerIds); + } + }, [speakerIds, onComplete]); + + return ( +
+
+

+ + Speaker Diarization +

+
+ + {error && ( + + {error} + + )} + + {!isProcessing && speakerIds.length === 0 && ( +
+
+ + setNumSpeakers(e.target.value)} + className="w-20" + placeholder="Auto" + /> +
+ + +
+ )} + + {isProcessing && ( +
+
+ + Processing audio... +
+ + + +

+ {progress}% complete +

+ + +
+ )} + + {speakerIds.length > 0 && ( +
+

Detected Speakers:

+
    + {speakerIds.map((speakerId) => ( +
  • + {editingSpeaker === speakerId ? ( + <> + setNewName(e.target.value)} + placeholder="Enter name" + className="flex-1" + autoFocus + /> + + + + ) : ( + <> + {speakerId} + + + )} +
  • + ))} +
+ + +
+ )} +
+ ); +} +``` + +--- + +## Code Segments to Reuse + +### Existing Job Repository + +**Location**: `src/noteflow/infrastructure/persistence/repositories/diarization_job_repo.py` + +Use and extend the existing repository instead of creating in-memory state. + +### Existing Speaker Assignment + +**Location**: `src/noteflow/infrastructure/diarization/assigner.py` + +```python +def assign_speaker(start_time: float, end_time: float, turns: Sequence[...]) -> str | None +def assign_speakers_batch(segments: Sequence[Segment], turns: Sequence[...]) -> list[...] +``` + +### Existing Service Pattern + +**Location**: `src/noteflow/application/services/summarization_service.py` + +Follow the same patterns for settings, factories, logging. + +--- + +## Performance Targets + +| Metric | Target | Measurement | +|--------|--------|-------------| +| Job queue latency | < 100ms | Time from RPC to job created | +| 10-minute audio | < 2 minutes | End-to-end diarization time | +| Status poll latency | < 50ms | Database query time | +| Concurrent jobs | 2 default | Configurable via settings | + +--- + +## Acceptance Criteria + +### Functional + +- [ ] `RefineSpeakerDiarization` RPC creates DB job record +- [ ] `GetDiarizationJobStatus` RPC reads from database +- [ ] `RenameSpeaker` RPC updates all segments +- [ ] Jobs persist across server restarts +- [ ] Interrupted jobs marked as failed on recovery +- [ ] Old jobs cleaned up automatically + +### Technical + +- [ ] No in-memory job state (all in database) +- [ ] Application service follows hexagonal architecture +- [ ] Port interface defined in domain layer +- [ ] Proper concurrency control with semaphore +- [ ] Structured logging with job context + +### Quality Gates + +- [ ] `pytest tests/quality/` passes +- [ ] Module size < 300 lines +- [ ] All public methods documented +- [ ] No `# type: ignore` without justification +- [ ] Test coverage > 80% for new code + +--- + +## Test Plan + +### Unit Tests + +**File**: `tests/application/test_diarization_service.py` + +```python +import pytest +from unittest.mock import AsyncMock, MagicMock, patch +from uuid import uuid4 + +from noteflow.application.services.diarization_service import ( + DiarizationService, + DiarizationServiceSettings, +) +from noteflow.domain.entities.meeting import MeetingId +from noteflow.domain.ports.diarization import JobStatus + + +@pytest.fixture +def mock_engine() -> MagicMock: + """Create mock diarization engine.""" + engine = MagicMock() + engine.is_ready.return_value = True + engine.process_audio = AsyncMock(return_value=[ + (0.0, 5.0, "SPEAKER_00"), + (5.0, 10.0, "SPEAKER_01"), + ]) + return engine + + +@pytest.fixture +def mock_uow(mock_meeting): + """Create mock unit of work.""" + uow = MagicMock() + uow.__aenter__ = AsyncMock(return_value=uow) + uow.__aexit__ = AsyncMock(return_value=None) + uow.meetings.get = AsyncMock(return_value=mock_meeting) + uow.diarization_jobs.create = AsyncMock() + uow.diarization_jobs.get = AsyncMock(return_value=None) + uow.diarization_jobs.update_status = AsyncMock() + uow.commit = AsyncMock() + return uow + + +async def test_refine_meeting_creates_db_job( + mock_engine: MagicMock, + mock_uow: MagicMock, +) -> None: + """Refine meeting creates job in database.""" + mock_uow_factory = MagicMock(return_value=mock_uow) + + service = DiarizationService( + engine=mock_engine, + uow_factory=mock_uow_factory, + ) + + meeting_id = MeetingId(uuid4()) + job_id = await service.refine_meeting(meeting_id) + + assert job_id + mock_uow.diarization_jobs.create.assert_called_once() + mock_uow.commit.assert_called() + + +async def test_get_job_status_reads_from_db(mock_engine: MagicMock) -> None: + """Job status reads from database, not memory.""" + job_id = uuid4() + + mock_job = MagicMock() + mock_job.status = "completed" + mock_job.segments_updated = 10 + mock_job.speaker_ids = ["SPEAKER_00", "SPEAKER_01"] + mock_job.error_message = None + + mock_uow = MagicMock() + mock_uow.__aenter__ = AsyncMock(return_value=mock_uow) + mock_uow.__aexit__ = AsyncMock(return_value=None) + mock_uow.diarization_jobs.get = AsyncMock(return_value=mock_job) + + service = DiarizationService( + engine=mock_engine, + uow_factory=MagicMock(return_value=mock_uow), + ) + + result = await service.get_job_status(str(job_id)) + + assert result.status == JobStatus.COMPLETED + assert result.segments_updated == 10 + assert len(result.speaker_ids) == 2 + + +async def test_rename_speaker_updates_segments(mock_engine: MagicMock) -> None: + """Rename speaker updates matching segments.""" + from noteflow.domain.entities.segment import Segment + + segments = [ + MagicMock(spec=Segment, speaker_id="SPEAKER_00"), + MagicMock(spec=Segment, speaker_id="SPEAKER_01"), + MagicMock(spec=Segment, speaker_id="SPEAKER_00"), + ] + + mock_meeting = MagicMock() + mock_meeting.segments = segments + + mock_uow = MagicMock() + mock_uow.__aenter__ = AsyncMock(return_value=mock_uow) + mock_uow.__aexit__ = AsyncMock(return_value=None) + mock_uow.meetings.get = AsyncMock(return_value=mock_meeting) + mock_uow.commit = AsyncMock() + + service = DiarizationService( + engine=mock_engine, + uow_factory=MagicMock(return_value=mock_uow), + ) + + count = await service.rename_speaker( + meeting_id=MeetingId(uuid4()), + old_speaker_id="SPEAKER_00", + new_speaker_name="Alice", + ) + + assert count == 2 + assert segments[0].speaker_id == "Alice" + assert segments[1].speaker_id == "SPEAKER_01" + assert segments[2].speaker_id == "Alice" +``` + +### Integration Tests + +**File**: `tests/integration/test_diarization_workflow.py` + +```python +import asyncio + +import pytest + +from noteflow.domain.ports.diarization import JobStatus + + +@pytest.mark.integration +async def test_full_diarization_workflow( + grpc_client, + meeting_with_audio, +) -> None: + """Full diarization workflow: create job β†’ poll β†’ verify.""" + # Start refinement + response = await grpc_client.refine_speaker_diarization( + meeting_id=str(meeting_with_audio.id), + ) + + assert response.job_id + + # Poll for completion (with timeout) + final_status = None + for _ in range(60): # 60 seconds max + status = await grpc_client.get_diarization_job_status(response.job_id) + if status.status in {JobStatus.COMPLETED, JobStatus.FAILED}: + final_status = status + break + await asyncio.sleep(1) + + assert final_status is not None, "Job did not complete in time" + assert final_status.status == JobStatus.COMPLETED + assert final_status.segments_updated > 0 + + +@pytest.mark.integration +async def test_job_persists_across_service_restart( + grpc_client, + meeting_with_audio, + restart_service, # Fixture that restarts the gRPC service +) -> None: + """Jobs are recoverable after service restart.""" + # Start a job + response = await grpc_client.refine_speaker_diarization( + meeting_id=str(meeting_with_audio.id), + ) + + # Restart service + await restart_service() + + # Job should still be queryable + status = await grpc_client.get_diarization_job_status(response.job_id) + assert status.job_id == response.job_id +``` + +--- + +## Definition of Done + +- [ ] All acceptance criteria met +- [ ] Quality gates pass (`pytest tests/quality/`) +- [ ] Integration tests pass with real database +- [ ] Performance targets met (measured) +- [ ] CLAUDE.md updated with new service patterns +- [ ] No in-memory state for job tracking +- [ ] Frontend components reviewed and tested +- [ ] Observability verified (logs, metrics) + +--- + +## Dependencies + +- Sprint 0 (Proto & Schema Foundation) for any proto changes +- Existing `DiarizationEngine` in infrastructure +- Existing `SQLAlchemyUnitOfWork` pattern + +## Blocks + +- None + +## Failure Modes + +| Failure | Detection | Recovery | +|---------|-----------|----------| +| Engine model not loaded | `is_ready() == False` | Lazy-load with timeout; return `FAILED_PRECONDITION` if unavailable | +| Audio file missing/corrupt | `FileNotFoundError` or decode error | Mark job failed with descriptive error message | +| Job timeout | `asyncio.TimeoutError` after `job_timeout_seconds` | Mark failed, log audio duration vs timeout ratio | +| Engine OOM | Process crash or memory error | Restart worker, limit concurrent jobs via semaphore | +| DB connection lost | `asyncpg.PostgresError` | Retry with exponential backoff; job remains in `running` state until recovery | +| Orphaned jobs (server restart) | Jobs stuck in `running` state on startup | `recover_jobs()` marks as failed with "interrupted" message | + +### Dead Letter Queue Pattern + +For jobs that fail repeatedly, implement a dead letter queue: + +```python +# In DiarizationService +MAX_JOB_RETRIES = 3 + +async def _fail_job(self, job_id: UUID, error_message: str) -> None: + """Mark job as failed with retry tracking.""" + async with self._uow_factory() as uow: + job = await uow.diarization_jobs.get(job_id) + if job and job.retry_count >= MAX_JOB_RETRIES: + # Move to dead letter state + await uow.diarization_jobs.update_status( + job_id, "dead_letter", error_message=error_message + ) + _logger.error("Job exceeded max retries", extra={"job_id": str(job_id)}) + else: + await uow.diarization_jobs.update_status( + job_id, "failed", error_message=error_message + ) + await uow.commit() +``` + +--- + +## Post-Sprint + +- Add real-time progress streaming (WebSocket/SSE) +- Consider job queuing with Redis for horizontal scaling +- Add diarization quality metrics (speaker confusion rate) +- Support for speaker embeddings and voice fingerprinting diff --git a/docs/sprints/phase-1-core-pipeline/sprint-3-pdf-export/README.md b/docs/sprints/phase-1-core-pipeline/sprint-3-pdf-export/README.md new file mode 100644 index 0000000..4728105 --- /dev/null +++ b/docs/sprints/phase-1-core-pipeline/sprint-3-pdf-export/README.md @@ -0,0 +1,778 @@ +# Sprint 3: PDF Export + +> **Priority**: 3 | **Owner**: Backend | **Complexity**: Low-Medium + +--- + +## Objective + +Add PDF export capability to complement existing Markdown and HTML exports. Users expect to export transcripts as PDF for sharing and archival. + +--- + +## Current State Analysis + +### What Exists + +| Component | Location | Status | +|-----------|----------|--------| +| Export Protocol | `src/noteflow/infrastructure/export/protocols.py` | `TranscriptExporter` interface | +| Markdown Exporter | `src/noteflow/infrastructure/export/markdown.py` | Working | +| HTML Exporter | `src/noteflow/infrastructure/export/html.py` | Working | +| Formatting Utils | `src/noteflow/infrastructure/export/_formatting.py` | `format_timestamp()`, `format_datetime()` | +| gRPC Mixin | `src/noteflow/grpc/_mixins/export.py` | `ExportTranscript` RPC | +| Proto Enum | `noteflow.proto:420` | `ExportFormat` (MARKDOWN, HTML only) | + +### Gap + +No PDF exporter exists. The `ExportFormat` proto enum lacks `EXPORT_FORMAT_PDF`. + +--- + +## Target/Affected Code + +### Files to Create + +| File | Purpose | Lines Est. | +|------|---------|------------| +| `src/noteflow/infrastructure/export/pdf.py` | PDF exporter class | ~100 | +| `tests/infrastructure/export/test_pdf.py` | Unit tests | ~80 | + +### Files to Modify + +| File | Change Type | Lines Est. | +|------|-------------|------------| +| `src/noteflow/grpc/proto/noteflow.proto` | Add PDF enum value | +1 | +| `src/noteflow/infrastructure/export/__init__.py` | Export `PdfExporter` | +2 | +| `src/noteflow/grpc/_mixins/export.py` | Handle PDF format | +15 | +| `pyproject.toml` | Add weasyprint dependency | +1 | +| `client/src-tauri/src/commands/export.rs` | Handle PDF format | +5 | +| `client/src/pages/MeetingDetail.tsx` | Add PDF button | +5 | + +--- + +## Implementation Tasks + +### Task 1: Add Dependency + +**File**: `pyproject.toml` + +Add to dependencies: +```toml +dependencies = [ + # ... existing ... + "weasyprint>=62.0", +] +``` + +**Note**: weasyprint requires system dependencies (cairo, pango, gdk-pixbuf). Document in README. + +System packages (Ubuntu/Debian): +```bash +apt-get install libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 +``` + +System packages (macOS): +```bash +brew install pango cairo gdk-pixbuf +``` + +--- + +### Task 2: Proto Update + +**File**: `src/noteflow/grpc/proto/noteflow.proto` + +Modify `ExportFormat` enum (around line 420): + +```protobuf +enum ExportFormat { + EXPORT_FORMAT_UNSPECIFIED = 0; + EXPORT_FORMAT_MARKDOWN = 1; + EXPORT_FORMAT_HTML = 2; + EXPORT_FORMAT_PDF = 3; // NEW +} +``` + +Regenerate stubs after change. + +--- + +### Task 3: Create PDF Exporter + +**File**: `src/noteflow/infrastructure/export/pdf.py` + +```python +"""PDF transcript exporter using weasyprint.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from weasyprint import HTML + +from noteflow.infrastructure.export._formatting import ( + format_datetime, + format_timestamp, +) +from noteflow.infrastructure.export.protocols import TranscriptExporter + +if TYPE_CHECKING: + from noteflow.domain.entities.meeting import Meeting + +# PDF-optimized CSS +_PDF_CSS = """ +@page { + size: A4; + margin: 2cm; +} + +body { + font-family: 'Helvetica Neue', Arial, sans-serif; + font-size: 11pt; + line-height: 1.6; + color: #333; +} + +h1 { + color: #1a1a1a; + border-bottom: 2px solid #333; + padding-bottom: 8px; + margin-bottom: 16px; +} + +h2 { + color: #444; + margin-top: 24px; + margin-bottom: 12px; +} + +.metadata { + color: #666; + font-size: 10pt; + margin-bottom: 20px; + padding-bottom: 10px; + border-bottom: 1px solid #ddd; +} + +.summary { + background-color: #f8f9fa; + padding: 16px; + border-radius: 4px; + margin-bottom: 24px; + page-break-inside: avoid; +} + +.summary h2 { + color: #2563eb; + margin-top: 0; +} + +.key-points { + margin: 12px 0; +} + +.key-points li { + margin-bottom: 8px; +} + +.action-item { + background-color: #fef3c7; + padding: 8px 12px; + margin: 8px 0; + border-left: 3px solid #f59e0b; + page-break-inside: avoid; +} + +.segment { + margin: 12px 0; + padding: 8px 0; + border-bottom: 1px solid #eee; + page-break-inside: avoid; +} + +.speaker { + font-weight: bold; + color: #2563eb; +} + +.timestamp { + color: #888; + font-size: 9pt; + margin-left: 8px; +} + +.text { + margin-top: 4px; +} +""" + + +class PdfExporter(TranscriptExporter): + """Export transcripts to PDF format.""" + + def export(self, meeting: Meeting) -> bytes: + """Export meeting transcript to PDF bytes. + + Args: + meeting: Meeting entity with segments and optional summary. + + Returns: + PDF document as bytes. + """ + html_content = self._build_html(meeting) + pdf_bytes: bytes = HTML(string=html_content).write_pdf() + return pdf_bytes + + def _build_html(self, meeting: Meeting) -> str: + """Build HTML content for PDF rendering.""" + title = meeting.title or f"Meeting {meeting.id}" + date = format_datetime(meeting.created_at) if meeting.created_at else "Unknown" + duration = ( + format_timestamp(meeting.duration_seconds) + if meeting.duration_seconds + else "Unknown" + ) + + # Build segments HTML + segments_html = self._build_segments_html(meeting) + + # Build summary HTML + summary_html = self._build_summary_html(meeting) if meeting.summary else "" + + return f""" + + + + {self._escape(title)} + + + +

{self._escape(title)}

+ + {summary_html} +

Transcript

+ {segments_html} + +""" + + def _build_segments_html(self, meeting: Meeting) -> str: + """Build HTML for transcript segments.""" + parts: list[str] = [] + + for segment in meeting.segments: + speaker = self._escape(segment.speaker_id or "Unknown") + timestamp = format_timestamp(segment.start_time) + text = self._escape(segment.text) + + parts.append(f""" +
+ {speaker} + [{timestamp}] +
{text}
+
""") + + return "\n".join(parts) + + def _build_summary_html(self, meeting: Meeting) -> str: + """Build HTML for meeting summary.""" + summary = meeting.summary + if not summary: + return "" + + # Executive summary + exec_summary = self._escape(summary.executive_summary) + + # Key points + key_points_html = "" + if summary.key_points: + items = "\n".join( + f"
  • {self._escape(kp.text)}
  • " + for kp in summary.key_points + ) + key_points_html = f""" +

    Key Points

    +
      + {items} +
    """ + + # Action items + action_items_html = "" + if summary.action_items: + items = "\n".join( + f'
    {self._escape(ai.text)}
    ' + for ai in summary.action_items + ) + action_items_html = f""" +

    Action Items

    +{items}""" + + return f""" +
    +

    Summary

    +

    {exec_summary}

    + {key_points_html} + {action_items_html} +
    """ + + @staticmethod + def _escape(text: str) -> str: + """Escape HTML special characters.""" + return ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'") + ) +``` + +--- + +### Task 4: Register Exporter + +**File**: `src/noteflow/infrastructure/export/__init__.py` + +```python +"""Export infrastructure module.""" + +from noteflow.infrastructure.export.html import HtmlExporter +from noteflow.infrastructure.export.markdown import MarkdownExporter +from noteflow.infrastructure.export.pdf import PdfExporter +from noteflow.infrastructure.export.protocols import TranscriptExporter + +__all__ = [ + "HtmlExporter", + "MarkdownExporter", + "PdfExporter", + "TranscriptExporter", +] +``` + +--- + +### Task 5: Update gRPC Mixin + +**File**: `src/noteflow/grpc/_mixins/export.py` + +Modify `ExportTranscript` to handle PDF: + +```python +from noteflow.infrastructure.export import ( + HtmlExporter, + MarkdownExporter, + PdfExporter, +) +from noteflow.grpc.proto import noteflow_pb2 + +# Exporter registry +_EXPORTERS = { + noteflow_pb2.EXPORT_FORMAT_MARKDOWN: (MarkdownExporter, "markdown", ".md"), + noteflow_pb2.EXPORT_FORMAT_HTML: (HtmlExporter, "html", ".html"), + noteflow_pb2.EXPORT_FORMAT_PDF: (PdfExporter, "pdf", ".pdf"), +} + + +class ExportMixin: + """Mixin for export RPC methods.""" + + async def ExportTranscript( + self: ServicerHost, + request: noteflow_pb2.ExportTranscriptRequest, + context: grpc.aio.ServicerContext, + ) -> noteflow_pb2.ExportTranscriptResponse: + """Export meeting transcript to specified format.""" + meeting_id = self._parse_meeting_id(request.meeting_id) + + # Get exporter + exporter_info = _EXPORTERS.get(request.format) + if not exporter_info: + context.set_code(grpc.StatusCode.INVALID_ARGUMENT) + context.set_details(f"Unsupported format: {request.format}") + return noteflow_pb2.ExportTranscriptResponse() + + exporter_class, format_name, extension = exporter_info + + # Fetch meeting + async with self._create_repository_provider() as provider: + meeting = await provider.meetings.get(meeting_id) + if not meeting: + context.set_code(grpc.StatusCode.NOT_FOUND) + context.set_details(f"Meeting {meeting_id} not found") + return noteflow_pb2.ExportTranscriptResponse() + + # Export + exporter = exporter_class() + result = exporter.export(meeting) + + # Handle bytes vs string + if isinstance(result, bytes): + # PDF returns bytes - base64 encode for transport + import base64 + content = base64.b64encode(result).decode("ascii") + else: + content = result + + return noteflow_pb2.ExportTranscriptResponse( + content=content, + format_name=format_name, + file_extension=extension, + ) +``` + +**Note**: For PDF, content is base64-encoded. Frontend must decode. + +--- + +### Task 6: Frontend Updates + +**File**: `client/src-tauri/src/commands/export.rs` + +```rust +#[tauri::command] +pub async fn export_transcript( + meeting_id: String, + format: String, + state: State<'_, AppState>, +) -> Result { + let proto_format = match format.as_str() { + "markdown" => proto::ExportFormat::Markdown, + "html" => proto::ExportFormat::Html, + "pdf" => proto::ExportFormat::Pdf, // NEW + _ => return Err(format!("Invalid format: {}", format)), + }; + + // ... gRPC call ... + + // For PDF, decode base64 before saving + let content = if format == "pdf" { + // Content is base64-encoded bytes + response.content + } else { + response.content + }; + + Ok(ExportResult { + content, + format_name: response.format_name, + file_extension: response.file_extension, + }) +} +``` + +**File**: `client/src/pages/MeetingDetail.tsx` + +Add PDF export button alongside existing exports: + +```tsx + + + + + + handleExport('markdown')}> + Markdown (.md) + + handleExport('html')}> + HTML (.html) + + handleExport('pdf')}> + PDF (.pdf) + + + +``` + +--- + +## Code Segments to Reuse + +### Existing Formatting Utilities + +**Location**: `src/noteflow/infrastructure/export/_formatting.py` + +```python +def format_timestamp(seconds: float) -> str: + """Format seconds as MM:SS or HH:MM:SS.""" + +def format_datetime(dt: datetime) -> str: + """Format datetime for display.""" +``` + +### Existing HTML Exporter Pattern + +**Location**: `src/noteflow/infrastructure/export/html.py` + +Follow the same structure: +- `export()` method returning string +- `_build_*` helper methods +- CSS embedded in output + +### Existing Exporter Protocol + +**Location**: `src/noteflow/infrastructure/export/protocols.py` + +```python +class TranscriptExporter(Protocol): + """Protocol for transcript exporters.""" + + def export(self, meeting: Meeting) -> str: + """Export meeting to string format.""" + ... +``` + +**Note**: PDF returns `bytes`, not `str`. Either: +1. Update protocol to `str | bytes` +2. Create separate `BinaryExporter` protocol + +--- + +## Acceptance Criteria + +### Functional + +- [ ] Export dropdown includes PDF option +- [ ] Clicking PDF export downloads valid PDF file +- [ ] PDF contains title, date, duration, segment count +- [ ] PDF contains all transcript segments with speakers/timestamps +- [ ] PDF contains summary (if present) with key points and action items +- [ ] PDF renders cleanly on A4 paper + +### Technical + +- [ ] PDF generation uses weasyprint (not reportlab) +- [ ] Content properly HTML-escaped to prevent injection +- [ ] Base64 encoding/decoding works correctly +- [ ] Error handling for missing weasyprint + +### Quality Gates + +- [ ] `pytest tests/quality/` passes +- [ ] Module size < 200 lines +- [ ] All functions documented +- [ ] No hardcoded strings (use constants) + +--- + +## Test Plan + +### Unit Tests + +**File**: `tests/infrastructure/export/test_pdf.py` + +```python +import pytest +from unittest.mock import MagicMock +from datetime import datetime, UTC +from uuid import uuid4 + +from noteflow.domain.entities.meeting import Meeting, MeetingId, MeetingState +from noteflow.domain.entities.segment import Segment +from noteflow.domain.entities.summary import Summary, KeyPoint, ActionItem +from noteflow.infrastructure.export.pdf import PdfExporter + + +@pytest.fixture +def meeting_with_segments() -> Meeting: + """Create meeting with segments for testing.""" + return Meeting( + id=MeetingId(uuid4()), + title="Test Meeting", + state=MeetingState.COMPLETED, + created_at=datetime.now(UTC), + duration_seconds=3600.0, + segments=[ + Segment( + segment_id=1, + text="Hello, welcome to the meeting.", + start_time=0.0, + end_time=5.0, + speaker_id="Alice", + ), + Segment( + segment_id=2, + text="Thank you for joining.", + start_time=5.0, + end_time=10.0, + speaker_id="Bob", + ), + ], + ) + + +@pytest.fixture +def meeting_with_summary(meeting_with_segments: Meeting) -> Meeting: + """Add summary to meeting.""" + meeting_with_segments.summary = Summary( + meeting_id=meeting_with_segments.id, + executive_summary="This was a productive meeting.", + key_points=[ + KeyPoint(text="Discussed project timeline", segment_ids=[1]), + ], + action_items=[ + ActionItem(text="Follow up with client", assignee="Alice", segment_ids=[2]), + ], + generated_at=datetime.now(UTC), + ) + return meeting_with_segments + + +def test_export_returns_bytes(meeting_with_segments: Meeting) -> None: + """PDF export returns bytes.""" + exporter = PdfExporter() + + result = exporter.export(meeting_with_segments) + + assert isinstance(result, bytes) + assert len(result) > 0 + + +def test_export_is_valid_pdf(meeting_with_segments: Meeting) -> None: + """PDF export produces valid PDF file.""" + exporter = PdfExporter() + + result = exporter.export(meeting_with_segments) + + # PDF files start with %PDF- + assert result.startswith(b"%PDF-") + + +def test_export_includes_title(meeting_with_segments: Meeting) -> None: + """PDF contains meeting title.""" + exporter = PdfExporter() + + # Check HTML content (before PDF conversion) + html = exporter._build_html(meeting_with_segments) + + assert "Test Meeting" in html + + +def test_export_includes_segments(meeting_with_segments: Meeting) -> None: + """PDF contains all segments.""" + exporter = PdfExporter() + + html = exporter._build_html(meeting_with_segments) + + assert "Hello, welcome" in html + assert "Thank you for joining" in html + assert "Alice" in html + assert "Bob" in html + + +def test_export_includes_summary(meeting_with_summary: Meeting) -> None: + """PDF contains summary when present.""" + exporter = PdfExporter() + + html = exporter._build_html(meeting_with_summary) + + assert "productive meeting" in html + assert "project timeline" in html + assert "Follow up with client" in html + + +def test_export_escapes_html_characters(meeting_with_segments: Meeting) -> None: + """PDF properly escapes HTML special characters.""" + meeting_with_segments.segments[0].text = "" + exporter = PdfExporter() + + html = exporter._build_html(meeting_with_segments) + + assert "