Files
noteflow/spikes/spike_03_asr_latency/demo.py
Travis Vasceannie af1285b181 Add initial project structure and files
- Introduced .python-version for Python version management.
- Added AGENTS.md for documentation on agent usage and best practices.
- Created alembic.ini for database migration configurations.
- Implemented main.py as the entry point for the application.
- Established pyproject.toml for project dependencies and configurations.
- Initialized README.md for project overview.
- Generated uv.lock for dependency locking.
- Documented milestones and specifications in docs/milestones.md and docs/spec.md.
- Created logs/status_line.json for logging status information.
- Added initial spike implementations for UI tray hotkeys, audio capture, ASR latency, and encryption validation.
- Set up NoteFlow core structure in src/noteflow with necessary modules and services.
- Developed test suite in tests directory for application, domain, infrastructure, and integration testing.
- Included initial migration scripts in infrastructure/persistence/migrations for database setup.
- Established security protocols in infrastructure/security for key management and encryption.
- Implemented audio infrastructure for capturing and processing audio data.
- Created converters for ASR and ORM in infrastructure/converters.
- Added export functionality for different formats in infrastructure/export.
- Ensured all new files are included in the repository for future development.
2025-12-17 18:28:59 +00:00

288 lines
8.5 KiB
Python

"""Interactive ASR latency demo for Spike 3.
Run with: python -m spikes.spike_03_asr_latency.demo
Features:
- Downloads model on first run (shows progress)
- Generates synthetic audio for testing (or accepts WAV file)
- Displays transcription as it streams
- Shows latency metrics (time-to-first-word, total time)
- Reports memory usage
"""
from __future__ import annotations
import argparse
import logging
import os
import time
import wave
from pathlib import Path
import numpy as np
from numpy.typing import NDArray
from .engine_impl import VALID_MODEL_SIZES, FasterWhisperEngine
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
logger = logging.getLogger(__name__)
def get_memory_usage_mb() -> float:
"""Get current process memory usage in MB."""
try:
import psutil
process = psutil.Process(os.getpid())
return process.memory_info().rss / 1024 / 1024
except ImportError:
return 0.0
def generate_silence(duration_seconds: float, sample_rate: int = 16000) -> NDArray[np.float32]:
"""Generate silent audio for testing.
Args:
duration_seconds: Duration of silence.
sample_rate: Sample rate in Hz.
Returns:
Float32 array of zeros.
"""
samples = int(duration_seconds * sample_rate)
return np.zeros(samples, dtype=np.float32)
def generate_tone(
duration_seconds: float,
frequency_hz: float = 440.0,
sample_rate: int = 16000,
amplitude: float = 0.3,
) -> NDArray[np.float32]:
"""Generate a sine wave tone for testing.
Args:
duration_seconds: Duration of tone.
frequency_hz: Frequency in Hz.
sample_rate: Sample rate in Hz.
amplitude: Amplitude (0.0-1.0).
Returns:
Float32 array of sine wave samples.
"""
samples = int(duration_seconds * sample_rate)
t = np.linspace(0, duration_seconds, samples, dtype=np.float32)
return (amplitude * np.sin(2 * np.pi * frequency_hz * t)).astype(np.float32)
def load_wav_file(path: Path, target_sample_rate: int = 16000) -> NDArray[np.float32]:
"""Load a WAV file and convert to float32.
Args:
path: Path to WAV file.
target_sample_rate: Expected sample rate.
Returns:
Float32 array of audio samples.
Raises:
ValueError: If file format is incompatible.
"""
with wave.open(str(path), "rb") as wf:
if wf.getnchannels() != 1:
raise ValueError(f"Expected mono audio, got {wf.getnchannels()} channels")
sample_rate = wf.getframerate()
if sample_rate != target_sample_rate:
logger.warning(
"Sample rate mismatch: expected %d, got %d",
target_sample_rate,
sample_rate,
)
# Read all frames
frames = wf.readframes(wf.getnframes())
# Convert to numpy array
sample_width = wf.getsampwidth()
if sample_width == 2:
audio = np.frombuffer(frames, dtype=np.int16)
return audio.astype(np.float32) / 32768.0
elif sample_width == 4:
audio = np.frombuffer(frames, dtype=np.int32)
return audio.astype(np.float32) / 2147483648.0
else:
raise ValueError(f"Unsupported sample width: {sample_width}")
class AsrDemo:
"""Interactive ASR demonstration."""
def __init__(self, model_size: str = "tiny") -> None:
"""Initialize the demo.
Args:
model_size: Model size to use.
"""
self.model_size = model_size
self.engine = FasterWhisperEngine(
compute_type="int8",
device="cpu",
)
def load_model(self) -> float:
"""Load the ASR model.
Returns:
Load time in seconds.
"""
print(f"\n=== Loading Model: {self.model_size} ===")
mem_before = get_memory_usage_mb()
start = time.perf_counter()
self.engine.load_model(self.model_size)
elapsed = time.perf_counter() - start
mem_after = get_memory_usage_mb()
mem_used = mem_after - mem_before
print(f" Load time: {elapsed:.2f}s")
print(f" Memory before: {mem_before:.1f} MB")
print(f" Memory after: {mem_after:.1f} MB")
print(f" Memory used: {mem_used:.1f} MB")
return elapsed
def transcribe_audio(
self,
audio: NDArray[np.float32],
audio_name: str = "audio",
) -> None:
"""Transcribe audio and display results.
Args:
audio: Audio samples (float32, 16kHz).
audio_name: Name for display.
"""
duration = len(audio) / 16000
print(f"\n=== Transcribing: {audio_name} ({duration:.2f}s) ===")
start = time.perf_counter()
first_result_time: float | None = None
segment_count = 0
for result in self.engine.transcribe(audio):
if first_result_time is None:
first_result_time = time.perf_counter() - start
segment_count += 1
print(f"\n[{result.start:.2f}s - {result.end:.2f}s] {result.text}")
if result.words:
print(f" Words: {len(result.words)}")
# Show first few words with timing
for word in result.words[:3]:
print(f" '{word.word}' @ {word.start:.2f}s (conf: {word.probability:.2f})")
if len(result.words) > 3:
print(f" ... and {len(result.words) - 3} more words")
total_time = time.perf_counter() - start
print("\n=== Results ===")
print(f" Audio duration: {duration:.2f}s")
print(f" Segments found: {segment_count}")
print(f" Time to first result: {first_result_time:.3f}s" if first_result_time else " No results")
print(f" Total transcription time: {total_time:.3f}s")
print(f" Real-time factor: {total_time / duration:.2f}x" if duration > 0 else " N/A")
if total_time > 0 and duration > 0:
rtf = total_time / duration
if rtf < 1.0:
print(" Status: FASTER than real-time")
else:
print(f" Status: {rtf:.1f}x slower than real-time")
def demo_with_silence(self, duration: float = 5.0) -> None:
"""Demo with silent audio (should produce no results)."""
audio = generate_silence(duration)
self.transcribe_audio(audio, f"silence ({duration}s)")
def demo_with_tone(self, duration: float = 5.0) -> None:
"""Demo with tone audio (should produce minimal results)."""
audio = generate_tone(duration)
self.transcribe_audio(audio, f"440Hz tone ({duration}s)")
def demo_with_file(self, path: Path) -> None:
"""Demo with a WAV file."""
print(f"\nLoading WAV file: {path}")
audio = load_wav_file(path)
self.transcribe_audio(audio, path.name)
def run(self, audio_path: Path | None = None) -> None:
"""Run the demo.
Args:
audio_path: Optional path to WAV file.
"""
print("=" * 60)
print("NoteFlow ASR Demo - Spike 3")
print("=" * 60)
# Load model
self.load_model()
if audio_path and audio_path.exists():
# Use provided audio file
self.demo_with_file(audio_path)
else:
# Demo with synthetic audio
print("\nNo audio file provided, using synthetic audio...")
self.demo_with_silence(3.0)
self.demo_with_tone(3.0)
print("\n=== Demo Complete ===")
print(f"Final memory usage: {get_memory_usage_mb():.1f} MB")
def main() -> None:
"""Run the ASR demo."""
parser = argparse.ArgumentParser(description="ASR Latency Demo - Spike 3")
parser.add_argument(
"-m",
"--model",
type=str,
default="tiny",
choices=list(VALID_MODEL_SIZES),
help="Model size to use (default: tiny)",
)
parser.add_argument(
"-i",
"--input",
type=Path,
default=None,
help="Input WAV file to transcribe",
)
parser.add_argument(
"--list-models",
action="store_true",
help="List available model sizes and exit",
)
args = parser.parse_args()
if args.list_models:
print("Available model sizes:")
for size in VALID_MODEL_SIZES:
print(f" {size}")
return
demo = AsrDemo(model_size=args.model)
demo.run(audio_path=args.input)
if __name__ == "__main__":
main()