- Introduced .python-version for Python version management. - Added AGENTS.md for documentation on agent usage and best practices. - Created alembic.ini for database migration configurations. - Implemented main.py as the entry point for the application. - Established pyproject.toml for project dependencies and configurations. - Initialized README.md for project overview. - Generated uv.lock for dependency locking. - Documented milestones and specifications in docs/milestones.md and docs/spec.md. - Created logs/status_line.json for logging status information. - Added initial spike implementations for UI tray hotkeys, audio capture, ASR latency, and encryption validation. - Set up NoteFlow core structure in src/noteflow with necessary modules and services. - Developed test suite in tests directory for application, domain, infrastructure, and integration testing. - Included initial migration scripts in infrastructure/persistence/migrations for database setup. - Established security protocols in infrastructure/security for key management and encryption. - Implemented audio infrastructure for capturing and processing audio data. - Created converters for ASR and ORM in infrastructure/converters. - Added export functionality for different formats in infrastructure/export. - Ensured all new files are included in the repository for future development.
288 lines
8.5 KiB
Python
288 lines
8.5 KiB
Python
"""Interactive ASR latency demo for Spike 3.
|
|
|
|
Run with: python -m spikes.spike_03_asr_latency.demo
|
|
|
|
Features:
|
|
- Downloads model on first run (shows progress)
|
|
- Generates synthetic audio for testing (or accepts WAV file)
|
|
- Displays transcription as it streams
|
|
- Shows latency metrics (time-to-first-word, total time)
|
|
- Reports memory usage
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import time
|
|
import wave
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
from numpy.typing import NDArray
|
|
|
|
from .engine_impl import VALID_MODEL_SIZES, FasterWhisperEngine
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_memory_usage_mb() -> float:
|
|
"""Get current process memory usage in MB."""
|
|
try:
|
|
import psutil
|
|
|
|
process = psutil.Process(os.getpid())
|
|
return process.memory_info().rss / 1024 / 1024
|
|
except ImportError:
|
|
return 0.0
|
|
|
|
|
|
def generate_silence(duration_seconds: float, sample_rate: int = 16000) -> NDArray[np.float32]:
|
|
"""Generate silent audio for testing.
|
|
|
|
Args:
|
|
duration_seconds: Duration of silence.
|
|
sample_rate: Sample rate in Hz.
|
|
|
|
Returns:
|
|
Float32 array of zeros.
|
|
"""
|
|
samples = int(duration_seconds * sample_rate)
|
|
return np.zeros(samples, dtype=np.float32)
|
|
|
|
|
|
def generate_tone(
|
|
duration_seconds: float,
|
|
frequency_hz: float = 440.0,
|
|
sample_rate: int = 16000,
|
|
amplitude: float = 0.3,
|
|
) -> NDArray[np.float32]:
|
|
"""Generate a sine wave tone for testing.
|
|
|
|
Args:
|
|
duration_seconds: Duration of tone.
|
|
frequency_hz: Frequency in Hz.
|
|
sample_rate: Sample rate in Hz.
|
|
amplitude: Amplitude (0.0-1.0).
|
|
|
|
Returns:
|
|
Float32 array of sine wave samples.
|
|
"""
|
|
samples = int(duration_seconds * sample_rate)
|
|
t = np.linspace(0, duration_seconds, samples, dtype=np.float32)
|
|
return (amplitude * np.sin(2 * np.pi * frequency_hz * t)).astype(np.float32)
|
|
|
|
|
|
def load_wav_file(path: Path, target_sample_rate: int = 16000) -> NDArray[np.float32]:
|
|
"""Load a WAV file and convert to float32.
|
|
|
|
Args:
|
|
path: Path to WAV file.
|
|
target_sample_rate: Expected sample rate.
|
|
|
|
Returns:
|
|
Float32 array of audio samples.
|
|
|
|
Raises:
|
|
ValueError: If file format is incompatible.
|
|
"""
|
|
with wave.open(str(path), "rb") as wf:
|
|
if wf.getnchannels() != 1:
|
|
raise ValueError(f"Expected mono audio, got {wf.getnchannels()} channels")
|
|
|
|
sample_rate = wf.getframerate()
|
|
if sample_rate != target_sample_rate:
|
|
logger.warning(
|
|
"Sample rate mismatch: expected %d, got %d",
|
|
target_sample_rate,
|
|
sample_rate,
|
|
)
|
|
|
|
# Read all frames
|
|
frames = wf.readframes(wf.getnframes())
|
|
|
|
# Convert to numpy array
|
|
sample_width = wf.getsampwidth()
|
|
if sample_width == 2:
|
|
audio = np.frombuffer(frames, dtype=np.int16)
|
|
return audio.astype(np.float32) / 32768.0
|
|
elif sample_width == 4:
|
|
audio = np.frombuffer(frames, dtype=np.int32)
|
|
return audio.astype(np.float32) / 2147483648.0
|
|
else:
|
|
raise ValueError(f"Unsupported sample width: {sample_width}")
|
|
|
|
|
|
class AsrDemo:
|
|
"""Interactive ASR demonstration."""
|
|
|
|
def __init__(self, model_size: str = "tiny") -> None:
|
|
"""Initialize the demo.
|
|
|
|
Args:
|
|
model_size: Model size to use.
|
|
"""
|
|
self.model_size = model_size
|
|
self.engine = FasterWhisperEngine(
|
|
compute_type="int8",
|
|
device="cpu",
|
|
)
|
|
|
|
def load_model(self) -> float:
|
|
"""Load the ASR model.
|
|
|
|
Returns:
|
|
Load time in seconds.
|
|
"""
|
|
print(f"\n=== Loading Model: {self.model_size} ===")
|
|
mem_before = get_memory_usage_mb()
|
|
|
|
start = time.perf_counter()
|
|
self.engine.load_model(self.model_size)
|
|
elapsed = time.perf_counter() - start
|
|
|
|
mem_after = get_memory_usage_mb()
|
|
mem_used = mem_after - mem_before
|
|
|
|
print(f" Load time: {elapsed:.2f}s")
|
|
print(f" Memory before: {mem_before:.1f} MB")
|
|
print(f" Memory after: {mem_after:.1f} MB")
|
|
print(f" Memory used: {mem_used:.1f} MB")
|
|
|
|
return elapsed
|
|
|
|
def transcribe_audio(
|
|
self,
|
|
audio: NDArray[np.float32],
|
|
audio_name: str = "audio",
|
|
) -> None:
|
|
"""Transcribe audio and display results.
|
|
|
|
Args:
|
|
audio: Audio samples (float32, 16kHz).
|
|
audio_name: Name for display.
|
|
"""
|
|
duration = len(audio) / 16000
|
|
print(f"\n=== Transcribing: {audio_name} ({duration:.2f}s) ===")
|
|
|
|
start = time.perf_counter()
|
|
first_result_time: float | None = None
|
|
segment_count = 0
|
|
|
|
for result in self.engine.transcribe(audio):
|
|
if first_result_time is None:
|
|
first_result_time = time.perf_counter() - start
|
|
|
|
segment_count += 1
|
|
print(f"\n[{result.start:.2f}s - {result.end:.2f}s] {result.text}")
|
|
|
|
if result.words:
|
|
print(f" Words: {len(result.words)}")
|
|
# Show first few words with timing
|
|
for word in result.words[:3]:
|
|
print(f" '{word.word}' @ {word.start:.2f}s (conf: {word.probability:.2f})")
|
|
if len(result.words) > 3:
|
|
print(f" ... and {len(result.words) - 3} more words")
|
|
|
|
total_time = time.perf_counter() - start
|
|
|
|
print("\n=== Results ===")
|
|
print(f" Audio duration: {duration:.2f}s")
|
|
print(f" Segments found: {segment_count}")
|
|
print(f" Time to first result: {first_result_time:.3f}s" if first_result_time else " No results")
|
|
print(f" Total transcription time: {total_time:.3f}s")
|
|
print(f" Real-time factor: {total_time / duration:.2f}x" if duration > 0 else " N/A")
|
|
|
|
if total_time > 0 and duration > 0:
|
|
rtf = total_time / duration
|
|
if rtf < 1.0:
|
|
print(" Status: FASTER than real-time")
|
|
else:
|
|
print(f" Status: {rtf:.1f}x slower than real-time")
|
|
|
|
def demo_with_silence(self, duration: float = 5.0) -> None:
|
|
"""Demo with silent audio (should produce no results)."""
|
|
audio = generate_silence(duration)
|
|
self.transcribe_audio(audio, f"silence ({duration}s)")
|
|
|
|
def demo_with_tone(self, duration: float = 5.0) -> None:
|
|
"""Demo with tone audio (should produce minimal results)."""
|
|
audio = generate_tone(duration)
|
|
self.transcribe_audio(audio, f"440Hz tone ({duration}s)")
|
|
|
|
def demo_with_file(self, path: Path) -> None:
|
|
"""Demo with a WAV file."""
|
|
print(f"\nLoading WAV file: {path}")
|
|
audio = load_wav_file(path)
|
|
self.transcribe_audio(audio, path.name)
|
|
|
|
def run(self, audio_path: Path | None = None) -> None:
|
|
"""Run the demo.
|
|
|
|
Args:
|
|
audio_path: Optional path to WAV file.
|
|
"""
|
|
print("=" * 60)
|
|
print("NoteFlow ASR Demo - Spike 3")
|
|
print("=" * 60)
|
|
|
|
# Load model
|
|
self.load_model()
|
|
|
|
if audio_path and audio_path.exists():
|
|
# Use provided audio file
|
|
self.demo_with_file(audio_path)
|
|
else:
|
|
# Demo with synthetic audio
|
|
print("\nNo audio file provided, using synthetic audio...")
|
|
self.demo_with_silence(3.0)
|
|
self.demo_with_tone(3.0)
|
|
|
|
print("\n=== Demo Complete ===")
|
|
print(f"Final memory usage: {get_memory_usage_mb():.1f} MB")
|
|
|
|
|
|
def main() -> None:
|
|
"""Run the ASR demo."""
|
|
parser = argparse.ArgumentParser(description="ASR Latency Demo - Spike 3")
|
|
parser.add_argument(
|
|
"-m",
|
|
"--model",
|
|
type=str,
|
|
default="tiny",
|
|
choices=list(VALID_MODEL_SIZES),
|
|
help="Model size to use (default: tiny)",
|
|
)
|
|
parser.add_argument(
|
|
"-i",
|
|
"--input",
|
|
type=Path,
|
|
default=None,
|
|
help="Input WAV file to transcribe",
|
|
)
|
|
parser.add_argument(
|
|
"--list-models",
|
|
action="store_true",
|
|
help="List available model sizes and exit",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if args.list_models:
|
|
print("Available model sizes:")
|
|
for size in VALID_MODEL_SIZES:
|
|
print(f" {size}")
|
|
return
|
|
|
|
demo = AsrDemo(model_size=args.model)
|
|
demo.run(audio_path=args.input)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|