noteflow/spikes/spike_03_asr_latency/demo.py

"""Interactive ASR latency demo for Spike 3.

Run with: python -m spikes.spike_03_asr_latency.demo

Features:
- Downloads model on first run (shows progress)
- Generates synthetic audio for testing (or accepts WAV file)
- Displays transcription as it streams
- Shows latency metrics (time-to-first-word, total time)
- Reports memory usage
"""

from __future__ import annotations

import argparse
import logging
import os
import time
import wave
from pathlib import Path

import numpy as np
from numpy.typing import NDArray

from .engine_impl import VALID_MODEL_SIZES, FasterWhisperEngine

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
logger = logging.getLogger(__name__)


def get_memory_usage_mb() -> float:
    """Get current process memory usage in MB."""
    try:
        import psutil

        process = psutil.Process(os.getpid())
        return process.memory_info().rss / 1024 / 1024
    except ImportError:
        return 0.0


def generate_silence(duration_seconds: float, sample_rate: int = 16000) -> NDArray[np.float32]:
    """Generate silent audio for testing.

    Args:
        duration_seconds: Duration of silence.
        sample_rate: Sample rate in Hz.

    Returns:
        Float32 array of zeros.
    """
    samples = int(duration_seconds * sample_rate)
    return np.zeros(samples, dtype=np.float32)


def generate_tone(
    duration_seconds: float,
    frequency_hz: float = 440.0,
    sample_rate: int = 16000,
    amplitude: float = 0.3,
) -> NDArray[np.float32]:
    """Generate a sine wave tone for testing.

    Args:
        duration_seconds: Duration of tone.
        frequency_hz: Frequency in Hz.
        sample_rate: Sample rate in Hz.
        amplitude: Amplitude (0.0-1.0).

    Returns:
        Float32 array of sine wave samples.
    """
    samples = int(duration_seconds * sample_rate)
    t = np.linspace(0, duration_seconds, samples, dtype=np.float32)
    return (amplitude * np.sin(2 * np.pi * frequency_hz * t)).astype(np.float32)


def load_wav_file(path: Path, target_sample_rate: int = 16000) -> NDArray[np.float32]:
    """Load a WAV file and convert to float32.

    Args:
        path: Path to WAV file.
        target_sample_rate: Expected sample rate.

    Returns:
        Float32 array of audio samples.

    Raises:
        ValueError: If file format is incompatible.
    """
    with wave.open(str(path), "rb") as wf:
        if wf.getnchannels() != 1:
            raise ValueError(f"Expected mono audio, got {wf.getnchannels()} channels")

        sample_rate = wf.getframerate()
        if sample_rate != target_sample_rate:
            logger.warning(
                "Sample rate mismatch: expected %d, got %d",
                target_sample_rate,
                sample_rate,
            )

        # Read all frames
        frames = wf.readframes(wf.getnframes())

        # Convert to numpy array
        sample_width = wf.getsampwidth()
        if sample_width == 2:
            audio = np.frombuffer(frames, dtype=np.int16)
            return audio.astype(np.float32) / 32768.0
        elif sample_width == 4:
            audio = np.frombuffer(frames, dtype=np.int32)
            return audio.astype(np.float32) / 2147483648.0
        else:
            raise ValueError(f"Unsupported sample width: {sample_width}")


class AsrDemo:
    """Interactive ASR demonstration."""

    def __init__(self, model_size: str = "tiny") -> None:
        """Initialize the demo.

        Args:
            model_size: Model size to use.
        """
        self.model_size = model_size
        self.engine = FasterWhisperEngine(
            compute_type="int8",
            device="cpu",
        )

    def load_model(self) -> float:
        """Load the ASR model.

        Returns:
            Load time in seconds.
        """
        print(f"\n=== Loading Model: {self.model_size} ===")
        mem_before = get_memory_usage_mb()

        start = time.perf_counter()
        self.engine.load_model(self.model_size)
        elapsed = time.perf_counter() - start

        mem_after = get_memory_usage_mb()
        mem_used = mem_after - mem_before

        print(f"  Load time: {elapsed:.2f}s")
        print(f"  Memory before: {mem_before:.1f} MB")
        print(f"  Memory after: {mem_after:.1f} MB")
        print(f"  Memory used: {mem_used:.1f} MB")

        return elapsed

    def transcribe_audio(
        self,
        audio: NDArray[np.float32],
        audio_name: str = "audio",
    ) -> None:
        """Transcribe audio and display results.

        Args:
            audio: Audio samples (float32, 16kHz).
            audio_name: Name for display.
        """
        duration = len(audio) / 16000
        print(f"\n=== Transcribing: {audio_name} ({duration:.2f}s) ===")

        start = time.perf_counter()
        first_result_time: float | None = None
        segment_count = 0

        for result in self.engine.transcribe(audio):
            if first_result_time is None:
                first_result_time = time.perf_counter() - start

            segment_count += 1
            print(f"\n[{result.start:.2f}s - {result.end:.2f}s] {result.text}")

            if result.words:
                print(f"  Words: {len(result.words)}")
                # Show first few words with timing
                for word in result.words[:3]:
                    print(f"    '{word.word}' @ {word.start:.2f}s (conf: {word.probability:.2f})")
                if len(result.words) > 3:
                    print(f"    ... and {len(result.words) - 3} more words")

        total_time = time.perf_counter() - start

        print("\n=== Results ===")
        print(f"  Audio duration: {duration:.2f}s")
        print(f"  Segments found: {segment_count}")
        print(f"  Time to first result: {first_result_time:.3f}s" if first_result_time else "  No results")
        print(f"  Total transcription time: {total_time:.3f}s")
        print(f"  Real-time factor: {total_time / duration:.2f}x" if duration > 0 else "  N/A")

        if total_time > 0 and duration > 0:
            rtf = total_time / duration
            if rtf < 1.0:
                print("  Status: FASTER than real-time")
            else:
                print(f"  Status: {rtf:.1f}x slower than real-time")

    def demo_with_silence(self, duration: float = 5.0) -> None:
        """Demo with silent audio (should produce no results)."""
        audio = generate_silence(duration)
        self.transcribe_audio(audio, f"silence ({duration}s)")

    def demo_with_tone(self, duration: float = 5.0) -> None:
        """Demo with tone audio (should produce minimal results)."""
        audio = generate_tone(duration)
        self.transcribe_audio(audio, f"440Hz tone ({duration}s)")

    def demo_with_file(self, path: Path) -> None:
        """Demo with a WAV file."""
        print(f"\nLoading WAV file: {path}")
        audio = load_wav_file(path)
        self.transcribe_audio(audio, path.name)

    def run(self, audio_path: Path | None = None) -> None:
        """Run the demo.

        Args:
            audio_path: Optional path to WAV file.
        """
        print("=" * 60)
        print("NoteFlow ASR Demo - Spike 3")
        print("=" * 60)

        # Load model
        self.load_model()

        if audio_path and audio_path.exists():
            # Use provided audio file
            self.demo_with_file(audio_path)
        else:
            # Demo with synthetic audio
            print("\nNo audio file provided, using synthetic audio...")
            self.demo_with_silence(3.0)
            self.demo_with_tone(3.0)

        print("\n=== Demo Complete ===")
        print(f"Final memory usage: {get_memory_usage_mb():.1f} MB")


def main() -> None:
    """Run the ASR demo."""
    parser = argparse.ArgumentParser(description="ASR Latency Demo - Spike 3")
    parser.add_argument(
        "-m",
        "--model",
        type=str,
        default="tiny",
        choices=list(VALID_MODEL_SIZES),
        help="Model size to use (default: tiny)",
    )
    parser.add_argument(
        "-i",
        "--input",
        type=Path,
        default=None,
        help="Input WAV file to transcribe",
    )
    parser.add_argument(
        "--list-models",
        action="store_true",
        help="List available model sizes and exit",
    )
    args = parser.parse_args()

    if args.list_models:
        print("Available model sizes:")
        for size in VALID_MODEL_SIZES:
            print(f"  {size}")
        return

    demo = AsrDemo(model_size=args.model)
    demo.run(audio_path=args.input)


if __name__ == "__main__":
    main()