import asyncio import audioop import base64 import contextlib import io import json import os import re import shlex import shutil import socket import subprocess import sys import tempfile import time import wave from dataclasses import dataclass from fractions import Fraction from pathlib import Path from typing import TYPE_CHECKING, Any, Awaitable, Callable from wisper import WisperEvent if TYPE_CHECKING: from supertonic_gateway import SuperTonicGateway try: import numpy as np NUMPY_AVAILABLE = True except Exception: # pragma: no cover - runtime fallback when numpy is unavailable np = None # type: ignore[assignment] NUMPY_AVAILABLE = False try: from supertonic import TTS as SupertonicTTS SUPERTONIC_TTS_AVAILABLE = True except Exception: # pragma: no cover - runtime fallback when supertonic is unavailable SupertonicTTS = None # type: ignore[assignment] SUPERTONIC_TTS_AVAILABLE = False try: from faster_whisper import WhisperModel FASTER_WHISPER_AVAILABLE = True except Exception: # pragma: no cover - runtime fallback when faster-whisper is unavailable WhisperModel = None # type: ignore[assignment] FASTER_WHISPER_AVAILABLE = False try: from aiortc import RTCPeerConnection, RTCSessionDescription from aiortc.mediastreams import MediaStreamTrack from aiortc.sdp import candidate_from_sdp from av import AudioFrame AIORTC_AVAILABLE = True except Exception: # pragma: no cover - runtime fallback when aiortc is unavailable RTCPeerConnection = None # type: ignore[assignment] RTCSessionDescription = None # type: ignore[assignment] MediaStreamTrack = object # type: ignore[assignment,misc] candidate_from_sdp = None # type: ignore[assignment] AudioFrame = None # type: ignore[assignment] AIORTC_AVAILABLE = False SPEECH_FILTER_RE = re.compile( r"^(already connected to nanobot|connected to nanobot|disconnected from nanobot|nanobot closed the connection|websocket)", re.IGNORECASE, ) THINKING_STATUS_RE = re.compile( r"\b(?:agent|nanobot|napbot)\b(?:\s+is)?\s+thinking\b", re.IGNORECASE, ) USER_PREFIX_RE = re.compile(r"^(?:you|user)\s*:\s*", re.IGNORECASE) AGENT_PREFIX_RE = re.compile(r"^(?:nanobot|napbot)\b\s*[:>\-]?\s*", re.IGNORECASE) VOICE_TRANSCRIPT_RE = re.compile( r"^(?:wisper\s*:\s*)?voice\s+transcript\s*:\s*", re.IGNORECASE, ) ANSI_ESCAPE_RE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") CONTROL_CHAR_RE = re.compile(r"[\x00-\x1f\x7f]") BRAILLE_SPINNER_RE = re.compile(r"[\u2800-\u28ff]") TTS_ALLOWED_ASCII = set( "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?;:'\"()[]{}@#%&*+-_/<>|" ) TTS_WORD_RE = re.compile(r"[A-Za-z0-9]") TTS_RETRY_BREAK_RE = re.compile(r"(?<=[.!?,;:])\s+") TTS_PARTIAL_COMMIT_RE = re.compile(r"[.!?]\s*$|[,;:]\s+$") LOCAL_ICE_GATHER_TIMEOUT_S = 0.35 def _sanitize_tts_text(text: str) -> str: cleaned = ANSI_ESCAPE_RE.sub(" ", text) cleaned = BRAILLE_SPINNER_RE.sub(" ", cleaned) cleaned = cleaned.replace("\u00a0", " ") cleaned = cleaned.replace("•", " ") cleaned = CONTROL_CHAR_RE.sub(" ", cleaned) cleaned = "".join(ch if (ch in TTS_ALLOWED_ASCII or ch.isspace()) else " " for ch in cleaned) cleaned = re.sub(r"\s+", " ", cleaned).strip() if not TTS_WORD_RE.search(cleaned): return "" return cleaned def _split_tts_retry_segments(text: str, max_chars: int = 120) -> list[str]: clean = _sanitize_tts_text(text) if not clean: return [] parts = [part.strip() for part in TTS_RETRY_BREAK_RE.split(clean) if part.strip()] if len(parts) <= 1: words = clean.split() if len(words) <= 1: return [] parts = [] current = words[0] for word in words[1:]: candidate = f"{current} {word}" if len(candidate) <= max_chars: current = candidate continue parts.append(current) current = word parts.append(current) compact_parts = [_sanitize_tts_text(part) for part in parts] compact_parts = [part for part in compact_parts if part] if len(compact_parts) <= 1: return [] return compact_parts def _coerce_message_metadata(raw: Any) -> dict[str, Any]: def _coerce_jsonish(value: Any, depth: int = 0) -> Any: if depth > 6: return None if value is None or isinstance(value, (str, int, float, bool)): return value if isinstance(value, dict): cleaned_dict: dict[str, Any] = {} for key, item in value.items(): cleaned_item = _coerce_jsonish(item, depth + 1) if cleaned_item is not None: cleaned_dict[str(key)] = cleaned_item return cleaned_dict if isinstance(value, list): cleaned_list: list[Any] = [] for item in value[:50]: cleaned_item = _coerce_jsonish(item, depth + 1) if cleaned_item is not None: cleaned_list.append(cleaned_item) return cleaned_list return None if not isinstance(raw, dict): return {} cleaned: dict[str, Any] = {} for key, value in raw.items(): cleaned_value = _coerce_jsonish(value) if cleaned_value is not None: cleaned[str(key)] = cleaned_value return cleaned def _optional_int_env(name: str) -> int | None: raw_value = os.getenv(name, "").strip() if not raw_value: return None return int(raw_value) @dataclass(slots=True) class PCMChunk: pcm: bytes sample_rate: int channels: int = 1 @dataclass(slots=True) class STTSegment: pcm: bytes metadata: dict[str, Any] if AIORTC_AVAILABLE: class QueueAudioTrack(MediaStreamTrack): kind = "audio" def __init__(self, sample_rate: int = 48_000, frame_ms: int = 20) -> None: super().__init__() self._sample_rate = sample_rate self._frame_ms = max(1, frame_ms) self._samples_per_frame = max(1, (sample_rate * frame_ms) // 1000) self._bytes_per_frame = self._samples_per_frame * 2 self._queue: asyncio.Queue[bytes] = asyncio.Queue() self._timestamp = 0 self._resample_state = None self._resample_source_rate: int | None = None self._lead_in_ms = max(0, int(os.getenv("HOST_RTC_OUTBOUND_LEAD_IN_MS", "120"))) self._lead_in_frames = (self._lead_in_ms + self._frame_ms - 1) // self._frame_ms self._lead_in_idle_s = max(0.1, float(os.getenv("HOST_RTC_OUTBOUND_IDLE_S", "0.6"))) self._last_enqueue_at = 0.0 self._closed = False self._frame_duration_s = frame_ms / 1000.0 self._last_recv_at = 0.0 self._playing = False self._idle_frames = 0 # Number of consecutive silent frames before signalling idle. # At 20ms per frame, 15 frames = 300ms grace period to avoid # flickering between TTS synthesis chunks. self._idle_grace_frames = max( 1, int(os.getenv("HOST_RTC_IDLE_GRACE_MS", "300")) // max(1, frame_ms) ) self._on_playing_changed: Callable[[bool], None] | None = None async def enqueue_pcm(self, pcm: bytes, sample_rate: int, channels: int = 1) -> None: if self._closed or not pcm: return now = asyncio.get_running_loop().time() should_add_lead_in = ( self._lead_in_frames > 0 and self._queue.empty() and ( self._last_enqueue_at <= 0.0 or (now - self._last_enqueue_at) >= self._lead_in_idle_s ) ) if should_add_lead_in: silence = b"\x00" * self._bytes_per_frame for _index in range(self._lead_in_frames): await self._queue.put(silence) mono = pcm if channels > 1: mono = audioop.tomono(mono, 2, 0.5, 0.5) if sample_rate != self._sample_rate: # audioop rate conversion state is only valid when source/destination rates stay the same. if self._resample_source_rate != sample_rate: self._resample_state = None self._resample_source_rate = sample_rate mono, self._resample_state = audioop.ratecv( mono, 2, 1, sample_rate, self._sample_rate, self._resample_state, ) else: self._resample_state = None self._resample_source_rate = None if not mono: return for start in range(0, len(mono), self._bytes_per_frame): chunk = mono[start : start + self._bytes_per_frame] if len(chunk) < self._bytes_per_frame: chunk += b"\x00" * (self._bytes_per_frame - len(chunk)) await self._queue.put(chunk) self._last_enqueue_at = now async def recv(self) -> AudioFrame: if self._closed: raise asyncio.CancelledError # Pace frame delivery to real-time to prevent RTP burst sends. # Without pacing, when TTS enqueues audio faster than real-time, # aiortc sends RTP packets in a burst and the browser's jitter # buffer skips ahead, causing the user to only hear the tail end. loop = asyncio.get_running_loop() now = loop.time() if self._last_recv_at > 0.0: elapsed = now - self._last_recv_at remaining = self._frame_duration_s - elapsed if remaining > 0.001: await asyncio.sleep(remaining) try: payload = self._queue.get_nowait() has_audio = True except asyncio.QueueEmpty: payload = b"\x00" * self._bytes_per_frame has_audio = False # Notify when playback state changes. if has_audio: self._idle_frames = 0 if not self._playing: self._playing = True if self._on_playing_changed: self._on_playing_changed(True) elif self._playing: self._idle_frames += 1 if self._idle_frames >= self._idle_grace_frames: self._playing = False if self._on_playing_changed: self._on_playing_changed(False) self._last_recv_at = loop.time() frame = AudioFrame(format="s16", layout="mono", samples=self._samples_per_frame) frame.planes[0].update(payload) frame.sample_rate = self._sample_rate frame.time_base = Fraction(1, self._sample_rate) frame.pts = self._timestamp self._timestamp += self._samples_per_frame return frame def stop(self) -> None: self._closed = True super().stop() def clear(self) -> None: while True: try: self._queue.get_nowait() except asyncio.QueueEmpty: break self._last_enqueue_at = 0.0 self._idle_frames = 0 if self._playing: self._playing = False if self._on_playing_changed: self._on_playing_changed(False) else: class QueueAudioTrack: # pragma: no cover - used only when aiortc is unavailable _on_playing_changed: Callable[[bool], None] | None = None async def enqueue_pcm(self, pcm: bytes, sample_rate: int, channels: int = 1) -> None: return def stop(self) -> None: return def clear(self) -> None: return def _write_temp_wav(pcm: bytes, sample_rate: int, channels: int) -> str: with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: tmp_path = tmp_file.name with wave.open(tmp_path, "wb") as wav_file: wav_file.setnchannels(max(1, channels)) wav_file.setsampwidth(2) wav_file.setframerate(sample_rate) wav_file.writeframes(pcm) return tmp_path class CommandSpeechToText: def __init__(self) -> None: self._command_template = os.getenv("HOST_STT_COMMAND", "").strip() @property def enabled(self) -> bool: return bool(self._command_template) async def transcribe_pcm( self, pcm: bytes, sample_rate: int = 16_000, channels: int = 1 ) -> str | None: if not self.enabled or not pcm: return None return await asyncio.to_thread(self._transcribe_blocking, pcm, sample_rate, channels) def unavailable_reason(self) -> str: if not self._command_template: return "HOST_STT_COMMAND is not configured." return "HOST_STT_COMMAND failed to produce transcript." def _transcribe_blocking(self, pcm: bytes, sample_rate: int, channels: int) -> str | None: tmp_path: str | None = None try: tmp_path = _write_temp_wav(pcm=pcm, sample_rate=sample_rate, channels=channels) command = self._command_template if "{input_wav}" in command: command = command.replace("{input_wav}", shlex.quote(tmp_path)) else: command = f"{command} {shlex.quote(tmp_path)}" result = subprocess.run( command, shell=True, capture_output=True, text=True, check=False, ) if result.returncode != 0: stderr = result.stderr.strip() or "unknown error" raise RuntimeError(f"STT command failed: {stderr}") transcript = result.stdout.strip() return transcript or None finally: if tmp_path and os.path.exists(tmp_path): with contextlib.suppress(OSError): os.unlink(tmp_path) class FasterWhisperSpeechToText: def __init__(self) -> None: self._model_name = os.getenv("HOST_STT_MODEL", "tiny.en").strip() or "tiny.en" self._device = os.getenv("HOST_STT_DEVICE", "auto").strip() or "auto" self._compute_type = os.getenv("HOST_STT_COMPUTE_TYPE", "int8").strip() or "int8" self._language = os.getenv("HOST_STT_LANGUAGE", "en").strip() self._beam_size = max(1, int(os.getenv("HOST_STT_BEAM_SIZE", "1"))) self._best_of = max(1, int(os.getenv("HOST_STT_BEST_OF", "1"))) self._vad_filter = os.getenv("HOST_STT_VAD_FILTER", "0").strip() not in { "0", "false", "False", "no", "off", } self._temperature = float(os.getenv("HOST_STT_TEMPERATURE", "0.0")) self._log_prob_threshold = float(os.getenv("HOST_STT_LOG_PROB_THRESHOLD", "-1.0")) self._no_speech_threshold = float(os.getenv("HOST_STT_NO_SPEECH_THRESHOLD", "0.6")) self._compression_ratio_threshold = float( os.getenv("HOST_STT_COMPRESSION_RATIO_THRESHOLD", "2.4") ) self._initial_prompt = ( os.getenv( "HOST_STT_INITIAL_PROMPT", "Transcribe brief spoken English precisely. Prefer common words over sound effects.", ).strip() or None ) self._repetition_penalty = float(os.getenv("HOST_STT_REPETITION_PENALTY", "1.0")) raw_hallucination_threshold = os.getenv( "HOST_STT_HALLUCINATION_SILENCE_THRESHOLD", "" ).strip() self._hallucination_silence_threshold: float | None = ( float(raw_hallucination_threshold) if raw_hallucination_threshold else None ) self._model: Any = None self._init_error: str | None = None self._lock = asyncio.Lock() @property def enabled(self) -> bool: return FASTER_WHISPER_AVAILABLE and WhisperModel is not None @property def init_error(self) -> str | None: return self._init_error async def transcribe_pcm( self, pcm: bytes, sample_rate: int = 16_000, channels: int = 1 ) -> str | None: if not self.enabled or not pcm: return None async with self._lock: return await asyncio.to_thread(self._transcribe_blocking, pcm, sample_rate, channels) async def warmup(self) -> None: if not self.enabled: return async with self._lock: await asyncio.to_thread(self._initialize_blocking) def _initialize_blocking(self) -> None: if self._model is not None: return if not self.enabled or WhisperModel is None: return try: self._model = WhisperModel( self._model_name, device=self._device, compute_type=self._compute_type, ) self._init_error = None except Exception as exc: self._init_error = str(exc) self._model = None def _transcribe_blocking(self, pcm: bytes, sample_rate: int, channels: int) -> str | None: self._initialize_blocking() if self._model is None: if self._init_error: raise RuntimeError(f"faster-whisper initialization failed: {self._init_error}") return None if NUMPY_AVAILABLE and np is not None: mono = pcm if channels > 1: mono = audioop.tomono(mono, 2, 0.5, 0.5) if sample_rate != 16_000: mono, _ = audioop.ratecv( mono, 2, 1, sample_rate, 16_000, None, ) audio = np.frombuffer(mono, dtype=np.int16).astype(np.float32) / 32768.0 if audio.size == 0: return None segments, _info = self._model.transcribe( audio, language=self._language or None, beam_size=self._beam_size, best_of=self._best_of, vad_filter=self._vad_filter, condition_on_previous_text=False, without_timestamps=True, initial_prompt=self._initial_prompt, temperature=self._temperature, log_prob_threshold=self._log_prob_threshold, no_speech_threshold=self._no_speech_threshold, compression_ratio_threshold=self._compression_ratio_threshold, repetition_penalty=self._repetition_penalty, hallucination_silence_threshold=self._hallucination_silence_threshold, ) transcript_parts: list[str] = [] for segment in segments: text = str(getattr(segment, "text", "")).strip() if text: transcript_parts.append(text) transcript = " ".join(transcript_parts).strip() return transcript or None tmp_path: str | None = None try: tmp_path = _write_temp_wav(pcm=pcm, sample_rate=sample_rate, channels=channels) segments, _info = self._model.transcribe( tmp_path, language=self._language or None, beam_size=self._beam_size, best_of=self._best_of, vad_filter=self._vad_filter, condition_on_previous_text=False, without_timestamps=True, initial_prompt=self._initial_prompt, temperature=self._temperature, log_prob_threshold=self._log_prob_threshold, no_speech_threshold=self._no_speech_threshold, compression_ratio_threshold=self._compression_ratio_threshold, repetition_penalty=self._repetition_penalty, hallucination_silence_threshold=self._hallucination_silence_threshold, ) transcript_parts: list[str] = [] for segment in segments: text = str(getattr(segment, "text", "")).strip() if text: transcript_parts.append(text) transcript = " ".join(transcript_parts).strip() return transcript or None finally: if tmp_path and os.path.exists(tmp_path): with contextlib.suppress(OSError): os.unlink(tmp_path) class HostSpeechToText: def __init__(self) -> None: provider = ( os.getenv("HOST_STT_PROVIDER", "faster-whisper").strip() or "faster-whisper" ).lower() if provider not in {"faster-whisper", "command", "auto"}: provider = "auto" self._provider = provider self._faster_whisper = FasterWhisperSpeechToText() self._command = CommandSpeechToText() @property def enabled(self) -> bool: if self._provider == "faster-whisper": return self._faster_whisper.enabled if self._provider == "command": return self._command.enabled return self._faster_whisper.enabled or self._command.enabled async def transcribe_pcm( self, pcm: bytes, sample_rate: int = 16_000, channels: int = 1 ) -> str | None: if self._provider in {"faster-whisper", "auto"}: transcript = await self._faster_whisper.transcribe_pcm( pcm=pcm, sample_rate=sample_rate, channels=channels, ) if transcript: return transcript if self._provider == "faster-whisper": return None if self._provider in {"command", "auto"}: return await self._command.transcribe_pcm( pcm=pcm, sample_rate=sample_rate, channels=channels, ) return None async def warmup(self) -> None: if self._provider in {"faster-whisper", "auto"}: await self._faster_whisper.warmup() def unavailable_reason(self) -> str: if self._provider == "faster-whisper": if not self._faster_whisper.enabled: return "faster-whisper package is not available." if self._faster_whisper.init_error: return f"faster-whisper initialization failed: {self._faster_whisper.init_error}" return "faster-whisper did not return transcript." if self._provider == "command": return self._command.unavailable_reason() if self._faster_whisper.init_error: return f"faster-whisper initialization failed: {self._faster_whisper.init_error}" if self._command.enabled: return "HOST_STT_COMMAND failed to produce transcript." if not self._faster_whisper.enabled: return "faster-whisper package is not available." return "No STT provider is configured." class SupertonicTextToSpeech: def __init__(self) -> None: self._model = os.getenv("SUPERTONIC_MODEL", "supertonic-2").strip() or "supertonic-2" self._voice_style_name = os.getenv("SUPERTONIC_VOICE_STYLE", "F1").strip() or "F1" self._lang = os.getenv("SUPERTONIC_LANG", "en").strip() or "en" self._total_steps = int(os.getenv("SUPERTONIC_TOTAL_STEPS", "4")) self._speed = float(os.getenv("SUPERTONIC_SPEED", "1.5")) self._intra_op_num_threads = _optional_int_env("SUPERTONIC_INTRA_OP_THREADS") self._inter_op_num_threads = _optional_int_env("SUPERTONIC_INTER_OP_THREADS") self._auto_download = os.getenv("SUPERTONIC_AUTO_DOWNLOAD", "1").strip() not in { "0", "false", "False", "no", "off", } self._engine: Any = None self._voice_style: Any = None self._init_error: str | None = None self._lock = asyncio.Lock() @property def enabled(self) -> bool: return SUPERTONIC_TTS_AVAILABLE and SupertonicTTS is not None and NUMPY_AVAILABLE @property def init_error(self) -> str | None: return self._init_error async def synthesize(self, text: str) -> PCMChunk | None: if not self.enabled: return None clean_text = " ".join(text.split()) if not clean_text: return None async with self._lock: return await asyncio.to_thread(self._synthesize_blocking, clean_text) def _synthesize_blocking(self, text: str) -> PCMChunk | None: self._initialize_blocking() if self._engine is None or self._voice_style is None or np is None: return None text = _sanitize_tts_text(text) if not text: return None try: wav, _duration = self._engine.synthesize( text, voice_style=self._voice_style, lang=self._lang, total_steps=self._total_steps, speed=self._speed, ) except ValueError as exc: message = str(exc) if "unsupported character" not in message.lower(): raise fallback_text = _sanitize_tts_text(text) if not fallback_text or fallback_text == text: raise wav, _duration = self._engine.synthesize( fallback_text, voice_style=self._voice_style, lang=self._lang, total_steps=self._total_steps, speed=self._speed, ) samples = np.asarray(wav) if samples.size == 0: return None channels = 1 if samples.ndim == 0: samples = samples.reshape(1) elif samples.ndim == 1: channels = 1 elif samples.ndim == 2: # Normalize to frames x channels so PCM bytes are correctly interleaved. dim0, dim1 = int(samples.shape[0]), int(samples.shape[1]) if dim0 <= 2 and dim1 > dim0: channels = dim0 samples = samples.T elif dim1 <= 2 and dim0 > dim1: channels = dim1 else: channels = 1 samples = samples.reshape(-1) else: channels = 1 samples = samples.reshape(-1) if np.issubdtype(samples.dtype, np.floating): samples = np.clip(samples, -1.0, 1.0) samples = (samples * 32767.0).astype(np.int16) else: if samples.dtype != np.int16: samples = samples.astype(np.int16) pcm = samples.tobytes() return PCMChunk( pcm=pcm, sample_rate=int(getattr(self._engine, "sample_rate", 24_000)), channels=max(1, channels), ) def _initialize_blocking(self) -> None: if self._engine is not None and self._voice_style is not None: return if not self.enabled or SupertonicTTS is None: return try: engine = SupertonicTTS( model=self._model, auto_download=self._auto_download, intra_op_num_threads=self._intra_op_num_threads, inter_op_num_threads=self._inter_op_num_threads, ) voice_style = engine.get_voice_style(self._voice_style_name) except Exception as exc: self._init_error = str(exc) return self._engine = engine self._voice_style = voice_style self._init_error = None class MeloTTSTextToSpeech: def __init__(self) -> None: self._root_dir = Path(__file__).resolve().parent self._workspace_dir = Path( os.getenv("NANOBOT_WORKSPACE", str(Path.home() / ".nanobot")) ).expanduser() self._socket_path = Path( os.getenv("MELO_TTS_SOCKET", str(self._workspace_dir / "melotts.sock")) ).expanduser() self._server_script = self._root_dir / "scripts" / "melotts_server.py" self._server_log_path = self._workspace_dir / "logs" / "melotts-server.log" self._startup_timeout_s = max( 5.0, float(os.getenv("MELO_TTS_SERVER_STARTUP_TIMEOUT_S", "120")) ) self._init_error: str | None = None self._lock = asyncio.Lock() @property def enabled(self) -> bool: return self._server_script.exists() @property def init_error(self) -> str | None: return self._init_error async def synthesize(self, text: str) -> PCMChunk | None: if not self.enabled: return None clean_text = " ".join(text.split()) if not clean_text: return None async with self._lock: return await asyncio.to_thread(self._synthesize_blocking, clean_text) def _synthesize_blocking(self, text: str) -> PCMChunk | None: self._ensure_server_blocking() response = self._rpc( { "action": "synthesize_pcm", "text": text, }, timeout_s=max(30.0, self._startup_timeout_s), ) if not response.get("ok"): raise RuntimeError(str(response.get("error", "MeloTTS synthesis failed"))) encoded_pcm = str(response.get("pcm", "")).strip() if not encoded_pcm: return None pcm = base64.b64decode(encoded_pcm) sample_rate = max(1, int(response.get("sample_rate", 44100))) channels = max(1, int(response.get("channels", 1))) return PCMChunk(pcm=pcm, sample_rate=sample_rate, channels=channels) def _ensure_server_blocking(self) -> None: if self._ping(): self._init_error = None return with contextlib.suppress(FileNotFoundError): self._socket_path.unlink() self._server_log_path.parent.mkdir(parents=True, exist_ok=True) with self._server_log_path.open("a", encoding="utf-8") as log_handle: proc = subprocess.Popen( [sys.executable, str(self._server_script), "--socket-path", str(self._socket_path)], cwd=str(self._root_dir), stdin=subprocess.DEVNULL, stdout=log_handle, stderr=subprocess.STDOUT, start_new_session=True, ) deadline = time.time() + self._startup_timeout_s while time.time() < deadline: if self._ping(): self._init_error = None return exit_code = proc.poll() if exit_code is not None: self._init_error = ( f"MeloTTS server exited during startup with code {exit_code}. " f"See {self._server_log_path}" ) raise RuntimeError(self._init_error) time.sleep(0.25) self._init_error = ( f"MeloTTS server did not become ready within {self._startup_timeout_s:.0f}s." ) raise RuntimeError(self._init_error) def _ping(self) -> bool: try: response = self._rpc({"action": "ping"}, timeout_s=2.0) except Exception: return False return bool(response.get("ok")) def _rpc(self, payload: dict[str, Any], timeout_s: float) -> dict[str, Any]: sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.settimeout(timeout_s) try: sock.connect(str(self._socket_path)) sock.sendall((json.dumps(payload) + "\n").encode("utf-8")) chunks: list[bytes] = [] while True: data = sock.recv(8192) if not data: break chunks.append(data) if b"\n" in data: break finally: sock.close() response = b"".join(chunks).decode("utf-8", errors="replace").strip() if not response: raise RuntimeError("empty response from MeloTTS server") return json.loads(response) class HostTextToSpeech: def __init__(self) -> None: provider = (os.getenv("HOST_TTS_PROVIDER", "supertonic").strip() or "supertonic").lower() if provider not in {"supertonic", "melotts", "command", "espeak", "auto"}: provider = "auto" self._provider = provider self._supertonic = SupertonicTextToSpeech() self._melotts = MeloTTSTextToSpeech() self._command_template = os.getenv("HOST_TTS_COMMAND", "").strip() self._espeak = shutil.which("espeak") @property def enabled(self) -> bool: if self._provider == "supertonic": return self._supertonic.enabled if self._provider == "melotts": return self._melotts.enabled if self._provider == "command": return bool(self._command_template) if self._provider == "espeak": return bool(self._espeak) return ( self._supertonic.enabled or self._melotts.enabled or bool(self._command_template or self._espeak) ) async def synthesize(self, text: str) -> PCMChunk | None: clean_text = " ".join(text.split()) if not clean_text: return None if self._provider in {"supertonic", "auto"}: audio = await self._supertonic.synthesize(clean_text) if audio: return audio if self._provider == "supertonic": return None if self._provider in {"melotts", "auto"}: audio = await self._melotts.synthesize(clean_text) if audio: return audio if self._provider == "melotts": return None if self._provider in {"command", "auto"} and self._command_template: return await asyncio.to_thread(self._synthesize_with_command, clean_text) if self._provider == "command": return None if self._provider in {"espeak", "auto"} and self._espeak: return await asyncio.to_thread(self._synthesize_with_espeak, clean_text) return None def unavailable_reason(self) -> str: if self._provider == "supertonic": if not self._supertonic.enabled: return "supertonic package is not available." if self._supertonic.init_error: return f"supertonic initialization failed: {self._supertonic.init_error}" return "supertonic did not return audio." if self._provider == "melotts": if not self._melotts.enabled: return "MeloTTS server script is not available." if self._melotts.init_error: return f"MeloTTS initialization failed: {self._melotts.init_error}" return "MeloTTS did not return audio." if self._provider == "command": return "HOST_TTS_COMMAND is not configured." if self._provider == "espeak": return "espeak binary is not available." if self._supertonic.init_error: return f"supertonic initialization failed: {self._supertonic.init_error}" if self._melotts.init_error: return f"MeloTTS initialization failed: {self._melotts.init_error}" if self._command_template: return "HOST_TTS_COMMAND failed to produce audio." if self._espeak: return "espeak failed to produce audio." return "No TTS provider is configured." def _synthesize_with_command(self, text: str) -> PCMChunk | None: command = self._command_template if "{text}" in command: command = command.replace("{text}", shlex.quote(text)) else: command = f"{command} {shlex.quote(text)}" if "{output_wav}" in command: tmp_path: str | None = None try: with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: tmp_path = tmp_file.name command_with_output = command.replace("{output_wav}", shlex.quote(tmp_path)) result = subprocess.run( command_with_output, shell=True, capture_output=True, text=True, check=False, ) if result.returncode != 0: stderr = result.stderr.strip() or "unknown error" raise RuntimeError(f"TTS command failed: {stderr}") return self._read_wav_file(tmp_path) finally: if tmp_path and os.path.exists(tmp_path): with contextlib.suppress(OSError): os.unlink(tmp_path) result = subprocess.run( command, shell=True, capture_output=True, check=False, ) if result.returncode != 0: stderr = result.stderr.decode(errors="ignore").strip() or "unknown error" raise RuntimeError(f"TTS command failed: {stderr}") return self._decode_wav_bytes(result.stdout) def _synthesize_with_espeak(self, text: str) -> PCMChunk | None: if not self._espeak: return None result = subprocess.run( [self._espeak, "--stdout", text], capture_output=True, check=False, ) if result.returncode != 0: stderr = result.stderr.decode(errors="ignore").strip() or "unknown error" raise RuntimeError(f"espeak failed: {stderr}") return self._decode_wav_bytes(result.stdout) def _read_wav_file(self, path: str) -> PCMChunk | None: try: with open(path, "rb") as wav_file: return self._decode_wav_bytes(wav_file.read()) except OSError: return None def _decode_wav_bytes(self, payload: bytes) -> PCMChunk | None: if not payload: return None with wave.open(io.BytesIO(payload), "rb") as wav_file: channels = wav_file.getnchannels() sample_width = wav_file.getsampwidth() sample_rate = wav_file.getframerate() pcm = wav_file.readframes(wav_file.getnframes()) if sample_width != 2: pcm = audioop.lin2lin(pcm, sample_width, 2) return PCMChunk(pcm=pcm, sample_rate=sample_rate, channels=max(1, channels)) SendJsonCallable = Callable[[dict[str, Any]], Awaitable[None]] class WebRTCVoiceSession: def __init__(self, gateway: "SuperTonicGateway") -> None: self._gateway = gateway self._pc: RTCPeerConnection | None = None self._dc: Any | None = None # RTCDataChannel (aiortc) self._outbound_track: QueueAudioTrack | None = None self._incoming_audio_task: asyncio.Task[None] | None = None self._stt_worker_task: asyncio.Task[None] | None = None self._stt_warmup_task: asyncio.Task[None] | None = None self._stt = HostSpeechToText() self._tts = HostTextToSpeech() self._stt_segment_queue_size = max(1, int(os.getenv("HOST_STT_SEGMENT_QUEUE_SIZE", "2"))) self._stt_segments: asyncio.Queue[STTSegment] = asyncio.Queue( maxsize=self._stt_segment_queue_size ) self._tts_chunks: list[str] = [] self._tts_partial_buffer = "" self._tts_flush_handle: asyncio.TimerHandle | None = None self._tts_flush_lock = asyncio.Lock() self._tts_buffer_lock = asyncio.Lock() # How long to wait after the last incoming chunk before flushing the # entire accumulated response to TTS in one go. self._tts_response_end_delay_s = max( 0.1, float(os.getenv("HOST_TTS_RESPONSE_END_DELAY_S", "0.5")) ) self._tts_partial_commit_chars = max( 24, int(os.getenv("HOST_TTS_PARTIAL_COMMIT_CHARS", "72")) ) self._closed = False self._audio_debug = os.getenv("HOST_AUDIO_DEBUG", "0").strip() not in { "0", "false", "False", "no", "off", } self._stt_unavailable_notice_sent = False self._tts_unavailable_notice_sent = False self._audio_seen_notice_sent = False self._audio_format_notice_sent = False self._stt_first_segment_notice_sent = False self._ptt_timing_correction_notice_sent = False self._stt_min_ptt_ms = max( 120, int(os.getenv("HOST_STT_MIN_PTT_MS", os.getenv("HOST_STT_MIN_SEGMENT_MS", "220"))), ) self._stt_suppress_during_tts = os.getenv( "HOST_STT_SUPPRESS_DURING_TTS", "1" ).strip() not in { "0", "false", "False", "no", "off", } self._stt_suppress_ms_after_tts = max( 0, int(os.getenv("HOST_STT_SUPPRESS_MS_AFTER_TTS", "300")), ) self._stt_suppress_until = 0.0 self._stt_backlog_notice_interval_s = max( 2.0, float(os.getenv("HOST_STT_BACKLOG_NOTICE_INTERVAL_S", "6.0")), ) self._last_stt_backlog_notice_at = 0.0 self._ptt_pressed = False self._active_message_metadata: dict[str, Any] = {} self._active_chat_id = "web" def set_push_to_talk_pressed(self, pressed: bool) -> None: self._ptt_pressed = bool(pressed) def send_to_datachannel(self, payload: dict[str, Any]) -> None: """Send a JSON message over the DataChannel if it is open.""" dc = self._dc if dc is None: return try: if dc.readyState == "open": dc.send(json.dumps(payload)) except Exception: pass def _should_commit_partial_buffer(self) -> bool: stripped = self._tts_partial_buffer.strip() if not stripped: return False if len(stripped) >= self._tts_partial_commit_chars: return True return bool(TTS_PARTIAL_COMMIT_RE.search(self._tts_partial_buffer)) def _commit_partial_buffer_locked(self) -> None: partial = self._tts_partial_buffer.strip() self._tts_partial_buffer = "" if partial: self._tts_chunks.append(partial) async def queue_output_text(self, chunk: str, *, partial: bool = False) -> None: if not chunk: return async with self._tts_buffer_lock: if not self._pc or not self._outbound_track: return if partial: self._tts_partial_buffer += chunk if self._should_commit_partial_buffer(): self._commit_partial_buffer_locked() self._schedule_tts_flush_after(0.05, reset=True) else: self._schedule_tts_flush_after(self._tts_response_end_delay_s, reset=True) return normalized_chunk = chunk.strip() if not normalized_chunk: return if self._tts_partial_buffer.strip(): self._commit_partial_buffer_locked() # Keep line boundaries between streamed chunks so line-based filters # stay accurate while avoiding repeated full-string copies. self._tts_chunks.append(normalized_chunk) # Flush in short rolling windows instead of waiting for the whole # response so streamed Nanobot output can start speaking sooner. self._schedule_tts_flush_after(self._tts_response_end_delay_s, reset=False) async def flush_partial_output_text(self) -> None: async with self._tts_buffer_lock: if not self._pc or not self._outbound_track: return if not self._tts_partial_buffer.strip(): return self._commit_partial_buffer_locked() self._schedule_tts_flush_after(0.05, reset=True) def interrupt_output(self) -> None: if self._tts_flush_handle: self._tts_flush_handle.cancel() self._tts_flush_handle = None self._tts_chunks.clear() self._tts_partial_buffer = "" self._stt_suppress_until = 0.0 if self._outbound_track: self._outbound_track.clear() async def handle_offer(self, payload: dict[str, Any]) -> dict[str, Any] | None: if not AIORTC_AVAILABLE or not RTCPeerConnection or not RTCSessionDescription: return None sdp = str(payload.get("sdp", "")).strip() rtc_type = str(payload.get("rtcType", "offer")).strip() or "offer" if not sdp: return None await self._close_peer_connection() self._ptt_pressed = False self._active_message_metadata = {} self._active_chat_id = "web" peer_connection = RTCPeerConnection() self._pc = peer_connection offer_has_audio = bool(re.search(r"(?im)^m=audio\s", sdp)) if offer_has_audio: self._outbound_track = QueueAudioTrack() self._outbound_track._on_playing_changed = self._on_track_playing_changed peer_connection.addTrack(self._outbound_track) else: self._outbound_track = None @peer_connection.on("datachannel") def on_datachannel(channel: Any) -> None: if channel.label != "app": return self._dc = channel @channel.on("message") def on_message(raw: str) -> None: try: msg = json.loads(raw) except Exception: return msg_type = str(msg.get("type", "")).strip() if msg_type == "voice-ptt": self._active_chat_id = str(msg.get("chat_id", "web")).strip() or "web" self._active_message_metadata = _coerce_message_metadata(msg.get("metadata", {})) self.set_push_to_talk_pressed(bool(msg.get("pressed", False))) elif msg_type == "command": command = str(msg.get("command", "")).strip() chat_id = str(msg.get("chat_id", self._active_chat_id)).strip() or "web" if command == "reset": self.interrupt_output() asyncio.create_task(self._gateway.send_command(command, chat_id=chat_id)) elif msg_type == "card-response": asyncio.create_task( self._gateway.send_card_response( str(msg.get("card_id", "")), str(msg.get("value", "")), ) ) elif msg_type == "ping": self.send_to_datachannel({"type": "pong"}) @peer_connection.on("track") def on_track(track: MediaStreamTrack) -> None: if track.kind != "audio": return if self._incoming_audio_task: self._incoming_audio_task.cancel() self._incoming_audio_task = asyncio.create_task( self._consume_audio_track(track), name="voice-inbound-track", ) await peer_connection.setRemoteDescription(RTCSessionDescription(sdp=sdp, type=rtc_type)) answer = await peer_connection.createAnswer() await peer_connection.setLocalDescription(answer) await self._wait_for_ice_gathering(peer_connection) local_description = peer_connection.localDescription sdp_answer = str(local_description.sdp or "") if sdp_answer: sdp_answer = ( sdp_answer.replace("\r\n", "\n").replace("\r", "\n").strip().replace("\n", "\r\n") + "\r\n" ) if self._stt.enabled and not self._stt_worker_task: self._stt_worker_task = asyncio.create_task(self._stt_worker(), name="voice-stt-worker") if self._stt.enabled and (self._stt_warmup_task is None or self._stt_warmup_task.done()): self._stt_warmup_task = asyncio.create_task(self._warmup_stt(), name="voice-stt-warmup") elif not self._stt.enabled and not self._stt_unavailable_notice_sent: self._stt_unavailable_notice_sent = True await self._publish_system( f"Voice input backend unavailable. {self._stt.unavailable_reason()}" ) return { "sdp": sdp_answer, "rtcType": local_description.type, } async def close(self) -> None: self._closed = True self._ptt_pressed = False if self._tts_flush_handle: self._tts_flush_handle.cancel() self._tts_flush_handle = None self._tts_chunks.clear() self._tts_partial_buffer = "" if self._incoming_audio_task: self._incoming_audio_task.cancel() with contextlib.suppress(asyncio.CancelledError): await self._incoming_audio_task self._incoming_audio_task = None if self._stt_worker_task: self._stt_worker_task.cancel() with contextlib.suppress(asyncio.CancelledError, RuntimeError): await self._stt_worker_task self._stt_worker_task = None if self._stt_warmup_task: self._stt_warmup_task.cancel() with contextlib.suppress(asyncio.CancelledError): await self._stt_warmup_task self._stt_warmup_task = None await self._close_peer_connection() def _schedule_tts_flush(self) -> None: if self._closed: return asyncio.create_task(self._flush_tts(), name="voice-tts-flush") def _schedule_tts_flush_after(self, delay_s: float, *, reset: bool = True) -> None: if self._tts_flush_handle: if not reset: return self._tts_flush_handle.cancel() loop = asyncio.get_running_loop() self._tts_flush_handle = loop.call_later(max(0.05, delay_s), self._schedule_tts_flush) async def _flush_tts(self) -> None: async with self._tts_flush_lock: while True: async with self._tts_buffer_lock: self._tts_flush_handle = None if not self._tts_chunks and self._tts_partial_buffer.strip(): self._commit_partial_buffer_locked() if not self._tts_chunks: return raw_text = self._tts_chunks.pop(0) clean_text = self._clean_tts_text(raw_text) if not clean_text: continue if not self._outbound_track: return try: audio = await self._tts.synthesize(clean_text) except asyncio.CancelledError: raise except Exception as exc: import traceback # noqa: local import in exception handler traceback.print_exc() retry_segments = _split_tts_retry_segments(clean_text) if retry_segments: async with self._tts_buffer_lock: self._tts_chunks[0:0] = retry_segments continue await self._publish_system(f"TTS synthesis error: {exc}") return if not audio: if not self._tts_unavailable_notice_sent: self._tts_unavailable_notice_sent = True await self._publish_system( f"Host TTS backend is unavailable. {self._tts.unavailable_reason()}" ) return if not self._outbound_track: return self._extend_stt_suppression(audio) await self._outbound_track.enqueue_pcm( pcm=audio.pcm, sample_rate=audio.sample_rate, channels=audio.channels, ) def _extend_stt_suppression(self, audio: PCMChunk) -> None: if not self._stt_suppress_during_tts: return channels = max(1, int(audio.channels)) sample_rate = max(1, int(audio.sample_rate)) sample_count = len(audio.pcm) // (2 * channels) if sample_count <= 0: return duration_s = sample_count / float(sample_rate) cooldown_s = float(self._stt_suppress_ms_after_tts) / 1000.0 now = asyncio.get_running_loop().time() base = max(now, self._stt_suppress_until) self._stt_suppress_until = base + duration_s + cooldown_s async def _consume_audio_track(self, track: MediaStreamTrack) -> None: if not self._stt.enabled: try: while True: await track.recv() except asyncio.CancelledError: raise except Exception: return resample_state = None recording = False recording_started_at = 0.0 segment_ms = 0.0 segment_buffer = bytearray() try: while True: frame = await track.recv() pcm16, frame_ms, resample_state = self._frame_to_pcm16k_mono(frame, resample_state) if not pcm16: continue if self._audio_debug and not self._audio_seen_notice_sent: self._audio_seen_notice_sent = True await self._publish_debug("Receiving microphone audio on host.") if self._audio_debug and not self._audio_format_notice_sent: self._audio_format_notice_sent = True await self._publish_debug( "Inbound audio frame stats: " f"sample_rate={int(getattr(frame, 'sample_rate', 0) or 0)}, " f"samples={int(getattr(frame, 'samples', 0) or 0)}, " f"time_base={getattr(frame, 'time_base', None)}." ) loop = asyncio.get_running_loop() if self._stt_suppress_during_tts and loop.time() < self._stt_suppress_until: recording = False recording_started_at = 0.0 segment_ms = 0.0 segment_buffer = bytearray() continue if self._ptt_pressed: if not recording: recording = True recording_started_at = loop.time() segment_ms = 0.0 segment_buffer = bytearray() segment_buffer.extend(pcm16) segment_ms += frame_ms continue if recording: observed_duration_ms = max( 1.0, (loop.time() - recording_started_at) * 1000.0, ) await self._finalize_ptt_segment( bytes(segment_buffer), segment_ms, observed_duration_ms=observed_duration_ms, ) recording = False recording_started_at = 0.0 segment_ms = 0.0 segment_buffer = bytearray() except asyncio.CancelledError: raise except Exception as exc: details = str(exc).strip() if details: await self._publish_system( f"Voice input stream ended ({exc.__class__.__name__}): {details}" ) else: await self._publish_system(f"Voice input stream ended ({exc.__class__.__name__}).") finally: if recording and segment_ms >= self._stt_min_ptt_ms: observed_duration_ms = max( 1.0, (asyncio.get_running_loop().time() - recording_started_at) * 1000.0, ) await self._finalize_ptt_segment( bytes(segment_buffer), segment_ms, observed_duration_ms=observed_duration_ms, ) async def _finalize_ptt_segment( self, pcm16: bytes, duration_ms: float, observed_duration_ms: float | None = None, ) -> None: if not pcm16 or duration_ms <= 0.0: return normalized_pcm = pcm16 normalized_duration_ms = duration_ms if observed_duration_ms is not None and observed_duration_ms > 0.0: duration_ratio = duration_ms / observed_duration_ms if duration_ratio < 0.70 or duration_ratio > 1.40: estimated_source_rate = int(round(16_000 * duration_ratio)) estimated_source_rate = max(8_000, min(96_000, estimated_source_rate)) candidate_rates = [ 8_000, 12_000, 16_000, 24_000, 32_000, 44_100, 48_000, ] nearest_source_rate = min( candidate_rates, key=lambda candidate: abs(candidate - estimated_source_rate), ) if nearest_source_rate != 16_000: normalized_pcm, _state = audioop.ratecv( pcm16, 2, 1, nearest_source_rate, 16_000, None, ) normalized_duration_ms = (len(normalized_pcm) / 2 / 16_000) * 1000.0 if self._audio_debug and not self._ptt_timing_correction_notice_sent: self._ptt_timing_correction_notice_sent = True await self._publish_debug( "Corrected PTT timing mismatch " f"(estimated source={nearest_source_rate}Hz)." ) await self._enqueue_stt_segment( pcm16=normalized_pcm, duration_ms=normalized_duration_ms, metadata=dict(self._active_message_metadata), ) async def _enqueue_stt_segment( self, pcm16: bytes, duration_ms: float, metadata: dict[str, Any], ) -> None: if duration_ms < self._stt_min_ptt_ms: return if self._stt_segments.full(): with contextlib.suppress(asyncio.QueueEmpty): self._stt_segments.get_nowait() now = asyncio.get_running_loop().time() if (now - self._last_stt_backlog_notice_at) >= self._stt_backlog_notice_interval_s: self._last_stt_backlog_notice_at = now await self._publish_system("Voice input backlog detected; dropping stale segment.") with contextlib.suppress(asyncio.QueueFull): self._stt_segments.put_nowait(STTSegment(pcm=pcm16, metadata=dict(metadata))) async def _stt_worker(self) -> None: while True: segment = await self._stt_segments.get() if self._audio_debug and not self._stt_first_segment_notice_sent: self._stt_first_segment_notice_sent = True await self._publish_debug("Push-to-talk audio captured. Running host STT...") try: transcript = await self._stt.transcribe_pcm( pcm=segment.pcm, sample_rate=16_000, channels=1, ) except asyncio.CancelledError: raise except Exception as exc: await self._publish_system(f"Host STT failed: {exc}") continue if not transcript: continue transcript = transcript.strip() if not transcript: continue await self._gateway.bus.publish( WisperEvent(role="wisper", text=f"voice transcript: {transcript}") ) try: await self._gateway.send_user_message( transcript, metadata=dict(segment.metadata), chat_id=self._active_chat_id, ) except RuntimeError as exc: if self._closed: return await self._publish_system(f"Could not deliver voice transcript: {exc}") async def _close_peer_connection(self) -> None: self._dc = None if self._outbound_track: self._outbound_track.stop() self._outbound_track = None if self._pc: await self._pc.close() self._pc = None async def _wait_for_ice_gathering(self, peer_connection: RTCPeerConnection) -> None: if peer_connection.iceGatheringState == "complete": return completed = asyncio.Event() @peer_connection.on("icegatheringstatechange") def on_icegatheringstatechange() -> None: if peer_connection.iceGatheringState == "complete": completed.set() with contextlib.suppress(asyncio.TimeoutError): await asyncio.wait_for(completed.wait(), timeout=LOCAL_ICE_GATHER_TIMEOUT_S) async def _warmup_stt(self) -> None: try: await self._stt.warmup() except asyncio.CancelledError: raise except Exception: return async def _publish_system(self, text: str) -> None: await self._gateway.bus.publish(WisperEvent(role="system", text=text)) async def _publish_debug(self, text: str) -> None: if not self._audio_debug: return await self._publish_system(text) async def _publish_agent_state(self, state: str) -> None: await self._gateway.bus.publish(WisperEvent(role="agent-state", text=state)) def _on_track_playing_changed(self, playing: bool) -> None: """Called from QueueAudioTrack.recv() when audio playback starts or stops.""" try: loop = asyncio.get_running_loop() except RuntimeError: return loop.create_task(self._publish_agent_state("speaking" if playing else "idle")) def _clean_tts_text(self, raw_text: str) -> str: lines = [line.strip() for line in raw_text.splitlines() if line.strip()] useful_lines = [ AGENT_PREFIX_RE.sub("", line) for line in lines if not SPEECH_FILTER_RE.match(line) and not THINKING_STATUS_RE.search(line) and not USER_PREFIX_RE.match(line) and not VOICE_TRANSCRIPT_RE.match(line) ] return _sanitize_tts_text(" ".join(useful_lines)) def _frame_to_pcm16k_mono( self, frame: AudioFrame, resample_state: tuple[Any, ...] | None ) -> tuple[bytes, float, tuple[Any, ...] | None]: try: pcm = frame.to_ndarray(format="s16") except TypeError: pcm = frame.to_ndarray() if NUMPY_AVAILABLE and np is not None and getattr(pcm, "dtype", None) is not None: if pcm.dtype != np.int16: if np.issubdtype(pcm.dtype, np.floating): pcm = np.clip(pcm, -1.0, 1.0) pcm = (pcm * 32767.0).astype(np.int16) else: pcm = pcm.astype(np.int16) if pcm.ndim == 1: mono = pcm.tobytes() elif pcm.ndim == 2: expected_channels = 0 if getattr(frame, "layout", None) is not None: with contextlib.suppress(Exception): expected_channels = len(frame.layout.channels) rows = int(pcm.shape[0]) cols = int(pcm.shape[1]) # Normalize to [frames, channels] to avoid accidental channel mis-detection. if expected_channels > 0: if rows == expected_channels: frames_channels = pcm.T elif cols == expected_channels: frames_channels = pcm else: frames_channels = pcm.reshape(-1, 1) else: if rows == 1: frames_channels = pcm.T elif cols == 1: frames_channels = pcm elif rows <= 8 and cols > rows: frames_channels = pcm.T elif cols <= 8 and rows > cols: frames_channels = pcm else: frames_channels = pcm.reshape(-1, 1) channel_count = int(frames_channels.shape[1]) if frames_channels.ndim == 2 else 1 if channel_count <= 1: mono = frames_channels.reshape(-1).tobytes() elif NUMPY_AVAILABLE and np is not None: mixed = frames_channels.astype(np.int32).mean(axis=1) mono = np.clip(mixed, -32768, 32767).astype(np.int16).tobytes() elif channel_count == 2: interleaved = frames_channels.reshape(-1).tobytes() mono = audioop.tomono(interleaved, 2, 0.5, 0.5) else: mono = frames_channels[:, 0].reshape(-1).tobytes() else: return b"", 0.0, resample_state source_rate = int(getattr(frame, "sample_rate", 0) or getattr(frame, "rate", 0) or 0) time_base = getattr(frame, "time_base", None) tb_rate = 0 if time_base is not None: with contextlib.suppress(Exception): numerator = int(getattr(time_base, "numerator", 0)) denominator = int(getattr(time_base, "denominator", 0)) if numerator == 1 and denominator > 0: tb_rate = denominator samples_per_channel = int(getattr(frame, "samples", 0) or 0) if samples_per_channel > 0: candidate_rates = [8_000, 16_000, 24_000, 32_000, 44_100, 48_000] inferred_rate = min( candidate_rates, key=lambda rate: abs((samples_per_channel / float(rate)) - 0.020), ) inferred_frame_ms = (samples_per_channel / float(inferred_rate)) * 1000.0 # If metadata suggests implausibly long frames, trust the inferred rate instead. if ( source_rate <= 0 or (samples_per_channel / float(max(1, source_rate))) * 1000.0 > 40.0 ): source_rate = inferred_rate elif abs(inferred_frame_ms - 20.0) <= 2.5 and source_rate not in { inferred_rate, tb_rate, }: source_rate = inferred_rate if tb_rate > 0 and (source_rate <= 0 or abs(tb_rate - source_rate) > 2_000): source_rate = tb_rate if source_rate <= 0: source_rate = 48_000 if source_rate != 16_000: mono, resample_state = audioop.ratecv( mono, 2, 1, source_rate, 16_000, resample_state, ) if not mono: return b"", 0.0, resample_state sample_count = len(mono) // 2 duration_ms = (sample_count / 16_000) * 1000 return mono, duration_ms, resample_state