api channel and tools

This commit is contained in:
kacper 2026-03-05 15:10:14 -05:00
parent 9222c59f03
commit 3816a9627e
4 changed files with 684 additions and 582 deletions

21
app.py
View file

@ -5,7 +5,7 @@ from pathlib import Path
from typing import Any, Awaitable, Callable
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.responses import FileResponse, JSONResponse
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from supertonic_gateway import SuperTonicGateway
@ -28,8 +28,9 @@ async def health() -> JSONResponse:
@app.get("/")
async def index() -> FileResponse:
return FileResponse(INDEX_PATH)
async def index() -> HTMLResponse:
html = INDEX_PATH.read_text(encoding="utf-8")
return HTMLResponse(content=html)
@app.websocket("/ws/chat")
@ -65,18 +66,24 @@ async def websocket_chat(websocket: WebSocket) -> None:
elif msg_type == "rtc-ice-candidate":
await voice_session.handle_ice_candidate(message)
elif msg_type == "voice-ptt":
voice_session.set_push_to_talk_pressed(
bool(message.get("pressed", False))
)
voice_session.set_push_to_talk_pressed(bool(message.get("pressed", False)))
elif msg_type == "user-message":
await gateway.send_user_message(str(message.get("text", "")))
elif msg_type == "ui-response":
await gateway.send_ui_response(
str(message.get("request_id", "")),
str(message.get("value", "")),
)
elif msg_type == "command":
await gateway.send_command(str(message.get("command", "")))
else:
await safe_send_json(
{
"role": "system",
"text": (
"Unknown message type. Use spawn, stop, rtc-offer, "
"rtc-ice-candidate, voice-ptt, or user-message."
"rtc-ice-candidate, voice-ptt, user-message, "
"ui-response, or command."
),
"timestamp": "",
}

View file

@ -16,7 +16,7 @@
width: 100%;
height: 100%;
overflow: hidden;
background: #1a1510;
background: #ffffff;
touch-action: none;
}
#log {
@ -31,7 +31,7 @@
font-family: "SF Mono", ui-monospace, Menlo, Consolas, monospace;
font-size: 12px;
line-height: 1.6;
color: rgba(255, 245, 235, 0.35);
color: rgba(30, 20, 10, 0.35);
white-space: pre-wrap;
word-break: break-word;
display: flex;
@ -45,8 +45,8 @@
mask-image: linear-gradient(to top, black 55%, transparent 100%);
}
#log:hover {
color: rgba(255, 245, 235, 0.92);
background: rgba(0, 0, 0, 0.18);
color: rgba(30, 20, 10, 0.85);
background: rgba(0, 0, 0, 0.06);
-webkit-mask-image: none;
mask-image: none;
}
@ -62,17 +62,17 @@
margin-bottom: 4px;
}
.line.user {
color: rgba(255, 255, 255, 0.9);
color: rgba(20, 10, 0, 0.85);
}
.line.system {
color: rgba(255, 220, 180, 0.5);
color: rgba(120, 80, 40, 0.5);
}
.line.wisper {
color: rgba(255, 200, 160, 0.4);
color: rgba(120, 80, 40, 0.4);
}
#log:hover .line.user { color: rgba(255, 255, 255, 1.0); }
#log:hover .line.system { color: rgba(255, 220, 180, 0.85); }
#log:hover .line.wisper { color: rgba(255, 200, 160, 0.75); }
#log:hover .line.user { color: rgba(20, 10, 0, 1.0); }
#log:hover .line.system { color: rgba(120, 80, 40, 0.85); }
#log:hover .line.wisper { color: rgba(120, 80, 40, 0.75); }
#voiceStatus {
position: fixed;
bottom: 12px;
@ -119,11 +119,14 @@
border-radius: 24px;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.25), 4px 4px 0px rgba(0,0,0,0.15);
overflow: hidden;
pointer-events: auto;
cursor: pointer;
}
#agentViz canvas {
width: 100% !important;
height: 100% !important;
display: block;
pointer-events: auto;
}
#agentIndicator .label {
display: none;
@ -140,10 +143,6 @@
#agentIndicator.speaking {
color: #8b4513;
}
/* Deepen the background while PTT is active */
body.ptt-active {
background: radial-gradient(ellipse at 50% 44%, #f2caa8 0%, #e8b898 100%);
}
#controls {
position: fixed;
top: 12px;
@ -167,20 +166,236 @@
transform: translateY(1px);
box-shadow: 0 1px 4px rgba(0, 0, 0, 0.15);
}
/* Toast notifications */
#toast-container {
position: fixed;
top: 16px;
left: 50%;
transform: translateX(-50%);
width: min(92vw, 480px);
max-height: calc(100vh - 32px);
overflow-y: auto;
overflow-x: hidden;
display: flex;
flex-direction: column;
gap: 10px;
z-index: 100;
pointer-events: auto;
/* Hide scrollbar until hovered */
scrollbar-width: thin;
scrollbar-color: rgba(255,200,140,0.25) transparent;
padding-bottom: 4px;
}
#toast-container::-webkit-scrollbar {
width: 4px;
}
#toast-container::-webkit-scrollbar-track {
background: transparent;
}
#toast-container::-webkit-scrollbar-thumb {
background: rgba(255,200,140,0.25);
border-radius: 2px;
}
.toast {
pointer-events: auto;
background: rgba(28, 22, 16, 0.92);
border: 1px solid rgba(255, 200, 140, 0.18);
border-radius: 12px;
padding: 14px 16px 14px 16px;
display: flex;
flex-direction: column;
gap: 8px;
box-shadow: 0 4px 24px rgba(0, 0, 0, 0.45);
animation: toast-in 0.22s cubic-bezier(0.34, 1.4, 0.64, 1) both;
position: relative;
overflow: hidden;
max-width: 100%;
}
.toast.dismissing {
animation: toast-out 0.18s ease-in both;
}
@keyframes toast-in {
from { opacity: 0; transform: translateY(-14px) scale(0.96); }
to { opacity: 1; transform: translateY(0) scale(1); }
}
@keyframes toast-out {
from { opacity: 1; transform: translateY(0) scale(1); }
to { opacity: 0; transform: translateY(-10px) scale(0.96); }
}
.toast-progress {
position: absolute;
bottom: 0;
left: 0;
height: 2px;
background: rgba(255, 190, 120, 0.55);
width: 100%;
transform-origin: left;
animation: toast-progress-shrink linear both;
}
@keyframes toast-progress-shrink {
from { transform: scaleX(1); }
to { transform: scaleX(0); }
}
.toast-header {
display: flex;
justify-content: space-between;
align-items: flex-start;
gap: 10px;
}
.toast-title {
font-family: "SF Mono", ui-monospace, Menlo, Consolas, monospace;
font-size: 11px;
font-weight: 600;
letter-spacing: 0.07em;
color: rgba(255, 200, 140, 0.85);
text-transform: uppercase;
flex: 1;
min-width: 0;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.toast-close {
background: none;
border: none;
color: rgba(255, 245, 235, 0.35);
font-size: 16px;
line-height: 1;
cursor: pointer;
padding: 0 2px;
flex-shrink: 0;
transition: color 0.15s;
}
.toast-close:hover {
color: rgba(255, 245, 235, 0.85);
}
.toast-body {
font-family: "SF Mono", ui-monospace, Menlo, Consolas, monospace;
font-size: 12px;
line-height: 1.65;
color: rgba(255, 245, 235, 0.82);
white-space: normal;
word-break: break-word;
user-select: text;
-webkit-user-select: text;
}
.toast-body p { margin: 0 0 6px; }
.toast-body p:last-child { margin-bottom: 0; }
.toast-body h1, .toast-body h2, .toast-body h3,
.toast-body h4, .toast-body h5, .toast-body h6 {
font-size: 13px;
font-weight: 700;
color: rgba(255, 200, 140, 0.95);
margin: 8px 0 4px;
}
.toast-body ul, .toast-body ol {
margin: 4px 0 6px;
padding-left: 18px;
}
.toast-body li { margin-bottom: 2px; }
.toast-body code {
background: rgba(255,255,255,0.07);
border-radius: 4px;
padding: 1px 5px;
font-size: 11px;
}
.toast-body pre {
background: rgba(0,0,0,0.35);
border-radius: 6px;
padding: 8px 10px;
overflow-x: auto;
margin: 6px 0;
}
.toast-body pre code {
background: none;
padding: 0;
font-size: 11px;
}
.toast-body table {
border-collapse: collapse;
width: 100%;
font-size: 11px;
margin: 6px 0;
}
.toast-body th, .toast-body td {
border: 1px solid rgba(255,200,140,0.2);
padding: 4px 8px;
text-align: left;
}
.toast-body th {
background: rgba(255,200,140,0.08);
color: rgba(255,200,140,0.9);
font-weight: 600;
}
.toast-body a {
color: rgba(255,200,140,0.85);
text-decoration: underline;
}
.toast-body blockquote {
border-left: 3px solid rgba(255,200,140,0.3);
margin: 6px 0;
padding-left: 10px;
color: rgba(255,245,235,0.55);
}
.toast-body hr {
border: none;
border-top: 1px solid rgba(255,200,140,0.15);
margin: 8px 0;
}
.toast-choices {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-top: 4px;
}
.toast-choice-btn {
background: rgba(255, 200, 140, 0.12);
border: 1px solid rgba(255, 200, 140, 0.35);
border-radius: 8px;
color: rgba(255, 245, 235, 0.90);
font-family: "SF Mono", ui-monospace, Menlo, Consolas, monospace;
font-size: 12px;
padding: 6px 14px;
cursor: pointer;
transition: background 0.15s, border-color 0.15s;
flex: 1 1 auto;
text-align: center;
}
.toast-choice-btn:hover {
background: rgba(255, 200, 140, 0.25);
border-color: rgba(255, 200, 140, 0.65);
}
.toast-choice-btn:active {
background: rgba(255, 200, 140, 0.38);
}
.toast-choice-btn:disabled {
opacity: 0.4;
cursor: default;
}
.toast-image {
width: 100%;
max-height: 320px;
object-fit: contain;
border-radius: 8px;
display: block;
}
</style>
</head>
<body>
<div id="controls" data-no-ptt="1">
<button id="resetSessionBtn" class="control-btn" type="button" data-no-ptt="1">Reset</button>
<div id="controls">
<button id="resetSessionBtn" class="control-btn" type="button">Reset</button>
</div>
<div id="log"><div id="log-inner"></div></div>
<div id="agentIndicator">
<div id="agentViz"></div>
<div id="agentIndicator" data-ptt="1">
<div id="agentViz" data-ptt="1"></div>
<span class="label"></span>
</div>
<div id="voiceStatus"></div>
<div id="toast-container"></div>
<audio id="remoteAudio" autoplay playsinline hidden></audio>
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<script src="/static/three.min.js"></script>
<script>
const logEl = document.getElementById("log-inner");
@ -190,15 +405,134 @@
const agentVizEl = document.getElementById("agentViz");
const agentLabel = agentIndicator.querySelector(".label");
const resetSessionBtn = document.getElementById("resetSessionBtn");
const toastContainer = document.getElementById("toast-container");
// --- Toast notifications ---
const showToast = (kind, content, title, durationMs) => {
const toast = document.createElement("div");
toast.className = "toast";
// Header row (title + close button)
const header = document.createElement("div");
header.className = "toast-header";
if (title) {
const titleEl = document.createElement("span");
titleEl.className = "toast-title";
titleEl.textContent = title;
header.appendChild(titleEl);
}
const closeBtn = document.createElement("button");
closeBtn.className = "toast-close";
closeBtn.setAttribute("type", "button");
closeBtn.setAttribute("aria-label", "Dismiss");
closeBtn.textContent = "×";
header.appendChild(closeBtn);
toast.appendChild(header);
// Body
if (kind === "image") {
const img = document.createElement("img");
img.className = "toast-image";
img.src = content;
img.alt = title || "image";
toast.appendChild(img);
} else {
const body = document.createElement("div");
body.className = "toast-body";
// If content looks like HTML, inject directly; otherwise render as markdown.
const looksLikeHtml = /^\s*<[a-zA-Z]/.test(content);
if (looksLikeHtml) {
body.innerHTML = content;
} else if (typeof marked !== "undefined") {
body.innerHTML = marked.parse(content);
} else {
body.textContent = content;
}
toast.appendChild(body);
}
// dismiss must be declared before close button references it
const dismiss = () => {
toast.classList.add("dismissing");
const fallback = setTimeout(() => toast.remove(), 400);
toast.addEventListener("animationend", () => { clearTimeout(fallback); toast.remove(); }, { once: true });
};
closeBtn.addEventListener("click", (e) => { e.stopPropagation(); dismiss(); });
toastContainer.prepend(toast);
toastContainer.scrollTop = 0;
};
// --- Choice toasts (ask_user tool) ---
const showChoice = (requestId, question, choices, title) => {
const toast = document.createElement("div");
toast.className = "toast";
// Header
const header = document.createElement("div");
header.className = "toast-header";
if (title) {
const titleEl = document.createElement("span");
titleEl.className = "toast-title";
titleEl.textContent = title;
header.appendChild(titleEl);
}
const closeBtn = document.createElement("button");
closeBtn.className = "toast-close";
closeBtn.setAttribute("type", "button");
closeBtn.setAttribute("aria-label", "Dismiss");
closeBtn.textContent = "×";
header.appendChild(closeBtn);
toast.appendChild(header);
// Question body
const body = document.createElement("div");
body.className = "toast-body";
body.textContent = question;
toast.appendChild(body);
// Choice buttons
const choicesEl = document.createElement("div");
choicesEl.className = "toast-choices";
const dismiss = () => {
toast.classList.add("dismissing");
const fallback = setTimeout(() => toast.remove(), 400);
toast.addEventListener("animationend", () => { clearTimeout(fallback); toast.remove(); }, { once: true });
};
choices.forEach((label) => {
const btn = document.createElement("button");
btn.className = "toast-choice-btn";
btn.setAttribute("type", "button");
btn.textContent = label;
btn.addEventListener("click", (e) => {
e.stopPropagation();
// Disable all buttons to prevent double-send
choicesEl.querySelectorAll(".toast-choice-btn").forEach((b) => { b.disabled = true; });
sendJson({ type: "ui-response", request_id: requestId, value: label });
dismiss();
});
choicesEl.appendChild(btn);
});
toast.appendChild(choicesEl);
closeBtn.addEventListener("click", (e) => { e.stopPropagation(); dismiss(); });
toastContainer.prepend(toast);
toastContainer.scrollTop = 0;
};
// --- Agent state indicator ---
const STATES = { idle: "idle", listening: "listening", thinking: "thinking", speaking: "speaking" };
const STATE_COLORS = {
[STATES.idle]: 0xfff5eb,
[STATES.listening]: 0xfff5eb,
[STATES.thinking]: 0xfff5eb,
[STATES.speaking]: 0xfff5eb,
};
const STATE_COLORS = {
[STATES.idle]: 0xfff5eb,
[STATES.listening]: 0xfff5eb,
[STATES.thinking]: 0xfff5eb,
[STATES.speaking]: 0xfff5eb,
};
let agentState = STATES.idle;
let agentVisualizer = null;
let lastRemoteAudioActivityS = 0;
@ -248,7 +582,8 @@
powerPreference: "high-performance",
});
renderer.setPixelRatio(1);
renderer.setClearColor(0xa09b96, 1);
renderer.setClearColor(0xe8e4e0, 1);
renderer.domElement.dataset.ptt = "1";
agentVizEl.innerHTML = "";
agentVizEl.appendChild(renderer.domElement);
@ -358,12 +693,12 @@
let deformScale = 1.0;
let ringScale = 1.0; // uniform xz scale — used for thickness throb when thinking
let spinSpeed = 0.0;
// Card background colour lerp: 0 = idle coral, 1 = dark listening
// Card background colour lerp: 0 = idle coral, 1 = dark coral (PTT/listening)
let cardColorT = 0.0;
let connectedT = 0.0; // 0 = gray (disconnected), 1 = coral (connected)
const CARD_GRAY_RGB = [160, 155, 150]; // disconnected gray
const CARD_IDLE_RGB = [212, 85, 63]; // #d4553f
const CARD_LISTEN_RGB = [120, 40, 28]; // dark desaturated coral
const CARD_GRAY_RGB = [232, 228, 224]; // #e8e4e0 — disconnected light warm gray
const CARD_IDLE_RGB = [212, 85, 63]; // #d4553f — connected idle coral
const CARD_LISTEN_RGB = [120, 40, 28]; // #782c1c — PTT active dark coral
const setStateColor = (_state) => { /* no-op: MeshBasicMaterial, colour is fixed */ };
@ -696,7 +1031,6 @@
const setPushToTalkState = (pressed, notifyServer = true) => {
pttPressed = pressed;
document.body.classList.toggle("ptt-active", pressed);
setMicCaptureEnabled(pressed);
if (notifyServer && ws.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ type: "voice-ptt", pressed }));
@ -947,26 +1281,27 @@
if (!appStarted) {
await bootstrap();
}
if (sendUserMessage("/reset")) {
showStatus("Reset command sent.", 1500);
if (ws.readyState === WebSocket.OPEN) {
sendJson({ type: "command", command: "reset" });
showStatus("Session reset.", 1500);
}
});
}
// --- Whole-screen PTT pointer handling ---
// --- Center-card PTT pointer handling ---
// Only touches that land on #agentIndicator / #agentViz (data-ptt="1") trigger PTT.
// We track active pointer IDs so multi-touch doesn't double-fire.
const activePointers = new Set();
document.addEventListener("pointerdown", async (event) => {
if (event.target instanceof Element && event.target.closest("[data-no-ptt='1']")) {
if (!(event.target instanceof Element) || !event.target.closest("[data-ptt='1']")) {
return;
}
activePointers.add(event.pointerId);
if (!appStarted) {
await bootstrap();
return;
}
ensureVisualizerAudioMeter();
activePointers.add(event.pointerId);
if (activePointers.size === 1) beginPushToTalk();
}, { passive: false });
@ -1020,6 +1355,30 @@
if (agentState !== STATES.listening && STATES[newState]) {
setAgentState(newState);
}
} else if (msg.role === "toast") {
try {
const t = JSON.parse(msg.text || "{}");
showToast(
t.kind || "text",
t.content || "",
t.title || "",
typeof t.duration_ms === "number" ? t.duration_ms : 6000,
);
} catch (_) {
showToast("text", msg.text || "", "", 6000);
}
} else if (msg.role === "choice") {
try {
const c = JSON.parse(msg.text || "{}");
showChoice(
c.request_id || "",
c.question || "",
Array.isArray(c.choices) ? c.choices : [],
c.title || "",
);
} catch (_) {
// Malformed choice payload — ignore.
}
} else if (msg.role === "wisper") {
// suppress wisper debug output
} else {

View file

@ -1,442 +1,270 @@
"""SuperTonic Gateway — nanobot integration for the web UI.
Connects to the already-running nanobot process via a Unix domain socket.
nanobot must be started separately (e.g. ``nanobot gateway``) with the API
channel enabled in its config.
Wire protocol (newline-delimited JSON)
---------------------------------------
Client nanobot::
{"type": "message", "content": "hello", "chat_id": "web"}
{"type": "ping"}
{"type": "ui-response", "request_id": "<uuid>", "value": "Option A", "chat_id": "web"}
{"type": "command", "command": "reset", "chat_id": "web"}
nanobot client::
{"type": "message", "content": "Hi!", "chat_id": "web", "is_progress": false}
{"type": "agent_state", "state": "thinking", "chat_id": "web"}
{"type": "toast", "kind": "text"|"image", "content": "...", "title": "...", "duration_ms": 5000}
{"type": "choice", "request_id": "<uuid>", "question": "...", "choices": ["A", "B"],
"title": "...", "chat_id": "web"}
{"type": "pong"}
{"type": "error", "error": "..."}
The public ``SuperTonicGateway`` interface (``spawn_tui``, ``send_user_message``,
``stop_tui``, ``shutdown``) is unchanged so ``app.py`` and ``voice_rtc.py``
require no modification.
"""
from __future__ import annotations
import asyncio
import contextlib
import json
import os
import pty
import re
import shlex
import signal
import subprocess
import time
from collections import deque
from pathlib import Path
from wisper import WisperBus, WisperEvent
ANSI_ESCAPE_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
CONTROL_CHAR_RE = re.compile(r"[\x00-\x08\x0b-\x1f\x7f]")
BRAILLE_SPINNER_RE = re.compile(r"[\u2800-\u28ff]")
SPINNER_ONLY_RE = re.compile(r"^[\s|/\\\-]+$")
BOX_DRAWING_ONLY_RE = re.compile(r"^[\s\u2500-\u257f]+$")
THINKING_LINE_RE = re.compile(
r"\b(?:agent|nanobot|napbot)\b(?:\s+is)?\s+thinking\b",
re.IGNORECASE,
)
USER_ECHO_LINE_RE = re.compile(r"^(?:you|user)\s*:", re.IGNORECASE)
TOOL_STREAM_LINE_RE = re.compile(
r"^(?:tool(?:\s+call|\s+output)?|calling\s+tool|running\s+tool|executing\s+tool)\b",
re.IGNORECASE,
)
LEADING_NON_WORD_RE = re.compile(r"^[^\w]+")
WHITESPACE_RE = re.compile(r"\s+")
AGENT_OUTPUT_PREFIX_RE = re.compile(
r"^(?:nanobot|napbot)\b\s*[:>\-]?\s*", re.IGNORECASE
)
EMOJI_RE = re.compile(
"[" # Common emoji and pictograph blocks.
"\U0001f1e6-\U0001f1ff"
"\U0001f300-\U0001f5ff"
"\U0001f600-\U0001f64f"
"\U0001f680-\U0001f6ff"
"\U0001f700-\U0001f77f"
"\U0001f780-\U0001f7ff"
"\U0001f800-\U0001f8ff"
"\U0001f900-\U0001f9ff"
"\U0001fa00-\U0001faff"
"\u2600-\u26ff"
"\u2700-\u27bf"
"\ufe0f"
"\u200d"
"]"
)
# Default path — must match nanobot's channels.api.socket_path config value.
DEFAULT_SOCKET_PATH = Path.home() / ".nanobot" / "api.sock"
def _clean_output(text: str) -> str:
cleaned = ANSI_ESCAPE_RE.sub("", text)
cleaned = BRAILLE_SPINNER_RE.sub(" ", cleaned)
cleaned = CONTROL_CHAR_RE.sub("", cleaned)
return cleaned.replace("\r", "\n")
# ---------------------------------------------------------------------------
# NanobotApiProcess — connects to the running nanobot via its Unix socket
# ---------------------------------------------------------------------------
def _resolve_nanobot_command_and_workdir() -> tuple[str, Path]:
command_override = os.getenv("NANOBOT_COMMAND")
workdir_override = os.getenv("NANOBOT_WORKDIR")
class NanobotApiProcess:
"""Connects to the running nanobot process via its Unix domain socket.
if workdir_override:
default_workdir = Path(workdir_override).expanduser()
else:
default_workdir = Path.home()
Lifecycle
---------
``start()`` opens a connection to nanobot's API socket.
``send()`` writes a user message over the socket.
``stop()`` closes the connection.
"""
if command_override:
return command_override, default_workdir
nanobot_dir = Path.home() / "nanobot"
nanobot_python_candidates = [
nanobot_dir / ".venv" / "bin" / "python",
nanobot_dir / "venv" / "bin" / "python",
]
for nanobot_venv_python in nanobot_python_candidates:
if nanobot_venv_python.exists():
if not workdir_override:
default_workdir = nanobot_dir
return (
f"{nanobot_venv_python} -m nanobot agent --no-markdown",
default_workdir,
)
return "nanobot agent --no-markdown", default_workdir
def _infer_venv_root(command_parts: list[str], workdir: Path) -> Path | None:
if not command_parts:
return None
binary = Path(command_parts[0]).expanduser()
if (
binary.is_absolute()
and binary.name.startswith("python")
and binary.parent.name == "bin"
):
return binary.parent.parent
for candidate in (workdir / ".venv", workdir / "venv"):
if (candidate / "bin" / "python").exists():
return candidate
return None
def _build_process_env(
command_parts: list[str], workdir: Path
) -> tuple[dict[str, str], Path | None]:
env = os.environ.copy()
env.pop("PYTHONHOME", None)
venv_root = _infer_venv_root(command_parts, workdir)
if not venv_root:
return env, None
venv_bin = str(venv_root / "bin")
path_entries = [entry for entry in env.get("PATH", "").split(os.pathsep) if entry]
path_entries = [entry for entry in path_entries if entry != venv_bin]
path_entries.insert(0, venv_bin)
env["PATH"] = os.pathsep.join(path_entries)
env["VIRTUAL_ENV"] = str(venv_root)
return env, venv_root
class NanobotTUIProcess:
def __init__(self, bus: WisperBus, command: str, workdir: Path) -> None:
def __init__(self, bus: WisperBus, socket_path: Path) -> None:
self._bus = bus
self._command = command
self._workdir = workdir
self._process: subprocess.Popen[bytes] | None = None
self._master_fd: int | None = None
self._read_task: asyncio.Task[None] | None = None
self._pending_output = ""
self._suppress_noisy_ui = os.getenv(
"NANOBOT_SUPPRESS_NOISY_UI", "1"
).strip() not in {
"0",
"false",
"False",
"no",
"off",
}
self._dedup_window_s = max(
0.2, float(os.getenv("NANOBOT_OUTPUT_DEDUP_WINDOW_S", "1.5"))
)
self._recent_lines: deque[tuple[str, float]] = deque()
self._last_tts_line = ""
self._socket_path = socket_path
self._reader: asyncio.StreamReader | None = None
self._writer: asyncio.StreamWriter | None = None
self._read_task: asyncio.Task | None = None
@property
def running(self) -> bool:
return self._process is not None and self._process.poll() is None
return (
self._writer is not None
and not self._writer.is_closing()
and self._read_task is not None
and not self._read_task.done()
)
async def start(self) -> None:
if self.running:
await self._bus.publish(
WisperEvent(role="system", text="Nanobot TUI is already running.")
WisperEvent(role="system", text="Already connected to nanobot.")
)
return
command_parts = [
os.path.expandvars(os.path.expanduser(part))
for part in shlex.split(self._command)
]
if not command_parts:
await self._bus.publish(
WisperEvent(role="system", text="NANOBOT_COMMAND is empty.")
)
return
if not self._workdir.exists():
await self._bus.publish(
WisperEvent(
role="system",
text=f"NANOBOT_WORKDIR does not exist: {self._workdir}",
)
)
return
master_fd, slave_fd = pty.openpty()
child_env, child_venv_root = _build_process_env(
command_parts=command_parts, workdir=self._workdir
)
try:
self._process = subprocess.Popen(
command_parts,
stdin=slave_fd,
stdout=slave_fd,
stderr=slave_fd,
cwd=str(self._workdir),
start_new_session=True,
env=child_env,
)
except FileNotFoundError as exc:
os.close(master_fd)
os.close(slave_fd)
if not self._socket_path.exists():
await self._bus.publish(
WisperEvent(
role="system",
text=(
"Could not start Nanobot process "
f"(command='{command_parts[0]}', workdir='{self._workdir}'): {exc}. "
"Check NANOBOT_COMMAND and NANOBOT_WORKDIR."
f"Nanobot API socket not found at {self._socket_path}. "
"Make sure nanobot is running with the API channel enabled "
"(set channels.api.enabled = true in ~/.nanobot/config.json, "
"then run: nanobot gateway)."
),
)
)
return
except Exception as exc:
os.close(master_fd)
os.close(slave_fd)
await self._bus.publish(
WisperEvent(role="system", text=f"Failed to spawn TUI process: {exc}")
)
return
os.close(slave_fd)
os.set_blocking(master_fd, False)
self._master_fd = master_fd
self._read_task = asyncio.create_task(
self._read_output(), name="nanobot-tui-reader"
)
await self._bus.publish(
WisperEvent(
role="system",
text=f"Spawned Nanobot TUI with command: {' '.join(command_parts)}",
try:
self._reader, self._writer = await asyncio.open_unix_connection(
path=str(self._socket_path)
)
)
if child_venv_root:
except OSError as exc:
await self._bus.publish(
WisperEvent(
role="system",
text=f"Nanobot runtime venv: {child_venv_root}",
text=f"Could not connect to nanobot API socket: {exc}",
)
)
return
self._read_task = asyncio.create_task(self._read_loop(), name="nanobot-api-reader")
await self._bus.publish(WisperEvent(role="system", text="Connected to nanobot."))
async def send(self, text: str) -> None:
if not self.running or self._master_fd is None:
if not self.running or self._writer is None:
await self._bus.publish(
WisperEvent(
role="system", text="Nanobot TUI is not running. Click spawn first."
role="system",
text="Not connected to nanobot. Click spawn first.",
)
)
return
message = text.rstrip("\n") + "\n"
payload = json.dumps({"type": "message", "content": text, "chat_id": "web"}) + "\n"
try:
os.write(self._master_fd, message.encode())
self._writer.write(payload.encode())
await self._writer.drain()
except OSError as exc:
await self._bus.publish(
WisperEvent(role="system", text=f"Failed to write to TUI: {exc}")
await self._bus.publish(WisperEvent(role="system", text=f"Send failed: {exc}"))
await self._cleanup()
async def send_ui_response(self, request_id: str, value: str) -> None:
"""Forward a ui-response (choice selection) back to nanobot."""
if not self.running or self._writer is None:
return
payload = (
json.dumps(
{"type": "ui-response", "request_id": request_id, "value": value, "chat_id": "web"}
)
+ "\n"
)
try:
self._writer.write(payload.encode())
await self._writer.drain()
except OSError as exc:
await self._bus.publish(WisperEvent(role="system", text=f"Send failed: {exc}"))
await self._cleanup()
async def send_command(self, command: str) -> None:
"""Send a command (e.g. 'reset') to nanobot."""
if not self.running or self._writer is None:
await self._bus.publish(
WisperEvent(
role="system",
text="Not connected to nanobot. Click spawn first.",
)
)
return
payload = json.dumps({"type": "command", "command": command, "chat_id": "web"}) + "\n"
try:
self._writer.write(payload.encode())
await self._writer.drain()
except OSError as exc:
await self._bus.publish(WisperEvent(role="system", text=f"Send failed: {exc}"))
await self._cleanup()
async def stop(self) -> None:
if self._read_task:
await self._cleanup()
await self._bus.publish(WisperEvent(role="system", text="Disconnected from nanobot."))
# ------------------------------------------------------------------
# Internal
# ------------------------------------------------------------------
async def _cleanup(self) -> None:
if self._read_task and not self._read_task.done():
self._read_task.cancel()
with contextlib.suppress(asyncio.CancelledError):
try:
await self._read_task
self._read_task = None
if self.running and self._process:
try:
os.killpg(self._process.pid, signal.SIGTERM)
except ProcessLookupError:
except asyncio.CancelledError:
pass
except Exception:
self._process.terminate()
try:
self._process.wait(timeout=3)
except Exception:
self._process.kill()
self._process.wait(timeout=1)
self._read_task = None
if self._master_fd is not None:
if self._writer:
try:
os.close(self._master_fd)
self._writer.close()
await self._writer.wait_closed()
except OSError:
pass
self._master_fd = None
self._process = None
self._pending_output = ""
self._recent_lines.clear()
self._last_tts_line = ""
await self._bus.publish(WisperEvent(role="system", text="Stopped Nanobot TUI."))
self._writer = None
self._reader = None
async def _read_output(self) -> None:
if self._master_fd is None:
return
while self.running:
if not await self._wait_for_fd_readable():
break
try:
chunk = os.read(self._master_fd, 4096)
except BlockingIOError:
continue
except OSError:
break
if not chunk:
if not self.running:
async def _read_loop(self) -> None:
"""Read newline-delimited JSON from nanobot and publish WisperEvents."""
assert self._reader is not None
try:
while True:
try:
line = await self._reader.readline()
except OSError:
break
await asyncio.sleep(0.01)
continue
text = _clean_output(chunk.decode(errors="ignore"))
if not text.strip():
continue
displayable, tts_publishable, saw_thinking = self._consume_output_chunk(
text
)
if saw_thinking:
await self._bus.publish(
WisperEvent(role="agent-state", text="thinking")
)
if displayable:
await self._bus.publish(WisperEvent(role="nanobot", text=displayable))
if tts_publishable:
await self._bus.publish(
WisperEvent(role="nanobot-tts", text=tts_publishable)
)
trailing_display, trailing_tts, _ = self._consume_output_chunk("\n")
if trailing_display:
await self._bus.publish(WisperEvent(role="nanobot", text=trailing_display))
if trailing_tts:
await self._bus.publish(WisperEvent(role="nanobot-tts", text=trailing_tts))
if self._process is not None:
exit_code = self._process.poll()
await self._bus.publish(
WisperEvent(
role="system", text=f"Nanobot TUI exited (code={exit_code})."
)
)
def _consume_output_chunk(self, text: str) -> tuple[str, str, bool]:
"""Return (displayable, tts_publishable, saw_thinking)."""
self._pending_output += text
lines = self._pending_output.split("\n")
self._pending_output = lines.pop()
if len(self._pending_output) > 1024:
lines.append(self._pending_output)
self._pending_output = ""
kept_lines: list[str] = []
tts_lines: list[str] = []
saw_thinking = False
for line in lines:
normalized = self._normalize_line(line)
if not normalized:
continue
if self._suppress_noisy_ui and self._is_noisy_ui_line(normalized):
# Detect thinking lines even though they are filtered from display.
candidate = LEADING_NON_WORD_RE.sub("", normalized)
if THINKING_LINE_RE.search(candidate):
saw_thinking = True
continue
if normalized != self._last_tts_line:
tts_lines.append(normalized)
self._last_tts_line = normalized
if self._is_recent_duplicate(normalized):
continue
kept_lines.append(normalized)
return "\n".join(kept_lines).strip(), "\n".join(tts_lines).strip(), saw_thinking
def _normalize_line(self, line: str) -> str:
without_emoji = EMOJI_RE.sub(" ", line)
normalized = WHITESPACE_RE.sub(" ", without_emoji).strip()
# Strip leading "nanobot:" prefix that the TUI echoes in its own output,
# since the frontend already labels lines with the role name and TTS
# should not read the agent's own name aloud.
normalized = AGENT_OUTPUT_PREFIX_RE.sub("", normalized)
return normalized
def _is_noisy_ui_line(self, line: str) -> bool:
if SPINNER_ONLY_RE.fullmatch(line):
return True
if BOX_DRAWING_ONLY_RE.fullmatch(line):
return True
candidate = LEADING_NON_WORD_RE.sub("", line)
if THINKING_LINE_RE.search(candidate):
return True
if TOOL_STREAM_LINE_RE.match(candidate):
return True
if USER_ECHO_LINE_RE.match(candidate):
return True
return False
async def _wait_for_fd_readable(self) -> bool:
if self._master_fd is None:
return False
loop = asyncio.get_running_loop()
ready: asyncio.Future[None] = loop.create_future()
def _mark_ready() -> None:
if not ready.done():
ready.set_result(None)
try:
loop.add_reader(self._master_fd, _mark_ready)
except (AttributeError, NotImplementedError, OSError, ValueError):
await asyncio.sleep(0.01)
return True
try:
await ready
return True
if not line:
break # EOF — nanobot closed the connection
await self._handle_line(line)
finally:
with contextlib.suppress(Exception):
loop.remove_reader(self._master_fd)
await self._bus.publish(
WisperEvent(role="system", text="Nanobot closed the connection.")
)
# Clear writer so running → False
self._writer = None
self._reader = None
def _is_recent_duplicate(self, line: str) -> bool:
now = time.monotonic()
normalized = line.lower()
async def _handle_line(self, line: bytes) -> None:
raw = line.decode(errors="replace").strip()
if not raw:
return
try:
obj = json.loads(raw)
except json.JSONDecodeError:
await self._bus.publish(
WisperEvent(role="system", text=f"Malformed response from nanobot: {raw[:200]}")
)
return
while (
self._recent_lines
and (now - self._recent_lines[0][1]) > self._dedup_window_s
):
self._recent_lines.popleft()
msg_type = str(obj.get("type", ""))
for previous, _timestamp in self._recent_lines:
if previous == normalized:
return True
if msg_type == "message":
content = str(obj.get("content", ""))
is_progress = bool(obj.get("is_progress", False))
if is_progress:
# Intermediate tool-call hint — show in UI, skip TTS
await self._bus.publish(WisperEvent(role="nanobot-progress", text=content))
else:
# Final answer — display + TTS
await self._bus.publish(WisperEvent(role="nanobot", text=content))
await self._bus.publish(WisperEvent(role="nanobot-tts", text=content))
self._recent_lines.append((normalized, now))
return False
elif msg_type == "agent_state":
state = str(obj.get("state", ""))
await self._bus.publish(WisperEvent(role="agent-state", text=state))
elif msg_type == "toast":
# Forward the full toast payload as JSON so the frontend can render it.
await self._bus.publish(WisperEvent(role="toast", text=json.dumps(obj)))
elif msg_type == "choice":
# Forward the full choice payload as JSON so the frontend can render it.
await self._bus.publish(WisperEvent(role="choice", text=json.dumps(obj)))
elif msg_type == "pong":
pass # keepalive, ignore
elif msg_type == "error":
await self._bus.publish(
WisperEvent(role="system", text=f"Nanobot error: {obj.get('error', '')}")
)
# ---------------------------------------------------------------------------
# SuperTonicGateway — public interface (unchanged from original)
# ---------------------------------------------------------------------------
class SuperTonicGateway:
def __init__(self) -> None:
self.bus = WisperBus()
self._lock = asyncio.Lock()
self._tui: NanobotTUIProcess | None = None
self._process: NanobotApiProcess | None = None
socket_path = Path(os.getenv("NANOBOT_API_SOCKET", str(DEFAULT_SOCKET_PATH))).expanduser()
self._socket_path = socket_path
async def subscribe(self) -> asyncio.Queue[WisperEvent]:
return await self.bus.subscribe()
@ -445,18 +273,15 @@ class SuperTonicGateway:
await self.bus.unsubscribe(queue)
async def spawn_tui(self) -> None:
"""Connect to nanobot (name kept for API compatibility with app.py)."""
async with self._lock:
if self._tui and self._tui.running:
if self._process and self._process.running:
await self.bus.publish(
WisperEvent(role="system", text="Nanobot TUI is already running.")
WisperEvent(role="system", text="Already connected to nanobot.")
)
return
command, workdir = _resolve_nanobot_command_and_workdir()
self._tui = NanobotTUIProcess(
bus=self.bus, command=command, workdir=workdir
)
await self._tui.start()
self._process = NanobotApiProcess(bus=self.bus, socket_path=self._socket_path)
await self._process.start()
async def send_user_message(self, text: str) -> None:
message = text.strip()
@ -464,20 +289,34 @@ class SuperTonicGateway:
return
await self.bus.publish(WisperEvent(role="user", text=message))
async with self._lock:
if not self._tui:
if not self._process:
await self.bus.publish(
WisperEvent(
role="system",
text="Nanobot TUI is not running. Click spawn first.",
text="Not connected to nanobot. Click spawn first.",
)
)
return
await self._tui.send(message)
await self._process.send(message)
async def send_ui_response(self, request_id: str, value: str) -> None:
"""Forward a choice selection back to nanobot."""
async with self._lock:
if self._process:
await self._process.send_ui_response(request_id, value)
async def send_command(self, command: str) -> None:
"""Send a command (e.g. 'reset') to nanobot."""
async with self._lock:
if self._process:
await self._process.send_command(command)
async def stop_tui(self) -> None:
"""Disconnect from nanobot (name kept for API compatibility with app.py)."""
async with self._lock:
if self._tui:
await self._tui.stop()
if self._process:
await self._process.stop()
self._process = None
async def shutdown(self) -> None:
await self.stop_tui()

View file

@ -41,9 +41,7 @@ try:
from faster_whisper import WhisperModel
FASTER_WHISPER_AVAILABLE = True
except (
Exception
): # pragma: no cover - runtime fallback when faster-whisper is unavailable
except Exception: # pragma: no cover - runtime fallback when faster-whisper is unavailable
WhisperModel = None # type: ignore[assignment]
FASTER_WHISPER_AVAILABLE = False
@ -82,10 +80,7 @@ ANSI_ESCAPE_RE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
CONTROL_CHAR_RE = re.compile(r"[\x00-\x1f\x7f]")
BRAILLE_SPINNER_RE = re.compile(r"[\u2800-\u28ff]")
TTS_ALLOWED_ASCII = set(
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789"
" .,!?;:'\"()[]{}@#%&*+-_/<>|"
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?;:'\"()[]{}@#%&*+-_/<>|"
)
@ -95,9 +90,7 @@ def _sanitize_tts_text(text: str) -> str:
cleaned = cleaned.replace("\u00a0", " ")
cleaned = cleaned.replace("", " ")
cleaned = CONTROL_CHAR_RE.sub(" ", cleaned)
cleaned = "".join(
ch if (ch in TTS_ALLOWED_ASCII or ch.isspace()) else " " for ch in cleaned
)
cleaned = "".join(ch if (ch in TTS_ALLOWED_ASCII or ch.isspace()) else " " for ch in cleaned)
cleaned = re.sub(r"\s+", " ", cleaned).strip()
return cleaned
@ -131,15 +124,9 @@ if AIORTC_AVAILABLE:
self._timestamp = 0
self._resample_state = None
self._resample_source_rate: int | None = None
self._lead_in_ms = max(
0, int(os.getenv("HOST_RTC_OUTBOUND_LEAD_IN_MS", "120"))
)
self._lead_in_frames = (
self._lead_in_ms + self._frame_ms - 1
) // self._frame_ms
self._lead_in_idle_s = max(
0.1, float(os.getenv("HOST_RTC_OUTBOUND_IDLE_S", "0.6"))
)
self._lead_in_ms = max(0, int(os.getenv("HOST_RTC_OUTBOUND_LEAD_IN_MS", "120")))
self._lead_in_frames = (self._lead_in_ms + self._frame_ms - 1) // self._frame_ms
self._lead_in_idle_s = max(0.1, float(os.getenv("HOST_RTC_OUTBOUND_IDLE_S", "0.6")))
self._last_enqueue_at = 0.0
self._closed = False
self._frame_duration_s = frame_ms / 1000.0
@ -154,9 +141,7 @@ if AIORTC_AVAILABLE:
)
self._on_playing_changed: Callable[[bool], None] | None = None
async def enqueue_pcm(
self, pcm: bytes, sample_rate: int, channels: int = 1
) -> None:
async def enqueue_pcm(self, pcm: bytes, sample_rate: int, channels: int = 1) -> None:
if self._closed or not pcm:
return
@ -244,9 +229,7 @@ if AIORTC_AVAILABLE:
self._last_recv_at = loop.time()
frame = AudioFrame(
format="s16", layout="mono", samples=self._samples_per_frame
)
frame = AudioFrame(format="s16", layout="mono", samples=self._samples_per_frame)
frame.planes[0].update(payload)
frame.sample_rate = self._sample_rate
frame.time_base = Fraction(1, self._sample_rate)
@ -263,9 +246,7 @@ else:
class QueueAudioTrack: # pragma: no cover - used only when aiortc is unavailable
_on_playing_changed: Callable[[bool], None] | None = None
async def enqueue_pcm(
self, pcm: bytes, sample_rate: int, channels: int = 1
) -> None:
async def enqueue_pcm(self, pcm: bytes, sample_rate: int, channels: int = 1) -> None:
return
def stop(self) -> None:
@ -296,23 +277,17 @@ class CommandSpeechToText:
) -> str | None:
if not self.enabled or not pcm:
return None
return await asyncio.to_thread(
self._transcribe_blocking, pcm, sample_rate, channels
)
return await asyncio.to_thread(self._transcribe_blocking, pcm, sample_rate, channels)
def unavailable_reason(self) -> str:
if not self._command_template:
return "HOST_STT_COMMAND is not configured."
return "HOST_STT_COMMAND failed to produce transcript."
def _transcribe_blocking(
self, pcm: bytes, sample_rate: int, channels: int
) -> str | None:
def _transcribe_blocking(self, pcm: bytes, sample_rate: int, channels: int) -> str | None:
tmp_path: str | None = None
try:
tmp_path = _write_temp_wav(
pcm=pcm, sample_rate=sample_rate, channels=channels
)
tmp_path = _write_temp_wav(pcm=pcm, sample_rate=sample_rate, channels=channels)
command = self._command_template
if "{input_wav}" in command:
@ -343,9 +318,7 @@ class FasterWhisperSpeechToText:
def __init__(self) -> None:
self._model_name = os.getenv("HOST_STT_MODEL", "tiny.en").strip() or "tiny.en"
self._device = os.getenv("HOST_STT_DEVICE", "auto").strip() or "auto"
self._compute_type = (
os.getenv("HOST_STT_COMPUTE_TYPE", "int8").strip() or "int8"
)
self._compute_type = os.getenv("HOST_STT_COMPUTE_TYPE", "int8").strip() or "int8"
self._language = os.getenv("HOST_STT_LANGUAGE", "en").strip()
self._beam_size = max(1, int(os.getenv("HOST_STT_BEAM_SIZE", "1")))
self._best_of = max(1, int(os.getenv("HOST_STT_BEST_OF", "1")))
@ -357,12 +330,8 @@ class FasterWhisperSpeechToText:
"off",
}
self._temperature = float(os.getenv("HOST_STT_TEMPERATURE", "0.0"))
self._log_prob_threshold = float(
os.getenv("HOST_STT_LOG_PROB_THRESHOLD", "-1.0")
)
self._no_speech_threshold = float(
os.getenv("HOST_STT_NO_SPEECH_THRESHOLD", "0.6")
)
self._log_prob_threshold = float(os.getenv("HOST_STT_LOG_PROB_THRESHOLD", "-1.0"))
self._no_speech_threshold = float(os.getenv("HOST_STT_NO_SPEECH_THRESHOLD", "0.6"))
self._compression_ratio_threshold = float(
os.getenv("HOST_STT_COMPRESSION_RATIO_THRESHOLD", "2.4")
)
@ -373,9 +342,7 @@ class FasterWhisperSpeechToText:
).strip()
or None
)
self._repetition_penalty = float(
os.getenv("HOST_STT_REPETITION_PENALTY", "1.0")
)
self._repetition_penalty = float(os.getenv("HOST_STT_REPETITION_PENALTY", "1.0"))
raw_hallucination_threshold = os.getenv(
"HOST_STT_HALLUCINATION_SILENCE_THRESHOLD", ""
).strip()
@ -401,9 +368,7 @@ class FasterWhisperSpeechToText:
if not self.enabled or not pcm:
return None
async with self._lock:
return await asyncio.to_thread(
self._transcribe_blocking, pcm, sample_rate, channels
)
return await asyncio.to_thread(self._transcribe_blocking, pcm, sample_rate, channels)
async def warmup(self) -> None:
if not self.enabled:
@ -428,15 +393,11 @@ class FasterWhisperSpeechToText:
self._init_error = str(exc)
self._model = None
def _transcribe_blocking(
self, pcm: bytes, sample_rate: int, channels: int
) -> str | None:
def _transcribe_blocking(self, pcm: bytes, sample_rate: int, channels: int) -> str | None:
self._initialize_blocking()
if self._model is None:
if self._init_error:
raise RuntimeError(
f"faster-whisper initialization failed: {self._init_error}"
)
raise RuntimeError(f"faster-whisper initialization failed: {self._init_error}")
return None
if NUMPY_AVAILABLE and np is not None:
@ -481,9 +442,7 @@ class FasterWhisperSpeechToText:
tmp_path: str | None = None
try:
tmp_path = _write_temp_wav(
pcm=pcm, sample_rate=sample_rate, channels=channels
)
tmp_path = _write_temp_wav(pcm=pcm, sample_rate=sample_rate, channels=channels)
segments, _info = self._model.transcribe(
tmp_path,
language=self._language or None,
@ -580,20 +539,14 @@ class HostSpeechToText:
class SupertonicTextToSpeech:
def __init__(self) -> None:
self._model = (
os.getenv("SUPERTONIC_MODEL", "supertonic-2").strip() or "supertonic-2"
)
self._voice_style_name = (
os.getenv("SUPERTONIC_VOICE_STYLE", "F1").strip() or "F1"
)
self._model = os.getenv("SUPERTONIC_MODEL", "supertonic-2").strip() or "supertonic-2"
self._voice_style_name = os.getenv("SUPERTONIC_VOICE_STYLE", "F1").strip() or "F1"
self._lang = os.getenv("SUPERTONIC_LANG", "en").strip() or "en"
self._total_steps = int(os.getenv("SUPERTONIC_TOTAL_STEPS", "4"))
self._speed = float(os.getenv("SUPERTONIC_SPEED", "1.5"))
self._intra_op_num_threads = _optional_int_env("SUPERTONIC_INTRA_OP_THREADS")
self._inter_op_num_threads = _optional_int_env("SUPERTONIC_INTER_OP_THREADS")
self._auto_download = os.getenv(
"SUPERTONIC_AUTO_DOWNLOAD", "1"
).strip() not in {
self._auto_download = os.getenv("SUPERTONIC_AUTO_DOWNLOAD", "1").strip() not in {
"0",
"false",
"False",
@ -608,9 +561,7 @@ class SupertonicTextToSpeech:
@property
def enabled(self) -> bool:
return (
SUPERTONIC_TTS_AVAILABLE and SupertonicTTS is not None and NUMPY_AVAILABLE
)
return SUPERTONIC_TTS_AVAILABLE and SupertonicTTS is not None and NUMPY_AVAILABLE
@property
def init_error(self) -> str | None:
@ -723,9 +674,7 @@ class SupertonicTextToSpeech:
class HostTextToSpeech:
def __init__(self) -> None:
provider = (
os.getenv("HOST_TTS_PROVIDER", "supertonic").strip() or "supertonic"
).lower()
provider = (os.getenv("HOST_TTS_PROVIDER", "supertonic").strip() or "supertonic").lower()
if provider not in {"supertonic", "command", "espeak", "auto"}:
provider = "auto"
self._provider = provider
@ -770,9 +719,7 @@ class HostTextToSpeech:
if not self._supertonic.enabled:
return "supertonic package is not available."
if self._supertonic.init_error:
return (
f"supertonic initialization failed: {self._supertonic.init_error}"
)
return f"supertonic initialization failed: {self._supertonic.init_error}"
return "supertonic did not return audio."
if self._provider == "command":
return "HOST_TTS_COMMAND is not configured."
@ -797,13 +744,9 @@ class HostTextToSpeech:
if "{output_wav}" in command:
tmp_path: str | None = None
try:
with tempfile.NamedTemporaryFile(
suffix=".wav", delete=False
) as tmp_file:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
tmp_path = tmp_file.name
command_with_output = command.replace(
"{output_wav}", shlex.quote(tmp_path)
)
command_with_output = command.replace("{output_wav}", shlex.quote(tmp_path))
result = subprocess.run(
command_with_output,
shell=True,
@ -872,9 +815,7 @@ SendJsonCallable = Callable[[dict[str, Any]], Awaitable[None]]
class WebRTCVoiceSession:
def __init__(
self, gateway: "SuperTonicGateway", send_json: SendJsonCallable
) -> None:
def __init__(self, gateway: "SuperTonicGateway", send_json: SendJsonCallable) -> None:
self._gateway = gateway
self._send_json = send_json
@ -886,9 +827,7 @@ class WebRTCVoiceSession:
self._stt = HostSpeechToText()
self._tts = HostTextToSpeech()
self._stt_segment_queue_size = max(
1, int(os.getenv("HOST_STT_SEGMENT_QUEUE_SIZE", "2"))
)
self._stt_segment_queue_size = max(1, int(os.getenv("HOST_STT_SEGMENT_QUEUE_SIZE", "2")))
self._stt_segments: asyncio.Queue[bytes] = asyncio.Queue(
maxsize=self._stt_segment_queue_size
)
@ -913,11 +852,7 @@ class WebRTCVoiceSession:
self._stt_min_ptt_ms = max(
120,
int(
os.getenv(
"HOST_STT_MIN_PTT_MS", os.getenv("HOST_STT_MIN_SEGMENT_MS", "220")
)
),
int(os.getenv("HOST_STT_MIN_PTT_MS", os.getenv("HOST_STT_MIN_SEGMENT_MS", "220"))),
)
self._stt_suppress_during_tts = os.getenv(
@ -973,9 +908,7 @@ class WebRTCVoiceSession:
sdp = str(payload.get("sdp", "")).strip()
rtc_type = str(payload.get("rtcType", "offer")).strip() or "offer"
if not sdp:
await self._send_json(
{"type": "rtc-error", "message": "Missing SDP offer payload."}
)
await self._send_json({"type": "rtc-error", "message": "Missing SDP offer payload."})
return
await self._close_peer_connection()
@ -1009,9 +942,7 @@ class WebRTCVoiceSession:
name="voice-inbound-track",
)
await peer_connection.setRemoteDescription(
RTCSessionDescription(sdp=sdp, type=rtc_type)
)
await peer_connection.setRemoteDescription(RTCSessionDescription(sdp=sdp, type=rtc_type))
await self._drain_pending_ice_candidates(peer_connection)
answer = await peer_connection.createAnswer()
await peer_connection.setLocalDescription(answer)
@ -1021,10 +952,7 @@ class WebRTCVoiceSession:
sdp_answer = str(local_description.sdp or "")
if sdp_answer:
sdp_answer = (
sdp_answer.replace("\r\n", "\n")
.replace("\r", "\n")
.strip()
.replace("\n", "\r\n")
sdp_answer.replace("\r\n", "\n").replace("\r", "\n").strip().replace("\n", "\r\n")
+ "\r\n"
)
await self._send_json(
@ -1036,15 +964,9 @@ class WebRTCVoiceSession:
)
if self._stt.enabled and not self._stt_worker_task:
self._stt_worker_task = asyncio.create_task(
self._stt_worker(), name="voice-stt-worker"
)
if self._stt.enabled and (
self._stt_warmup_task is None or self._stt_warmup_task.done()
):
self._stt_warmup_task = asyncio.create_task(
self._warmup_stt(), name="voice-stt-warmup"
)
self._stt_worker_task = asyncio.create_task(self._stt_worker(), name="voice-stt-worker")
if self._stt.enabled and (self._stt_warmup_task is None or self._stt_warmup_task.done()):
self._stt_warmup_task = asyncio.create_task(self._warmup_stt(), name="voice-stt-warmup")
elif not self._stt.enabled and not self._stt_unavailable_notice_sent:
self._stt_unavailable_notice_sent = True
await self._publish_system(
@ -1103,9 +1025,7 @@ class WebRTCVoiceSession:
candidate = candidate_from_sdp(candidate_sdp)
candidate.sdpMid = raw_candidate.get("sdpMid")
line_index = raw_candidate.get("sdpMLineIndex")
candidate.sdpMLineIndex = (
int(line_index) if line_index is not None else None
)
candidate.sdpMLineIndex = int(line_index) if line_index is not None else None
await peer_connection.addIceCandidate(candidate)
except Exception as exc:
await self._publish_system(f"Failed to add ICE candidate: {exc}")
@ -1147,9 +1067,7 @@ class WebRTCVoiceSession:
if self._tts_flush_handle:
self._tts_flush_handle.cancel()
loop = asyncio.get_running_loop()
self._tts_flush_handle = loop.call_later(
max(0.05, delay_s), self._schedule_tts_flush
)
self._tts_flush_handle = loop.call_later(max(0.05, delay_s), self._schedule_tts_flush)
async def _flush_tts(self) -> None:
async with self._tts_flush_lock:
@ -1230,9 +1148,7 @@ class WebRTCVoiceSession:
try:
while True:
frame = await track.recv()
pcm16, frame_ms, resample_state = self._frame_to_pcm16k_mono(
frame, resample_state
)
pcm16, frame_ms, resample_state = self._frame_to_pcm16k_mono(frame, resample_state)
if not pcm16:
continue
@ -1249,10 +1165,9 @@ class WebRTCVoiceSession:
f"time_base={getattr(frame, 'time_base', None)}."
)
if (
self._stt_suppress_during_tts
and asyncio.get_running_loop().time() < self._stt_suppress_until
):
loop = asyncio.get_running_loop()
if self._stt_suppress_during_tts and loop.time() < self._stt_suppress_until:
recording = False
recording_started_at = 0.0
segment_ms = 0.0
@ -1262,7 +1177,7 @@ class WebRTCVoiceSession:
if self._ptt_pressed:
if not recording:
recording = True
recording_started_at = asyncio.get_running_loop().time()
recording_started_at = loop.time()
segment_ms = 0.0
segment_buffer = bytearray()
@ -1273,8 +1188,7 @@ class WebRTCVoiceSession:
if recording:
observed_duration_ms = max(
1.0,
(asyncio.get_running_loop().time() - recording_started_at)
* 1000.0,
(loop.time() - recording_started_at) * 1000.0,
)
await self._finalize_ptt_segment(
bytes(segment_buffer),
@ -1285,6 +1199,7 @@ class WebRTCVoiceSession:
recording_started_at = 0.0
segment_ms = 0.0
segment_buffer = bytearray()
except asyncio.CancelledError:
raise
except Exception as exc:
@ -1294,9 +1209,7 @@ class WebRTCVoiceSession:
f"Voice input stream ended ({exc.__class__.__name__}): {details}"
)
else:
await self._publish_system(
f"Voice input stream ended ({exc.__class__.__name__})."
)
await self._publish_system(f"Voice input stream ended ({exc.__class__.__name__}).")
finally:
if recording and segment_ms >= self._stt_min_ptt_ms:
observed_duration_ms = max(
@ -1355,9 +1268,7 @@ class WebRTCVoiceSession:
f"(estimated source={nearest_source_rate}Hz)."
)
await self._enqueue_stt_segment(
pcm16=normalized_pcm, duration_ms=normalized_duration_ms
)
await self._enqueue_stt_segment(pcm16=normalized_pcm, duration_ms=normalized_duration_ms)
async def _enqueue_stt_segment(self, pcm16: bytes, duration_ms: float) -> None:
if duration_ms < self._stt_min_ptt_ms:
@ -1368,13 +1279,9 @@ class WebRTCVoiceSession:
self._stt_segments.get_nowait()
now = asyncio.get_running_loop().time()
if (
now - self._last_stt_backlog_notice_at
) >= self._stt_backlog_notice_interval_s:
if (now - self._last_stt_backlog_notice_at) >= self._stt_backlog_notice_interval_s:
self._last_stt_backlog_notice_at = now
await self._publish_system(
"Voice input backlog detected; dropping stale segment."
)
await self._publish_system("Voice input backlog detected; dropping stale segment.")
with contextlib.suppress(asyncio.QueueFull):
self._stt_segments.put_nowait(pcm16)
@ -1384,9 +1291,7 @@ class WebRTCVoiceSession:
pcm16 = await self._stt_segments.get()
if not self._stt_first_segment_notice_sent:
self._stt_first_segment_notice_sent = True
await self._publish_system(
"Push-to-talk audio captured. Running host STT..."
)
await self._publish_system("Push-to-talk audio captured. Running host STT...")
try:
transcript = await self._stt.transcribe_pcm(
pcm=pcm16,
@ -1478,11 +1383,7 @@ class WebRTCVoiceSession:
except TypeError:
pcm = frame.to_ndarray()
if (
NUMPY_AVAILABLE
and np is not None
and getattr(pcm, "dtype", None) is not None
):
if NUMPY_AVAILABLE and np is not None and getattr(pcm, "dtype", None) is not None:
if pcm.dtype != np.int16:
if np.issubdtype(pcm.dtype, np.floating):
pcm = np.clip(pcm, -1.0, 1.0)
@ -1521,9 +1422,7 @@ class WebRTCVoiceSession:
else:
frames_channels = pcm.reshape(-1, 1)
channel_count = (
int(frames_channels.shape[1]) if frames_channels.ndim == 2 else 1
)
channel_count = int(frames_channels.shape[1]) if frames_channels.ndim == 2 else 1
if channel_count <= 1:
mono = frames_channels.reshape(-1).tobytes()
elif NUMPY_AVAILABLE and np is not None:
@ -1537,9 +1436,7 @@ class WebRTCVoiceSession:
else:
return b"", 0.0, resample_state
source_rate = int(
getattr(frame, "sample_rate", 0) or getattr(frame, "rate", 0) or 0
)
source_rate = int(getattr(frame, "sample_rate", 0) or getattr(frame, "rate", 0) or 0)
time_base = getattr(frame, "time_base", None)
tb_rate = 0