api channel and tools

This commit is contained in:
kacper 2026-03-05 15:10:14 -05:00
parent 9222c59f03
commit 3816a9627e
4 changed files with 684 additions and 582 deletions

21
app.py
View file

@ -5,7 +5,7 @@ from pathlib import Path
from typing import Any, Awaitable, Callable from typing import Any, Awaitable, Callable
from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.responses import FileResponse, JSONResponse from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from supertonic_gateway import SuperTonicGateway from supertonic_gateway import SuperTonicGateway
@ -28,8 +28,9 @@ async def health() -> JSONResponse:
@app.get("/") @app.get("/")
async def index() -> FileResponse: async def index() -> HTMLResponse:
return FileResponse(INDEX_PATH) html = INDEX_PATH.read_text(encoding="utf-8")
return HTMLResponse(content=html)
@app.websocket("/ws/chat") @app.websocket("/ws/chat")
@ -65,18 +66,24 @@ async def websocket_chat(websocket: WebSocket) -> None:
elif msg_type == "rtc-ice-candidate": elif msg_type == "rtc-ice-candidate":
await voice_session.handle_ice_candidate(message) await voice_session.handle_ice_candidate(message)
elif msg_type == "voice-ptt": elif msg_type == "voice-ptt":
voice_session.set_push_to_talk_pressed( voice_session.set_push_to_talk_pressed(bool(message.get("pressed", False)))
bool(message.get("pressed", False))
)
elif msg_type == "user-message": elif msg_type == "user-message":
await gateway.send_user_message(str(message.get("text", ""))) await gateway.send_user_message(str(message.get("text", "")))
elif msg_type == "ui-response":
await gateway.send_ui_response(
str(message.get("request_id", "")),
str(message.get("value", "")),
)
elif msg_type == "command":
await gateway.send_command(str(message.get("command", "")))
else: else:
await safe_send_json( await safe_send_json(
{ {
"role": "system", "role": "system",
"text": ( "text": (
"Unknown message type. Use spawn, stop, rtc-offer, " "Unknown message type. Use spawn, stop, rtc-offer, "
"rtc-ice-candidate, voice-ptt, or user-message." "rtc-ice-candidate, voice-ptt, user-message, "
"ui-response, or command."
), ),
"timestamp": "", "timestamp": "",
} }

View file

@ -16,7 +16,7 @@
width: 100%; width: 100%;
height: 100%; height: 100%;
overflow: hidden; overflow: hidden;
background: #1a1510; background: #ffffff;
touch-action: none; touch-action: none;
} }
#log { #log {
@ -31,7 +31,7 @@
font-family: "SF Mono", ui-monospace, Menlo, Consolas, monospace; font-family: "SF Mono", ui-monospace, Menlo, Consolas, monospace;
font-size: 12px; font-size: 12px;
line-height: 1.6; line-height: 1.6;
color: rgba(255, 245, 235, 0.35); color: rgba(30, 20, 10, 0.35);
white-space: pre-wrap; white-space: pre-wrap;
word-break: break-word; word-break: break-word;
display: flex; display: flex;
@ -45,8 +45,8 @@
mask-image: linear-gradient(to top, black 55%, transparent 100%); mask-image: linear-gradient(to top, black 55%, transparent 100%);
} }
#log:hover { #log:hover {
color: rgba(255, 245, 235, 0.92); color: rgba(30, 20, 10, 0.85);
background: rgba(0, 0, 0, 0.18); background: rgba(0, 0, 0, 0.06);
-webkit-mask-image: none; -webkit-mask-image: none;
mask-image: none; mask-image: none;
} }
@ -62,17 +62,17 @@
margin-bottom: 4px; margin-bottom: 4px;
} }
.line.user { .line.user {
color: rgba(255, 255, 255, 0.9); color: rgba(20, 10, 0, 0.85);
} }
.line.system { .line.system {
color: rgba(255, 220, 180, 0.5); color: rgba(120, 80, 40, 0.5);
} }
.line.wisper { .line.wisper {
color: rgba(255, 200, 160, 0.4); color: rgba(120, 80, 40, 0.4);
} }
#log:hover .line.user { color: rgba(255, 255, 255, 1.0); } #log:hover .line.user { color: rgba(20, 10, 0, 1.0); }
#log:hover .line.system { color: rgba(255, 220, 180, 0.85); } #log:hover .line.system { color: rgba(120, 80, 40, 0.85); }
#log:hover .line.wisper { color: rgba(255, 200, 160, 0.75); } #log:hover .line.wisper { color: rgba(120, 80, 40, 0.75); }
#voiceStatus { #voiceStatus {
position: fixed; position: fixed;
bottom: 12px; bottom: 12px;
@ -119,11 +119,14 @@
border-radius: 24px; border-radius: 24px;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.25), 4px 4px 0px rgba(0,0,0,0.15); box-shadow: 0 8px 32px rgba(0, 0, 0, 0.25), 4px 4px 0px rgba(0,0,0,0.15);
overflow: hidden; overflow: hidden;
pointer-events: auto;
cursor: pointer;
} }
#agentViz canvas { #agentViz canvas {
width: 100% !important; width: 100% !important;
height: 100% !important; height: 100% !important;
display: block; display: block;
pointer-events: auto;
} }
#agentIndicator .label { #agentIndicator .label {
display: none; display: none;
@ -140,10 +143,6 @@
#agentIndicator.speaking { #agentIndicator.speaking {
color: #8b4513; color: #8b4513;
} }
/* Deepen the background while PTT is active */
body.ptt-active {
background: radial-gradient(ellipse at 50% 44%, #f2caa8 0%, #e8b898 100%);
}
#controls { #controls {
position: fixed; position: fixed;
top: 12px; top: 12px;
@ -167,20 +166,236 @@
transform: translateY(1px); transform: translateY(1px);
box-shadow: 0 1px 4px rgba(0, 0, 0, 0.15); box-shadow: 0 1px 4px rgba(0, 0, 0, 0.15);
} }
/* Toast notifications */
#toast-container {
position: fixed;
top: 16px;
left: 50%;
transform: translateX(-50%);
width: min(92vw, 480px);
max-height: calc(100vh - 32px);
overflow-y: auto;
overflow-x: hidden;
display: flex;
flex-direction: column;
gap: 10px;
z-index: 100;
pointer-events: auto;
/* Hide scrollbar until hovered */
scrollbar-width: thin;
scrollbar-color: rgba(255,200,140,0.25) transparent;
padding-bottom: 4px;
}
#toast-container::-webkit-scrollbar {
width: 4px;
}
#toast-container::-webkit-scrollbar-track {
background: transparent;
}
#toast-container::-webkit-scrollbar-thumb {
background: rgba(255,200,140,0.25);
border-radius: 2px;
}
.toast {
pointer-events: auto;
background: rgba(28, 22, 16, 0.92);
border: 1px solid rgba(255, 200, 140, 0.18);
border-radius: 12px;
padding: 14px 16px 14px 16px;
display: flex;
flex-direction: column;
gap: 8px;
box-shadow: 0 4px 24px rgba(0, 0, 0, 0.45);
animation: toast-in 0.22s cubic-bezier(0.34, 1.4, 0.64, 1) both;
position: relative;
overflow: hidden;
max-width: 100%;
}
.toast.dismissing {
animation: toast-out 0.18s ease-in both;
}
@keyframes toast-in {
from { opacity: 0; transform: translateY(-14px) scale(0.96); }
to { opacity: 1; transform: translateY(0) scale(1); }
}
@keyframes toast-out {
from { opacity: 1; transform: translateY(0) scale(1); }
to { opacity: 0; transform: translateY(-10px) scale(0.96); }
}
.toast-progress {
position: absolute;
bottom: 0;
left: 0;
height: 2px;
background: rgba(255, 190, 120, 0.55);
width: 100%;
transform-origin: left;
animation: toast-progress-shrink linear both;
}
@keyframes toast-progress-shrink {
from { transform: scaleX(1); }
to { transform: scaleX(0); }
}
.toast-header {
display: flex;
justify-content: space-between;
align-items: flex-start;
gap: 10px;
}
.toast-title {
font-family: "SF Mono", ui-monospace, Menlo, Consolas, monospace;
font-size: 11px;
font-weight: 600;
letter-spacing: 0.07em;
color: rgba(255, 200, 140, 0.85);
text-transform: uppercase;
flex: 1;
min-width: 0;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.toast-close {
background: none;
border: none;
color: rgba(255, 245, 235, 0.35);
font-size: 16px;
line-height: 1;
cursor: pointer;
padding: 0 2px;
flex-shrink: 0;
transition: color 0.15s;
}
.toast-close:hover {
color: rgba(255, 245, 235, 0.85);
}
.toast-body {
font-family: "SF Mono", ui-monospace, Menlo, Consolas, monospace;
font-size: 12px;
line-height: 1.65;
color: rgba(255, 245, 235, 0.82);
white-space: normal;
word-break: break-word;
user-select: text;
-webkit-user-select: text;
}
.toast-body p { margin: 0 0 6px; }
.toast-body p:last-child { margin-bottom: 0; }
.toast-body h1, .toast-body h2, .toast-body h3,
.toast-body h4, .toast-body h5, .toast-body h6 {
font-size: 13px;
font-weight: 700;
color: rgba(255, 200, 140, 0.95);
margin: 8px 0 4px;
}
.toast-body ul, .toast-body ol {
margin: 4px 0 6px;
padding-left: 18px;
}
.toast-body li { margin-bottom: 2px; }
.toast-body code {
background: rgba(255,255,255,0.07);
border-radius: 4px;
padding: 1px 5px;
font-size: 11px;
}
.toast-body pre {
background: rgba(0,0,0,0.35);
border-radius: 6px;
padding: 8px 10px;
overflow-x: auto;
margin: 6px 0;
}
.toast-body pre code {
background: none;
padding: 0;
font-size: 11px;
}
.toast-body table {
border-collapse: collapse;
width: 100%;
font-size: 11px;
margin: 6px 0;
}
.toast-body th, .toast-body td {
border: 1px solid rgba(255,200,140,0.2);
padding: 4px 8px;
text-align: left;
}
.toast-body th {
background: rgba(255,200,140,0.08);
color: rgba(255,200,140,0.9);
font-weight: 600;
}
.toast-body a {
color: rgba(255,200,140,0.85);
text-decoration: underline;
}
.toast-body blockquote {
border-left: 3px solid rgba(255,200,140,0.3);
margin: 6px 0;
padding-left: 10px;
color: rgba(255,245,235,0.55);
}
.toast-body hr {
border: none;
border-top: 1px solid rgba(255,200,140,0.15);
margin: 8px 0;
}
.toast-choices {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-top: 4px;
}
.toast-choice-btn {
background: rgba(255, 200, 140, 0.12);
border: 1px solid rgba(255, 200, 140, 0.35);
border-radius: 8px;
color: rgba(255, 245, 235, 0.90);
font-family: "SF Mono", ui-monospace, Menlo, Consolas, monospace;
font-size: 12px;
padding: 6px 14px;
cursor: pointer;
transition: background 0.15s, border-color 0.15s;
flex: 1 1 auto;
text-align: center;
}
.toast-choice-btn:hover {
background: rgba(255, 200, 140, 0.25);
border-color: rgba(255, 200, 140, 0.65);
}
.toast-choice-btn:active {
background: rgba(255, 200, 140, 0.38);
}
.toast-choice-btn:disabled {
opacity: 0.4;
cursor: default;
}
.toast-image {
width: 100%;
max-height: 320px;
object-fit: contain;
border-radius: 8px;
display: block;
}
</style> </style>
</head> </head>
<body> <body>
<div id="controls" data-no-ptt="1"> <div id="controls">
<button id="resetSessionBtn" class="control-btn" type="button" data-no-ptt="1">Reset</button> <button id="resetSessionBtn" class="control-btn" type="button">Reset</button>
</div> </div>
<div id="log"><div id="log-inner"></div></div> <div id="log"><div id="log-inner"></div></div>
<div id="agentIndicator"> <div id="agentIndicator" data-ptt="1">
<div id="agentViz"></div> <div id="agentViz" data-ptt="1"></div>
<span class="label"></span> <span class="label"></span>
</div> </div>
<div id="voiceStatus"></div> <div id="voiceStatus"></div>
<div id="toast-container"></div>
<audio id="remoteAudio" autoplay playsinline hidden></audio> <audio id="remoteAudio" autoplay playsinline hidden></audio>
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<script src="/static/three.min.js"></script> <script src="/static/three.min.js"></script>
<script> <script>
const logEl = document.getElementById("log-inner"); const logEl = document.getElementById("log-inner");
@ -190,15 +405,134 @@
const agentVizEl = document.getElementById("agentViz"); const agentVizEl = document.getElementById("agentViz");
const agentLabel = agentIndicator.querySelector(".label"); const agentLabel = agentIndicator.querySelector(".label");
const resetSessionBtn = document.getElementById("resetSessionBtn"); const resetSessionBtn = document.getElementById("resetSessionBtn");
const toastContainer = document.getElementById("toast-container");
// --- Toast notifications ---
const showToast = (kind, content, title, durationMs) => {
const toast = document.createElement("div");
toast.className = "toast";
// Header row (title + close button)
const header = document.createElement("div");
header.className = "toast-header";
if (title) {
const titleEl = document.createElement("span");
titleEl.className = "toast-title";
titleEl.textContent = title;
header.appendChild(titleEl);
}
const closeBtn = document.createElement("button");
closeBtn.className = "toast-close";
closeBtn.setAttribute("type", "button");
closeBtn.setAttribute("aria-label", "Dismiss");
closeBtn.textContent = "×";
header.appendChild(closeBtn);
toast.appendChild(header);
// Body
if (kind === "image") {
const img = document.createElement("img");
img.className = "toast-image";
img.src = content;
img.alt = title || "image";
toast.appendChild(img);
} else {
const body = document.createElement("div");
body.className = "toast-body";
// If content looks like HTML, inject directly; otherwise render as markdown.
const looksLikeHtml = /^\s*<[a-zA-Z]/.test(content);
if (looksLikeHtml) {
body.innerHTML = content;
} else if (typeof marked !== "undefined") {
body.innerHTML = marked.parse(content);
} else {
body.textContent = content;
}
toast.appendChild(body);
}
// dismiss must be declared before close button references it
const dismiss = () => {
toast.classList.add("dismissing");
const fallback = setTimeout(() => toast.remove(), 400);
toast.addEventListener("animationend", () => { clearTimeout(fallback); toast.remove(); }, { once: true });
};
closeBtn.addEventListener("click", (e) => { e.stopPropagation(); dismiss(); });
toastContainer.prepend(toast);
toastContainer.scrollTop = 0;
};
// --- Choice toasts (ask_user tool) ---
const showChoice = (requestId, question, choices, title) => {
const toast = document.createElement("div");
toast.className = "toast";
// Header
const header = document.createElement("div");
header.className = "toast-header";
if (title) {
const titleEl = document.createElement("span");
titleEl.className = "toast-title";
titleEl.textContent = title;
header.appendChild(titleEl);
}
const closeBtn = document.createElement("button");
closeBtn.className = "toast-close";
closeBtn.setAttribute("type", "button");
closeBtn.setAttribute("aria-label", "Dismiss");
closeBtn.textContent = "×";
header.appendChild(closeBtn);
toast.appendChild(header);
// Question body
const body = document.createElement("div");
body.className = "toast-body";
body.textContent = question;
toast.appendChild(body);
// Choice buttons
const choicesEl = document.createElement("div");
choicesEl.className = "toast-choices";
const dismiss = () => {
toast.classList.add("dismissing");
const fallback = setTimeout(() => toast.remove(), 400);
toast.addEventListener("animationend", () => { clearTimeout(fallback); toast.remove(); }, { once: true });
};
choices.forEach((label) => {
const btn = document.createElement("button");
btn.className = "toast-choice-btn";
btn.setAttribute("type", "button");
btn.textContent = label;
btn.addEventListener("click", (e) => {
e.stopPropagation();
// Disable all buttons to prevent double-send
choicesEl.querySelectorAll(".toast-choice-btn").forEach((b) => { b.disabled = true; });
sendJson({ type: "ui-response", request_id: requestId, value: label });
dismiss();
});
choicesEl.appendChild(btn);
});
toast.appendChild(choicesEl);
closeBtn.addEventListener("click", (e) => { e.stopPropagation(); dismiss(); });
toastContainer.prepend(toast);
toastContainer.scrollTop = 0;
};
// --- Agent state indicator --- // --- Agent state indicator ---
const STATES = { idle: "idle", listening: "listening", thinking: "thinking", speaking: "speaking" }; const STATES = { idle: "idle", listening: "listening", thinking: "thinking", speaking: "speaking" };
const STATE_COLORS = { const STATE_COLORS = {
[STATES.idle]: 0xfff5eb, [STATES.idle]: 0xfff5eb,
[STATES.listening]: 0xfff5eb, [STATES.listening]: 0xfff5eb,
[STATES.thinking]: 0xfff5eb, [STATES.thinking]: 0xfff5eb,
[STATES.speaking]: 0xfff5eb, [STATES.speaking]: 0xfff5eb,
}; };
let agentState = STATES.idle; let agentState = STATES.idle;
let agentVisualizer = null; let agentVisualizer = null;
let lastRemoteAudioActivityS = 0; let lastRemoteAudioActivityS = 0;
@ -248,7 +582,8 @@
powerPreference: "high-performance", powerPreference: "high-performance",
}); });
renderer.setPixelRatio(1); renderer.setPixelRatio(1);
renderer.setClearColor(0xa09b96, 1); renderer.setClearColor(0xe8e4e0, 1);
renderer.domElement.dataset.ptt = "1";
agentVizEl.innerHTML = ""; agentVizEl.innerHTML = "";
agentVizEl.appendChild(renderer.domElement); agentVizEl.appendChild(renderer.domElement);
@ -358,12 +693,12 @@
let deformScale = 1.0; let deformScale = 1.0;
let ringScale = 1.0; // uniform xz scale — used for thickness throb when thinking let ringScale = 1.0; // uniform xz scale — used for thickness throb when thinking
let spinSpeed = 0.0; let spinSpeed = 0.0;
// Card background colour lerp: 0 = idle coral, 1 = dark listening // Card background colour lerp: 0 = idle coral, 1 = dark coral (PTT/listening)
let cardColorT = 0.0; let cardColorT = 0.0;
let connectedT = 0.0; // 0 = gray (disconnected), 1 = coral (connected) let connectedT = 0.0; // 0 = gray (disconnected), 1 = coral (connected)
const CARD_GRAY_RGB = [160, 155, 150]; // disconnected gray const CARD_GRAY_RGB = [232, 228, 224]; // #e8e4e0 — disconnected light warm gray
const CARD_IDLE_RGB = [212, 85, 63]; // #d4553f const CARD_IDLE_RGB = [212, 85, 63]; // #d4553f — connected idle coral
const CARD_LISTEN_RGB = [120, 40, 28]; // dark desaturated coral const CARD_LISTEN_RGB = [120, 40, 28]; // #782c1c — PTT active dark coral
const setStateColor = (_state) => { /* no-op: MeshBasicMaterial, colour is fixed */ }; const setStateColor = (_state) => { /* no-op: MeshBasicMaterial, colour is fixed */ };
@ -696,7 +1031,6 @@
const setPushToTalkState = (pressed, notifyServer = true) => { const setPushToTalkState = (pressed, notifyServer = true) => {
pttPressed = pressed; pttPressed = pressed;
document.body.classList.toggle("ptt-active", pressed);
setMicCaptureEnabled(pressed); setMicCaptureEnabled(pressed);
if (notifyServer && ws.readyState === WebSocket.OPEN) { if (notifyServer && ws.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ type: "voice-ptt", pressed })); ws.send(JSON.stringify({ type: "voice-ptt", pressed }));
@ -947,26 +1281,27 @@
if (!appStarted) { if (!appStarted) {
await bootstrap(); await bootstrap();
} }
if (sendUserMessage("/reset")) { if (ws.readyState === WebSocket.OPEN) {
showStatus("Reset command sent.", 1500); sendJson({ type: "command", command: "reset" });
showStatus("Session reset.", 1500);
} }
}); });
} }
// --- Whole-screen PTT pointer handling --- // --- Center-card PTT pointer handling ---
// Only touches that land on #agentIndicator / #agentViz (data-ptt="1") trigger PTT.
// We track active pointer IDs so multi-touch doesn't double-fire. // We track active pointer IDs so multi-touch doesn't double-fire.
const activePointers = new Set(); const activePointers = new Set();
document.addEventListener("pointerdown", async (event) => { document.addEventListener("pointerdown", async (event) => {
if (event.target instanceof Element && event.target.closest("[data-no-ptt='1']")) { if (!(event.target instanceof Element) || !event.target.closest("[data-ptt='1']")) {
return; return;
} }
activePointers.add(event.pointerId);
if (!appStarted) { if (!appStarted) {
await bootstrap(); await bootstrap();
return;
} }
ensureVisualizerAudioMeter(); ensureVisualizerAudioMeter();
activePointers.add(event.pointerId);
if (activePointers.size === 1) beginPushToTalk(); if (activePointers.size === 1) beginPushToTalk();
}, { passive: false }); }, { passive: false });
@ -1020,6 +1355,30 @@
if (agentState !== STATES.listening && STATES[newState]) { if (agentState !== STATES.listening && STATES[newState]) {
setAgentState(newState); setAgentState(newState);
} }
} else if (msg.role === "toast") {
try {
const t = JSON.parse(msg.text || "{}");
showToast(
t.kind || "text",
t.content || "",
t.title || "",
typeof t.duration_ms === "number" ? t.duration_ms : 6000,
);
} catch (_) {
showToast("text", msg.text || "", "", 6000);
}
} else if (msg.role === "choice") {
try {
const c = JSON.parse(msg.text || "{}");
showChoice(
c.request_id || "",
c.question || "",
Array.isArray(c.choices) ? c.choices : [],
c.title || "",
);
} catch (_) {
// Malformed choice payload — ignore.
}
} else if (msg.role === "wisper") { } else if (msg.role === "wisper") {
// suppress wisper debug output // suppress wisper debug output
} else { } else {

View file

@ -1,442 +1,270 @@
"""SuperTonic Gateway — nanobot integration for the web UI.
Connects to the already-running nanobot process via a Unix domain socket.
nanobot must be started separately (e.g. ``nanobot gateway``) with the API
channel enabled in its config.
Wire protocol (newline-delimited JSON)
---------------------------------------
Client nanobot::
{"type": "message", "content": "hello", "chat_id": "web"}
{"type": "ping"}
{"type": "ui-response", "request_id": "<uuid>", "value": "Option A", "chat_id": "web"}
{"type": "command", "command": "reset", "chat_id": "web"}
nanobot client::
{"type": "message", "content": "Hi!", "chat_id": "web", "is_progress": false}
{"type": "agent_state", "state": "thinking", "chat_id": "web"}
{"type": "toast", "kind": "text"|"image", "content": "...", "title": "...", "duration_ms": 5000}
{"type": "choice", "request_id": "<uuid>", "question": "...", "choices": ["A", "B"],
"title": "...", "chat_id": "web"}
{"type": "pong"}
{"type": "error", "error": "..."}
The public ``SuperTonicGateway`` interface (``spawn_tui``, ``send_user_message``,
``stop_tui``, ``shutdown``) is unchanged so ``app.py`` and ``voice_rtc.py``
require no modification.
"""
from __future__ import annotations
import asyncio import asyncio
import contextlib import json
import os import os
import pty
import re
import shlex
import signal
import subprocess
import time
from collections import deque
from pathlib import Path from pathlib import Path
from wisper import WisperBus, WisperEvent from wisper import WisperBus, WisperEvent
# Default path — must match nanobot's channels.api.socket_path config value.
ANSI_ESCAPE_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") DEFAULT_SOCKET_PATH = Path.home() / ".nanobot" / "api.sock"
CONTROL_CHAR_RE = re.compile(r"[\x00-\x08\x0b-\x1f\x7f]")
BRAILLE_SPINNER_RE = re.compile(r"[\u2800-\u28ff]")
SPINNER_ONLY_RE = re.compile(r"^[\s|/\\\-]+$")
BOX_DRAWING_ONLY_RE = re.compile(r"^[\s\u2500-\u257f]+$")
THINKING_LINE_RE = re.compile(
r"\b(?:agent|nanobot|napbot)\b(?:\s+is)?\s+thinking\b",
re.IGNORECASE,
)
USER_ECHO_LINE_RE = re.compile(r"^(?:you|user)\s*:", re.IGNORECASE)
TOOL_STREAM_LINE_RE = re.compile(
r"^(?:tool(?:\s+call|\s+output)?|calling\s+tool|running\s+tool|executing\s+tool)\b",
re.IGNORECASE,
)
LEADING_NON_WORD_RE = re.compile(r"^[^\w]+")
WHITESPACE_RE = re.compile(r"\s+")
AGENT_OUTPUT_PREFIX_RE = re.compile(
r"^(?:nanobot|napbot)\b\s*[:>\-]?\s*", re.IGNORECASE
)
EMOJI_RE = re.compile(
"[" # Common emoji and pictograph blocks.
"\U0001f1e6-\U0001f1ff"
"\U0001f300-\U0001f5ff"
"\U0001f600-\U0001f64f"
"\U0001f680-\U0001f6ff"
"\U0001f700-\U0001f77f"
"\U0001f780-\U0001f7ff"
"\U0001f800-\U0001f8ff"
"\U0001f900-\U0001f9ff"
"\U0001fa00-\U0001faff"
"\u2600-\u26ff"
"\u2700-\u27bf"
"\ufe0f"
"\u200d"
"]"
)
def _clean_output(text: str) -> str: # ---------------------------------------------------------------------------
cleaned = ANSI_ESCAPE_RE.sub("", text) # NanobotApiProcess — connects to the running nanobot via its Unix socket
cleaned = BRAILLE_SPINNER_RE.sub(" ", cleaned) # ---------------------------------------------------------------------------
cleaned = CONTROL_CHAR_RE.sub("", cleaned)
return cleaned.replace("\r", "\n")
def _resolve_nanobot_command_and_workdir() -> tuple[str, Path]: class NanobotApiProcess:
command_override = os.getenv("NANOBOT_COMMAND") """Connects to the running nanobot process via its Unix domain socket.
workdir_override = os.getenv("NANOBOT_WORKDIR")
if workdir_override: Lifecycle
default_workdir = Path(workdir_override).expanduser() ---------
else: ``start()`` opens a connection to nanobot's API socket.
default_workdir = Path.home() ``send()`` writes a user message over the socket.
``stop()`` closes the connection.
"""
if command_override: def __init__(self, bus: WisperBus, socket_path: Path) -> None:
return command_override, default_workdir
nanobot_dir = Path.home() / "nanobot"
nanobot_python_candidates = [
nanobot_dir / ".venv" / "bin" / "python",
nanobot_dir / "venv" / "bin" / "python",
]
for nanobot_venv_python in nanobot_python_candidates:
if nanobot_venv_python.exists():
if not workdir_override:
default_workdir = nanobot_dir
return (
f"{nanobot_venv_python} -m nanobot agent --no-markdown",
default_workdir,
)
return "nanobot agent --no-markdown", default_workdir
def _infer_venv_root(command_parts: list[str], workdir: Path) -> Path | None:
if not command_parts:
return None
binary = Path(command_parts[0]).expanduser()
if (
binary.is_absolute()
and binary.name.startswith("python")
and binary.parent.name == "bin"
):
return binary.parent.parent
for candidate in (workdir / ".venv", workdir / "venv"):
if (candidate / "bin" / "python").exists():
return candidate
return None
def _build_process_env(
command_parts: list[str], workdir: Path
) -> tuple[dict[str, str], Path | None]:
env = os.environ.copy()
env.pop("PYTHONHOME", None)
venv_root = _infer_venv_root(command_parts, workdir)
if not venv_root:
return env, None
venv_bin = str(venv_root / "bin")
path_entries = [entry for entry in env.get("PATH", "").split(os.pathsep) if entry]
path_entries = [entry for entry in path_entries if entry != venv_bin]
path_entries.insert(0, venv_bin)
env["PATH"] = os.pathsep.join(path_entries)
env["VIRTUAL_ENV"] = str(venv_root)
return env, venv_root
class NanobotTUIProcess:
def __init__(self, bus: WisperBus, command: str, workdir: Path) -> None:
self._bus = bus self._bus = bus
self._command = command self._socket_path = socket_path
self._workdir = workdir self._reader: asyncio.StreamReader | None = None
self._process: subprocess.Popen[bytes] | None = None self._writer: asyncio.StreamWriter | None = None
self._master_fd: int | None = None self._read_task: asyncio.Task | None = None
self._read_task: asyncio.Task[None] | None = None
self._pending_output = ""
self._suppress_noisy_ui = os.getenv(
"NANOBOT_SUPPRESS_NOISY_UI", "1"
).strip() not in {
"0",
"false",
"False",
"no",
"off",
}
self._dedup_window_s = max(
0.2, float(os.getenv("NANOBOT_OUTPUT_DEDUP_WINDOW_S", "1.5"))
)
self._recent_lines: deque[tuple[str, float]] = deque()
self._last_tts_line = ""
@property @property
def running(self) -> bool: def running(self) -> bool:
return self._process is not None and self._process.poll() is None return (
self._writer is not None
and not self._writer.is_closing()
and self._read_task is not None
and not self._read_task.done()
)
async def start(self) -> None: async def start(self) -> None:
if self.running: if self.running:
await self._bus.publish( await self._bus.publish(
WisperEvent(role="system", text="Nanobot TUI is already running.") WisperEvent(role="system", text="Already connected to nanobot.")
) )
return return
command_parts = [ if not self._socket_path.exists():
os.path.expandvars(os.path.expanduser(part))
for part in shlex.split(self._command)
]
if not command_parts:
await self._bus.publish(
WisperEvent(role="system", text="NANOBOT_COMMAND is empty.")
)
return
if not self._workdir.exists():
await self._bus.publish(
WisperEvent(
role="system",
text=f"NANOBOT_WORKDIR does not exist: {self._workdir}",
)
)
return
master_fd, slave_fd = pty.openpty()
child_env, child_venv_root = _build_process_env(
command_parts=command_parts, workdir=self._workdir
)
try:
self._process = subprocess.Popen(
command_parts,
stdin=slave_fd,
stdout=slave_fd,
stderr=slave_fd,
cwd=str(self._workdir),
start_new_session=True,
env=child_env,
)
except FileNotFoundError as exc:
os.close(master_fd)
os.close(slave_fd)
await self._bus.publish( await self._bus.publish(
WisperEvent( WisperEvent(
role="system", role="system",
text=( text=(
"Could not start Nanobot process " f"Nanobot API socket not found at {self._socket_path}. "
f"(command='{command_parts[0]}', workdir='{self._workdir}'): {exc}. " "Make sure nanobot is running with the API channel enabled "
"Check NANOBOT_COMMAND and NANOBOT_WORKDIR." "(set channels.api.enabled = true in ~/.nanobot/config.json, "
"then run: nanobot gateway)."
), ),
) )
) )
return return
except Exception as exc:
os.close(master_fd)
os.close(slave_fd)
await self._bus.publish(
WisperEvent(role="system", text=f"Failed to spawn TUI process: {exc}")
)
return
os.close(slave_fd) try:
os.set_blocking(master_fd, False) self._reader, self._writer = await asyncio.open_unix_connection(
self._master_fd = master_fd path=str(self._socket_path)
self._read_task = asyncio.create_task(
self._read_output(), name="nanobot-tui-reader"
)
await self._bus.publish(
WisperEvent(
role="system",
text=f"Spawned Nanobot TUI with command: {' '.join(command_parts)}",
) )
) except OSError as exc:
if child_venv_root:
await self._bus.publish( await self._bus.publish(
WisperEvent( WisperEvent(
role="system", role="system",
text=f"Nanobot runtime venv: {child_venv_root}", text=f"Could not connect to nanobot API socket: {exc}",
) )
) )
return
self._read_task = asyncio.create_task(self._read_loop(), name="nanobot-api-reader")
await self._bus.publish(WisperEvent(role="system", text="Connected to nanobot."))
async def send(self, text: str) -> None: async def send(self, text: str) -> None:
if not self.running or self._master_fd is None: if not self.running or self._writer is None:
await self._bus.publish( await self._bus.publish(
WisperEvent( WisperEvent(
role="system", text="Nanobot TUI is not running. Click spawn first." role="system",
text="Not connected to nanobot. Click spawn first.",
) )
) )
return return
message = text.rstrip("\n") + "\n" payload = json.dumps({"type": "message", "content": text, "chat_id": "web"}) + "\n"
try: try:
os.write(self._master_fd, message.encode()) self._writer.write(payload.encode())
await self._writer.drain()
except OSError as exc: except OSError as exc:
await self._bus.publish( await self._bus.publish(WisperEvent(role="system", text=f"Send failed: {exc}"))
WisperEvent(role="system", text=f"Failed to write to TUI: {exc}") await self._cleanup()
async def send_ui_response(self, request_id: str, value: str) -> None:
"""Forward a ui-response (choice selection) back to nanobot."""
if not self.running or self._writer is None:
return
payload = (
json.dumps(
{"type": "ui-response", "request_id": request_id, "value": value, "chat_id": "web"}
) )
+ "\n"
)
try:
self._writer.write(payload.encode())
await self._writer.drain()
except OSError as exc:
await self._bus.publish(WisperEvent(role="system", text=f"Send failed: {exc}"))
await self._cleanup()
async def send_command(self, command: str) -> None:
"""Send a command (e.g. 'reset') to nanobot."""
if not self.running or self._writer is None:
await self._bus.publish(
WisperEvent(
role="system",
text="Not connected to nanobot. Click spawn first.",
)
)
return
payload = json.dumps({"type": "command", "command": command, "chat_id": "web"}) + "\n"
try:
self._writer.write(payload.encode())
await self._writer.drain()
except OSError as exc:
await self._bus.publish(WisperEvent(role="system", text=f"Send failed: {exc}"))
await self._cleanup()
async def stop(self) -> None: async def stop(self) -> None:
if self._read_task: await self._cleanup()
await self._bus.publish(WisperEvent(role="system", text="Disconnected from nanobot."))
# ------------------------------------------------------------------
# Internal
# ------------------------------------------------------------------
async def _cleanup(self) -> None:
if self._read_task and not self._read_task.done():
self._read_task.cancel() self._read_task.cancel()
with contextlib.suppress(asyncio.CancelledError): try:
await self._read_task await self._read_task
self._read_task = None except asyncio.CancelledError:
if self.running and self._process:
try:
os.killpg(self._process.pid, signal.SIGTERM)
except ProcessLookupError:
pass pass
except Exception: self._read_task = None
self._process.terminate()
try:
self._process.wait(timeout=3)
except Exception:
self._process.kill()
self._process.wait(timeout=1)
if self._master_fd is not None: if self._writer:
try: try:
os.close(self._master_fd) self._writer.close()
await self._writer.wait_closed()
except OSError: except OSError:
pass pass
self._master_fd = None self._writer = None
self._process = None self._reader = None
self._pending_output = ""
self._recent_lines.clear()
self._last_tts_line = ""
await self._bus.publish(WisperEvent(role="system", text="Stopped Nanobot TUI."))
async def _read_output(self) -> None: async def _read_loop(self) -> None:
if self._master_fd is None: """Read newline-delimited JSON from nanobot and publish WisperEvents."""
return assert self._reader is not None
while self.running: try:
if not await self._wait_for_fd_readable(): while True:
break try:
try: line = await self._reader.readline()
chunk = os.read(self._master_fd, 4096) except OSError:
except BlockingIOError:
continue
except OSError:
break
if not chunk:
if not self.running:
break break
await asyncio.sleep(0.01) if not line:
continue break # EOF — nanobot closed the connection
await self._handle_line(line)
text = _clean_output(chunk.decode(errors="ignore"))
if not text.strip():
continue
displayable, tts_publishable, saw_thinking = self._consume_output_chunk(
text
)
if saw_thinking:
await self._bus.publish(
WisperEvent(role="agent-state", text="thinking")
)
if displayable:
await self._bus.publish(WisperEvent(role="nanobot", text=displayable))
if tts_publishable:
await self._bus.publish(
WisperEvent(role="nanobot-tts", text=tts_publishable)
)
trailing_display, trailing_tts, _ = self._consume_output_chunk("\n")
if trailing_display:
await self._bus.publish(WisperEvent(role="nanobot", text=trailing_display))
if trailing_tts:
await self._bus.publish(WisperEvent(role="nanobot-tts", text=trailing_tts))
if self._process is not None:
exit_code = self._process.poll()
await self._bus.publish(
WisperEvent(
role="system", text=f"Nanobot TUI exited (code={exit_code})."
)
)
def _consume_output_chunk(self, text: str) -> tuple[str, str, bool]:
"""Return (displayable, tts_publishable, saw_thinking)."""
self._pending_output += text
lines = self._pending_output.split("\n")
self._pending_output = lines.pop()
if len(self._pending_output) > 1024:
lines.append(self._pending_output)
self._pending_output = ""
kept_lines: list[str] = []
tts_lines: list[str] = []
saw_thinking = False
for line in lines:
normalized = self._normalize_line(line)
if not normalized:
continue
if self._suppress_noisy_ui and self._is_noisy_ui_line(normalized):
# Detect thinking lines even though they are filtered from display.
candidate = LEADING_NON_WORD_RE.sub("", normalized)
if THINKING_LINE_RE.search(candidate):
saw_thinking = True
continue
if normalized != self._last_tts_line:
tts_lines.append(normalized)
self._last_tts_line = normalized
if self._is_recent_duplicate(normalized):
continue
kept_lines.append(normalized)
return "\n".join(kept_lines).strip(), "\n".join(tts_lines).strip(), saw_thinking
def _normalize_line(self, line: str) -> str:
without_emoji = EMOJI_RE.sub(" ", line)
normalized = WHITESPACE_RE.sub(" ", without_emoji).strip()
# Strip leading "nanobot:" prefix that the TUI echoes in its own output,
# since the frontend already labels lines with the role name and TTS
# should not read the agent's own name aloud.
normalized = AGENT_OUTPUT_PREFIX_RE.sub("", normalized)
return normalized
def _is_noisy_ui_line(self, line: str) -> bool:
if SPINNER_ONLY_RE.fullmatch(line):
return True
if BOX_DRAWING_ONLY_RE.fullmatch(line):
return True
candidate = LEADING_NON_WORD_RE.sub("", line)
if THINKING_LINE_RE.search(candidate):
return True
if TOOL_STREAM_LINE_RE.match(candidate):
return True
if USER_ECHO_LINE_RE.match(candidate):
return True
return False
async def _wait_for_fd_readable(self) -> bool:
if self._master_fd is None:
return False
loop = asyncio.get_running_loop()
ready: asyncio.Future[None] = loop.create_future()
def _mark_ready() -> None:
if not ready.done():
ready.set_result(None)
try:
loop.add_reader(self._master_fd, _mark_ready)
except (AttributeError, NotImplementedError, OSError, ValueError):
await asyncio.sleep(0.01)
return True
try:
await ready
return True
finally: finally:
with contextlib.suppress(Exception): await self._bus.publish(
loop.remove_reader(self._master_fd) WisperEvent(role="system", text="Nanobot closed the connection.")
)
# Clear writer so running → False
self._writer = None
self._reader = None
def _is_recent_duplicate(self, line: str) -> bool: async def _handle_line(self, line: bytes) -> None:
now = time.monotonic() raw = line.decode(errors="replace").strip()
normalized = line.lower() if not raw:
return
try:
obj = json.loads(raw)
except json.JSONDecodeError:
await self._bus.publish(
WisperEvent(role="system", text=f"Malformed response from nanobot: {raw[:200]}")
)
return
while ( msg_type = str(obj.get("type", ""))
self._recent_lines
and (now - self._recent_lines[0][1]) > self._dedup_window_s
):
self._recent_lines.popleft()
for previous, _timestamp in self._recent_lines: if msg_type == "message":
if previous == normalized: content = str(obj.get("content", ""))
return True is_progress = bool(obj.get("is_progress", False))
if is_progress:
# Intermediate tool-call hint — show in UI, skip TTS
await self._bus.publish(WisperEvent(role="nanobot-progress", text=content))
else:
# Final answer — display + TTS
await self._bus.publish(WisperEvent(role="nanobot", text=content))
await self._bus.publish(WisperEvent(role="nanobot-tts", text=content))
self._recent_lines.append((normalized, now)) elif msg_type == "agent_state":
return False state = str(obj.get("state", ""))
await self._bus.publish(WisperEvent(role="agent-state", text=state))
elif msg_type == "toast":
# Forward the full toast payload as JSON so the frontend can render it.
await self._bus.publish(WisperEvent(role="toast", text=json.dumps(obj)))
elif msg_type == "choice":
# Forward the full choice payload as JSON so the frontend can render it.
await self._bus.publish(WisperEvent(role="choice", text=json.dumps(obj)))
elif msg_type == "pong":
pass # keepalive, ignore
elif msg_type == "error":
await self._bus.publish(
WisperEvent(role="system", text=f"Nanobot error: {obj.get('error', '')}")
)
# ---------------------------------------------------------------------------
# SuperTonicGateway — public interface (unchanged from original)
# ---------------------------------------------------------------------------
class SuperTonicGateway: class SuperTonicGateway:
def __init__(self) -> None: def __init__(self) -> None:
self.bus = WisperBus() self.bus = WisperBus()
self._lock = asyncio.Lock() self._lock = asyncio.Lock()
self._tui: NanobotTUIProcess | None = None self._process: NanobotApiProcess | None = None
socket_path = Path(os.getenv("NANOBOT_API_SOCKET", str(DEFAULT_SOCKET_PATH))).expanduser()
self._socket_path = socket_path
async def subscribe(self) -> asyncio.Queue[WisperEvent]: async def subscribe(self) -> asyncio.Queue[WisperEvent]:
return await self.bus.subscribe() return await self.bus.subscribe()
@ -445,18 +273,15 @@ class SuperTonicGateway:
await self.bus.unsubscribe(queue) await self.bus.unsubscribe(queue)
async def spawn_tui(self) -> None: async def spawn_tui(self) -> None:
"""Connect to nanobot (name kept for API compatibility with app.py)."""
async with self._lock: async with self._lock:
if self._tui and self._tui.running: if self._process and self._process.running:
await self.bus.publish( await self.bus.publish(
WisperEvent(role="system", text="Nanobot TUI is already running.") WisperEvent(role="system", text="Already connected to nanobot.")
) )
return return
self._process = NanobotApiProcess(bus=self.bus, socket_path=self._socket_path)
command, workdir = _resolve_nanobot_command_and_workdir() await self._process.start()
self._tui = NanobotTUIProcess(
bus=self.bus, command=command, workdir=workdir
)
await self._tui.start()
async def send_user_message(self, text: str) -> None: async def send_user_message(self, text: str) -> None:
message = text.strip() message = text.strip()
@ -464,20 +289,34 @@ class SuperTonicGateway:
return return
await self.bus.publish(WisperEvent(role="user", text=message)) await self.bus.publish(WisperEvent(role="user", text=message))
async with self._lock: async with self._lock:
if not self._tui: if not self._process:
await self.bus.publish( await self.bus.publish(
WisperEvent( WisperEvent(
role="system", role="system",
text="Nanobot TUI is not running. Click spawn first.", text="Not connected to nanobot. Click spawn first.",
) )
) )
return return
await self._tui.send(message) await self._process.send(message)
async def send_ui_response(self, request_id: str, value: str) -> None:
"""Forward a choice selection back to nanobot."""
async with self._lock:
if self._process:
await self._process.send_ui_response(request_id, value)
async def send_command(self, command: str) -> None:
"""Send a command (e.g. 'reset') to nanobot."""
async with self._lock:
if self._process:
await self._process.send_command(command)
async def stop_tui(self) -> None: async def stop_tui(self) -> None:
"""Disconnect from nanobot (name kept for API compatibility with app.py)."""
async with self._lock: async with self._lock:
if self._tui: if self._process:
await self._tui.stop() await self._process.stop()
self._process = None
async def shutdown(self) -> None: async def shutdown(self) -> None:
await self.stop_tui() await self.stop_tui()

View file

@ -41,9 +41,7 @@ try:
from faster_whisper import WhisperModel from faster_whisper import WhisperModel
FASTER_WHISPER_AVAILABLE = True FASTER_WHISPER_AVAILABLE = True
except ( except Exception: # pragma: no cover - runtime fallback when faster-whisper is unavailable
Exception
): # pragma: no cover - runtime fallback when faster-whisper is unavailable
WhisperModel = None # type: ignore[assignment] WhisperModel = None # type: ignore[assignment]
FASTER_WHISPER_AVAILABLE = False FASTER_WHISPER_AVAILABLE = False
@ -82,10 +80,7 @@ ANSI_ESCAPE_RE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
CONTROL_CHAR_RE = re.compile(r"[\x00-\x1f\x7f]") CONTROL_CHAR_RE = re.compile(r"[\x00-\x1f\x7f]")
BRAILLE_SPINNER_RE = re.compile(r"[\u2800-\u28ff]") BRAILLE_SPINNER_RE = re.compile(r"[\u2800-\u28ff]")
TTS_ALLOWED_ASCII = set( TTS_ALLOWED_ASCII = set(
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?;:'\"()[]{}@#%&*+-_/<>|"
"abcdefghijklmnopqrstuvwxyz"
"0123456789"
" .,!?;:'\"()[]{}@#%&*+-_/<>|"
) )
@ -95,9 +90,7 @@ def _sanitize_tts_text(text: str) -> str:
cleaned = cleaned.replace("\u00a0", " ") cleaned = cleaned.replace("\u00a0", " ")
cleaned = cleaned.replace("", " ") cleaned = cleaned.replace("", " ")
cleaned = CONTROL_CHAR_RE.sub(" ", cleaned) cleaned = CONTROL_CHAR_RE.sub(" ", cleaned)
cleaned = "".join( cleaned = "".join(ch if (ch in TTS_ALLOWED_ASCII or ch.isspace()) else " " for ch in cleaned)
ch if (ch in TTS_ALLOWED_ASCII or ch.isspace()) else " " for ch in cleaned
)
cleaned = re.sub(r"\s+", " ", cleaned).strip() cleaned = re.sub(r"\s+", " ", cleaned).strip()
return cleaned return cleaned
@ -131,15 +124,9 @@ if AIORTC_AVAILABLE:
self._timestamp = 0 self._timestamp = 0
self._resample_state = None self._resample_state = None
self._resample_source_rate: int | None = None self._resample_source_rate: int | None = None
self._lead_in_ms = max( self._lead_in_ms = max(0, int(os.getenv("HOST_RTC_OUTBOUND_LEAD_IN_MS", "120")))
0, int(os.getenv("HOST_RTC_OUTBOUND_LEAD_IN_MS", "120")) self._lead_in_frames = (self._lead_in_ms + self._frame_ms - 1) // self._frame_ms
) self._lead_in_idle_s = max(0.1, float(os.getenv("HOST_RTC_OUTBOUND_IDLE_S", "0.6")))
self._lead_in_frames = (
self._lead_in_ms + self._frame_ms - 1
) // self._frame_ms
self._lead_in_idle_s = max(
0.1, float(os.getenv("HOST_RTC_OUTBOUND_IDLE_S", "0.6"))
)
self._last_enqueue_at = 0.0 self._last_enqueue_at = 0.0
self._closed = False self._closed = False
self._frame_duration_s = frame_ms / 1000.0 self._frame_duration_s = frame_ms / 1000.0
@ -154,9 +141,7 @@ if AIORTC_AVAILABLE:
) )
self._on_playing_changed: Callable[[bool], None] | None = None self._on_playing_changed: Callable[[bool], None] | None = None
async def enqueue_pcm( async def enqueue_pcm(self, pcm: bytes, sample_rate: int, channels: int = 1) -> None:
self, pcm: bytes, sample_rate: int, channels: int = 1
) -> None:
if self._closed or not pcm: if self._closed or not pcm:
return return
@ -244,9 +229,7 @@ if AIORTC_AVAILABLE:
self._last_recv_at = loop.time() self._last_recv_at = loop.time()
frame = AudioFrame( frame = AudioFrame(format="s16", layout="mono", samples=self._samples_per_frame)
format="s16", layout="mono", samples=self._samples_per_frame
)
frame.planes[0].update(payload) frame.planes[0].update(payload)
frame.sample_rate = self._sample_rate frame.sample_rate = self._sample_rate
frame.time_base = Fraction(1, self._sample_rate) frame.time_base = Fraction(1, self._sample_rate)
@ -263,9 +246,7 @@ else:
class QueueAudioTrack: # pragma: no cover - used only when aiortc is unavailable class QueueAudioTrack: # pragma: no cover - used only when aiortc is unavailable
_on_playing_changed: Callable[[bool], None] | None = None _on_playing_changed: Callable[[bool], None] | None = None
async def enqueue_pcm( async def enqueue_pcm(self, pcm: bytes, sample_rate: int, channels: int = 1) -> None:
self, pcm: bytes, sample_rate: int, channels: int = 1
) -> None:
return return
def stop(self) -> None: def stop(self) -> None:
@ -296,23 +277,17 @@ class CommandSpeechToText:
) -> str | None: ) -> str | None:
if not self.enabled or not pcm: if not self.enabled or not pcm:
return None return None
return await asyncio.to_thread( return await asyncio.to_thread(self._transcribe_blocking, pcm, sample_rate, channels)
self._transcribe_blocking, pcm, sample_rate, channels
)
def unavailable_reason(self) -> str: def unavailable_reason(self) -> str:
if not self._command_template: if not self._command_template:
return "HOST_STT_COMMAND is not configured." return "HOST_STT_COMMAND is not configured."
return "HOST_STT_COMMAND failed to produce transcript." return "HOST_STT_COMMAND failed to produce transcript."
def _transcribe_blocking( def _transcribe_blocking(self, pcm: bytes, sample_rate: int, channels: int) -> str | None:
self, pcm: bytes, sample_rate: int, channels: int
) -> str | None:
tmp_path: str | None = None tmp_path: str | None = None
try: try:
tmp_path = _write_temp_wav( tmp_path = _write_temp_wav(pcm=pcm, sample_rate=sample_rate, channels=channels)
pcm=pcm, sample_rate=sample_rate, channels=channels
)
command = self._command_template command = self._command_template
if "{input_wav}" in command: if "{input_wav}" in command:
@ -343,9 +318,7 @@ class FasterWhisperSpeechToText:
def __init__(self) -> None: def __init__(self) -> None:
self._model_name = os.getenv("HOST_STT_MODEL", "tiny.en").strip() or "tiny.en" self._model_name = os.getenv("HOST_STT_MODEL", "tiny.en").strip() or "tiny.en"
self._device = os.getenv("HOST_STT_DEVICE", "auto").strip() or "auto" self._device = os.getenv("HOST_STT_DEVICE", "auto").strip() or "auto"
self._compute_type = ( self._compute_type = os.getenv("HOST_STT_COMPUTE_TYPE", "int8").strip() or "int8"
os.getenv("HOST_STT_COMPUTE_TYPE", "int8").strip() or "int8"
)
self._language = os.getenv("HOST_STT_LANGUAGE", "en").strip() self._language = os.getenv("HOST_STT_LANGUAGE", "en").strip()
self._beam_size = max(1, int(os.getenv("HOST_STT_BEAM_SIZE", "1"))) self._beam_size = max(1, int(os.getenv("HOST_STT_BEAM_SIZE", "1")))
self._best_of = max(1, int(os.getenv("HOST_STT_BEST_OF", "1"))) self._best_of = max(1, int(os.getenv("HOST_STT_BEST_OF", "1")))
@ -357,12 +330,8 @@ class FasterWhisperSpeechToText:
"off", "off",
} }
self._temperature = float(os.getenv("HOST_STT_TEMPERATURE", "0.0")) self._temperature = float(os.getenv("HOST_STT_TEMPERATURE", "0.0"))
self._log_prob_threshold = float( self._log_prob_threshold = float(os.getenv("HOST_STT_LOG_PROB_THRESHOLD", "-1.0"))
os.getenv("HOST_STT_LOG_PROB_THRESHOLD", "-1.0") self._no_speech_threshold = float(os.getenv("HOST_STT_NO_SPEECH_THRESHOLD", "0.6"))
)
self._no_speech_threshold = float(
os.getenv("HOST_STT_NO_SPEECH_THRESHOLD", "0.6")
)
self._compression_ratio_threshold = float( self._compression_ratio_threshold = float(
os.getenv("HOST_STT_COMPRESSION_RATIO_THRESHOLD", "2.4") os.getenv("HOST_STT_COMPRESSION_RATIO_THRESHOLD", "2.4")
) )
@ -373,9 +342,7 @@ class FasterWhisperSpeechToText:
).strip() ).strip()
or None or None
) )
self._repetition_penalty = float( self._repetition_penalty = float(os.getenv("HOST_STT_REPETITION_PENALTY", "1.0"))
os.getenv("HOST_STT_REPETITION_PENALTY", "1.0")
)
raw_hallucination_threshold = os.getenv( raw_hallucination_threshold = os.getenv(
"HOST_STT_HALLUCINATION_SILENCE_THRESHOLD", "" "HOST_STT_HALLUCINATION_SILENCE_THRESHOLD", ""
).strip() ).strip()
@ -401,9 +368,7 @@ class FasterWhisperSpeechToText:
if not self.enabled or not pcm: if not self.enabled or not pcm:
return None return None
async with self._lock: async with self._lock:
return await asyncio.to_thread( return await asyncio.to_thread(self._transcribe_blocking, pcm, sample_rate, channels)
self._transcribe_blocking, pcm, sample_rate, channels
)
async def warmup(self) -> None: async def warmup(self) -> None:
if not self.enabled: if not self.enabled:
@ -428,15 +393,11 @@ class FasterWhisperSpeechToText:
self._init_error = str(exc) self._init_error = str(exc)
self._model = None self._model = None
def _transcribe_blocking( def _transcribe_blocking(self, pcm: bytes, sample_rate: int, channels: int) -> str | None:
self, pcm: bytes, sample_rate: int, channels: int
) -> str | None:
self._initialize_blocking() self._initialize_blocking()
if self._model is None: if self._model is None:
if self._init_error: if self._init_error:
raise RuntimeError( raise RuntimeError(f"faster-whisper initialization failed: {self._init_error}")
f"faster-whisper initialization failed: {self._init_error}"
)
return None return None
if NUMPY_AVAILABLE and np is not None: if NUMPY_AVAILABLE and np is not None:
@ -481,9 +442,7 @@ class FasterWhisperSpeechToText:
tmp_path: str | None = None tmp_path: str | None = None
try: try:
tmp_path = _write_temp_wav( tmp_path = _write_temp_wav(pcm=pcm, sample_rate=sample_rate, channels=channels)
pcm=pcm, sample_rate=sample_rate, channels=channels
)
segments, _info = self._model.transcribe( segments, _info = self._model.transcribe(
tmp_path, tmp_path,
language=self._language or None, language=self._language or None,
@ -580,20 +539,14 @@ class HostSpeechToText:
class SupertonicTextToSpeech: class SupertonicTextToSpeech:
def __init__(self) -> None: def __init__(self) -> None:
self._model = ( self._model = os.getenv("SUPERTONIC_MODEL", "supertonic-2").strip() or "supertonic-2"
os.getenv("SUPERTONIC_MODEL", "supertonic-2").strip() or "supertonic-2" self._voice_style_name = os.getenv("SUPERTONIC_VOICE_STYLE", "F1").strip() or "F1"
)
self._voice_style_name = (
os.getenv("SUPERTONIC_VOICE_STYLE", "F1").strip() or "F1"
)
self._lang = os.getenv("SUPERTONIC_LANG", "en").strip() or "en" self._lang = os.getenv("SUPERTONIC_LANG", "en").strip() or "en"
self._total_steps = int(os.getenv("SUPERTONIC_TOTAL_STEPS", "4")) self._total_steps = int(os.getenv("SUPERTONIC_TOTAL_STEPS", "4"))
self._speed = float(os.getenv("SUPERTONIC_SPEED", "1.5")) self._speed = float(os.getenv("SUPERTONIC_SPEED", "1.5"))
self._intra_op_num_threads = _optional_int_env("SUPERTONIC_INTRA_OP_THREADS") self._intra_op_num_threads = _optional_int_env("SUPERTONIC_INTRA_OP_THREADS")
self._inter_op_num_threads = _optional_int_env("SUPERTONIC_INTER_OP_THREADS") self._inter_op_num_threads = _optional_int_env("SUPERTONIC_INTER_OP_THREADS")
self._auto_download = os.getenv( self._auto_download = os.getenv("SUPERTONIC_AUTO_DOWNLOAD", "1").strip() not in {
"SUPERTONIC_AUTO_DOWNLOAD", "1"
).strip() not in {
"0", "0",
"false", "false",
"False", "False",
@ -608,9 +561,7 @@ class SupertonicTextToSpeech:
@property @property
def enabled(self) -> bool: def enabled(self) -> bool:
return ( return SUPERTONIC_TTS_AVAILABLE and SupertonicTTS is not None and NUMPY_AVAILABLE
SUPERTONIC_TTS_AVAILABLE and SupertonicTTS is not None and NUMPY_AVAILABLE
)
@property @property
def init_error(self) -> str | None: def init_error(self) -> str | None:
@ -723,9 +674,7 @@ class SupertonicTextToSpeech:
class HostTextToSpeech: class HostTextToSpeech:
def __init__(self) -> None: def __init__(self) -> None:
provider = ( provider = (os.getenv("HOST_TTS_PROVIDER", "supertonic").strip() or "supertonic").lower()
os.getenv("HOST_TTS_PROVIDER", "supertonic").strip() or "supertonic"
).lower()
if provider not in {"supertonic", "command", "espeak", "auto"}: if provider not in {"supertonic", "command", "espeak", "auto"}:
provider = "auto" provider = "auto"
self._provider = provider self._provider = provider
@ -770,9 +719,7 @@ class HostTextToSpeech:
if not self._supertonic.enabled: if not self._supertonic.enabled:
return "supertonic package is not available." return "supertonic package is not available."
if self._supertonic.init_error: if self._supertonic.init_error:
return ( return f"supertonic initialization failed: {self._supertonic.init_error}"
f"supertonic initialization failed: {self._supertonic.init_error}"
)
return "supertonic did not return audio." return "supertonic did not return audio."
if self._provider == "command": if self._provider == "command":
return "HOST_TTS_COMMAND is not configured." return "HOST_TTS_COMMAND is not configured."
@ -797,13 +744,9 @@ class HostTextToSpeech:
if "{output_wav}" in command: if "{output_wav}" in command:
tmp_path: str | None = None tmp_path: str | None = None
try: try:
with tempfile.NamedTemporaryFile( with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
suffix=".wav", delete=False
) as tmp_file:
tmp_path = tmp_file.name tmp_path = tmp_file.name
command_with_output = command.replace( command_with_output = command.replace("{output_wav}", shlex.quote(tmp_path))
"{output_wav}", shlex.quote(tmp_path)
)
result = subprocess.run( result = subprocess.run(
command_with_output, command_with_output,
shell=True, shell=True,
@ -872,9 +815,7 @@ SendJsonCallable = Callable[[dict[str, Any]], Awaitable[None]]
class WebRTCVoiceSession: class WebRTCVoiceSession:
def __init__( def __init__(self, gateway: "SuperTonicGateway", send_json: SendJsonCallable) -> None:
self, gateway: "SuperTonicGateway", send_json: SendJsonCallable
) -> None:
self._gateway = gateway self._gateway = gateway
self._send_json = send_json self._send_json = send_json
@ -886,9 +827,7 @@ class WebRTCVoiceSession:
self._stt = HostSpeechToText() self._stt = HostSpeechToText()
self._tts = HostTextToSpeech() self._tts = HostTextToSpeech()
self._stt_segment_queue_size = max( self._stt_segment_queue_size = max(1, int(os.getenv("HOST_STT_SEGMENT_QUEUE_SIZE", "2")))
1, int(os.getenv("HOST_STT_SEGMENT_QUEUE_SIZE", "2"))
)
self._stt_segments: asyncio.Queue[bytes] = asyncio.Queue( self._stt_segments: asyncio.Queue[bytes] = asyncio.Queue(
maxsize=self._stt_segment_queue_size maxsize=self._stt_segment_queue_size
) )
@ -913,11 +852,7 @@ class WebRTCVoiceSession:
self._stt_min_ptt_ms = max( self._stt_min_ptt_ms = max(
120, 120,
int( int(os.getenv("HOST_STT_MIN_PTT_MS", os.getenv("HOST_STT_MIN_SEGMENT_MS", "220"))),
os.getenv(
"HOST_STT_MIN_PTT_MS", os.getenv("HOST_STT_MIN_SEGMENT_MS", "220")
)
),
) )
self._stt_suppress_during_tts = os.getenv( self._stt_suppress_during_tts = os.getenv(
@ -973,9 +908,7 @@ class WebRTCVoiceSession:
sdp = str(payload.get("sdp", "")).strip() sdp = str(payload.get("sdp", "")).strip()
rtc_type = str(payload.get("rtcType", "offer")).strip() or "offer" rtc_type = str(payload.get("rtcType", "offer")).strip() or "offer"
if not sdp: if not sdp:
await self._send_json( await self._send_json({"type": "rtc-error", "message": "Missing SDP offer payload."})
{"type": "rtc-error", "message": "Missing SDP offer payload."}
)
return return
await self._close_peer_connection() await self._close_peer_connection()
@ -1009,9 +942,7 @@ class WebRTCVoiceSession:
name="voice-inbound-track", name="voice-inbound-track",
) )
await peer_connection.setRemoteDescription( await peer_connection.setRemoteDescription(RTCSessionDescription(sdp=sdp, type=rtc_type))
RTCSessionDescription(sdp=sdp, type=rtc_type)
)
await self._drain_pending_ice_candidates(peer_connection) await self._drain_pending_ice_candidates(peer_connection)
answer = await peer_connection.createAnswer() answer = await peer_connection.createAnswer()
await peer_connection.setLocalDescription(answer) await peer_connection.setLocalDescription(answer)
@ -1021,10 +952,7 @@ class WebRTCVoiceSession:
sdp_answer = str(local_description.sdp or "") sdp_answer = str(local_description.sdp or "")
if sdp_answer: if sdp_answer:
sdp_answer = ( sdp_answer = (
sdp_answer.replace("\r\n", "\n") sdp_answer.replace("\r\n", "\n").replace("\r", "\n").strip().replace("\n", "\r\n")
.replace("\r", "\n")
.strip()
.replace("\n", "\r\n")
+ "\r\n" + "\r\n"
) )
await self._send_json( await self._send_json(
@ -1036,15 +964,9 @@ class WebRTCVoiceSession:
) )
if self._stt.enabled and not self._stt_worker_task: if self._stt.enabled and not self._stt_worker_task:
self._stt_worker_task = asyncio.create_task( self._stt_worker_task = asyncio.create_task(self._stt_worker(), name="voice-stt-worker")
self._stt_worker(), name="voice-stt-worker" if self._stt.enabled and (self._stt_warmup_task is None or self._stt_warmup_task.done()):
) self._stt_warmup_task = asyncio.create_task(self._warmup_stt(), name="voice-stt-warmup")
if self._stt.enabled and (
self._stt_warmup_task is None or self._stt_warmup_task.done()
):
self._stt_warmup_task = asyncio.create_task(
self._warmup_stt(), name="voice-stt-warmup"
)
elif not self._stt.enabled and not self._stt_unavailable_notice_sent: elif not self._stt.enabled and not self._stt_unavailable_notice_sent:
self._stt_unavailable_notice_sent = True self._stt_unavailable_notice_sent = True
await self._publish_system( await self._publish_system(
@ -1103,9 +1025,7 @@ class WebRTCVoiceSession:
candidate = candidate_from_sdp(candidate_sdp) candidate = candidate_from_sdp(candidate_sdp)
candidate.sdpMid = raw_candidate.get("sdpMid") candidate.sdpMid = raw_candidate.get("sdpMid")
line_index = raw_candidate.get("sdpMLineIndex") line_index = raw_candidate.get("sdpMLineIndex")
candidate.sdpMLineIndex = ( candidate.sdpMLineIndex = int(line_index) if line_index is not None else None
int(line_index) if line_index is not None else None
)
await peer_connection.addIceCandidate(candidate) await peer_connection.addIceCandidate(candidate)
except Exception as exc: except Exception as exc:
await self._publish_system(f"Failed to add ICE candidate: {exc}") await self._publish_system(f"Failed to add ICE candidate: {exc}")
@ -1147,9 +1067,7 @@ class WebRTCVoiceSession:
if self._tts_flush_handle: if self._tts_flush_handle:
self._tts_flush_handle.cancel() self._tts_flush_handle.cancel()
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
self._tts_flush_handle = loop.call_later( self._tts_flush_handle = loop.call_later(max(0.05, delay_s), self._schedule_tts_flush)
max(0.05, delay_s), self._schedule_tts_flush
)
async def _flush_tts(self) -> None: async def _flush_tts(self) -> None:
async with self._tts_flush_lock: async with self._tts_flush_lock:
@ -1230,9 +1148,7 @@ class WebRTCVoiceSession:
try: try:
while True: while True:
frame = await track.recv() frame = await track.recv()
pcm16, frame_ms, resample_state = self._frame_to_pcm16k_mono( pcm16, frame_ms, resample_state = self._frame_to_pcm16k_mono(frame, resample_state)
frame, resample_state
)
if not pcm16: if not pcm16:
continue continue
@ -1249,10 +1165,9 @@ class WebRTCVoiceSession:
f"time_base={getattr(frame, 'time_base', None)}." f"time_base={getattr(frame, 'time_base', None)}."
) )
if ( loop = asyncio.get_running_loop()
self._stt_suppress_during_tts
and asyncio.get_running_loop().time() < self._stt_suppress_until if self._stt_suppress_during_tts and loop.time() < self._stt_suppress_until:
):
recording = False recording = False
recording_started_at = 0.0 recording_started_at = 0.0
segment_ms = 0.0 segment_ms = 0.0
@ -1262,7 +1177,7 @@ class WebRTCVoiceSession:
if self._ptt_pressed: if self._ptt_pressed:
if not recording: if not recording:
recording = True recording = True
recording_started_at = asyncio.get_running_loop().time() recording_started_at = loop.time()
segment_ms = 0.0 segment_ms = 0.0
segment_buffer = bytearray() segment_buffer = bytearray()
@ -1273,8 +1188,7 @@ class WebRTCVoiceSession:
if recording: if recording:
observed_duration_ms = max( observed_duration_ms = max(
1.0, 1.0,
(asyncio.get_running_loop().time() - recording_started_at) (loop.time() - recording_started_at) * 1000.0,
* 1000.0,
) )
await self._finalize_ptt_segment( await self._finalize_ptt_segment(
bytes(segment_buffer), bytes(segment_buffer),
@ -1285,6 +1199,7 @@ class WebRTCVoiceSession:
recording_started_at = 0.0 recording_started_at = 0.0
segment_ms = 0.0 segment_ms = 0.0
segment_buffer = bytearray() segment_buffer = bytearray()
except asyncio.CancelledError: except asyncio.CancelledError:
raise raise
except Exception as exc: except Exception as exc:
@ -1294,9 +1209,7 @@ class WebRTCVoiceSession:
f"Voice input stream ended ({exc.__class__.__name__}): {details}" f"Voice input stream ended ({exc.__class__.__name__}): {details}"
) )
else: else:
await self._publish_system( await self._publish_system(f"Voice input stream ended ({exc.__class__.__name__}).")
f"Voice input stream ended ({exc.__class__.__name__})."
)
finally: finally:
if recording and segment_ms >= self._stt_min_ptt_ms: if recording and segment_ms >= self._stt_min_ptt_ms:
observed_duration_ms = max( observed_duration_ms = max(
@ -1355,9 +1268,7 @@ class WebRTCVoiceSession:
f"(estimated source={nearest_source_rate}Hz)." f"(estimated source={nearest_source_rate}Hz)."
) )
await self._enqueue_stt_segment( await self._enqueue_stt_segment(pcm16=normalized_pcm, duration_ms=normalized_duration_ms)
pcm16=normalized_pcm, duration_ms=normalized_duration_ms
)
async def _enqueue_stt_segment(self, pcm16: bytes, duration_ms: float) -> None: async def _enqueue_stt_segment(self, pcm16: bytes, duration_ms: float) -> None:
if duration_ms < self._stt_min_ptt_ms: if duration_ms < self._stt_min_ptt_ms:
@ -1368,13 +1279,9 @@ class WebRTCVoiceSession:
self._stt_segments.get_nowait() self._stt_segments.get_nowait()
now = asyncio.get_running_loop().time() now = asyncio.get_running_loop().time()
if ( if (now - self._last_stt_backlog_notice_at) >= self._stt_backlog_notice_interval_s:
now - self._last_stt_backlog_notice_at
) >= self._stt_backlog_notice_interval_s:
self._last_stt_backlog_notice_at = now self._last_stt_backlog_notice_at = now
await self._publish_system( await self._publish_system("Voice input backlog detected; dropping stale segment.")
"Voice input backlog detected; dropping stale segment."
)
with contextlib.suppress(asyncio.QueueFull): with contextlib.suppress(asyncio.QueueFull):
self._stt_segments.put_nowait(pcm16) self._stt_segments.put_nowait(pcm16)
@ -1384,9 +1291,7 @@ class WebRTCVoiceSession:
pcm16 = await self._stt_segments.get() pcm16 = await self._stt_segments.get()
if not self._stt_first_segment_notice_sent: if not self._stt_first_segment_notice_sent:
self._stt_first_segment_notice_sent = True self._stt_first_segment_notice_sent = True
await self._publish_system( await self._publish_system("Push-to-talk audio captured. Running host STT...")
"Push-to-talk audio captured. Running host STT..."
)
try: try:
transcript = await self._stt.transcribe_pcm( transcript = await self._stt.transcribe_pcm(
pcm=pcm16, pcm=pcm16,
@ -1478,11 +1383,7 @@ class WebRTCVoiceSession:
except TypeError: except TypeError:
pcm = frame.to_ndarray() pcm = frame.to_ndarray()
if ( if NUMPY_AVAILABLE and np is not None and getattr(pcm, "dtype", None) is not None:
NUMPY_AVAILABLE
and np is not None
and getattr(pcm, "dtype", None) is not None
):
if pcm.dtype != np.int16: if pcm.dtype != np.int16:
if np.issubdtype(pcm.dtype, np.floating): if np.issubdtype(pcm.dtype, np.floating):
pcm = np.clip(pcm, -1.0, 1.0) pcm = np.clip(pcm, -1.0, 1.0)
@ -1521,9 +1422,7 @@ class WebRTCVoiceSession:
else: else:
frames_channels = pcm.reshape(-1, 1) frames_channels = pcm.reshape(-1, 1)
channel_count = ( channel_count = int(frames_channels.shape[1]) if frames_channels.ndim == 2 else 1
int(frames_channels.shape[1]) if frames_channels.ndim == 2 else 1
)
if channel_count <= 1: if channel_count <= 1:
mono = frames_channels.reshape(-1).tobytes() mono = frames_channels.reshape(-1).tobytes()
elif NUMPY_AVAILABLE and np is not None: elif NUMPY_AVAILABLE and np is not None:
@ -1537,9 +1436,7 @@ class WebRTCVoiceSession:
else: else:
return b"", 0.0, resample_state return b"", 0.0, resample_state
source_rate = int( source_rate = int(getattr(frame, "sample_rate", 0) or getattr(frame, "rate", 0) or 0)
getattr(frame, "sample_rate", 0) or getattr(frame, "rate", 0) or 0
)
time_base = getattr(frame, "time_base", None) time_base = getattr(frame, "time_base", None)
tb_rate = 0 tb_rate = 0