nanobot-voice-interface/static/index.html
2026-03-04 08:20:42 -05:00

928 lines
34 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no" />
<title>Nanobot</title>
<style>
* {
box-sizing: border-box;
user-select: none;
-webkit-user-select: none;
}
html, body {
margin: 0;
padding: 0;
width: 100%;
height: 100%;
overflow: hidden;
background: #1a1510;
touch-action: none;
}
#log {
position: fixed;
bottom: calc(5vh + 20px);
left: 50%;
transform: translateX(-50%);
width: calc(90vw - 40px);
max-height: 22vh;
overflow-y: auto;
padding: 12px 14px;
font-family: "SF Mono", ui-monospace, Menlo, Consolas, monospace;
font-size: 12px;
line-height: 1.6;
color: rgba(255, 245, 235, 0.35);
white-space: pre-wrap;
word-break: break-word;
display: flex;
flex-direction: column-reverse;
border-radius: 10px;
background: transparent;
transition: color 0.3s, background 0.3s;
z-index: 10;
pointer-events: auto;
-webkit-mask-image: linear-gradient(to top, black 55%, transparent 100%);
mask-image: linear-gradient(to top, black 55%, transparent 100%);
}
#log:hover {
color: rgba(255, 245, 235, 0.92);
background: rgba(0, 0, 0, 0.18);
-webkit-mask-image: none;
mask-image: none;
}
#log * {
user-select: text;
-webkit-user-select: text;
}
#log-inner {
display: flex;
flex-direction: column;
}
.line {
margin-bottom: 4px;
}
.line.user {
color: rgba(255, 255, 255, 0.9);
}
.line.system {
color: rgba(255, 220, 180, 0.5);
}
.line.wisper {
color: rgba(255, 200, 160, 0.4);
}
#log:hover .line.user { color: rgba(255, 255, 255, 1.0); }
#log:hover .line.system { color: rgba(255, 220, 180, 0.85); }
#log:hover .line.wisper { color: rgba(255, 200, 160, 0.75); }
#voiceStatus {
position: fixed;
bottom: 12px;
left: 50%;
transform: translateX(-50%);
background: rgba(0, 0, 0, 0.08);
color: #111111;
font-family: "SF Mono", ui-monospace, Menlo, Consolas, monospace;
font-size: 12px;
padding: 4px 12px;
border-radius: 99px;
pointer-events: none;
white-space: nowrap;
opacity: 0;
transition: opacity 0.2s;
}
#voiceStatus.visible {
opacity: 1;
}
/* Agent state indicator */
#agentIndicator {
position: fixed;
top: 0;
left: 0;
right: 0;
height: 100vh;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
gap: 18px;
pointer-events: none;
opacity: 0;
transition: opacity 0.4s;
}
#agentIndicator.visible {
opacity: 1;
}
#agentViz {
width: 90vw;
height: 90vh;
aspect-ratio: unset;
border-radius: 24px;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.25), 4px 4px 0px rgba(0,0,0,0.15);
overflow: hidden;
}
#agentViz canvas {
width: 100% !important;
height: 100% !important;
display: block;
}
#agentIndicator .label {
display: none;
}
#agentIndicator.idle {
color: #6b3a28;
}
#agentIndicator.listening {
color: #d4553f;
}
#agentIndicator.thinking {
color: #a0522d;
}
#agentIndicator.speaking {
color: #8b4513;
}
/* Deepen the background while PTT is active */
body.ptt-active {
background: radial-gradient(ellipse at 50% 44%, #f2caa8 0%, #e8b898 100%);
}
#controls {
position: fixed;
top: 12px;
right: 12px;
z-index: 20;
pointer-events: auto;
}
.control-btn {
border: none;
background: #ffffff;
color: #111111;
border-radius: 10px;
padding: 7px 12px;
font-family: "SF Mono", ui-monospace, Menlo, Consolas, monospace;
font-size: 12px;
letter-spacing: 0.04em;
cursor: pointer;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.15);
}
.control-btn:active {
transform: translateY(1px);
box-shadow: 0 1px 4px rgba(0, 0, 0, 0.15);
}
</style>
</head>
<body>
<div id="controls" data-no-ptt="1">
<button id="resetSessionBtn" class="control-btn" type="button" data-no-ptt="1">Reset</button>
</div>
<div id="log"><div id="log-inner"></div></div>
<div id="agentIndicator">
<div id="agentViz"></div>
<span class="label"></span>
</div>
<div id="voiceStatus"></div>
<audio id="remoteAudio" autoplay playsinline hidden></audio>
<script src="/static/three.min.js"></script>
<script>
const logEl = document.getElementById("log-inner");
const voiceStatus = document.getElementById("voiceStatus");
const remoteAudio = document.getElementById("remoteAudio");
const agentIndicator = document.getElementById("agentIndicator");
const agentVizEl = document.getElementById("agentViz");
const agentLabel = agentIndicator.querySelector(".label");
const resetSessionBtn = document.getElementById("resetSessionBtn");
// --- Agent state indicator ---
const STATES = { idle: "idle", listening: "listening", thinking: "thinking", speaking: "speaking" };
const STATE_COLORS = {
[STATES.idle]: 0xfff5eb,
[STATES.listening]: 0xfff5eb,
[STATES.thinking]: 0xfff5eb,
[STATES.speaking]: 0xfff5eb,
};
let agentState = STATES.idle;
let agentVisualizer = null;
let lastRemoteAudioActivityS = 0;
agentIndicator.classList.add("visible", "idle");
const setAgentState = (state) => {
agentState = state;
agentIndicator.classList.remove("listening", "thinking", "speaking", "idle");
agentIndicator.classList.add("visible", state);
agentLabel.textContent = state === STATES.idle ? "" : state;
if (agentVisualizer) agentVisualizer.setState(state);
};
const createParaboloidRing = (radius = 1.1, segments = 320, curvature = 0.06, tubeRadius = 0.022, waves = 5) => {
const points = [];
for (let i = 0; i <= segments; i += 1) {
const theta = (i / segments) * Math.PI * 2;
const x = radius * Math.cos(theta);
const z = radius * Math.sin(theta);
// Smooth round lumps: plain cosine is inherently smooth with
// symmetric rounded peaks and valleys — no sharpening needed.
const y = curvature * radius * Math.cos(waves * theta);
points.push(new THREE.Vector3(x, y, z));
}
const curve = new THREE.CatmullRomCurve3(points, true);
return new THREE.TubeGeometry(curve, segments, tubeRadius, 12, true);
};
const createAgentVisualizer = () => {
if (!window.THREE || !agentVizEl) return null;
const renderer = new THREE.WebGLRenderer({
antialias: true,
alpha: false,
powerPreference: "high-performance",
});
renderer.setPixelRatio(1);
renderer.setClearColor(0xa09b96, 1);
agentVizEl.innerHTML = "";
agentVizEl.appendChild(renderer.domElement);
const scene = new THREE.Scene();
const orthoSize = 2.0;
const camera = new THREE.OrthographicCamera(-orthoSize, orthoSize, orthoSize, -orthoSize, 0.1, 40);
const lookAt = new THREE.Vector3(0, 0, 0);
const speakingSideView = new THREE.Vector3(3.45, 0, 0);
const topView = new THREE.Vector3(0, 3.25, 0.001);
camera.position.copy(topView);
camera.lookAt(lookAt);
const ambient = new THREE.AmbientLight(0xffffff, 1.0);
scene.add(ambient);
const geometry = createParaboloidRing();
const ringMaterial = new THREE.MeshBasicMaterial({
color: 0xfff5eb,
transparent: false,
side: THREE.DoubleSide,
});
const ring = new THREE.Mesh(geometry, ringMaterial);
const group = new THREE.Group();
group.add(ring);
group.rotation.y = Math.PI * 0.18;
scene.add(group);
const resize = () => {
const width = Math.max(2, agentVizEl.clientWidth);
const height = Math.max(2, agentVizEl.clientHeight);
renderer.setSize(width, height, false);
const aspect = width / height;
// Keep the ring fully visible in both landscape and portrait.
// Landscape (aspect >= 1): expand horizontally, keep vertical fixed.
// Portrait (aspect < 1): keep horizontal fixed at orthoSize,
// expand vertically so the ring isn't clipped.
if (aspect >= 1) {
camera.left = -orthoSize * aspect;
camera.right = orthoSize * aspect;
camera.top = orthoSize;
camera.bottom = -orthoSize;
} else {
camera.left = -orthoSize;
camera.right = orthoSize;
camera.top = orthoSize / aspect;
camera.bottom = -orthoSize / aspect;
}
camera.updateProjectionMatrix();
};
resize();
window.addEventListener("resize", resize);
let currentState = STATES.idle;
let currentAudioLevel = 0;
let smoothAudioLevel = 0;
let deformScale = 1.0;
let ringScale = 1.0; // uniform xz scale — used for thickness throb when thinking
let spinSpeed = 0.0;
// Card background colour lerp: 0 = idle coral, 1 = dark listening
let cardColorT = 0.0;
let connectedT = 0.0; // 0 = gray (disconnected), 1 = coral (connected)
const CARD_GRAY_RGB = [160, 155, 150]; // disconnected gray
const CARD_IDLE_RGB = [212, 85, 63]; // #d4553f
const CARD_LISTEN_RGB = [120, 40, 28]; // dark desaturated coral
const setStateColor = (_state) => { /* no-op: MeshBasicMaterial, colour is fixed */ };
let prevCardRGB = "";
let targetConnected = 0.0;
let isConnecting = false;
const renderFrame = (now = 0) => {
const dt = Math.min((now - (renderFrame._lastNow || now)) / 1000, 0.1);
renderFrame._lastNow = now;
// Precompute lerp alphas once per frame (dt * 60 normalises to 60Hz baseline).
const t = dt * 60;
const lerpAudio = 1 - Math.pow(0.85, t);
const lerpDeform = 1 - Math.pow(0.88, t);
const lerpSpin = 1 - Math.pow(0.86, t);
const lerpRing = 1 - Math.pow(0.90, t);
smoothAudioLevel += (currentAudioLevel - smoothAudioLevel) * lerpAudio;
const speakingActive = currentState === STATES.speaking;
let targetDeformScale = 1.0 + (smoothAudioLevel * 1.1);
if (speakingActive) {
targetDeformScale = 2.05 + (smoothAudioLevel * 2.9);
} else if (currentState === STATES.thinking) {
targetDeformScale = 0.55 + (smoothAudioLevel * 0.35);
}
deformScale += (targetDeformScale - deformScale) * lerpDeform;
group.scale.y = deformScale;
// Thickness throb when thinking: pulse xz scale at 1 s rate.
const targetRingScale = currentState === STATES.thinking
? 1.0 + 0.18 * (0.5 + 0.5 * Math.sin(now * (Math.PI * 2 / 1000)))
: 1.0;
ringScale += (targetRingScale - ringScale) * lerpRing;
group.scale.x = ringScale;
group.scale.z = ringScale;
const targetSpinSpeed = speakingActive
? (0.012 + smoothAudioLevel * 0.105)
: (currentState === STATES.thinking ? 0.006 : 0.0022);
spinSpeed += (targetSpinSpeed - spinSpeed) * lerpSpin;
group.rotation.y += spinSpeed * t;
// Only move camera (and call lookAt) when in speaking state.
if (speakingActive || camera.position.distanceToSquared(topView) > 0.0001) {
const lerpCamera = 1 - Math.pow(0.96, t);
const targetCameraPosition = speakingActive ? speakingSideView : topView;
camera.position.lerp(targetCameraPosition, lerpCamera);
camera.lookAt(lookAt);
}
// Card background: gray → coral as connection is established, then darken when listening.
// While connecting, throb the gray base with a slow sine pulse.
connectedT += (targetConnected - connectedT) * (1 - Math.pow(0.88, t));
const throb = isConnecting && targetConnected === 0
? 0.22 * (0.5 - 0.5 * Math.sin(now * (Math.PI * 2 / 1000))) // 00.22 darkness pulse, 1 s period
: 0.0;
const baseR = Math.round(CARD_GRAY_RGB[0] + (CARD_IDLE_RGB[0] - CARD_GRAY_RGB[0]) * connectedT - throb * CARD_GRAY_RGB[0]);
const baseG = Math.round(CARD_GRAY_RGB[1] + (CARD_IDLE_RGB[1] - CARD_GRAY_RGB[1]) * connectedT - throb * CARD_GRAY_RGB[1]);
const baseB = Math.round(CARD_GRAY_RGB[2] + (CARD_IDLE_RGB[2] - CARD_GRAY_RGB[2]) * connectedT - throb * CARD_GRAY_RGB[2]);
const targetCardT = currentState === STATES.listening ? 1.0 : 0.0;
const cardBase = targetCardT > cardColorT ? 0.05 : 0.7;
cardColorT += (targetCardT - cardColorT) * (1 - Math.pow(cardBase, t));
const r = Math.min(255, Math.round(baseR + (CARD_LISTEN_RGB[0] - baseR) * cardColorT));
const g = Math.min(255, Math.round(baseG + (CARD_LISTEN_RGB[1] - baseG) * cardColorT));
const b = Math.min(255, Math.round(baseB + (CARD_LISTEN_RGB[2] - baseB) * cardColorT));
const cardRGB = `${r},${g},${b}`;
if (cardRGB !== prevCardRGB) {
renderer.setClearColor((r << 16) | (g << 8) | b, 1);
prevCardRGB = cardRGB;
}
renderer.render(scene, camera);
requestAnimationFrame(renderFrame);
};
setStateColor(currentState);
requestAnimationFrame(renderFrame);
return {
setAudioLevel: (level) => {
currentAudioLevel = Math.max(0, Math.min(1, Number(level) || 0));
},
setState: (state) => {
if (!STATES[state]) return;
currentState = state;
setStateColor(state);
},
setConnected: (connected) => {
targetConnected = connected ? 1.0 : 0.0;
if (connected) isConnecting = false;
},
setConnecting: (connecting) => {
isConnecting = !!connecting;
},
};
};
agentVisualizer = createAgentVisualizer();
if (agentVisualizer) agentVisualizer.setState(agentState);
const markRemoteAudioActivity = () => {
lastRemoteAudioActivityS = performance.now() / 1000;
};
remoteAudio.addEventListener("playing", markRemoteAudioActivity);
remoteAudio.addEventListener("timeupdate", markRemoteAudioActivity);
remoteAudio.addEventListener("canplay", markRemoteAudioActivity);
remoteAudio.addEventListener("seeked", markRemoteAudioActivity);
const wsProto = location.protocol === "https:" ? "wss" : "ws";
const ws = new WebSocket(`${wsProto}://${location.host}/ws/chat`);
let peerConnection = null;
let micStream = null;
let remoteStream = null;
let voiceConnected = false;
let disconnectedTimer = null;
let reconnectTimer = null;
let reconnectAttempts = 0;
let voiceDesired = false;
let connectingVoice = false;
let pttPressed = false;
let rtcAnswerApplied = false;
let pendingRemoteCandidates = [];
let appStarted = false;
const MAX_RECONNECT_ATTEMPTS = 2;
const AudioContextCtor = window.AudioContext || window.webkitAudioContext;
let visualizerAudioContext = null;
let visualizerSourceNode = null;
let visualizerSourceStream = null;
let visualizerAnalyser = null;
let visualizerWaveform = null;
let visualizerMeterRunning = false;
// --- Status overlay ---
let statusTimer = null;
const showStatus = (text, persistMs = 0) => {
voiceStatus.textContent = text;
voiceStatus.classList.add("visible");
if (statusTimer) { clearTimeout(statusTimer); statusTimer = null; }
if (persistMs > 0) {
statusTimer = setTimeout(() => {
voiceStatus.classList.remove("visible");
statusTimer = null;
}, persistMs);
}
};
const startVisualizerMeter = () => {
if (visualizerMeterRunning) return;
visualizerMeterRunning = true;
const sampleLevel = () => {
let level = 0;
if (visualizerAnalyser && visualizerWaveform) {
visualizerAnalyser.getByteTimeDomainData(visualizerWaveform);
let sum = 0;
for (let idx = 0; idx < visualizerWaveform.length; idx += 1) {
const value = (visualizerWaveform[idx] - 128) / 128;
sum += value * value;
}
const rms = Math.sqrt(sum / visualizerWaveform.length);
level = Math.min(1, rms * 4.8);
}
if (agentVisualizer) agentVisualizer.setAudioLevel(level);
requestAnimationFrame(sampleLevel);
};
requestAnimationFrame(sampleLevel);
};
const ensureVisualizerAudioMeter = async () => {
if (!agentVisualizer || !AudioContextCtor) return;
if (!visualizerAudioContext) {
visualizerAudioContext = new AudioContextCtor();
}
if (visualizerAudioContext.state === "suspended") {
try { await visualizerAudioContext.resume(); } catch (_) {}
}
if (!visualizerAnalyser) {
visualizerAnalyser = visualizerAudioContext.createAnalyser();
visualizerAnalyser.fftSize = 512;
visualizerAnalyser.smoothingTimeConstant = 0.84;
visualizerWaveform = new Uint8Array(visualizerAnalyser.fftSize);
}
if (
remoteStream
&& remoteStream.getAudioTracks
&& remoteStream.getAudioTracks().length > 0
&& visualizerSourceStream !== remoteStream
) {
if (visualizerSourceNode) {
try { visualizerSourceNode.disconnect(); } catch (_) {}
visualizerSourceNode = null;
}
try {
visualizerSourceNode = visualizerAudioContext.createMediaStreamSource(remoteStream);
visualizerSourceNode.connect(visualizerAnalyser);
visualizerSourceStream = remoteStream;
} catch (_err) {
visualizerSourceNode = null;
visualizerSourceStream = null;
}
}
startVisualizerMeter();
};
// --- Log ---
const MAX_LOG_LINES = 250;
const MAX_PENDING_LOG_LINES = 500;
const pendingLogItems = [];
let logFlushScheduled = false;
const flushPendingLogItems = () => {
logFlushScheduled = false;
if (pendingLogItems.length === 0) return;
const fragment = document.createDocumentFragment();
for (const item of pendingLogItems.splice(0)) {
const role = item.role || "system";
const line = document.createElement("div");
line.className = `line ${role}`;
const time = item.timestamp ? new Date(item.timestamp).toLocaleTimeString() : "";
const normalizedRole = role.toString().trim().toLowerCase();
const rawText = (item.text || "").toString();
if (normalizedRole === "nanobot") {
const cleaned = rawText.replace(/^(?:nanobot|napbot)\b\s*[:>\-]?\s*/i, "");
line.textContent = `[${time}] ${cleaned}`;
} else {
line.textContent = `[${time}] ${role}: ${rawText}`;
}
fragment.appendChild(line);
}
logEl.appendChild(fragment);
while (logEl.childElementCount > MAX_LOG_LINES) {
if (!logEl.firstElementChild) break;
logEl.removeChild(logEl.firstElementChild);
}
};
const scheduleLogFlush = () => {
if (logFlushScheduled) return;
logFlushScheduled = true;
requestAnimationFrame(flushPendingLogItems);
};
const appendLine = (role, text, timestamp) => {
pendingLogItems.push({ role, text, timestamp });
if (pendingLogItems.length > MAX_PENDING_LOG_LINES) {
pendingLogItems.splice(0, pendingLogItems.length - MAX_PENDING_LOG_LINES);
}
scheduleLogFlush();
};
const sendJson = (payload) => {
if (ws.readyState !== WebSocket.OPEN) return;
ws.send(JSON.stringify(payload));
};
const sendUserMessage = (text) => {
const message = (text || "").toString().trim();
if (!message) return false;
if (ws.readyState !== WebSocket.OPEN) {
showStatus("WebSocket disconnected.", 2000);
return false;
}
sendJson({ type: "user-message", text: message });
return true;
};
// --- Voice state ---
const setVoiceConnected = (connected) => {
voiceConnected = connected;
if (agentVisualizer) agentVisualizer.setConnected(connected);
};
const setMicCaptureEnabled = (enabled) => {
if (!micStream) return;
micStream.getAudioTracks().forEach((track) => { track.enabled = enabled; });
};
const setPushToTalkState = (pressed, notifyServer = true) => {
pttPressed = pressed;
document.body.classList.toggle("ptt-active", pressed);
setMicCaptureEnabled(pressed);
if (notifyServer && ws.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ type: "voice-ptt", pressed }));
}
if (pressed) {
setAgentState(STATES.listening);
showStatus("Listening...");
} else {
if (agentState === STATES.listening) setAgentState(STATES.idle);
if (voiceConnected) showStatus("Hold anywhere to talk", 1800);
}
};
const beginPushToTalk = () => {
if (!voiceConnected || !peerConnection || !micStream) return;
if (pttPressed) return;
setPushToTalkState(true);
};
const endPushToTalk = () => {
if (!pttPressed) return;
setPushToTalkState(false);
};
// --- Reconnect ---
const clearReconnectTimer = () => {
if (reconnectTimer) { clearTimeout(reconnectTimer); reconnectTimer = null; }
};
const scheduleReconnect = (reason, delayMs = 1200) => {
if (!voiceDesired) return;
if (voiceConnected || connectingVoice) return;
if (reconnectTimer) return;
if (reconnectAttempts >= MAX_RECONNECT_ATTEMPTS) {
showStatus("Voice reconnect failed.");
return;
}
reconnectAttempts += 1;
showStatus(`${reason} Retrying (${reconnectAttempts}/${MAX_RECONNECT_ATTEMPTS})...`);
reconnectTimer = setTimeout(async () => {
reconnectTimer = null;
await connectVoiceChannel();
}, delayMs);
};
const stopVoiceChannel = async (statusText = "", clearDesired = false) => {
if (clearDesired) {
voiceDesired = false;
reconnectAttempts = 0;
clearReconnectTimer();
}
if (disconnectedTimer) { clearTimeout(disconnectedTimer); disconnectedTimer = null; }
pendingRemoteCandidates = [];
rtcAnswerApplied = false;
setPushToTalkState(false, false);
if (peerConnection) {
peerConnection.ontrack = null;
peerConnection.onicecandidate = null;
peerConnection.onconnectionstatechange = null;
peerConnection.close();
peerConnection = null;
}
if (micStream) {
micStream.getTracks().forEach((track) => track.stop());
micStream = null;
}
if (remoteStream) {
remoteStream.getTracks().forEach((track) => track.stop());
remoteStream = null;
}
remoteAudio.srcObject = null;
setVoiceConnected(false);
lastRemoteAudioActivityS = 0;
visualizerSourceStream = null;
if (visualizerSourceNode) {
try { visualizerSourceNode.disconnect(); } catch (_) {}
visualizerSourceNode = null;
}
if (agentVisualizer) agentVisualizer.setAudioLevel(0);
if (agentVisualizer) agentVisualizer.setConnecting(false);
if (statusText) showStatus(statusText, 3000);
};
// --- WebRTC ---
const applyRtcAnswer = async (message) => {
if (!peerConnection) return;
const rawSdp = (message.sdp || "").toString();
if (!rawSdp.trim()) return;
const sdp = `${rawSdp
.replace(/\r\n/g, "\n")
.replace(/\r/g, "\n")
.split("\n")
.map((line) => line.trimEnd())
.join("\r\n")
.trim()}\r\n`;
try {
await peerConnection.setRemoteDescription({ type: message.rtcType || "answer", sdp });
rtcAnswerApplied = true;
const queued = pendingRemoteCandidates;
pendingRemoteCandidates = [];
for (const candidate of queued) {
try { await peerConnection.addIceCandidate(candidate); } catch (_) {}
}
reconnectAttempts = 0;
} catch (err) {
await stopVoiceChannel("Voice setup failed.");
scheduleReconnect("Failed to apply answer.");
appendLine("system", `RTC answer error: ${err}`, new Date().toISOString());
}
};
const applyRtcIceCandidate = async (message) => {
if (!peerConnection) return;
if (message.candidate == null) {
if (!rtcAnswerApplied || !peerConnection.remoteDescription) {
pendingRemoteCandidates.push(null);
return;
}
try { await peerConnection.addIceCandidate(null); } catch (_) {}
return;
}
try {
if (!rtcAnswerApplied || !peerConnection.remoteDescription) {
pendingRemoteCandidates.push(message.candidate);
return;
}
await peerConnection.addIceCandidate(message.candidate);
} catch (err) {
appendLine("system", `RTC ICE error: ${err}`, new Date().toISOString());
}
};
const connectVoiceChannel = async () => {
if (voiceConnected || peerConnection || connectingVoice) return;
if (!window.RTCPeerConnection || !navigator.mediaDevices?.getUserMedia) {
showStatus("Voice unavailable in this browser.", 4000);
return;
}
if (ws.readyState !== WebSocket.OPEN) {
showStatus("Connecting...");
return;
}
connectingVoice = true;
if (agentVisualizer) agentVisualizer.setConnecting(true);
showStatus("Connecting voice...");
try {
clearReconnectTimer();
rtcAnswerApplied = false;
pendingRemoteCandidates = [];
try {
micStream = await navigator.mediaDevices.getUserMedia({
audio: { channelCount: 1, sampleRate: 48000, sampleSize: 16, latency: 0,
echoCancellation: true, noiseSuppression: true, autoGainControl: false },
video: false,
});
} catch (_) {
micStream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
}
setMicCaptureEnabled(false);
peerConnection = new RTCPeerConnection({ iceServers: [{ urls: "stun:stun.l.google.com:19302" }] });
remoteStream = new MediaStream();
remoteAudio.srcObject = remoteStream;
peerConnection.ontrack = (event) => {
if (event.track.kind !== "audio") return;
remoteStream.addTrack(event.track);
remoteAudio.play().then(() => {
markRemoteAudioActivity();
ensureVisualizerAudioMeter();
}).catch(() => {});
};
peerConnection.onicecandidate = (event) => {
if (!event.candidate) { sendJson({ type: "rtc-ice-candidate", candidate: null }); return; }
sendJson({ type: "rtc-ice-candidate", candidate: event.candidate.toJSON() });
};
peerConnection.onconnectionstatechange = () => {
const state = peerConnection?.connectionState || "new";
if (state === "connected") {
if (disconnectedTimer) { clearTimeout(disconnectedTimer); disconnectedTimer = null; }
clearReconnectTimer();
reconnectAttempts = 0;
setVoiceConnected(true);
showStatus("Hold anywhere to talk", 2500);
return;
}
if (state === "failed" || state === "closed") {
stopVoiceChannel(`Voice channel ${state}.`);
scheduleReconnect(`Voice channel ${state}.`);
return;
}
if (state === "disconnected") {
if (disconnectedTimer) clearTimeout(disconnectedTimer);
showStatus("Voice disconnected. Waiting...");
disconnectedTimer = setTimeout(() => {
if (peerConnection?.connectionState === "disconnected") {
stopVoiceChannel("Voice channel disconnected.");
scheduleReconnect("Voice channel disconnected.");
}
}, 8000);
return;
}
};
micStream.getAudioTracks().forEach((track) => { peerConnection.addTrack(track, micStream); });
const offer = await peerConnection.createOffer();
await peerConnection.setLocalDescription(offer);
sendJson({ type: "rtc-offer", sdp: offer.sdp, rtcType: offer.type });
} catch (err) {
await stopVoiceChannel("Voice setup failed.");
scheduleReconnect("Voice setup failed.");
appendLine("system", `Voice setup error: ${err}`, new Date().toISOString());
} finally {
connectingVoice = false;
// Stop throb if connection failed (success path clears it in setConnected)
if (!voiceConnected && agentVisualizer) agentVisualizer.setConnecting(false);
}
};
// --- First-tap bootstrap ---
const bootstrap = async () => {
if (appStarted) return;
appStarted = true;
// Unblock audio context (required by browsers before user gesture resolves)
remoteAudio.play().catch(() => {});
await ensureVisualizerAudioMeter();
sendJson({ type: "spawn" });
voiceDesired = true;
reconnectAttempts = 0;
await connectVoiceChannel();
};
if (resetSessionBtn) {
resetSessionBtn.addEventListener("click", async (event) => {
event.preventDefault();
event.stopPropagation();
if (!appStarted) {
await bootstrap();
}
if (sendUserMessage("/reset")) {
showStatus("Reset command sent.", 1500);
}
});
}
// --- Whole-screen PTT pointer handling ---
// We track active pointer IDs so multi-touch doesn't double-fire.
const activePointers = new Set();
document.addEventListener("pointerdown", async (event) => {
if (event.target instanceof Element && event.target.closest("[data-no-ptt='1']")) {
return;
}
if (!appStarted) {
await bootstrap();
return;
}
ensureVisualizerAudioMeter();
activePointers.add(event.pointerId);
if (activePointers.size === 1) beginPushToTalk();
}, { passive: false });
document.addEventListener("pointerup", (event) => {
activePointers.delete(event.pointerId);
if (activePointers.size === 0) endPushToTalk();
}, { passive: false });
document.addEventListener("pointercancel", (event) => {
activePointers.delete(event.pointerId);
if (activePointers.size === 0) endPushToTalk();
}, { passive: false });
// --- WebSocket ---
ws.onopen = () => {
appendLine("system", "WebSocket connected.", new Date().toISOString());
showStatus("Tap anywhere to start", 0);
};
ws.onclose = async () => {
appendLine("system", "WebSocket disconnected.", new Date().toISOString());
await stopVoiceChannel("Disconnected.", true);
};
ws.onerror = () => appendLine("system", "WebSocket error.", new Date().toISOString());
ws.onmessage = async (event) => {
try {
const msg = JSON.parse(event.data);
if (msg.type === "rtc-answer") { await applyRtcAnswer(msg); return; }
if (msg.type === "rtc-ice-candidate") { await applyRtcIceCandidate(msg); return; }
if (msg.type === "rtc-state") {
const state = (msg.state || "").toString();
if (state === "connected") {
setVoiceConnected(true);
showStatus("Hold anywhere to talk", 2500);
}
return;
}
if (msg.type === "rtc-error") {
const text = (msg.message || "Unknown WebRTC error.").toString();
showStatus(`Voice error: ${text}`, 4000);
appendLine("system", `Voice error: ${text}`, new Date().toISOString());
await stopVoiceChannel("Voice channel error.");
scheduleReconnect("Voice channel error.");
return;
}
// Drive agent state indicator from server-sent agent-state events
if (msg.role === "agent-state") {
const newState = (msg.text || "").trim();
// Don't override listening state (user is holding PTT)
if (agentState !== STATES.listening && STATES[newState]) {
setAgentState(newState);
}
} else if (msg.role === "wisper") {
// suppress wisper debug output
} else {
appendLine(msg.role || "system", msg.text || "", msg.timestamp || "");
}
} catch (_err) {
appendLine("system", event.data, new Date().toISOString());
}
};
</script>
</body>
</html>