feat: polish life os cards and voice stack
This commit is contained in:
parent
66362c7176
commit
0edf8c3fef
21 changed files with 3681 additions and 502 deletions
52
scripts/install_melotts_cpu.sh
Normal file
52
scripts/install_melotts_cpu.sh
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
VENV_PYTHON="${ROOT_DIR}/.venv/bin/python"
|
||||
|
||||
if [[ ! -x "${VENV_PYTHON}" ]]; then
|
||||
echo "error: ${VENV_PYTHON} does not exist. Create the web UI virtualenv first." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
"${VENV_PYTHON}" -m pip install \
|
||||
--index-url https://download.pytorch.org/whl/cpu \
|
||||
"torch==2.7.1+cpu" \
|
||||
"torchaudio==2.7.1+cpu"
|
||||
|
||||
"${VENV_PYTHON}" -m pip install "setuptools<81"
|
||||
|
||||
"${VENV_PYTHON}" -m pip install \
|
||||
txtsplit \
|
||||
cached_path \
|
||||
"transformers==4.46.3" \
|
||||
"num2words==0.5.12" \
|
||||
"unidic_lite==1.0.8" \
|
||||
"mecab-python3==1.0.9" \
|
||||
fugashi \
|
||||
"pykakasi==2.2.1" \
|
||||
"g2p_en==2.1.0" \
|
||||
"anyascii==0.3.2" \
|
||||
"jamo==0.4.1" \
|
||||
"gruut[de,es,fr]==2.2.3" \
|
||||
"librosa==0.9.1" \
|
||||
"pydub==0.25.1" \
|
||||
"eng_to_ipa==0.0.2" \
|
||||
"inflect==7.0.0" \
|
||||
"unidecode==1.3.7" \
|
||||
"pypinyin==0.50.0" \
|
||||
"cn2an==0.5.22" \
|
||||
"jieba==0.42.1" \
|
||||
soundfile \
|
||||
tqdm
|
||||
|
||||
"${VENV_PYTHON}" -m pip install --no-deps "git+https://github.com/myshell-ai/MeloTTS.git"
|
||||
|
||||
"${VENV_PYTHON}" - <<'PY'
|
||||
import os
|
||||
import nltk
|
||||
|
||||
download_dir = os.path.expanduser("~/nltk_data")
|
||||
for package in ("averaged_perceptron_tagger", "averaged_perceptron_tagger_eng", "cmudict"):
|
||||
nltk.download(package, download_dir=download_dir)
|
||||
PY
|
||||
252
scripts/melotts_server.py
Normal file
252
scripts/melotts_server.py
Normal file
|
|
@ -0,0 +1,252 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import contextlib
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except Exception as exc: # pragma: no cover - runtime fallback when dependency is missing
|
||||
np = None # type: ignore[assignment]
|
||||
NUMPY_IMPORT_ERROR = exc
|
||||
else:
|
||||
NUMPY_IMPORT_ERROR = None
|
||||
|
||||
|
||||
ROOT_DIR = Path(__file__).resolve().parents[1]
|
||||
WORKSPACE_DIR = Path(os.getenv("NANOBOT_WORKSPACE", str(Path.home() / ".nanobot"))).expanduser()
|
||||
SOCKET_PATH = Path(os.getenv("MELO_TTS_SOCKET", str(WORKSPACE_DIR / "melotts.sock"))).expanduser()
|
||||
|
||||
|
||||
try:
|
||||
from melo.api import TTS
|
||||
|
||||
MELO_TTS_AVAILABLE = True
|
||||
except Exception as exc: # pragma: no cover - runtime fallback when dependency is missing
|
||||
TTS = None # type: ignore[assignment]
|
||||
MELO_TTS_AVAILABLE = False
|
||||
IMPORT_ERROR = exc
|
||||
else:
|
||||
IMPORT_ERROR = None
|
||||
|
||||
|
||||
class MeloTTSServer:
|
||||
def __init__(self) -> None:
|
||||
if not MELO_TTS_AVAILABLE or TTS is None:
|
||||
raise RuntimeError(f"MeloTTS import failed: {IMPORT_ERROR}")
|
||||
if np is None:
|
||||
raise RuntimeError(f"numpy import failed: {NUMPY_IMPORT_ERROR}")
|
||||
|
||||
self._language = os.getenv("MELO_TTS_LANGUAGE", "EN").strip() or "EN"
|
||||
self._device = os.getenv("MELO_TTS_DEVICE", "cpu").strip() or "cpu"
|
||||
self._speed = float(os.getenv("MELO_TTS_SPEED", "1.0"))
|
||||
self._speaker_name = os.getenv("MELO_TTS_SPEAKER", "EN-US").strip() or "EN-US"
|
||||
self._warmup_text = os.getenv("MELO_TTS_WARMUP_TEXT", "Nanobot is ready.").strip()
|
||||
|
||||
self._model = TTS(language=self._language, device=self._device)
|
||||
self._speaker_ids = dict(getattr(self._model.hps.data, "spk2id", {}))
|
||||
if self._speaker_name not in self._speaker_ids:
|
||||
available = ", ".join(sorted(self._speaker_ids))
|
||||
raise RuntimeError(
|
||||
f"speaker '{self._speaker_name}' is not available for language {self._language}. "
|
||||
f"Available speakers: {available}"
|
||||
)
|
||||
self._speaker_id = self._speaker_ids[self._speaker_name]
|
||||
if self._warmup_text:
|
||||
self._warmup()
|
||||
|
||||
def ping(self) -> dict[str, Any]:
|
||||
return {
|
||||
"ok": True,
|
||||
"language": self._language,
|
||||
"device": self._device,
|
||||
"speaker": self._speaker_name,
|
||||
"speakers": sorted(self._speaker_ids),
|
||||
}
|
||||
|
||||
def synthesize_pcm(self, text: str) -> dict[str, Any]:
|
||||
clean_text = " ".join(text.split())
|
||||
if not clean_text:
|
||||
raise RuntimeError("text is empty")
|
||||
|
||||
pcm, sample_rate, channels = self._synthesize_pcm(clean_text)
|
||||
return {
|
||||
"ok": True,
|
||||
"encoding": "pcm_s16le_base64",
|
||||
"pcm": base64.b64encode(pcm).decode("ascii"),
|
||||
"sample_rate": sample_rate,
|
||||
"channels": channels,
|
||||
"language": self._language,
|
||||
"speaker": self._speaker_name,
|
||||
}
|
||||
|
||||
def synthesize_to_file(self, text: str, output_wav: str) -> dict[str, Any]:
|
||||
clean_text = " ".join(text.split())
|
||||
if not clean_text:
|
||||
raise RuntimeError("text is empty")
|
||||
|
||||
output_path = Path(output_wav)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._model.tts_to_file(
|
||||
clean_text,
|
||||
self._speaker_id,
|
||||
str(output_path),
|
||||
speed=self._speed,
|
||||
quiet=True,
|
||||
)
|
||||
return {
|
||||
"ok": True,
|
||||
"output_wav": str(output_path),
|
||||
"language": self._language,
|
||||
"speaker": self._speaker_name,
|
||||
}
|
||||
|
||||
def _warmup(self) -> None:
|
||||
self._synthesize_pcm(self._warmup_text)
|
||||
|
||||
def _synthesize_pcm(self, text: str) -> tuple[bytes, int, int]:
|
||||
wav = self._model.tts_to_file(
|
||||
text,
|
||||
self._speaker_id,
|
||||
None,
|
||||
speed=self._speed,
|
||||
quiet=True,
|
||||
)
|
||||
if np is None:
|
||||
raise RuntimeError("numpy is unavailable")
|
||||
samples = np.asarray(wav)
|
||||
if samples.size == 0:
|
||||
raise RuntimeError("MeloTTS produced empty audio")
|
||||
|
||||
channels = 1
|
||||
if samples.ndim == 0:
|
||||
samples = samples.reshape(1)
|
||||
elif samples.ndim == 1:
|
||||
channels = 1
|
||||
elif samples.ndim == 2:
|
||||
dim0, dim1 = int(samples.shape[0]), int(samples.shape[1])
|
||||
if dim0 <= 2 and dim1 > dim0:
|
||||
channels = dim0
|
||||
samples = samples.T
|
||||
elif dim1 <= 2 and dim0 > dim1:
|
||||
channels = dim1
|
||||
else:
|
||||
channels = 1
|
||||
samples = samples.reshape(-1)
|
||||
else:
|
||||
channels = 1
|
||||
samples = samples.reshape(-1)
|
||||
|
||||
if np.issubdtype(samples.dtype, np.floating):
|
||||
samples = np.clip(samples, -1.0, 1.0)
|
||||
samples = (samples * 32767.0).astype(np.int16)
|
||||
elif samples.dtype != np.int16:
|
||||
samples = samples.astype(np.int16)
|
||||
|
||||
sample_rate = int(getattr(self._model.hps.data, "sampling_rate", 44100))
|
||||
return samples.tobytes(), sample_rate, max(1, channels)
|
||||
|
||||
|
||||
def _build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="Persistent MeloTTS sidecar for Nanobot voice.")
|
||||
parser.add_argument("--socket-path", default=str(SOCKET_PATH))
|
||||
return parser
|
||||
|
||||
|
||||
def _receive_json(conn: socket.socket) -> dict[str, Any]:
|
||||
chunks: list[bytes] = []
|
||||
while True:
|
||||
data = conn.recv(8192)
|
||||
if not data:
|
||||
break
|
||||
chunks.append(data)
|
||||
if b"\n" in data:
|
||||
break
|
||||
payload = b"".join(chunks).decode("utf-8", errors="replace").strip()
|
||||
if not payload:
|
||||
return {}
|
||||
return json.loads(payload)
|
||||
|
||||
|
||||
def _send_json(conn: socket.socket, payload: dict[str, Any]) -> None:
|
||||
conn.sendall((json.dumps(payload) + "\n").encode("utf-8"))
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = _build_parser().parse_args()
|
||||
socket_path = Path(args.socket_path).expanduser()
|
||||
socket_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with contextlib.suppress(FileNotFoundError):
|
||||
socket_path.unlink()
|
||||
|
||||
stop_requested = False
|
||||
|
||||
def request_stop(_signum: int, _frame: object) -> None:
|
||||
nonlocal stop_requested
|
||||
stop_requested = True
|
||||
|
||||
signal.signal(signal.SIGTERM, request_stop)
|
||||
signal.signal(signal.SIGINT, request_stop)
|
||||
|
||||
try:
|
||||
server = MeloTTSServer()
|
||||
except Exception as exc:
|
||||
print(f"melotts server initialization failed: {exc}", file=sys.stderr, flush=True)
|
||||
return 1
|
||||
|
||||
listener = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
listener.bind(str(socket_path))
|
||||
listener.listen(8)
|
||||
listener.settimeout(1.0)
|
||||
|
||||
try:
|
||||
while not stop_requested:
|
||||
try:
|
||||
conn, _addr = listener.accept()
|
||||
except TimeoutError:
|
||||
continue
|
||||
except OSError:
|
||||
if stop_requested:
|
||||
break
|
||||
raise
|
||||
with conn:
|
||||
try:
|
||||
request = _receive_json(conn)
|
||||
action = str(request.get("action", "")).strip().lower()
|
||||
if action == "ping":
|
||||
_send_json(conn, server.ping())
|
||||
continue
|
||||
if action == "synthesize_pcm":
|
||||
text = str(request.get("text", ""))
|
||||
response = server.synthesize_pcm(text)
|
||||
_send_json(conn, response)
|
||||
continue
|
||||
if action == "synthesize":
|
||||
text = str(request.get("text", ""))
|
||||
output_wav = str(request.get("output_wav", ""))
|
||||
if not output_wav:
|
||||
raise RuntimeError("output_wav is required")
|
||||
response = server.synthesize_to_file(text, output_wav)
|
||||
_send_json(conn, response)
|
||||
continue
|
||||
raise RuntimeError(f"unsupported action: {action or 'missing'}")
|
||||
except Exception as exc:
|
||||
_send_json(conn, {"ok": False, "error": str(exc)})
|
||||
finally:
|
||||
listener.close()
|
||||
with contextlib.suppress(FileNotFoundError):
|
||||
socket_path.unlink()
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
106
scripts/melotts_tts.py
Normal file
106
scripts/melotts_tts.py
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import contextlib
|
||||
import json
|
||||
import os
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
ROOT_DIR = Path(__file__).resolve().parents[1]
|
||||
WORKSPACE_DIR = Path(os.getenv("NANOBOT_WORKSPACE", str(Path.home() / ".nanobot"))).expanduser()
|
||||
LOG_DIR = WORKSPACE_DIR / "logs"
|
||||
SOCKET_PATH = Path(os.getenv("MELO_TTS_SOCKET", str(WORKSPACE_DIR / "melotts.sock"))).expanduser()
|
||||
SERVER_SCRIPT = ROOT_DIR / "scripts" / "melotts_server.py"
|
||||
SERVER_LOG_PATH = LOG_DIR / "melotts-server.log"
|
||||
DEFAULT_STARTUP_TIMEOUT_S = float(os.getenv("MELO_TTS_SERVER_STARTUP_TIMEOUT_S", "120"))
|
||||
|
||||
|
||||
def _build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="Nanobot MeloTTS command adapter.")
|
||||
parser.add_argument("--text", required=True)
|
||||
parser.add_argument("--output-wav", required=True)
|
||||
parser.add_argument("--socket-path", default=str(SOCKET_PATH))
|
||||
return parser
|
||||
|
||||
|
||||
def _rpc(socket_path: Path, payload: dict[str, Any], timeout_s: float = 10.0) -> dict[str, Any]:
|
||||
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
sock.settimeout(timeout_s)
|
||||
try:
|
||||
sock.connect(str(socket_path))
|
||||
sock.sendall((json.dumps(payload) + "\n").encode("utf-8"))
|
||||
response = sock.recv(8192).decode("utf-8", errors="replace").strip()
|
||||
finally:
|
||||
sock.close()
|
||||
if not response:
|
||||
raise RuntimeError("empty response from MeloTTS server")
|
||||
return json.loads(response)
|
||||
|
||||
|
||||
def _ping(socket_path: Path) -> bool:
|
||||
try:
|
||||
response = _rpc(socket_path, {"action": "ping"}, timeout_s=2.0)
|
||||
except Exception:
|
||||
return False
|
||||
return bool(response.get("ok"))
|
||||
|
||||
|
||||
def _ensure_server(socket_path: Path) -> None:
|
||||
if _ping(socket_path):
|
||||
return
|
||||
|
||||
with contextlib.suppress(FileNotFoundError):
|
||||
socket_path.unlink()
|
||||
|
||||
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
with SERVER_LOG_PATH.open("a", encoding="utf-8") as log_handle:
|
||||
proc = subprocess.Popen(
|
||||
[sys.executable, str(SERVER_SCRIPT), "--socket-path", str(socket_path)],
|
||||
cwd=str(ROOT_DIR),
|
||||
stdin=subprocess.DEVNULL,
|
||||
stdout=log_handle,
|
||||
stderr=subprocess.STDOUT,
|
||||
start_new_session=True,
|
||||
)
|
||||
|
||||
deadline = time.time() + DEFAULT_STARTUP_TIMEOUT_S
|
||||
while time.time() < deadline:
|
||||
if _ping(socket_path):
|
||||
return
|
||||
exit_code = proc.poll()
|
||||
if exit_code is not None:
|
||||
raise RuntimeError(
|
||||
f"MeloTTS server exited during startup with code {exit_code}. "
|
||||
f"See {SERVER_LOG_PATH}"
|
||||
)
|
||||
time.sleep(0.5)
|
||||
raise RuntimeError(f"MeloTTS server did not become ready within {DEFAULT_STARTUP_TIMEOUT_S:.0f}s")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = _build_parser().parse_args()
|
||||
socket_path = Path(args.socket_path).expanduser()
|
||||
_ensure_server(socket_path)
|
||||
response = _rpc(
|
||||
socket_path,
|
||||
{
|
||||
"action": "synthesize",
|
||||
"text": args.text,
|
||||
"output_wav": args.output_wav,
|
||||
},
|
||||
timeout_s=max(30.0, DEFAULT_STARTUP_TIMEOUT_S),
|
||||
)
|
||||
if not response.get("ok"):
|
||||
raise RuntimeError(str(response.get("error", "MeloTTS synthesis failed")))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue