feat(tts): add command-type provider registry under tts.providers.<name> (#17843)
Reshape of PR #17211 (@versun). Lets users wire any local or external TTS CLI into Hermes without adding engine-specific Python code. Users declare any number of named providers in config.yaml and switch between them with tts.provider: <name>, alongside the built-ins (edge, openai, elevenlabs, …). Config shape: tts: provider: piper-en providers: piper-en: type: command command: 'piper -m ~/model.onnx -f {output_path} < {input_path}' output_format: wav Placeholders: {input_path}, {text_path}, {output_path}, {format}, {voice}, {model}, {speed}. Use {{ / }} for literal braces. Key behavior: - Built-in provider names always win — a tts.providers.openai entry cannot shadow the native OpenAI provider. - type: command is the default when command: is set. - Placeholder values are shell-quote-aware (bare / single / double context), so paths with spaces and shell metacharacters are safe. - Default delivery is a regular audio attachment. voice_compatible: true opts in to Telegram voice-bubble delivery via ffmpeg Opus conversion. - Command failures (non-zero exit, timeout, empty output) surface to the agent with stderr/stdout included so you can debug from chat. - Process-tree kill on timeout (Unix killpg, Windows taskkill /T). - max_text_length defaults to 5000 for command providers; override under tts.providers.<name>.max_text_length. Tests: tests/tools/test_tts_command_providers.py — 42 new tests cover provider resolution, shell-quote context, placeholder rendering with injection payloads, timeout, non-zero exit, empty output, voice_compatible opt-in, and end-to-end dispatch through text_to_speech_tool. All 88 pre-existing TTS tests still pass. Docs: new "Custom command providers" section in website/docs/user-guide/features/tts.md with three worked examples (Piper, VoxCPM, MLX-Kokoro), placeholder reference, optional keys, behavior notes, and security caveat. E2E-verified live: isolated HERMES_HOME, command provider declared in config.yaml, text_to_speech_tool dispatches through the registered shell command and the output file is produced as expected. Co-authored-by: Versun <me+github7604@versun.org>
This commit is contained in:
parent
5b85a7d351
commit
2facea7f71
490
tests/tools/test_tts_command_providers.py
Normal file
490
tests/tools/test_tts_command_providers.py
Normal file
@ -0,0 +1,490 @@
|
||||
"""
|
||||
Tests for custom command-type TTS providers.
|
||||
|
||||
These tests cover the ``tts.providers.<name>`` registry: built-in
|
||||
precedence, command resolution, placeholder rendering, shell-quote
|
||||
context handling, timeout / failure cleanup, voice_compatible opt-in,
|
||||
and max_text_length lookup.
|
||||
|
||||
Nothing here talks to a real TTS engine. The shell command itself is
|
||||
portable: we write bytes to ``{output_path}`` using ``python -c`` so
|
||||
the tests run identically on Linux, macOS, and (with minor quoting
|
||||
differences) Windows.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from tools.tts_tool import (
|
||||
BUILTIN_TTS_PROVIDERS,
|
||||
COMMAND_TTS_OUTPUT_FORMATS,
|
||||
DEFAULT_COMMAND_TTS_MAX_TEXT_LENGTH,
|
||||
DEFAULT_COMMAND_TTS_OUTPUT_FORMAT,
|
||||
DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS,
|
||||
_generate_command_tts,
|
||||
_get_command_tts_output_format,
|
||||
_get_command_tts_timeout,
|
||||
_get_named_provider_config,
|
||||
_has_any_command_tts_provider,
|
||||
_is_command_provider_config,
|
||||
_is_command_tts_voice_compatible,
|
||||
_iter_command_providers,
|
||||
_render_command_tts_template,
|
||||
_resolve_command_provider_config,
|
||||
_resolve_max_text_length,
|
||||
_shell_quote_context,
|
||||
check_tts_requirements,
|
||||
text_to_speech_tool,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _python_copy_command(output_placeholder: str = "{output_path}") -> str:
|
||||
"""Return a cross-platform shell command that copies {input_path} -> output."""
|
||||
interpreter = sys.executable
|
||||
return (
|
||||
f'"{interpreter}" -c "import shutil, sys; '
|
||||
f'shutil.copyfile(sys.argv[1], sys.argv[2])" '
|
||||
f'{{input_path}} {output_placeholder}'
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _resolve_command_provider_config / built-in precedence
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestResolveCommandProviderConfig:
|
||||
def test_builtin_names_are_never_command_providers(self):
|
||||
cfg = {
|
||||
"providers": {
|
||||
"openai": {"type": "command", "command": "echo hi"},
|
||||
"edge": {"type": "command", "command": "echo hi"},
|
||||
},
|
||||
}
|
||||
for name in BUILTIN_TTS_PROVIDERS:
|
||||
assert _resolve_command_provider_config(name, cfg) is None
|
||||
|
||||
def test_missing_provider_returns_none(self):
|
||||
cfg = {"providers": {}}
|
||||
assert _resolve_command_provider_config("nope", cfg) is None
|
||||
|
||||
def test_user_declared_command_provider_resolves(self):
|
||||
cfg = {
|
||||
"providers": {
|
||||
"piper": {"type": "command", "command": "piper foo"},
|
||||
},
|
||||
}
|
||||
resolved = _resolve_command_provider_config("piper", cfg)
|
||||
assert resolved is not None
|
||||
assert resolved["command"] == "piper foo"
|
||||
|
||||
def test_type_command_is_implied_when_command_is_set(self):
|
||||
cfg = {"providers": {"piper": {"command": "piper foo"}}}
|
||||
resolved = _resolve_command_provider_config("piper", cfg)
|
||||
assert resolved is not None
|
||||
|
||||
def test_other_type_values_reject(self):
|
||||
cfg = {"providers": {"piper": {"type": "python", "command": "piper foo"}}}
|
||||
assert _resolve_command_provider_config("piper", cfg) is None
|
||||
|
||||
def test_empty_command_rejects(self):
|
||||
cfg = {"providers": {"piper": {"type": "command", "command": " "}}}
|
||||
assert _resolve_command_provider_config("piper", cfg) is None
|
||||
|
||||
def test_case_insensitive_lookup(self):
|
||||
cfg = {"providers": {"piper": {"type": "command", "command": "x"}}}
|
||||
assert _resolve_command_provider_config("PIPER", cfg) is not None
|
||||
|
||||
|
||||
class TestGetNamedProviderConfig:
|
||||
def test_providers_block_wins(self):
|
||||
cfg = {"providers": {"voxcpm": {"command": "new"}},
|
||||
"voxcpm": {"command": "legacy"}}
|
||||
assert _get_named_provider_config(cfg, "voxcpm") == {"command": "new"}
|
||||
|
||||
def test_legacy_tts_name_block_still_resolves(self):
|
||||
cfg = {"voxcpm": {"type": "command", "command": "legacy"}}
|
||||
assert _get_named_provider_config(cfg, "voxcpm") == {
|
||||
"type": "command", "command": "legacy"
|
||||
}
|
||||
|
||||
def test_builtin_names_do_not_leak_through_legacy_path(self):
|
||||
"""``tts.openai`` must never be mistaken for a command provider."""
|
||||
cfg = {"openai": {"command": "oops", "type": "command"}}
|
||||
assert _get_named_provider_config(cfg, "openai") == {}
|
||||
|
||||
|
||||
class TestIsCommandProviderConfig:
|
||||
def test_empty_dict_is_false(self):
|
||||
assert _is_command_provider_config({}) is False
|
||||
|
||||
def test_non_dict_is_false(self):
|
||||
assert _is_command_provider_config("foo") is False
|
||||
assert _is_command_provider_config(None) is False
|
||||
|
||||
def test_type_mismatch_is_false(self):
|
||||
assert _is_command_provider_config({"type": "native", "command": "x"}) is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _iter_command_providers / _has_any_command_tts_provider
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestIterCommandProviders:
|
||||
def test_iterates_only_user_command_providers(self):
|
||||
cfg = {
|
||||
"providers": {
|
||||
"openai": {"type": "command", "command": "shouldnt show up"},
|
||||
"piper": {"type": "command", "command": "piper"},
|
||||
"voxcpm": {"type": "command", "command": "voxcpm"},
|
||||
"broken": {"type": "command", "command": ""},
|
||||
},
|
||||
}
|
||||
names = sorted(name for name, _ in _iter_command_providers(cfg))
|
||||
assert names == ["piper", "voxcpm"]
|
||||
|
||||
def test_has_any_command_provider_detects_declared(self):
|
||||
cfg = {"providers": {"piper": {"type": "command", "command": "piper"}}}
|
||||
assert _has_any_command_tts_provider(cfg) is True
|
||||
|
||||
def test_has_any_command_provider_when_none(self):
|
||||
assert _has_any_command_tts_provider({"providers": {}}) is False
|
||||
assert _has_any_command_tts_provider({}) is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# config getters
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestConfigGetters:
|
||||
def test_timeout_defaults(self):
|
||||
assert _get_command_tts_timeout({}) == float(DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS)
|
||||
|
||||
def test_timeout_coerces_string(self):
|
||||
assert _get_command_tts_timeout({"timeout": "45"}) == 45.0
|
||||
|
||||
def test_timeout_rejects_non_positive(self):
|
||||
assert _get_command_tts_timeout({"timeout": 0}) == float(DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS)
|
||||
assert _get_command_tts_timeout({"timeout": -1}) == float(DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS)
|
||||
|
||||
def test_timeout_rejects_garbage(self):
|
||||
assert _get_command_tts_timeout({"timeout": "fast"}) == float(DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS)
|
||||
|
||||
def test_timeout_seconds_alias(self):
|
||||
assert _get_command_tts_timeout({"timeout_seconds": 90}) == 90.0
|
||||
|
||||
def test_output_format_defaults(self):
|
||||
assert _get_command_tts_output_format({}) == DEFAULT_COMMAND_TTS_OUTPUT_FORMAT
|
||||
|
||||
def test_output_format_path_override(self):
|
||||
assert _get_command_tts_output_format({}, "/tmp/clip.wav") == "wav"
|
||||
|
||||
def test_output_format_unknown_path_falls_back_to_config(self):
|
||||
assert _get_command_tts_output_format({"format": "ogg"}, "/tmp/clip.xyz") == "ogg"
|
||||
|
||||
def test_output_format_rejects_unknown(self):
|
||||
assert _get_command_tts_output_format({"format": "m4a"}) == DEFAULT_COMMAND_TTS_OUTPUT_FORMAT
|
||||
|
||||
def test_output_format_supported_set(self):
|
||||
assert COMMAND_TTS_OUTPUT_FORMATS == frozenset({"mp3", "wav", "ogg", "flac"})
|
||||
|
||||
def test_voice_compatible_boolean(self):
|
||||
assert _is_command_tts_voice_compatible({"voice_compatible": True}) is True
|
||||
assert _is_command_tts_voice_compatible({"voice_compatible": False}) is False
|
||||
|
||||
def test_voice_compatible_string(self):
|
||||
assert _is_command_tts_voice_compatible({"voice_compatible": "yes"}) is True
|
||||
assert _is_command_tts_voice_compatible({"voice_compatible": "0"}) is False
|
||||
|
||||
def test_voice_compatible_default_off(self):
|
||||
assert _is_command_tts_voice_compatible({}) is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _resolve_max_text_length for command providers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestMaxTextLengthForCommandProviders:
|
||||
def test_default_for_command_provider(self):
|
||||
cfg = {"providers": {"piper": {"type": "command", "command": "x"}}}
|
||||
assert _resolve_max_text_length("piper", cfg) == DEFAULT_COMMAND_TTS_MAX_TEXT_LENGTH
|
||||
|
||||
def test_override_under_providers(self):
|
||||
cfg = {"providers": {"piper": {"type": "command", "command": "x", "max_text_length": 2500}}}
|
||||
assert _resolve_max_text_length("piper", cfg) == 2500
|
||||
|
||||
def test_override_under_legacy_tts_name_block(self):
|
||||
cfg = {"piper": {"type": "command", "command": "x", "max_text_length": 7777}}
|
||||
assert _resolve_max_text_length("piper", cfg) == 7777
|
||||
|
||||
def test_non_command_unknown_provider_still_falls_back(self):
|
||||
assert _resolve_max_text_length("unknown", {}) > 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _shell_quote_context / template rendering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestShellQuoteContext:
|
||||
def test_bare_context(self):
|
||||
tpl = 'tts {output_path}'
|
||||
pos = tpl.index("{output_path}")
|
||||
assert _shell_quote_context(tpl, pos) is None
|
||||
|
||||
def test_inside_single_quotes(self):
|
||||
tpl = "tts '{output_path}'"
|
||||
pos = tpl.index("{output_path}")
|
||||
assert _shell_quote_context(tpl, pos) == "'"
|
||||
|
||||
def test_inside_double_quotes(self):
|
||||
tpl = 'tts "{output_path}"'
|
||||
pos = tpl.index("{output_path}")
|
||||
assert _shell_quote_context(tpl, pos) == '"'
|
||||
|
||||
def test_escaped_double_quote_inside_double(self):
|
||||
tpl = r'tts "foo \" {output_path}"'
|
||||
pos = tpl.index("{output_path}")
|
||||
assert _shell_quote_context(tpl, pos) == '"'
|
||||
|
||||
|
||||
class TestRenderCommandTtsTemplate:
|
||||
def test_substitutes_all_placeholders(self):
|
||||
placeholders = {
|
||||
"input_path": "/tmp/in.txt",
|
||||
"text_path": "/tmp/in.txt",
|
||||
"output_path": "/tmp/out.mp3",
|
||||
"format": "mp3",
|
||||
"voice": "af_sky",
|
||||
"model": "tiny",
|
||||
"speed": "1.0",
|
||||
}
|
||||
rendered = _render_command_tts_template(
|
||||
"tts --voice {voice} --in {input_path} --out {output_path}",
|
||||
placeholders,
|
||||
)
|
||||
assert "af_sky" in rendered
|
||||
assert "/tmp/out.mp3" in rendered
|
||||
|
||||
def test_quotes_paths_with_spaces(self):
|
||||
placeholders = {
|
||||
"input_path": "/tmp/Jane Doe/in.txt",
|
||||
"text_path": "/tmp/Jane Doe/in.txt",
|
||||
"output_path": "/tmp/out.mp3",
|
||||
"format": "mp3",
|
||||
"voice": "",
|
||||
"model": "",
|
||||
"speed": "1.0",
|
||||
}
|
||||
rendered = _render_command_tts_template(
|
||||
"tts --in {input_path} --out {output_path}",
|
||||
placeholders,
|
||||
)
|
||||
# shlex.quote wraps space-containing paths in single quotes on POSIX.
|
||||
if os.name != "nt":
|
||||
assert "'/tmp/Jane Doe/in.txt'" in rendered
|
||||
|
||||
def test_literal_braces_survive(self):
|
||||
placeholders = {
|
||||
"input_path": "/tmp/in.txt", "text_path": "/tmp/in.txt",
|
||||
"output_path": "/tmp/out.mp3", "format": "mp3",
|
||||
"voice": "", "model": "", "speed": "1.0",
|
||||
}
|
||||
rendered = _render_command_tts_template(
|
||||
"echo '{{not a placeholder}}' && tts --in {input_path}",
|
||||
placeholders,
|
||||
)
|
||||
assert "{not a placeholder}" in rendered
|
||||
|
||||
def test_injection_is_neutralized(self):
|
||||
"""Embedded shell metacharacters in a placeholder value must be quoted."""
|
||||
placeholders = {
|
||||
"input_path": "/tmp/in.txt", "text_path": "/tmp/in.txt",
|
||||
"output_path": "/tmp/out; rm -rf /",
|
||||
"format": "mp3",
|
||||
"voice": "$(whoami)", "model": "", "speed": "1.0",
|
||||
}
|
||||
rendered = _render_command_tts_template(
|
||||
"tts --voice {voice} --out {output_path}",
|
||||
placeholders,
|
||||
)
|
||||
# The injection payload must not appear unquoted in the rendered
|
||||
# command. On POSIX shlex.quote wraps the value in single quotes.
|
||||
if os.name != "nt":
|
||||
assert "'$(whoami)'" in rendered or "'\\''" in rendered
|
||||
assert "; rm -rf /" not in rendered.replace(
|
||||
"'/tmp/out; rm -rf /'", "",
|
||||
)
|
||||
|
||||
def test_preserves_shell_quoting_style(self):
|
||||
placeholders = {
|
||||
"input_path": "/tmp/in.txt", "text_path": "/tmp/in.txt",
|
||||
"output_path": "/tmp/out.mp3", "format": "mp3",
|
||||
"voice": "bob's voice", "model": "", "speed": "1.0",
|
||||
}
|
||||
# When the template wraps the placeholder in double quotes we must
|
||||
# escape for that context, not collapse to single-quoted form.
|
||||
rendered = _render_command_tts_template(
|
||||
'tts --voice "{voice}"',
|
||||
placeholders,
|
||||
)
|
||||
assert '"bob\'s voice"' in rendered
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# End-to-end: _generate_command_tts
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestGenerateCommandTts:
|
||||
def test_writes_output_file(self, tmp_path):
|
||||
out = tmp_path / "clip.mp3"
|
||||
config = {"command": _python_copy_command()}
|
||||
result = _generate_command_tts(
|
||||
"hello world",
|
||||
str(out),
|
||||
"py-copy",
|
||||
config,
|
||||
{},
|
||||
)
|
||||
assert result == str(out)
|
||||
assert out.exists()
|
||||
# The command copied the input text file over to output, so it
|
||||
# contains the original UTF-8 text.
|
||||
assert out.read_text(encoding="utf-8") == "hello world"
|
||||
|
||||
def test_empty_command_raises(self, tmp_path):
|
||||
with pytest.raises(ValueError, match="is not configured"):
|
||||
_generate_command_tts(
|
||||
"hello",
|
||||
str(tmp_path / "x.mp3"),
|
||||
"empty",
|
||||
{"command": " "},
|
||||
{},
|
||||
)
|
||||
|
||||
def test_nonzero_exit_raises_runtime(self, tmp_path):
|
||||
config = {"command": f'"{sys.executable}" -c "import sys; sys.exit(3)"'}
|
||||
with pytest.raises(RuntimeError, match="exited with code 3"):
|
||||
_generate_command_tts(
|
||||
"hello",
|
||||
str(tmp_path / "x.mp3"),
|
||||
"failing",
|
||||
config,
|
||||
{},
|
||||
)
|
||||
|
||||
def test_empty_output_raises_runtime(self, tmp_path):
|
||||
# This command completes successfully but writes nothing.
|
||||
config = {"command": f'"{sys.executable}" -c "pass"'}
|
||||
with pytest.raises(RuntimeError, match="produced no output"):
|
||||
_generate_command_tts(
|
||||
"hello",
|
||||
str(tmp_path / "x.mp3"),
|
||||
"silent",
|
||||
config,
|
||||
{},
|
||||
)
|
||||
|
||||
@pytest.mark.skipif(os.name == "nt", reason="POSIX-only timeout semantics")
|
||||
def test_timeout_raises_runtime(self, tmp_path):
|
||||
config = {
|
||||
"command": f'"{sys.executable}" -c "import time; time.sleep(10)"',
|
||||
"timeout": 1,
|
||||
}
|
||||
with pytest.raises(RuntimeError, match="timed out"):
|
||||
_generate_command_tts(
|
||||
"hello",
|
||||
str(tmp_path / "x.mp3"),
|
||||
"slow",
|
||||
config,
|
||||
{},
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# text_to_speech_tool integration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestTextToSpeechToolWithCommandProvider:
|
||||
def test_command_provider_dispatches_end_to_end(self, tmp_path):
|
||||
cfg = {
|
||||
"tts": {
|
||||
"provider": "py-copy",
|
||||
"providers": {
|
||||
"py-copy": {
|
||||
"type": "command",
|
||||
"command": _python_copy_command(),
|
||||
"output_format": "mp3",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
out = tmp_path / "clip.mp3"
|
||||
|
||||
# Patch the config loader used by the tool so we don't touch disk.
|
||||
def fake_load():
|
||||
return cfg["tts"]
|
||||
|
||||
with patch("tools.tts_tool._load_tts_config", fake_load):
|
||||
result = text_to_speech_tool(text="hi", output_path=str(out))
|
||||
data = json.loads(result)
|
||||
assert data["success"] is True, data
|
||||
assert data["provider"] == "py-copy"
|
||||
assert data["voice_compatible"] is False
|
||||
assert Path(data["file_path"]).exists()
|
||||
|
||||
def test_voice_compatible_opt_in_toggles_flag(self, tmp_path):
|
||||
"""voice_compatible=true is reflected in the response when the
|
||||
file is already .ogg (no ffmpeg needed)."""
|
||||
cfg = {
|
||||
"provider": "py-copy-ogg",
|
||||
"providers": {
|
||||
"py-copy-ogg": {
|
||||
"type": "command",
|
||||
"command": _python_copy_command(),
|
||||
"output_format": "ogg",
|
||||
"voice_compatible": True,
|
||||
},
|
||||
},
|
||||
}
|
||||
out = tmp_path / "clip.ogg"
|
||||
|
||||
with patch("tools.tts_tool._load_tts_config", return_value=cfg):
|
||||
result = text_to_speech_tool(text="hi", output_path=str(out))
|
||||
data = json.loads(result)
|
||||
assert data["success"] is True
|
||||
assert data["voice_compatible"] is True
|
||||
assert data["media_tag"].startswith("[[audio_as_voice]]")
|
||||
|
||||
def test_missing_command_falls_through_to_builtin(self, tmp_path):
|
||||
"""A provider entry with an empty command is not a command
|
||||
provider; the tool should not raise a "command not configured"
|
||||
error but fall through to the built-in resolution path."""
|
||||
cfg = {
|
||||
"provider": "broken",
|
||||
"providers": {
|
||||
"broken": {"type": "command", "command": " "},
|
||||
},
|
||||
}
|
||||
with patch("tools.tts_tool._load_tts_config", return_value=cfg):
|
||||
result = text_to_speech_tool(text="hi", output_path=str(tmp_path / "x.mp3"))
|
||||
data = json.loads(result)
|
||||
# The response should not carry the command-provider error text.
|
||||
err = (data.get("error") or "").lower()
|
||||
assert "tts.providers.broken.command is not configured" not in err
|
||||
|
||||
|
||||
class TestCheckTtsRequirements:
|
||||
def test_configured_command_provider_satisfies_requirement(self):
|
||||
cfg = {"providers": {"x": {"type": "command", "command": "echo x"}}}
|
||||
with patch("tools.tts_tool._load_tts_config", return_value=cfg):
|
||||
assert check_tts_requirements() is True
|
||||
@ -2,14 +2,23 @@
|
||||
"""
|
||||
Text-to-Speech Tool Module
|
||||
|
||||
Supports seven TTS providers:
|
||||
Built-in TTS providers:
|
||||
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
|
||||
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
|
||||
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
|
||||
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
|
||||
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
|
||||
- Google Gemini TTS: Controllable, 30 prebuilt voices, needs GEMINI_API_KEY
|
||||
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
|
||||
- xAI TTS: Grok voices, needs XAI_API_KEY
|
||||
- NeuTTS (local, free, no API key): On-device TTS via neutts
|
||||
- KittenTTS (local, free, no API key): On-device 25MB model
|
||||
|
||||
Custom command providers:
|
||||
- Users can declare any number of named providers with ``type: command``
|
||||
under ``tts.providers.<name>`` in ``~/.hermes/config.yaml``. Hermes
|
||||
writes the input text to a temp file and runs the configured shell
|
||||
command, which must produce the audio file at the expected path.
|
||||
See the Local Command section of ``website/docs/user-guide/features/tts.md``.
|
||||
|
||||
Output formats:
|
||||
- Opus (.ogg) for Telegram voice bubbles (requires ffmpeg for Edge TTS)
|
||||
@ -32,7 +41,9 @@ import logging
|
||||
import os
|
||||
import queue
|
||||
import re
|
||||
import shlex
|
||||
import shutil
|
||||
import signal
|
||||
import subprocess
|
||||
import tempfile
|
||||
import threading
|
||||
@ -181,9 +192,13 @@ def _resolve_max_text_length(
|
||||
|
||||
Resolution order:
|
||||
1. ``tts.<provider>.max_text_length`` (user override in config.yaml)
|
||||
2. ElevenLabs model-aware table (keyed on configured ``model_id``)
|
||||
3. ``PROVIDER_MAX_TEXT_LENGTH`` default
|
||||
4. ``FALLBACK_MAX_TEXT_LENGTH`` (4000)
|
||||
2. ``tts.providers.<provider>.max_text_length`` for user-declared
|
||||
command providers
|
||||
3. ElevenLabs model-aware table (keyed on configured ``model_id``)
|
||||
4. ``PROVIDER_MAX_TEXT_LENGTH`` default
|
||||
5. ``DEFAULT_COMMAND_TTS_MAX_TEXT_LENGTH`` when the provider is a
|
||||
command-type user provider without an explicit cap
|
||||
6. ``FALLBACK_MAX_TEXT_LENGTH`` (4000)
|
||||
|
||||
Non-positive or non-integer overrides fall through to the default so a
|
||||
broken config can't accidentally disable truncation entirely.
|
||||
@ -192,11 +207,12 @@ def _resolve_max_text_length(
|
||||
return FALLBACK_MAX_TEXT_LENGTH
|
||||
key = provider.lower().strip()
|
||||
cfg = tts_config or {}
|
||||
prov_cfg = cfg.get(key) if isinstance(cfg.get(key), dict) else {}
|
||||
|
||||
# Built-in-style override at tts.<provider>.max_text_length wins first,
|
||||
# matching historical behavior.
|
||||
prov_cfg = cfg.get(key) if isinstance(cfg.get(key), dict) else {}
|
||||
override = prov_cfg.get("max_text_length") if prov_cfg else None
|
||||
if isinstance(override, bool):
|
||||
# bool is an int subclass; treat explicit booleans as "not set"
|
||||
override = None
|
||||
if isinstance(override, int) and override > 0:
|
||||
return override
|
||||
@ -207,7 +223,21 @@ def _resolve_max_text_length(
|
||||
if mapped:
|
||||
return mapped
|
||||
|
||||
return PROVIDER_MAX_TEXT_LENGTH.get(key, FALLBACK_MAX_TEXT_LENGTH)
|
||||
if key in PROVIDER_MAX_TEXT_LENGTH:
|
||||
return PROVIDER_MAX_TEXT_LENGTH[key]
|
||||
|
||||
# User-declared command provider (under tts.providers.<name>)
|
||||
if key not in BUILTIN_TTS_PROVIDERS:
|
||||
named = _get_named_provider_config(cfg, key)
|
||||
if _is_command_provider_config(named):
|
||||
named_override = named.get("max_text_length")
|
||||
if isinstance(named_override, bool):
|
||||
named_override = None
|
||||
if isinstance(named_override, int) and named_override > 0:
|
||||
return named_override
|
||||
return DEFAULT_COMMAND_TTS_MAX_TEXT_LENGTH
|
||||
|
||||
return FALLBACK_MAX_TEXT_LENGTH
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
@ -237,6 +267,408 @@ def _get_provider(tts_config: Dict[str, Any]) -> str:
|
||||
return (tts_config.get("provider") or DEFAULT_PROVIDER).lower().strip()
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Custom command providers (type: command under tts.providers.<name>)
|
||||
# ===========================================================================
|
||||
#
|
||||
# Users can declare any number of command-type providers alongside the
|
||||
# built-ins so they can plug any local CLI (Piper, VoxCPM, Kokoro CLIs,
|
||||
# custom voice-cloning scripts, etc.) into Hermes without any Python code
|
||||
# changes. The config shape is::
|
||||
#
|
||||
# tts:
|
||||
# provider: piper-en
|
||||
# providers:
|
||||
# piper-en:
|
||||
# type: command
|
||||
# command: "piper -m ~/model.onnx -f {output_path} < {input_path}"
|
||||
# output_format: wav
|
||||
#
|
||||
# Hermes writes the input text to a temp UTF-8 file, runs the command with
|
||||
# placeholder substitution, and reads the audio file the command wrote to
|
||||
# ``{output_path}``. Supported placeholders: ``{input_path}``,
|
||||
# ``{text_path}`` (alias for input_path), ``{output_path}``, ``{format}``,
|
||||
# ``{voice}``, ``{model}``, ``{speed}``. Use ``{{`` / ``}}`` for literal braces.
|
||||
#
|
||||
# Built-in provider names always win over an entry with the same name under
|
||||
# ``tts.providers``, so user config can't silently shadow ``edge`` etc.
|
||||
#
|
||||
# Placeholder values are shell-quoted for their surrounding context
|
||||
# (bare / single / double quote), so paths with spaces work transparently.
|
||||
|
||||
# Built-in provider names. Any ``tts.provider`` value NOT in this set is
|
||||
# interpreted as a reference to ``tts.providers.<name>``.
|
||||
BUILTIN_TTS_PROVIDERS = frozenset({
|
||||
"edge",
|
||||
"elevenlabs",
|
||||
"openai",
|
||||
"minimax",
|
||||
"xai",
|
||||
"mistral",
|
||||
"gemini",
|
||||
"neutts",
|
||||
"kittentts",
|
||||
})
|
||||
|
||||
DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS = 120
|
||||
DEFAULT_COMMAND_TTS_OUTPUT_FORMAT = "mp3"
|
||||
COMMAND_TTS_OUTPUT_FORMATS = frozenset({"mp3", "wav", "ogg", "flac"})
|
||||
DEFAULT_COMMAND_TTS_MAX_TEXT_LENGTH = 5000
|
||||
|
||||
|
||||
def _get_provider_section(tts_config: Dict[str, Any], name: str) -> Dict[str, Any]:
|
||||
"""Return a provider config block if it's a dict, else an empty dict."""
|
||||
if not isinstance(tts_config, dict):
|
||||
return {}
|
||||
section = tts_config.get(name)
|
||||
return section if isinstance(section, dict) else {}
|
||||
|
||||
|
||||
def _get_named_provider_config(
|
||||
tts_config: Dict[str, Any],
|
||||
name: str,
|
||||
) -> Dict[str, Any]:
|
||||
"""Return the config dict for a user-declared provider.
|
||||
|
||||
Looks up ``tts.providers.<name>`` first (the canonical location), and
|
||||
falls back to ``tts.<name>`` so users who followed the built-in layout
|
||||
still work. Returns an empty dict when the provider is not declared.
|
||||
"""
|
||||
providers = _get_provider_section(tts_config, "providers")
|
||||
section = providers.get(name) if isinstance(providers, dict) else None
|
||||
if isinstance(section, dict):
|
||||
return section
|
||||
# Back-compat: allow ``tts.<name>`` for user-declared providers too,
|
||||
# but only when the name is not a built-in (so a user's ``tts.openai``
|
||||
# block still means the OpenAI provider, not a custom command).
|
||||
if name.lower() not in BUILTIN_TTS_PROVIDERS:
|
||||
legacy = _get_provider_section(tts_config, name)
|
||||
if legacy:
|
||||
return legacy
|
||||
return {}
|
||||
|
||||
|
||||
def _is_command_provider_config(config: Dict[str, Any]) -> bool:
|
||||
"""Return True when *config* declares a command-type provider."""
|
||||
if not isinstance(config, dict):
|
||||
return False
|
||||
ptype = str(config.get("type") or "").strip().lower()
|
||||
if ptype and ptype != "command":
|
||||
return False
|
||||
command = config.get("command")
|
||||
return isinstance(command, str) and bool(command.strip())
|
||||
|
||||
|
||||
def _resolve_command_provider_config(
|
||||
provider: str,
|
||||
tts_config: Dict[str, Any],
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Return the provider config if *provider* resolves to a command type.
|
||||
|
||||
Built-in provider names are rejected (they have native handlers).
|
||||
Returns None when the name is a built-in, unknown, or not a command
|
||||
type.
|
||||
"""
|
||||
if not provider:
|
||||
return None
|
||||
key = provider.lower().strip()
|
||||
if key in BUILTIN_TTS_PROVIDERS:
|
||||
return None
|
||||
config = _get_named_provider_config(tts_config, key)
|
||||
if _is_command_provider_config(config):
|
||||
return config
|
||||
return None
|
||||
|
||||
|
||||
def _iter_command_providers(tts_config: Dict[str, Any]):
|
||||
"""Yield (name, config) pairs for every declared command-type provider."""
|
||||
if not isinstance(tts_config, dict):
|
||||
return
|
||||
providers = _get_provider_section(tts_config, "providers")
|
||||
for name, cfg in (providers or {}).items():
|
||||
if isinstance(name, str) and name.lower() not in BUILTIN_TTS_PROVIDERS:
|
||||
if _is_command_provider_config(cfg):
|
||||
yield name, cfg
|
||||
|
||||
|
||||
def _get_command_tts_timeout(config: Dict[str, Any]) -> float:
|
||||
"""Return timeout in seconds, falling back when invalid."""
|
||||
raw = config.get("timeout", config.get("timeout_seconds", DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS))
|
||||
try:
|
||||
value = float(raw)
|
||||
except (TypeError, ValueError):
|
||||
return float(DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS)
|
||||
if value <= 0:
|
||||
return float(DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS)
|
||||
return value
|
||||
|
||||
|
||||
def _get_command_tts_output_format(
|
||||
config: Dict[str, Any],
|
||||
output_path: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Return the validated output format (mp3/wav/ogg/flac)."""
|
||||
if output_path:
|
||||
suffix = Path(output_path).suffix.lower().strip().lstrip(".")
|
||||
if suffix in COMMAND_TTS_OUTPUT_FORMATS:
|
||||
return suffix
|
||||
raw = (
|
||||
config.get("format")
|
||||
or config.get("output_format")
|
||||
or DEFAULT_COMMAND_TTS_OUTPUT_FORMAT
|
||||
)
|
||||
fmt = str(raw).lower().strip().lstrip(".")
|
||||
return fmt if fmt in COMMAND_TTS_OUTPUT_FORMATS else DEFAULT_COMMAND_TTS_OUTPUT_FORMAT
|
||||
|
||||
|
||||
def _is_command_tts_voice_compatible(config: Dict[str, Any]) -> bool:
|
||||
"""Return True only when the user explicitly opted in to voice delivery."""
|
||||
value = config.get("voice_compatible", False)
|
||||
if isinstance(value, str):
|
||||
return value.strip().lower() in {"1", "true", "yes", "on"}
|
||||
return bool(value)
|
||||
|
||||
|
||||
def _shell_quote_context(command_template: str, position: int) -> Optional[str]:
|
||||
"""Return the shell quote character active right before *position*.
|
||||
|
||||
Returns ``"'"`` / ``'"'`` when inside a single- / double-quoted region
|
||||
of the template, ``None`` for bare context.
|
||||
"""
|
||||
quote: Optional[str] = None
|
||||
escaped = False
|
||||
i = 0
|
||||
while i < position:
|
||||
char = command_template[i]
|
||||
if quote == "'":
|
||||
if char == "'":
|
||||
quote = None
|
||||
elif quote == '"':
|
||||
if escaped:
|
||||
escaped = False
|
||||
elif char == "\\":
|
||||
escaped = True
|
||||
elif char == '"':
|
||||
quote = None
|
||||
else:
|
||||
if char == "'":
|
||||
quote = "'"
|
||||
elif char == '"':
|
||||
quote = '"'
|
||||
elif char == "\\":
|
||||
i += 1
|
||||
i += 1
|
||||
return quote
|
||||
|
||||
|
||||
def _quote_command_tts_placeholder(value: str, quote_context: Optional[str]) -> str:
|
||||
"""Quote a placeholder value for its position in a shell command template."""
|
||||
if quote_context == "'":
|
||||
return value.replace("'", r"'\''")
|
||||
if quote_context == '"':
|
||||
return (
|
||||
value
|
||||
.replace("\\", "\\\\")
|
||||
.replace('"', r'\"')
|
||||
.replace("$", r"\$")
|
||||
.replace("`", r"\`")
|
||||
)
|
||||
if os.name == "nt":
|
||||
return subprocess.list2cmdline([value])
|
||||
return shlex.quote(value)
|
||||
|
||||
|
||||
def _render_command_tts_template(
|
||||
command_template: str,
|
||||
placeholders: Dict[str, str],
|
||||
) -> str:
|
||||
"""Replace supported placeholders while preserving ``{{`` / ``}}``."""
|
||||
names = "|".join(re.escape(name) for name in placeholders)
|
||||
pattern = re.compile(
|
||||
rf"(?<!\$)(?:\{{\{{(?P<double>{names})\}}\}}|\{{(?P<single>{names})\}})"
|
||||
)
|
||||
replacements: list[tuple[str, str]] = []
|
||||
|
||||
def replace_match(match: re.Match[str]) -> str:
|
||||
name = match.group("double") or match.group("single")
|
||||
token = f"__HERMES_TTS_PLACEHOLDER_{len(replacements)}__"
|
||||
replacements.append((
|
||||
token,
|
||||
_quote_command_tts_placeholder(
|
||||
placeholders[name],
|
||||
_shell_quote_context(command_template, match.start()),
|
||||
),
|
||||
))
|
||||
return token
|
||||
|
||||
rendered = pattern.sub(replace_match, command_template)
|
||||
rendered = rendered.replace("{{", "{").replace("}}", "}")
|
||||
for token, value in replacements:
|
||||
rendered = rendered.replace(token, value)
|
||||
return rendered
|
||||
|
||||
|
||||
def _terminate_command_tts_process_tree(proc: subprocess.Popen) -> None:
|
||||
"""Best-effort termination of a shell process and all of its children."""
|
||||
if proc.poll() is not None:
|
||||
return
|
||||
|
||||
if os.name == "nt":
|
||||
try:
|
||||
subprocess.run(
|
||||
["taskkill", "/F", "/T", "/PID", str(proc.pid)],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
timeout=5,
|
||||
)
|
||||
except Exception:
|
||||
proc.kill()
|
||||
return
|
||||
|
||||
try:
|
||||
os.killpg(proc.pid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
return
|
||||
except Exception:
|
||||
proc.terminate()
|
||||
|
||||
try:
|
||||
proc.wait(timeout=2)
|
||||
return
|
||||
except subprocess.TimeoutExpired:
|
||||
pass
|
||||
|
||||
try:
|
||||
os.killpg(proc.pid, signal.SIGKILL)
|
||||
except ProcessLookupError:
|
||||
return
|
||||
except Exception:
|
||||
proc.kill()
|
||||
|
||||
|
||||
def _run_command_tts(command: str, timeout: float) -> subprocess.CompletedProcess:
|
||||
"""Run a command-provider shell command with process-tree timeout cleanup."""
|
||||
popen_kwargs: Dict[str, Any] = {
|
||||
"shell": True,
|
||||
"stdout": subprocess.PIPE,
|
||||
"stderr": subprocess.PIPE,
|
||||
"text": True,
|
||||
}
|
||||
if os.name == "nt":
|
||||
popen_kwargs["creationflags"] = getattr(subprocess, "CREATE_NEW_PROCESS_GROUP", 0)
|
||||
else:
|
||||
popen_kwargs["start_new_session"] = True
|
||||
|
||||
proc = subprocess.Popen(command, **popen_kwargs)
|
||||
try:
|
||||
stdout, stderr = proc.communicate(timeout=timeout)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
_terminate_command_tts_process_tree(proc)
|
||||
try:
|
||||
stdout, stderr = proc.communicate(timeout=1)
|
||||
except Exception:
|
||||
stdout = getattr(exc, "output", None)
|
||||
stderr = getattr(exc, "stderr", None)
|
||||
raise subprocess.TimeoutExpired(
|
||||
command,
|
||||
timeout,
|
||||
output=stdout,
|
||||
stderr=stderr,
|
||||
) from exc
|
||||
|
||||
if proc.returncode:
|
||||
raise subprocess.CalledProcessError(
|
||||
proc.returncode,
|
||||
command,
|
||||
output=stdout,
|
||||
stderr=stderr,
|
||||
)
|
||||
return subprocess.CompletedProcess(command, proc.returncode, stdout, stderr)
|
||||
|
||||
|
||||
def _configured_command_tts_output_path(path: Path, config: Dict[str, Any]) -> Path:
|
||||
"""Return an output path whose extension matches the provider's output_format."""
|
||||
fmt = _get_command_tts_output_format(config)
|
||||
return path.with_suffix(f".{fmt}")
|
||||
|
||||
|
||||
def _generate_command_tts(
|
||||
text: str,
|
||||
output_path: str,
|
||||
provider_name: str,
|
||||
config: Dict[str, Any],
|
||||
tts_config: Dict[str, Any],
|
||||
) -> str:
|
||||
"""Generate speech by running a user-configured shell command.
|
||||
|
||||
Returns the absolute path of the audio file the command wrote.
|
||||
Raises ``ValueError`` when the provider config is invalid, and
|
||||
``RuntimeError`` for timeouts / non-zero exits / empty output.
|
||||
"""
|
||||
command_template = str(config.get("command") or "").strip()
|
||||
if not command_template:
|
||||
raise ValueError(
|
||||
f"tts.providers.{provider_name}.command is not configured"
|
||||
)
|
||||
|
||||
output = Path(output_path).expanduser()
|
||||
output.parent.mkdir(parents=True, exist_ok=True)
|
||||
if output.exists():
|
||||
output.unlink()
|
||||
|
||||
timeout = _get_command_tts_timeout(config)
|
||||
output_format = _get_command_tts_output_format(config, str(output))
|
||||
speed = config.get("speed", tts_config.get("speed", ""))
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
text_path = Path(tmpdir) / "input.txt"
|
||||
text_path.write_text(text, encoding="utf-8")
|
||||
|
||||
placeholders = {
|
||||
"input_path": str(text_path),
|
||||
"text_path": str(text_path),
|
||||
"output_path": str(output),
|
||||
"format": output_format,
|
||||
"voice": str(config.get("voice", "")),
|
||||
"model": str(config.get("model", "")),
|
||||
"speed": str(speed),
|
||||
}
|
||||
command = _render_command_tts_template(command_template, placeholders)
|
||||
|
||||
try:
|
||||
_run_command_tts(command, timeout)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
raise RuntimeError(
|
||||
f"TTS provider '{provider_name}' timed out after {timeout:g}s"
|
||||
) from exc
|
||||
except subprocess.CalledProcessError as exc:
|
||||
detail_parts = []
|
||||
if exc.stderr:
|
||||
detail_parts.append(f"stderr: {exc.stderr.strip()}")
|
||||
if exc.stdout:
|
||||
detail_parts.append(f"stdout: {exc.stdout.strip()}")
|
||||
detail = "; ".join(detail_parts) or "no command output"
|
||||
raise RuntimeError(
|
||||
f"TTS provider '{provider_name}' exited with code "
|
||||
f"{exc.returncode}: {detail}"
|
||||
) from exc
|
||||
|
||||
if not output.exists() or output.stat().st_size <= 0:
|
||||
raise RuntimeError(
|
||||
f"TTS provider '{provider_name}' produced no output at {output}"
|
||||
)
|
||||
return str(output)
|
||||
|
||||
|
||||
def _has_any_command_tts_provider(tts_config: Optional[Dict[str, Any]] = None) -> bool:
|
||||
"""Return True when any command-type TTS provider is configured."""
|
||||
if tts_config is None:
|
||||
tts_config = _load_tts_config()
|
||||
for _name, _cfg in _iter_command_providers(tts_config):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# ffmpeg Opus conversion (Edge TTS MP3 -> OGG Opus for Telegram)
|
||||
# ===========================================================================
|
||||
@ -954,6 +1386,12 @@ def text_to_speech_tool(
|
||||
tts_config = _load_tts_config()
|
||||
provider = _get_provider(tts_config)
|
||||
|
||||
# User-declared command provider (type: command under tts.providers.<name>)
|
||||
# resolves BEFORE the built-in dispatch. Built-in names short-circuit here
|
||||
# so a user's ``tts.providers.openai.command`` can't override the real
|
||||
# OpenAI handler.
|
||||
command_provider_config = _resolve_command_provider_config(provider, tts_config)
|
||||
|
||||
# Truncate very long text with a warning. The cap is per-provider
|
||||
# (OpenAI 4096, xAI 15k, MiniMax 10k, ElevenLabs model-aware, etc.).
|
||||
max_len = _resolve_max_text_length(provider, tts_config)
|
||||
@ -975,13 +1413,23 @@ def text_to_speech_tool(
|
||||
# Determine output path
|
||||
if output_path:
|
||||
file_path = Path(output_path).expanduser()
|
||||
if command_provider_config is not None:
|
||||
# Respect caller-supplied path but align the extension with the
|
||||
# provider's configured output_format so the command writes to a
|
||||
# path the caller actually expects.
|
||||
file_path = _configured_command_tts_output_path(
|
||||
file_path, command_provider_config
|
||||
)
|
||||
else:
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
out_dir = Path(DEFAULT_OUTPUT_DIR)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
if command_provider_config is not None:
|
||||
fmt = _get_command_tts_output_format(command_provider_config)
|
||||
file_path = out_dir / f"tts_{timestamp}.{fmt}"
|
||||
# Use .ogg for Telegram with providers that support native Opus output,
|
||||
# otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later).
|
||||
if want_opus and provider in ("openai", "elevenlabs", "mistral", "gemini"):
|
||||
elif want_opus and provider in ("openai", "elevenlabs", "mistral", "gemini"):
|
||||
file_path = out_dir / f"tts_{timestamp}.ogg"
|
||||
else:
|
||||
file_path = out_dir / f"tts_{timestamp}.mp3"
|
||||
@ -992,7 +1440,15 @@ def text_to_speech_tool(
|
||||
|
||||
try:
|
||||
# Generate audio with the configured provider
|
||||
if provider == "elevenlabs":
|
||||
if command_provider_config is not None:
|
||||
logger.info(
|
||||
"Generating speech with command TTS provider '%s'...", provider,
|
||||
)
|
||||
file_str = _generate_command_tts(
|
||||
text, file_str, provider, command_provider_config, tts_config,
|
||||
)
|
||||
|
||||
elif provider == "elevenlabs":
|
||||
try:
|
||||
_import_elevenlabs()
|
||||
except ImportError:
|
||||
@ -1100,7 +1556,17 @@ def text_to_speech_tool(
|
||||
# Try Opus conversion for Telegram compatibility
|
||||
# Edge TTS outputs MP3, NeuTTS/KittenTTS output WAV — all need ffmpeg conversion
|
||||
voice_compatible = False
|
||||
if provider in ("edge", "neutts", "minimax", "xai", "kittentts") and not file_str.endswith(".ogg"):
|
||||
if command_provider_config is not None:
|
||||
# Command providers are documents by default. Voice-bubble
|
||||
# delivery only kicks in when the user explicitly opts in
|
||||
# via ``voice_compatible: true`` in their provider config.
|
||||
if _is_command_tts_voice_compatible(command_provider_config):
|
||||
if not file_str.endswith(".ogg"):
|
||||
opus_path = _convert_to_opus(file_str)
|
||||
if opus_path:
|
||||
file_str = opus_path
|
||||
voice_compatible = file_str.endswith(".ogg")
|
||||
elif provider in ("edge", "neutts", "minimax", "xai", "kittentts") and not file_str.endswith(".ogg"):
|
||||
opus_path = _convert_to_opus(file_str)
|
||||
if opus_path:
|
||||
file_str = opus_path
|
||||
@ -1149,11 +1615,15 @@ def check_tts_requirements() -> bool:
|
||||
Check if at least one TTS provider is available.
|
||||
|
||||
Edge TTS needs no API key and is the default, so if the package
|
||||
is installed, TTS is available.
|
||||
is installed, TTS is available. A user-declared command provider
|
||||
also satisfies the requirement.
|
||||
|
||||
Returns:
|
||||
bool: True if at least one provider can work.
|
||||
"""
|
||||
# Any configured command provider counts as available.
|
||||
if _has_any_command_tts_provider():
|
||||
return True
|
||||
try:
|
||||
_import_edge_tts()
|
||||
return True
|
||||
@ -1499,7 +1969,7 @@ from tools.registry import registry, tool_error
|
||||
|
||||
TTS_SCHEMA = {
|
||||
"name": "text_to_speech",
|
||||
"description": "Convert text to speech audio. Returns a MEDIA: path that the platform delivers as a voice message. On Telegram it plays as a voice bubble, on Discord/WhatsApp as an audio attachment. In CLI mode, saves to ~/voice-memos/. Voice and provider are user-configured, not model-selected.",
|
||||
"description": "Convert text to speech audio. Returns a MEDIA: path that the platform delivers as native audio. Compatible providers render as a voice bubble on Telegram; otherwise audio is sent as a regular attachment. In CLI mode, saves to ~/voice-memos/. Voice and provider are user-configured (built-in providers like edge/openai or custom command providers under tts.providers.<name>), not model-selected.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
||||
@ -116,6 +116,73 @@ Without ffmpeg, Edge TTS, MiniMax TTS, NeuTTS, and KittenTTS audio are sent as r
|
||||
If you want voice bubbles without installing ffmpeg, switch to the OpenAI, ElevenLabs, or Mistral provider.
|
||||
:::
|
||||
|
||||
### Custom command providers
|
||||
|
||||
If a TTS engine you want isn't natively supported (Piper, VoxCPM, MLX-Kokoro, XTTS CLI, a voice-cloning script, anything else that exposes a CLI), you can wire it in as a **command-type provider** without writing any Python. Hermes writes the input text to a temp UTF-8 file, runs your shell command, and reads the audio file the command produced.
|
||||
|
||||
Declare one or more providers under `tts.providers.<name>` and switch between them with `tts.provider: <name>` — the same way you switch between built-ins like `edge` and `openai`.
|
||||
|
||||
```yaml
|
||||
tts:
|
||||
provider: piper-en # pick any name under tts.providers
|
||||
providers:
|
||||
piper-en:
|
||||
type: command
|
||||
command: "piper -m ~/models/en_US-amy.onnx -f {output_path} < {input_path}"
|
||||
output_format: wav
|
||||
|
||||
voxcpm:
|
||||
type: command
|
||||
command: "voxcpm --ref ~/voice.wav --text-file {input_path} --out {output_path}"
|
||||
output_format: mp3
|
||||
timeout: 180
|
||||
voice_compatible: true # try to deliver as a Telegram voice bubble
|
||||
|
||||
mlx-kokoro:
|
||||
type: command
|
||||
command: "python -m mlx_kokoro --in {input_path} --out {output_path} --voice {voice}"
|
||||
voice: af_sky
|
||||
output_format: wav
|
||||
```
|
||||
|
||||
#### Placeholders
|
||||
|
||||
Your command template can reference these placeholders. Hermes substitutes them at render time and shell-quotes each value for the surrounding context (bare / single-quoted / double-quoted), so paths with spaces and other shell-sensitive characters are safe.
|
||||
|
||||
| Placeholder | Meaning |
|
||||
|------------------|------------------------------------------------------|
|
||||
| `{input_path}` | Path to the temp UTF-8 text file Hermes wrote |
|
||||
| `{text_path}` | Alias for `{input_path}` |
|
||||
| `{output_path}` | Path the command must write audio to |
|
||||
| `{format}` | `mp3` / `wav` / `ogg` / `flac` |
|
||||
| `{voice}` | `tts.providers.<name>.voice`, empty when unset |
|
||||
| `{model}` | `tts.providers.<name>.model` |
|
||||
| `{speed}` | Resolved speed multiplier (provider or global) |
|
||||
|
||||
Use `{{` and `}}` for literal braces.
|
||||
|
||||
#### Optional keys
|
||||
|
||||
| Key | Default | Meaning |
|
||||
|--------------------|---------|------------------------------------------------------------------------------------------------------------|
|
||||
| `timeout` | `120` | Seconds; the process tree is killed on expiry (Unix `killpg`, Windows `taskkill /T`). |
|
||||
| `output_format` | `mp3` | One of `mp3` / `wav` / `ogg` / `flac`. Auto-inferred from the output extension if Hermes picks a path. |
|
||||
| `voice_compatible` | `false` | When `true`, Hermes converts MP3/WAV output to Opus/OGG via ffmpeg so Telegram renders a voice bubble. |
|
||||
| `max_text_length` | `5000` | Input is truncated to this length before rendering the command. |
|
||||
| `voice` / `model` | empty | Passed to the command as placeholder values only. |
|
||||
|
||||
#### Behavior notes
|
||||
|
||||
- **Built-in names always win.** A `tts.providers.openai` entry never shadows the native OpenAI provider, so no user config can silently replace a built-in.
|
||||
- **Default delivery is a document.** Command providers deliver as regular audio attachments on every platform. Opt in to voice-bubble delivery per-provider with `voice_compatible: true`.
|
||||
- **Command failures surface to the agent.** Non-zero exit, empty output, or timeout all return an error with the command's stderr/stdout included so you can debug the provider from the conversation.
|
||||
- **`type: command` is the default when `command:` is set.** Writing `type: command` explicitly is good practice but not required; an entry with a non-empty `command` string is treated as a command provider.
|
||||
- **`{input_path}` / `{text_path}` are interchangeable.** Use whichever reads better in your command.
|
||||
|
||||
#### Security
|
||||
|
||||
Command-type providers run whatever shell command you configure, with your user's permissions. Hermes quotes placeholder values and enforces the configured timeout, but the command template itself is trusted local input — treat it the same way you would a shell script on your PATH.
|
||||
|
||||
## Voice Message Transcription (STT)
|
||||
|
||||
Voice messages sent on Telegram, Discord, WhatsApp, Slack, or Signal are automatically transcribed and injected as text into the conversation. The agent sees the transcript as normal text.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user