Files
goclaw/internal/tools/tts.go
T
Duc Nguyen 2cc9d68cdc fix(tts): config save, Edge provider, media dispatch + dark mode chat (#265)
* fix(tts): config save + Edge provider registration + dark mode chat bubbles

- Wrap TTS config payload in `raw` field for config.patch RPC (#229)
- Always register Edge TTS provider (free, no API key) instead of gating on `enabled` flag
- Fix low-contrast user message bubbles in dark mode chat

* fix(tts): skip duplicate media dispatch when temp file already delivered

When both the agent loop and the message tool dispatch the same TTS
temp file, the first dispatch succeeds and cleanup deletes it. Filter
out missing temp media files before sending to prevent "file not found"
errors and spurious error notifications on Telegram/Slack/Discord.

* feat(tts): include edge-tts in Docker image when Python enabled

Edge TTS is free (no API key) and serves as a universal TTS fallback.
Install it alongside Python in both ENABLE_PYTHON and ENABLE_FULL_SKILLS builds.

* chore(docker): expose build args from .env for compose builds

Pass ENABLE_OTEL, ENABLE_PYTHON, ENABLE_FULL_SKILLS as env-driven
build args so .env can control Docker build features without editing
docker-compose.yml directly.

* fix(tts): hot-reload TTS config on settings change via pub/sub

TTS providers were only registered at startup, so changing provider/API
key via the Web UI had no effect until container restart. Add a
tts-config-reload bus subscriber that rebuilds the TTS manager on
config changes, matching the pattern used by quota, cron, and web_fetch.
Always create a TtsTool at startup (even without providers) so the
reload subscriber can populate it when settings are first configured.

* fix(tts): protect TtsTool.UpdateManager with RWMutex to prevent data race

UpdateManager() can be called from the config reload goroutine while
Execute() reads t.manager concurrently from agent goroutines. Add
sync.RWMutex following the same pattern as WebFetchTool.UpdatePolicy().

Also update setupTTS doc comment which incorrectly stated it could
return nil — Edge TTS is now always registered.

---------

Co-authored-by: viettranx <viettranx@gmail.com>
2026-03-19 08:21:06 +07:00

122 lines
3.3 KiB
Go

package tools
import (
"context"
"fmt"
"os"
"path/filepath"
"sync"
"time"
"github.com/nextlevelbuilder/goclaw/internal/tts"
)
// TtsTool is an agent tool that converts text to speech audio.
// Matching TS src/agents/tools/tts-tool.ts.
// Implements Tool + ContextualTool interfaces.
// Per-call channel is read from ctx for thread-safety.
type TtsTool struct {
mu sync.RWMutex
manager *tts.Manager
}
// NewTtsTool creates a TTS tool backed by the given manager.
func NewTtsTool(mgr *tts.Manager) *TtsTool {
return &TtsTool{manager: mgr}
}
// UpdateManager swaps the underlying TTS manager (used on config reload).
func (t *TtsTool) UpdateManager(mgr *tts.Manager) {
t.mu.Lock()
defer t.mu.Unlock()
t.manager = mgr
}
func (t *TtsTool) Name() string { return "tts" }
func (t *TtsTool) Description() string {
return "Convert text to speech audio. Returns a MEDIA: path to the generated audio file."
}
func (t *TtsTool) Parameters() map[string]any {
return map[string]any{
"type": "object",
"properties": map[string]any{
"text": map[string]any{
"type": "string",
"description": "The text to convert to speech",
},
"voice": map[string]any{
"type": "string",
"description": "Voice ID (provider-specific). Optional — uses default if omitted.",
},
"provider": map[string]any{
"type": "string",
"description": "TTS provider: openai, elevenlabs, edge, minimax. Optional — uses primary if omitted.",
},
},
"required": []string{"text"},
}
}
// SetContext is a no-op; channel is now read from ctx (thread-safe).
func (t *TtsTool) SetContext(channel, _ string) {}
func (t *TtsTool) Execute(ctx context.Context, args map[string]any) *Result {
text, _ := args["text"].(string)
if text == "" {
return &Result{ForLLM: "error: text is required", IsError: true}
}
voice, _ := args["voice"].(string)
providerName, _ := args["provider"].(string)
// Snapshot manager pointer under read lock so config reloads don't race.
t.mu.RLock()
mgr := t.manager
t.mu.RUnlock()
// Determine format based on channel (read from ctx — thread-safe)
channel := ToolChannelFromCtx(ctx)
opts := tts.Options{Voice: voice}
if channel == "telegram" {
opts.Format = "opus"
}
var result *tts.SynthResult
var err error
if providerName != "" {
// Use specific provider
p, ok := mgr.GetProvider(providerName)
if !ok {
return &Result{ForLLM: fmt.Sprintf("error: tts provider not found: %s", providerName), IsError: true}
}
result, err = p.Synthesize(ctx, text, opts)
} else {
result, err = mgr.SynthesizeWithFallback(ctx, text, opts)
}
if err != nil {
return &Result{ForLLM: fmt.Sprintf("error: tts failed: %s", err.Error()), IsError: true}
}
// Write audio to temp file
tmpDir := os.TempDir()
audioPath := filepath.Join(tmpDir, fmt.Sprintf("tts-%d.%s", time.Now().UnixNano(), result.Extension))
if err := os.WriteFile(audioPath, result.Audio, 0644); err != nil {
return &Result{ForLLM: fmt.Sprintf("error: write tts audio: %s", err.Error()), IsError: true}
}
// Return MEDIA: path (matching TS pattern)
voiceTag := ""
if channel == "telegram" && result.Extension == "ogg" {
voiceTag = "[[audio_as_voice]]\n"
}
forLLM := fmt.Sprintf("%sMEDIA:%s", voiceTag, audioPath)
r := &Result{ForLLM: forLLM}
r.Deliverable = fmt.Sprintf("[Generated audio: %s]\nText: %s", filepath.Base(audioPath), text)
return r
}