mirror of
https://github.com/tiennm99/goclaw.git
synced 2026-06-11 14:11:29 +00:00
2cc9d68cdc
* fix(tts): config save + Edge provider registration + dark mode chat bubbles - Wrap TTS config payload in `raw` field for config.patch RPC (#229) - Always register Edge TTS provider (free, no API key) instead of gating on `enabled` flag - Fix low-contrast user message bubbles in dark mode chat * fix(tts): skip duplicate media dispatch when temp file already delivered When both the agent loop and the message tool dispatch the same TTS temp file, the first dispatch succeeds and cleanup deletes it. Filter out missing temp media files before sending to prevent "file not found" errors and spurious error notifications on Telegram/Slack/Discord. * feat(tts): include edge-tts in Docker image when Python enabled Edge TTS is free (no API key) and serves as a universal TTS fallback. Install it alongside Python in both ENABLE_PYTHON and ENABLE_FULL_SKILLS builds. * chore(docker): expose build args from .env for compose builds Pass ENABLE_OTEL, ENABLE_PYTHON, ENABLE_FULL_SKILLS as env-driven build args so .env can control Docker build features without editing docker-compose.yml directly. * fix(tts): hot-reload TTS config on settings change via pub/sub TTS providers were only registered at startup, so changing provider/API key via the Web UI had no effect until container restart. Add a tts-config-reload bus subscriber that rebuilds the TTS manager on config changes, matching the pattern used by quota, cron, and web_fetch. Always create a TtsTool at startup (even without providers) so the reload subscriber can populate it when settings are first configured. * fix(tts): protect TtsTool.UpdateManager with RWMutex to prevent data race UpdateManager() can be called from the config reload goroutine while Execute() reads t.manager concurrently from agent goroutines. Add sync.RWMutex following the same pattern as WebFetchTool.UpdatePolicy(). Also update setupTTS doc comment which incorrectly stated it could return nil — Edge TTS is now always registered. --------- Co-authored-by: viettranx <viettranx@gmail.com>
122 lines
3.3 KiB
Go
122 lines
3.3 KiB
Go
package tools
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/nextlevelbuilder/goclaw/internal/tts"
|
|
)
|
|
|
|
// TtsTool is an agent tool that converts text to speech audio.
|
|
// Matching TS src/agents/tools/tts-tool.ts.
|
|
// Implements Tool + ContextualTool interfaces.
|
|
// Per-call channel is read from ctx for thread-safety.
|
|
type TtsTool struct {
|
|
mu sync.RWMutex
|
|
manager *tts.Manager
|
|
}
|
|
|
|
// NewTtsTool creates a TTS tool backed by the given manager.
|
|
func NewTtsTool(mgr *tts.Manager) *TtsTool {
|
|
return &TtsTool{manager: mgr}
|
|
}
|
|
|
|
// UpdateManager swaps the underlying TTS manager (used on config reload).
|
|
func (t *TtsTool) UpdateManager(mgr *tts.Manager) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
t.manager = mgr
|
|
}
|
|
|
|
func (t *TtsTool) Name() string { return "tts" }
|
|
|
|
func (t *TtsTool) Description() string {
|
|
return "Convert text to speech audio. Returns a MEDIA: path to the generated audio file."
|
|
}
|
|
|
|
func (t *TtsTool) Parameters() map[string]any {
|
|
return map[string]any{
|
|
"type": "object",
|
|
"properties": map[string]any{
|
|
"text": map[string]any{
|
|
"type": "string",
|
|
"description": "The text to convert to speech",
|
|
},
|
|
"voice": map[string]any{
|
|
"type": "string",
|
|
"description": "Voice ID (provider-specific). Optional — uses default if omitted.",
|
|
},
|
|
"provider": map[string]any{
|
|
"type": "string",
|
|
"description": "TTS provider: openai, elevenlabs, edge, minimax. Optional — uses primary if omitted.",
|
|
},
|
|
},
|
|
"required": []string{"text"},
|
|
}
|
|
}
|
|
|
|
// SetContext is a no-op; channel is now read from ctx (thread-safe).
|
|
func (t *TtsTool) SetContext(channel, _ string) {}
|
|
|
|
func (t *TtsTool) Execute(ctx context.Context, args map[string]any) *Result {
|
|
text, _ := args["text"].(string)
|
|
if text == "" {
|
|
return &Result{ForLLM: "error: text is required", IsError: true}
|
|
}
|
|
|
|
voice, _ := args["voice"].(string)
|
|
providerName, _ := args["provider"].(string)
|
|
|
|
// Snapshot manager pointer under read lock so config reloads don't race.
|
|
t.mu.RLock()
|
|
mgr := t.manager
|
|
t.mu.RUnlock()
|
|
|
|
// Determine format based on channel (read from ctx — thread-safe)
|
|
channel := ToolChannelFromCtx(ctx)
|
|
opts := tts.Options{Voice: voice}
|
|
if channel == "telegram" {
|
|
opts.Format = "opus"
|
|
}
|
|
|
|
var result *tts.SynthResult
|
|
var err error
|
|
|
|
if providerName != "" {
|
|
// Use specific provider
|
|
p, ok := mgr.GetProvider(providerName)
|
|
if !ok {
|
|
return &Result{ForLLM: fmt.Sprintf("error: tts provider not found: %s", providerName), IsError: true}
|
|
}
|
|
result, err = p.Synthesize(ctx, text, opts)
|
|
} else {
|
|
result, err = mgr.SynthesizeWithFallback(ctx, text, opts)
|
|
}
|
|
|
|
if err != nil {
|
|
return &Result{ForLLM: fmt.Sprintf("error: tts failed: %s", err.Error()), IsError: true}
|
|
}
|
|
|
|
// Write audio to temp file
|
|
tmpDir := os.TempDir()
|
|
audioPath := filepath.Join(tmpDir, fmt.Sprintf("tts-%d.%s", time.Now().UnixNano(), result.Extension))
|
|
if err := os.WriteFile(audioPath, result.Audio, 0644); err != nil {
|
|
return &Result{ForLLM: fmt.Sprintf("error: write tts audio: %s", err.Error()), IsError: true}
|
|
}
|
|
|
|
// Return MEDIA: path (matching TS pattern)
|
|
voiceTag := ""
|
|
if channel == "telegram" && result.Extension == "ogg" {
|
|
voiceTag = "[[audio_as_voice]]\n"
|
|
}
|
|
|
|
forLLM := fmt.Sprintf("%sMEDIA:%s", voiceTag, audioPath)
|
|
r := &Result{ForLLM: forLLM}
|
|
r.Deliverable = fmt.Sprintf("[Generated audio: %s]\nText: %s", filepath.Base(audioPath), text)
|
|
return r
|
|
}
|