mirror of
https://github.com/tiennm99/goclaw.git
synced 2026-06-10 00:13:42 +00:00
aa5158d4a2
Add BytePlus ModelArk as a new OpenAI-compatible provider for Seed 2.0 models (chat, vision). Two provider types: standard API and Coding Plan (separate base URLs, same auth). Integrate Seedream image generation (sync API) and Seedance video generation (async polling) into the builtin media tool chain, following the established DashScope/Gemini patterns. - Add WithAuthPrefix option to OpenAIProvider for future non-standard auth - Add ProviderBytePlus/ProviderBytePlusCoding store constants and config - Register provider from config.json and llm_providers DB table - Add BytePlus to media chain routing, priority lists, and dispatch - Create create_image_byteplus.go (Seedream, sync response) - Create create_video_byteplus.go (Seedance, async poll with 5min timeout) - Add BytePlus to web and desktop UI provider type dropdowns - Update provider docs with BytePlus entries Closes #686
447 lines
15 KiB
Go
447 lines
15 KiB
Go
package tools
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/base64"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"net/http"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/nextlevelbuilder/goclaw/internal/bus"
|
|
"github.com/nextlevelbuilder/goclaw/internal/providers"
|
|
)
|
|
|
|
// credentialProvider is a narrow interface for providers that expose API credentials.
|
|
type credentialProvider interface {
|
|
APIKey() string
|
|
APIBase() string
|
|
}
|
|
|
|
// imageGenProviderPriority is the default order for image generation providers.
|
|
var imageGenProviderPriority = []string{"openrouter", "gemini", "openai", "minimax", "dashscope", "byteplus"}
|
|
|
|
// imageGenModelDefaults maps provider names to default image generation models.
|
|
var imageGenModelDefaults = map[string]string{
|
|
"openrouter": "google/gemini-2.5-flash-image",
|
|
"openai": "gpt-image-1.5",
|
|
"gemini": "gemini-2.5-flash-image",
|
|
"minimax": "image-01",
|
|
"dashscope": "wan2.6-image",
|
|
"byteplus": "seedream-5-0-260128",
|
|
}
|
|
|
|
// CreateImageTool generates images using an image generation API.
|
|
type CreateImageTool struct {
|
|
registry *providers.Registry
|
|
}
|
|
|
|
func NewCreateImageTool(registry *providers.Registry) *CreateImageTool {
|
|
return &CreateImageTool{registry: registry}
|
|
}
|
|
|
|
func (t *CreateImageTool) Name() string { return "create_image" }
|
|
|
|
func (t *CreateImageTool) Description() string {
|
|
return "Generate an image from a text description using an image generation model. Returns a MEDIA: path to the generated image file."
|
|
}
|
|
|
|
func (t *CreateImageTool) Parameters() map[string]any {
|
|
return map[string]any{
|
|
"type": "object",
|
|
"properties": map[string]any{
|
|
"prompt": map[string]any{
|
|
"type": "string",
|
|
"description": "Text description of the image to generate.",
|
|
},
|
|
"aspect_ratio": map[string]any{
|
|
"type": "string",
|
|
"description": "Aspect ratio: '1:1' (default), '3:4', '4:3', '9:16', '16:9'.",
|
|
},
|
|
"filename_hint": map[string]any{
|
|
"type": "string",
|
|
"description": "Short descriptive filename (no extension). Example: 'sunset-beach', 'company-logo'.",
|
|
},
|
|
},
|
|
"required": []string{"prompt"},
|
|
}
|
|
}
|
|
|
|
func (t *CreateImageTool) Execute(ctx context.Context, args map[string]any) *Result {
|
|
prompt, _ := args["prompt"].(string)
|
|
if prompt == "" {
|
|
return ErrorResult("prompt is required")
|
|
}
|
|
aspectRatio, _ := args["aspect_ratio"].(string)
|
|
if aspectRatio == "" {
|
|
aspectRatio = "1:1"
|
|
}
|
|
filenameHint, _ := args["filename_hint"].(string)
|
|
|
|
chain := ResolveMediaProviderChain(ctx, "create_image", "", "",
|
|
imageGenProviderPriority, imageGenModelDefaults, t.registry)
|
|
|
|
// Inject prompt and aspect_ratio into each chain entry's params
|
|
for i := range chain {
|
|
if chain[i].Params == nil {
|
|
chain[i].Params = make(map[string]any)
|
|
}
|
|
chain[i].Params["prompt"] = prompt
|
|
chain[i].Params["aspect_ratio"] = aspectRatio
|
|
}
|
|
|
|
chainResult, err := ExecuteWithChain(ctx, chain, t.registry, t.callProvider)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("image generation failed: %v", err))
|
|
}
|
|
|
|
// Save to workspace under date-based folder (e.g. generated/2026-03-02/)
|
|
workspace := ToolWorkspaceFromCtx(ctx)
|
|
if workspace == "" {
|
|
workspace = os.TempDir()
|
|
}
|
|
dateDir := filepath.Join(workspace, "generated", time.Now().Format("2006-01-02"))
|
|
if err := os.MkdirAll(dateDir, 0755); err != nil {
|
|
return ErrorResult(fmt.Sprintf("failed to create output directory: %v", err))
|
|
}
|
|
imagePath := filepath.Join(dateDir, mediaFileName(ctx, "image", filenameHint, "png"))
|
|
if err := os.WriteFile(imagePath, chainResult.Data, 0644); err != nil {
|
|
return ErrorResult(fmt.Sprintf("failed to save generated image: %v", err))
|
|
}
|
|
|
|
// Verify file was persisted (diagnostic for disappearing files).
|
|
if fi, err := os.Stat(imagePath); err != nil {
|
|
slog.Warn("create_image: file missing immediately after write", "path", imagePath, "error", err)
|
|
return ErrorResult(fmt.Sprintf("generated image file missing after write: %v", err))
|
|
} else {
|
|
slog.Info("create_image: file saved", "path", imagePath, "size", fi.Size(), "data_len", len(chainResult.Data))
|
|
}
|
|
|
|
result := &Result{ForLLM: fmt.Sprintf("MEDIA:%s\nUse the EXACT filename when referencing: %s", imagePath, filepath.Base(imagePath))}
|
|
result.Media = []bus.MediaFile{{Path: imagePath, MimeType: "image/png"}}
|
|
result.Deliverable = fmt.Sprintf("[Generated image: %s]\nPrompt: %s", filepath.Base(imagePath), prompt)
|
|
result.Provider = chainResult.Provider
|
|
result.Model = chainResult.Model
|
|
if chainResult.Usage != nil {
|
|
result.Usage = chainResult.Usage
|
|
}
|
|
return result
|
|
}
|
|
|
|
// callProvider dispatches to the correct image generation implementation based on provider type.
|
|
func (t *CreateImageTool) callProvider(ctx context.Context, cp credentialProvider, providerName, model string, params map[string]any) ([]byte, *providers.Usage, error) {
|
|
if cp == nil {
|
|
return nil, nil, fmt.Errorf("provider %q does not expose API credentials required for image generation", providerName)
|
|
}
|
|
prompt := GetParamString(params, "prompt", "")
|
|
aspectRatio := GetParamString(params, "aspect_ratio", "1:1")
|
|
|
|
slog.Info("create_image: calling image generation API",
|
|
"provider", providerName, "model", model, "aspect_ratio", aspectRatio)
|
|
|
|
switch GetParamString(params, "_provider_type", providerTypeFromName(providerName)) {
|
|
case "gemini":
|
|
return t.callGeminiNativeImageGen(ctx, cp.APIKey(), cp.APIBase(), model, prompt, params)
|
|
case "openrouter":
|
|
return t.callImageGenAPI(ctx, cp.APIKey(), cp.APIBase(), model, prompt, aspectRatio, params)
|
|
case "minimax":
|
|
return callMinimaxImageGen(ctx, cp.APIKey(), cp.APIBase(), model, prompt, params)
|
|
case "dashscope":
|
|
return callDashScopeImageGen(ctx, cp.APIKey(), cp.APIBase(), model, prompt, params)
|
|
case "byteplus":
|
|
return callBytePlusImageGen(ctx, cp.APIKey(), cp.APIBase(), model, prompt, params)
|
|
default:
|
|
return t.callStandardImageGenAPI(ctx, cp.APIKey(), cp.APIBase(), model, prompt, params)
|
|
}
|
|
}
|
|
|
|
// callImageGenAPI calls the OpenAI-compatible chat completions endpoint with image modalities.
|
|
// Works with OpenRouter (modalities: ["image","text"]).
|
|
func (t *CreateImageTool) callImageGenAPI(ctx context.Context, apiKey, apiBase, model, prompt, aspectRatio string, params map[string]any) ([]byte, *providers.Usage, error) {
|
|
body := map[string]any{
|
|
"model": model,
|
|
"messages": []map[string]any{
|
|
{"role": "user", "content": prompt},
|
|
},
|
|
"modalities": []string{"image", "text"},
|
|
}
|
|
if aspectRatio != "" && aspectRatio != "1:1" {
|
|
body["image_config"] = map[string]any{
|
|
"aspect_ratio": aspectRatio,
|
|
}
|
|
}
|
|
|
|
jsonBody, err := json.Marshal(body)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("marshal request: %w", err)
|
|
}
|
|
|
|
url := strings.TrimRight(apiBase, "/") + "/chat/completions"
|
|
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("create request: %w", err)
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
req.Header.Set("Authorization", "Bearer "+apiKey)
|
|
|
|
client := &http.Client{} // timeout governed by chain context
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("http request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
respBody, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("read response: %w", err)
|
|
}
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, nil, fmt.Errorf("API error %d: %s", resp.StatusCode, truncateBytes(respBody, 500))
|
|
}
|
|
|
|
return t.parseImageResponse(respBody)
|
|
}
|
|
|
|
// callStandardImageGenAPI uses the /images/generations endpoint (OpenAI and compatible providers).
|
|
func (t *CreateImageTool) callStandardImageGenAPI(ctx context.Context, apiKey, apiBase, model, prompt string, params map[string]any) ([]byte, *providers.Usage, error) {
|
|
body := map[string]any{
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"n": 1,
|
|
"response_format": "b64_json",
|
|
}
|
|
|
|
jsonBody, err := json.Marshal(body)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("marshal request: %w", err)
|
|
}
|
|
|
|
url := strings.TrimRight(apiBase, "/") + "/images/generations"
|
|
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("create request: %w", err)
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
req.Header.Set("Authorization", "Bearer "+apiKey)
|
|
|
|
client := &http.Client{} // timeout governed by chain context
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("http request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
respBody, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("read response: %w", err)
|
|
}
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, nil, fmt.Errorf("API error %d: %s", resp.StatusCode, truncateBytes(respBody, 500))
|
|
}
|
|
|
|
var imgResp struct {
|
|
Data []struct {
|
|
B64JSON string `json:"b64_json"`
|
|
} `json:"data"`
|
|
}
|
|
if err := json.Unmarshal(respBody, &imgResp); err != nil {
|
|
return nil, nil, fmt.Errorf("parse response: %w", err)
|
|
}
|
|
if len(imgResp.Data) == 0 || imgResp.Data[0].B64JSON == "" {
|
|
return nil, nil, fmt.Errorf("no image data in response")
|
|
}
|
|
|
|
imageBytes, err := base64.StdEncoding.DecodeString(imgResp.Data[0].B64JSON)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("decode base64: %w", err)
|
|
}
|
|
|
|
return imageBytes, nil, nil
|
|
}
|
|
|
|
// callGeminiNativeImageGen uses the native Gemini generateContent API with responseModalities.
|
|
// Gemini image models require this endpoint — they don't support OpenAI-compat endpoints.
|
|
func (t *CreateImageTool) callGeminiNativeImageGen(ctx context.Context, apiKey, apiBase, model, prompt string, params map[string]any) ([]byte, *providers.Usage, error) {
|
|
// Derive native Gemini base from OpenAI-compat base (strip /openai suffix)
|
|
nativeBase := strings.TrimRight(apiBase, "/")
|
|
nativeBase = strings.TrimSuffix(nativeBase, "/openai")
|
|
|
|
url := fmt.Sprintf("%s/models/%s:generateContent?key=%s", nativeBase, model, apiKey)
|
|
|
|
body := map[string]any{
|
|
"contents": []map[string]any{
|
|
{"parts": []map[string]any{{"text": prompt}}},
|
|
},
|
|
"generationConfig": map[string]any{
|
|
"responseModalities": []string{"TEXT", "IMAGE"},
|
|
},
|
|
}
|
|
|
|
jsonBody, err := json.Marshal(body)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("marshal request: %w", err)
|
|
}
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("create request: %w", err)
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
client := &http.Client{} // timeout governed by chain context
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("http request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
respBody, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("read response: %w", err)
|
|
}
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, nil, fmt.Errorf("API error %d: %s", resp.StatusCode, truncateBytes(respBody, 500))
|
|
}
|
|
|
|
// Parse native Gemini response: {candidates: [{content: {parts: [{inlineData: {mimeType, data}}]}}]}
|
|
var gemResp struct {
|
|
Candidates []struct {
|
|
Content struct {
|
|
Parts []struct {
|
|
InlineData *struct {
|
|
MimeType string `json:"mimeType"`
|
|
Data string `json:"data"`
|
|
} `json:"inlineData"`
|
|
Text string `json:"text"`
|
|
} `json:"parts"`
|
|
} `json:"content"`
|
|
} `json:"candidates"`
|
|
UsageMetadata *struct {
|
|
PromptTokenCount int `json:"promptTokenCount"`
|
|
CandidatesTokenCount int `json:"candidatesTokenCount"`
|
|
TotalTokenCount int `json:"totalTokenCount"`
|
|
} `json:"usageMetadata"`
|
|
}
|
|
if err := json.Unmarshal(respBody, &gemResp); err != nil {
|
|
return nil, nil, fmt.Errorf("parse response: %w", err)
|
|
}
|
|
|
|
// Extract first image from parts
|
|
for _, cand := range gemResp.Candidates {
|
|
for _, part := range cand.Content.Parts {
|
|
if part.InlineData != nil && strings.HasPrefix(part.InlineData.MimeType, "image/") {
|
|
imageBytes, err := base64.StdEncoding.DecodeString(part.InlineData.Data)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("decode base64: %w", err)
|
|
}
|
|
var usage *providers.Usage
|
|
if gemResp.UsageMetadata != nil {
|
|
usage = &providers.Usage{
|
|
PromptTokens: gemResp.UsageMetadata.PromptTokenCount,
|
|
CompletionTokens: gemResp.UsageMetadata.CandidatesTokenCount,
|
|
TotalTokens: gemResp.UsageMetadata.TotalTokenCount,
|
|
}
|
|
}
|
|
return imageBytes, usage, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil, nil, fmt.Errorf("no image data in Gemini response")
|
|
}
|
|
|
|
// parseImageResponse extracts base64 image data from the OpenAI-compat chat response.
|
|
// Looks for images in choices[0].message.content (multipart) or choices[0].message.images.
|
|
func (t *CreateImageTool) parseImageResponse(respBody []byte) ([]byte, *providers.Usage, error) {
|
|
var resp struct {
|
|
Choices []struct {
|
|
Message struct {
|
|
Content any `json:"content"`
|
|
Images []struct {
|
|
ImageURL struct {
|
|
URL string `json:"url"`
|
|
} `json:"image_url"`
|
|
} `json:"images"`
|
|
} `json:"message"`
|
|
} `json:"choices"`
|
|
Usage *struct {
|
|
PromptTokens int `json:"prompt_tokens"`
|
|
CompletionTokens int `json:"completion_tokens"`
|
|
TotalTokens int `json:"total_tokens"`
|
|
} `json:"usage"`
|
|
}
|
|
|
|
if err := json.Unmarshal(respBody, &resp); err != nil {
|
|
return nil, nil, fmt.Errorf("parse response: %w", err)
|
|
}
|
|
if len(resp.Choices) == 0 {
|
|
return nil, nil, fmt.Errorf("no choices in response")
|
|
}
|
|
|
|
msg := resp.Choices[0].Message
|
|
|
|
// Try images array first (OpenRouter format)
|
|
for _, img := range msg.Images {
|
|
if imageBytes, err := decodeDataURL(img.ImageURL.URL); err == nil {
|
|
return imageBytes, convertUsage(resp.Usage), nil
|
|
}
|
|
}
|
|
|
|
// Try multipart content array (some providers return content as array of parts)
|
|
if parts, ok := msg.Content.([]any); ok {
|
|
for _, part := range parts {
|
|
if m, ok := part.(map[string]any); ok {
|
|
if m["type"] == "image_url" {
|
|
if imgURL, ok := m["image_url"].(map[string]any); ok {
|
|
if url, ok := imgURL["url"].(string); ok {
|
|
if imageBytes, err := decodeDataURL(url); err == nil {
|
|
return imageBytes, convertUsage(resp.Usage), nil
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil, nil, fmt.Errorf("no image data found in response")
|
|
}
|
|
|
|
// decodeDataURL decodes a data:image/...;base64,... URL into raw bytes.
|
|
func decodeDataURL(dataURL string) ([]byte, error) {
|
|
_, after, ok := strings.Cut(dataURL, ";base64,")
|
|
if !ok {
|
|
return nil, fmt.Errorf("not a base64 data URL")
|
|
}
|
|
b64 := after
|
|
return base64.StdEncoding.DecodeString(b64)
|
|
}
|
|
|
|
func convertUsage(u *struct {
|
|
PromptTokens int `json:"prompt_tokens"`
|
|
CompletionTokens int `json:"completion_tokens"`
|
|
TotalTokens int `json:"total_tokens"`
|
|
}) *providers.Usage {
|
|
if u == nil {
|
|
return nil
|
|
}
|
|
return &providers.Usage{
|
|
PromptTokens: u.PromptTokens,
|
|
CompletionTokens: u.CompletionTokens,
|
|
TotalTokens: u.TotalTokens,
|
|
}
|
|
}
|
|
|
|
func truncateBytes(b []byte, max int) string {
|
|
if len(b) <= max {
|
|
return string(b)
|
|
}
|
|
return string(b[:max]) + "..."
|
|
}
|