Files
viettranx aa5158d4a2 feat(providers): add BytePlus ModelArk provider with Seedream/Seedance media gen
Add BytePlus ModelArk as a new OpenAI-compatible provider for Seed 2.0
models (chat, vision). Two provider types: standard API and Coding Plan
(separate base URLs, same auth).

Integrate Seedream image generation (sync API) and Seedance video
generation (async polling) into the builtin media tool chain, following
the established DashScope/Gemini patterns.

- Add WithAuthPrefix option to OpenAIProvider for future non-standard auth
- Add ProviderBytePlus/ProviderBytePlusCoding store constants and config
- Register provider from config.json and llm_providers DB table
- Add BytePlus to media chain routing, priority lists, and dispatch
- Create create_image_byteplus.go (Seedream, sync response)
- Create create_video_byteplus.go (Seedance, async poll with 5min timeout)
- Add BytePlus to web and desktop UI provider type dropdowns
- Update provider docs with BytePlus entries

Closes #686
2026-04-04 23:07:42 +07:00

447 lines
15 KiB
Go

package tools
import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"log/slog"
"net/http"
"os"
"path/filepath"
"strings"
"time"
"github.com/nextlevelbuilder/goclaw/internal/bus"
"github.com/nextlevelbuilder/goclaw/internal/providers"
)
// credentialProvider is a narrow interface for providers that expose API credentials.
type credentialProvider interface {
APIKey() string
APIBase() string
}
// imageGenProviderPriority is the default order for image generation providers.
var imageGenProviderPriority = []string{"openrouter", "gemini", "openai", "minimax", "dashscope", "byteplus"}
// imageGenModelDefaults maps provider names to default image generation models.
var imageGenModelDefaults = map[string]string{
"openrouter": "google/gemini-2.5-flash-image",
"openai": "gpt-image-1.5",
"gemini": "gemini-2.5-flash-image",
"minimax": "image-01",
"dashscope": "wan2.6-image",
"byteplus": "seedream-5-0-260128",
}
// CreateImageTool generates images using an image generation API.
type CreateImageTool struct {
registry *providers.Registry
}
func NewCreateImageTool(registry *providers.Registry) *CreateImageTool {
return &CreateImageTool{registry: registry}
}
func (t *CreateImageTool) Name() string { return "create_image" }
func (t *CreateImageTool) Description() string {
return "Generate an image from a text description using an image generation model. Returns a MEDIA: path to the generated image file."
}
func (t *CreateImageTool) Parameters() map[string]any {
return map[string]any{
"type": "object",
"properties": map[string]any{
"prompt": map[string]any{
"type": "string",
"description": "Text description of the image to generate.",
},
"aspect_ratio": map[string]any{
"type": "string",
"description": "Aspect ratio: '1:1' (default), '3:4', '4:3', '9:16', '16:9'.",
},
"filename_hint": map[string]any{
"type": "string",
"description": "Short descriptive filename (no extension). Example: 'sunset-beach', 'company-logo'.",
},
},
"required": []string{"prompt"},
}
}
func (t *CreateImageTool) Execute(ctx context.Context, args map[string]any) *Result {
prompt, _ := args["prompt"].(string)
if prompt == "" {
return ErrorResult("prompt is required")
}
aspectRatio, _ := args["aspect_ratio"].(string)
if aspectRatio == "" {
aspectRatio = "1:1"
}
filenameHint, _ := args["filename_hint"].(string)
chain := ResolveMediaProviderChain(ctx, "create_image", "", "",
imageGenProviderPriority, imageGenModelDefaults, t.registry)
// Inject prompt and aspect_ratio into each chain entry's params
for i := range chain {
if chain[i].Params == nil {
chain[i].Params = make(map[string]any)
}
chain[i].Params["prompt"] = prompt
chain[i].Params["aspect_ratio"] = aspectRatio
}
chainResult, err := ExecuteWithChain(ctx, chain, t.registry, t.callProvider)
if err != nil {
return ErrorResult(fmt.Sprintf("image generation failed: %v", err))
}
// Save to workspace under date-based folder (e.g. generated/2026-03-02/)
workspace := ToolWorkspaceFromCtx(ctx)
if workspace == "" {
workspace = os.TempDir()
}
dateDir := filepath.Join(workspace, "generated", time.Now().Format("2006-01-02"))
if err := os.MkdirAll(dateDir, 0755); err != nil {
return ErrorResult(fmt.Sprintf("failed to create output directory: %v", err))
}
imagePath := filepath.Join(dateDir, mediaFileName(ctx, "image", filenameHint, "png"))
if err := os.WriteFile(imagePath, chainResult.Data, 0644); err != nil {
return ErrorResult(fmt.Sprintf("failed to save generated image: %v", err))
}
// Verify file was persisted (diagnostic for disappearing files).
if fi, err := os.Stat(imagePath); err != nil {
slog.Warn("create_image: file missing immediately after write", "path", imagePath, "error", err)
return ErrorResult(fmt.Sprintf("generated image file missing after write: %v", err))
} else {
slog.Info("create_image: file saved", "path", imagePath, "size", fi.Size(), "data_len", len(chainResult.Data))
}
result := &Result{ForLLM: fmt.Sprintf("MEDIA:%s\nUse the EXACT filename when referencing: %s", imagePath, filepath.Base(imagePath))}
result.Media = []bus.MediaFile{{Path: imagePath, MimeType: "image/png"}}
result.Deliverable = fmt.Sprintf("[Generated image: %s]\nPrompt: %s", filepath.Base(imagePath), prompt)
result.Provider = chainResult.Provider
result.Model = chainResult.Model
if chainResult.Usage != nil {
result.Usage = chainResult.Usage
}
return result
}
// callProvider dispatches to the correct image generation implementation based on provider type.
func (t *CreateImageTool) callProvider(ctx context.Context, cp credentialProvider, providerName, model string, params map[string]any) ([]byte, *providers.Usage, error) {
if cp == nil {
return nil, nil, fmt.Errorf("provider %q does not expose API credentials required for image generation", providerName)
}
prompt := GetParamString(params, "prompt", "")
aspectRatio := GetParamString(params, "aspect_ratio", "1:1")
slog.Info("create_image: calling image generation API",
"provider", providerName, "model", model, "aspect_ratio", aspectRatio)
switch GetParamString(params, "_provider_type", providerTypeFromName(providerName)) {
case "gemini":
return t.callGeminiNativeImageGen(ctx, cp.APIKey(), cp.APIBase(), model, prompt, params)
case "openrouter":
return t.callImageGenAPI(ctx, cp.APIKey(), cp.APIBase(), model, prompt, aspectRatio, params)
case "minimax":
return callMinimaxImageGen(ctx, cp.APIKey(), cp.APIBase(), model, prompt, params)
case "dashscope":
return callDashScopeImageGen(ctx, cp.APIKey(), cp.APIBase(), model, prompt, params)
case "byteplus":
return callBytePlusImageGen(ctx, cp.APIKey(), cp.APIBase(), model, prompt, params)
default:
return t.callStandardImageGenAPI(ctx, cp.APIKey(), cp.APIBase(), model, prompt, params)
}
}
// callImageGenAPI calls the OpenAI-compatible chat completions endpoint with image modalities.
// Works with OpenRouter (modalities: ["image","text"]).
func (t *CreateImageTool) callImageGenAPI(ctx context.Context, apiKey, apiBase, model, prompt, aspectRatio string, params map[string]any) ([]byte, *providers.Usage, error) {
body := map[string]any{
"model": model,
"messages": []map[string]any{
{"role": "user", "content": prompt},
},
"modalities": []string{"image", "text"},
}
if aspectRatio != "" && aspectRatio != "1:1" {
body["image_config"] = map[string]any{
"aspect_ratio": aspectRatio,
}
}
jsonBody, err := json.Marshal(body)
if err != nil {
return nil, nil, fmt.Errorf("marshal request: %w", err)
}
url := strings.TrimRight(apiBase, "/") + "/chat/completions"
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
if err != nil {
return nil, nil, fmt.Errorf("create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+apiKey)
client := &http.Client{} // timeout governed by chain context
resp, err := client.Do(req)
if err != nil {
return nil, nil, fmt.Errorf("http request: %w", err)
}
defer resp.Body.Close()
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, nil, fmt.Errorf("read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, nil, fmt.Errorf("API error %d: %s", resp.StatusCode, truncateBytes(respBody, 500))
}
return t.parseImageResponse(respBody)
}
// callStandardImageGenAPI uses the /images/generations endpoint (OpenAI and compatible providers).
func (t *CreateImageTool) callStandardImageGenAPI(ctx context.Context, apiKey, apiBase, model, prompt string, params map[string]any) ([]byte, *providers.Usage, error) {
body := map[string]any{
"model": model,
"prompt": prompt,
"n": 1,
"response_format": "b64_json",
}
jsonBody, err := json.Marshal(body)
if err != nil {
return nil, nil, fmt.Errorf("marshal request: %w", err)
}
url := strings.TrimRight(apiBase, "/") + "/images/generations"
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
if err != nil {
return nil, nil, fmt.Errorf("create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+apiKey)
client := &http.Client{} // timeout governed by chain context
resp, err := client.Do(req)
if err != nil {
return nil, nil, fmt.Errorf("http request: %w", err)
}
defer resp.Body.Close()
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, nil, fmt.Errorf("read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, nil, fmt.Errorf("API error %d: %s", resp.StatusCode, truncateBytes(respBody, 500))
}
var imgResp struct {
Data []struct {
B64JSON string `json:"b64_json"`
} `json:"data"`
}
if err := json.Unmarshal(respBody, &imgResp); err != nil {
return nil, nil, fmt.Errorf("parse response: %w", err)
}
if len(imgResp.Data) == 0 || imgResp.Data[0].B64JSON == "" {
return nil, nil, fmt.Errorf("no image data in response")
}
imageBytes, err := base64.StdEncoding.DecodeString(imgResp.Data[0].B64JSON)
if err != nil {
return nil, nil, fmt.Errorf("decode base64: %w", err)
}
return imageBytes, nil, nil
}
// callGeminiNativeImageGen uses the native Gemini generateContent API with responseModalities.
// Gemini image models require this endpoint — they don't support OpenAI-compat endpoints.
func (t *CreateImageTool) callGeminiNativeImageGen(ctx context.Context, apiKey, apiBase, model, prompt string, params map[string]any) ([]byte, *providers.Usage, error) {
// Derive native Gemini base from OpenAI-compat base (strip /openai suffix)
nativeBase := strings.TrimRight(apiBase, "/")
nativeBase = strings.TrimSuffix(nativeBase, "/openai")
url := fmt.Sprintf("%s/models/%s:generateContent?key=%s", nativeBase, model, apiKey)
body := map[string]any{
"contents": []map[string]any{
{"parts": []map[string]any{{"text": prompt}}},
},
"generationConfig": map[string]any{
"responseModalities": []string{"TEXT", "IMAGE"},
},
}
jsonBody, err := json.Marshal(body)
if err != nil {
return nil, nil, fmt.Errorf("marshal request: %w", err)
}
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
if err != nil {
return nil, nil, fmt.Errorf("create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
client := &http.Client{} // timeout governed by chain context
resp, err := client.Do(req)
if err != nil {
return nil, nil, fmt.Errorf("http request: %w", err)
}
defer resp.Body.Close()
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, nil, fmt.Errorf("read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, nil, fmt.Errorf("API error %d: %s", resp.StatusCode, truncateBytes(respBody, 500))
}
// Parse native Gemini response: {candidates: [{content: {parts: [{inlineData: {mimeType, data}}]}}]}
var gemResp struct {
Candidates []struct {
Content struct {
Parts []struct {
InlineData *struct {
MimeType string `json:"mimeType"`
Data string `json:"data"`
} `json:"inlineData"`
Text string `json:"text"`
} `json:"parts"`
} `json:"content"`
} `json:"candidates"`
UsageMetadata *struct {
PromptTokenCount int `json:"promptTokenCount"`
CandidatesTokenCount int `json:"candidatesTokenCount"`
TotalTokenCount int `json:"totalTokenCount"`
} `json:"usageMetadata"`
}
if err := json.Unmarshal(respBody, &gemResp); err != nil {
return nil, nil, fmt.Errorf("parse response: %w", err)
}
// Extract first image from parts
for _, cand := range gemResp.Candidates {
for _, part := range cand.Content.Parts {
if part.InlineData != nil && strings.HasPrefix(part.InlineData.MimeType, "image/") {
imageBytes, err := base64.StdEncoding.DecodeString(part.InlineData.Data)
if err != nil {
return nil, nil, fmt.Errorf("decode base64: %w", err)
}
var usage *providers.Usage
if gemResp.UsageMetadata != nil {
usage = &providers.Usage{
PromptTokens: gemResp.UsageMetadata.PromptTokenCount,
CompletionTokens: gemResp.UsageMetadata.CandidatesTokenCount,
TotalTokens: gemResp.UsageMetadata.TotalTokenCount,
}
}
return imageBytes, usage, nil
}
}
}
return nil, nil, fmt.Errorf("no image data in Gemini response")
}
// parseImageResponse extracts base64 image data from the OpenAI-compat chat response.
// Looks for images in choices[0].message.content (multipart) or choices[0].message.images.
func (t *CreateImageTool) parseImageResponse(respBody []byte) ([]byte, *providers.Usage, error) {
var resp struct {
Choices []struct {
Message struct {
Content any `json:"content"`
Images []struct {
ImageURL struct {
URL string `json:"url"`
} `json:"image_url"`
} `json:"images"`
} `json:"message"`
} `json:"choices"`
Usage *struct {
PromptTokens int `json:"prompt_tokens"`
CompletionTokens int `json:"completion_tokens"`
TotalTokens int `json:"total_tokens"`
} `json:"usage"`
}
if err := json.Unmarshal(respBody, &resp); err != nil {
return nil, nil, fmt.Errorf("parse response: %w", err)
}
if len(resp.Choices) == 0 {
return nil, nil, fmt.Errorf("no choices in response")
}
msg := resp.Choices[0].Message
// Try images array first (OpenRouter format)
for _, img := range msg.Images {
if imageBytes, err := decodeDataURL(img.ImageURL.URL); err == nil {
return imageBytes, convertUsage(resp.Usage), nil
}
}
// Try multipart content array (some providers return content as array of parts)
if parts, ok := msg.Content.([]any); ok {
for _, part := range parts {
if m, ok := part.(map[string]any); ok {
if m["type"] == "image_url" {
if imgURL, ok := m["image_url"].(map[string]any); ok {
if url, ok := imgURL["url"].(string); ok {
if imageBytes, err := decodeDataURL(url); err == nil {
return imageBytes, convertUsage(resp.Usage), nil
}
}
}
}
}
}
}
return nil, nil, fmt.Errorf("no image data found in response")
}
// decodeDataURL decodes a data:image/...;base64,... URL into raw bytes.
func decodeDataURL(dataURL string) ([]byte, error) {
_, after, ok := strings.Cut(dataURL, ";base64,")
if !ok {
return nil, fmt.Errorf("not a base64 data URL")
}
b64 := after
return base64.StdEncoding.DecodeString(b64)
}
func convertUsage(u *struct {
PromptTokens int `json:"prompt_tokens"`
CompletionTokens int `json:"completion_tokens"`
TotalTokens int `json:"total_tokens"`
}) *providers.Usage {
if u == nil {
return nil
}
return &providers.Usage{
PromptTokens: u.PromptTokens,
CompletionTokens: u.CompletionTokens,
TotalTokens: u.TotalTokens,
}
}
func truncateBytes(b []byte, max int) string {
if len(b) <= max {
return string(b)
}
return string(b[:max]) + "..."
}