mirror of
https://github.com/tiennm99/goclaw.git
synced 2026-06-10 10:10:49 +00:00
bdb60de7ae
- Update go.mod and Dockerfile to Go 1.26 - Apply `go fix ./...` stdlib modernizations across 170+ files - Add `go fix` to post-implementation checklist in CLAUDE.md - Fix go fix misapplied rewrite in loop_history.go
202 lines
4.7 KiB
Go
202 lines
4.7 KiB
Go
package memory
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"math"
|
|
"net/http"
|
|
"strings"
|
|
)
|
|
|
|
// ContentHash returns a short SHA256 hex digest of the content (first 16 bytes).
|
|
func ContentHash(text string) string {
|
|
h := sha256.Sum256([]byte(text))
|
|
return fmt.Sprintf("%x", h[:16])
|
|
}
|
|
|
|
// TextChunk is a chunk of text with line number metadata.
|
|
type TextChunk struct {
|
|
Text string
|
|
StartLine int
|
|
EndLine int
|
|
}
|
|
|
|
// ChunkText splits text into chunks at paragraph boundaries.
|
|
// Each chunk includes its starting line number in the source file.
|
|
func ChunkText(text string, maxChunkLen int) []TextChunk {
|
|
if maxChunkLen <= 0 {
|
|
maxChunkLen = 1000
|
|
}
|
|
|
|
lines := strings.Split(text, "\n")
|
|
var chunks []TextChunk
|
|
var current strings.Builder
|
|
startLine := 1
|
|
|
|
flush := func(endLine int) {
|
|
content := strings.TrimSpace(current.String())
|
|
if content != "" {
|
|
chunks = append(chunks, TextChunk{
|
|
Text: content,
|
|
StartLine: startLine,
|
|
EndLine: endLine,
|
|
})
|
|
}
|
|
current.Reset()
|
|
startLine = endLine + 1
|
|
}
|
|
|
|
for i, line := range lines {
|
|
lineNum := i + 1
|
|
|
|
// Paragraph boundary: empty line
|
|
if strings.TrimSpace(line) == "" && current.Len() > 0 {
|
|
if current.Len() >= maxChunkLen/2 {
|
|
flush(lineNum - 1)
|
|
continue
|
|
}
|
|
}
|
|
|
|
if current.Len() > 0 {
|
|
current.WriteString("\n")
|
|
}
|
|
current.WriteString(line)
|
|
|
|
// Force flush if too large
|
|
if current.Len() >= maxChunkLen {
|
|
flush(lineNum)
|
|
}
|
|
}
|
|
|
|
if current.Len() > 0 {
|
|
flush(len(lines))
|
|
}
|
|
|
|
return chunks
|
|
}
|
|
|
|
// EmbeddingProvider generates vector embeddings for text.
|
|
type EmbeddingProvider interface {
|
|
// Name returns the provider identifier (e.g., "openai", "voyage").
|
|
Name() string
|
|
|
|
// Model returns the model used for embeddings.
|
|
Model() string
|
|
|
|
// Embed generates embeddings for a batch of texts.
|
|
Embed(ctx context.Context, texts []string) ([][]float32, error)
|
|
}
|
|
|
|
// OpenAIEmbeddingProvider uses the OpenAI-compatible embedding API.
|
|
// Works with OpenAI, OpenRouter, and any compatible endpoint.
|
|
type OpenAIEmbeddingProvider struct {
|
|
name string
|
|
model string
|
|
apiKey string
|
|
apiURL string
|
|
dimensions int // optional: truncate output to this many dimensions (0 = use model default)
|
|
}
|
|
|
|
// NewOpenAIEmbeddingProvider creates a provider for OpenAI-compatible embedding APIs.
|
|
func NewOpenAIEmbeddingProvider(name, apiKey, apiURL, model string) *OpenAIEmbeddingProvider {
|
|
if apiURL == "" {
|
|
apiURL = "https://api.openai.com/v1"
|
|
}
|
|
if model == "" {
|
|
model = "text-embedding-3-small"
|
|
}
|
|
|
|
return &OpenAIEmbeddingProvider{
|
|
name: name,
|
|
model: model,
|
|
apiKey: apiKey,
|
|
apiURL: apiURL,
|
|
}
|
|
}
|
|
|
|
// WithDimensions sets the output dimensions for models that support dimension truncation.
|
|
func (p *OpenAIEmbeddingProvider) WithDimensions(d int) *OpenAIEmbeddingProvider {
|
|
p.dimensions = d
|
|
return p
|
|
}
|
|
|
|
func (p *OpenAIEmbeddingProvider) Name() string { return p.name }
|
|
func (p *OpenAIEmbeddingProvider) Model() string { return p.model }
|
|
|
|
func (p *OpenAIEmbeddingProvider) Embed(ctx context.Context, texts []string) ([][]float32, error) {
|
|
reqBody := map[string]any{
|
|
"input": texts,
|
|
"model": p.model,
|
|
}
|
|
if p.dimensions > 0 {
|
|
reqBody["dimensions"] = p.dimensions
|
|
}
|
|
|
|
bodyJSON, err := json.Marshal(reqBody)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("marshal request: %w", err)
|
|
}
|
|
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.apiURL+"/embeddings", bytes.NewReader(bodyJSON))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create request: %w", err)
|
|
}
|
|
|
|
req.Header.Set("Content-Type", "application/json")
|
|
req.Header.Set("Authorization", "Bearer "+p.apiKey)
|
|
|
|
resp, err := http.DefaultClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("embedding request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
body, _ := io.ReadAll(resp.Body)
|
|
return nil, fmt.Errorf("embedding API error %d: %s", resp.StatusCode, string(body))
|
|
}
|
|
|
|
var result struct {
|
|
Data []struct {
|
|
Embedding []float32 `json:"embedding"`
|
|
} `json:"data"`
|
|
}
|
|
|
|
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
|
return nil, fmt.Errorf("decode response: %w", err)
|
|
}
|
|
|
|
embeddings := make([][]float32, len(result.Data))
|
|
for i, d := range result.Data {
|
|
embeddings[i] = d.Embedding
|
|
}
|
|
|
|
return embeddings, nil
|
|
}
|
|
|
|
// CosineSimilarity computes the cosine similarity between two vectors.
|
|
// Returns a value between -1 and 1 (1 = identical).
|
|
func CosineSimilarity(a, b []float32) float64 {
|
|
if len(a) != len(b) || len(a) == 0 {
|
|
return 0
|
|
}
|
|
|
|
var dot, normA, normB float64
|
|
for i := range a {
|
|
dot += float64(a[i]) * float64(b[i])
|
|
normA += float64(a[i]) * float64(a[i])
|
|
normB += float64(b[i]) * float64(b[i])
|
|
}
|
|
|
|
denom := math.Sqrt(normA) * math.Sqrt(normB)
|
|
if denom == 0 {
|
|
return 0
|
|
}
|
|
|
|
return dot / denom
|
|
}
|