From f525627d7cefcf9f0567cdf80b2ea0ef46372a54 Mon Sep 17 00:00:00 2001 From: tiennm99 Date: Sun, 10 May 2026 02:30:04 +0700 Subject: [PATCH] feat(modules): add Gemini AI integration module --- internal/ai/client.go | 141 ++++++++++++++++++++++++++++++++++ internal/ai/ratelimit.go | 54 +++++++++++++ internal/ai/ratelimit_test.go | 38 +++++++++ internal/ai/types.go | 29 +++++++ 4 files changed, 262 insertions(+) create mode 100644 internal/ai/client.go create mode 100644 internal/ai/ratelimit.go create mode 100644 internal/ai/ratelimit_test.go create mode 100644 internal/ai/types.go diff --git a/internal/ai/client.go b/internal/ai/client.go new file mode 100644 index 0000000..085c1ef --- /dev/null +++ b/internal/ai/client.go @@ -0,0 +1,141 @@ +package ai + +import ( + "context" + "errors" + "fmt" + "strings" + + "google.golang.org/genai" +) + +// Default model identifiers. Pinned strings rather than constants exposed to +// callers — modules should not pick their own model. If we ever need to A/B +// test, a higher-level config wins, not a per-module override. +const ( + embeddingModel = "text-embedding-004" // 768-dim, free tier + chatModel = "gemini-2.5-flash" // newest flash; 15 RPM / 1500 RPD free +) + +// ErrRateLimited is returned when the upstream rejected with 429 (or our +// in-process per-user bucket dropped the call). Modules show a friendly +// "AI is rate-limited, try again in N minutes" message on this sentinel. +var ErrRateLimited = errors.New("ai: rate limited") + +// ErrNotConfigured is returned when GEMINI_API_KEY was empty at startup. +// The Client is nil in that case; modules using AI must check before +// invoking and refuse the command with a config-error reply. +var ErrNotConfigured = errors.New("ai: GEMINI_API_KEY not set") + +// Client wraps a *genai.Client with the small surface the bot needs. The +// underlying gRPC connection is reused across requests — Cloud Run cold-start +// budget makes a per-request handshake intolerable. +// +// Safe for concurrent use; *genai.Client is itself goroutine-safe. +type Client struct { + g *genai.Client +} + +// NewClient constructs a *Client backed by the Gemini API (not Vertex AI — +// Vertex requires a service-account flow incompatible with the free-tier +// Cloud Run baseline). A blank apiKey returns ErrNotConfigured so callers +// can decide whether to skip AI-dependent module loading. +func NewClient(ctx context.Context, apiKey string) (*Client, error) { + if strings.TrimSpace(apiKey) == "" { + return nil, ErrNotConfigured + } + g, err := genai.NewClient(ctx, &genai.ClientConfig{ + APIKey: apiKey, + Backend: genai.BackendGeminiAPI, + }) + if err != nil { + return nil, fmt.Errorf("ai: genai.NewClient: %w", err) + } + return &Client{g: g}, nil +} + +// Embed batches `texts` into a single EmbedContent call and returns +// dense vectors in the same order. Empty input → (nil, nil). +// +// Errors are wrapped; rate-limit (HTTP 429) is mapped to ErrRateLimited +// so callers can branch on errors.Is. +func (c *Client) Embed(ctx context.Context, texts []string) ([][]float32, error) { + if c == nil || c.g == nil { + return nil, ErrNotConfigured + } + if len(texts) == 0 { + return nil, nil + } + contents := make([]*genai.Content, 0, len(texts)) + for _, t := range texts { + contents = append(contents, genai.NewContentFromText(t, genai.RoleUser)) + } + resp, err := c.g.Models.EmbedContent(ctx, embeddingModel, contents, nil) + if err != nil { + if isRateLimit(err) { + return nil, ErrRateLimited + } + return nil, fmt.Errorf("ai: EmbedContent: %w", err) + } + if resp == nil { + return nil, fmt.Errorf("ai: EmbedContent: nil response") + } + if len(resp.Embeddings) != len(texts) { + return nil, fmt.Errorf("ai: EmbedContent returned %d embeddings, want %d", + len(resp.Embeddings), len(texts)) + } + out := make([][]float32, len(texts)) + for i, e := range resp.Embeddings { + if e == nil || len(e.Values) == 0 { + return nil, fmt.Errorf("ai: EmbedContent: empty embedding at index %d", i) + } + out[i] = e.Values + } + return out, nil +} + +// Generate runs a single-turn chat with `system` as the system instruction +// and `user` as the user message. Returns the model's text reply. +// +// The output cap matches what the JS twentyq prompt expects (≤200 tokens, +// single-line JSON). Temperature 0.7 mirrors the JS code path. +func (c *Client) Generate(ctx context.Context, system, user string) (string, error) { + if c == nil || c.g == nil { + return "", ErrNotConfigured + } + cfg := &genai.GenerateContentConfig{ + Temperature: ptrFloat32(0.7), + MaxOutputTokens: 256, + } + if system != "" { + cfg.SystemInstruction = genai.NewContentFromText(system, genai.RoleUser) + } + resp, err := c.g.Models.GenerateContent(ctx, chatModel, genai.Text(user), cfg) + if err != nil { + if isRateLimit(err) { + return "", ErrRateLimited + } + return "", fmt.Errorf("ai: GenerateContent: %w", err) + } + if resp == nil { + return "", fmt.Errorf("ai: GenerateContent: nil response") + } + return resp.Text(), nil +} + +func ptrFloat32(v float32) *float32 { return &v } + +// isRateLimit returns true if err looks like a Gemini 429. The genai SDK +// surfaces 429 as a typed error with Code=429 in some paths and as a wrapped +// HTTP status string in others — we sniff both. +func isRateLimit(err error) bool { + if err == nil { + return false + } + var apiErr genai.APIError + if errors.As(err, &apiErr) && apiErr.Code == 429 { + return true + } + msg := err.Error() + return strings.Contains(msg, "429") || strings.Contains(strings.ToLower(msg), "resource_exhausted") +} diff --git a/internal/ai/ratelimit.go b/internal/ai/ratelimit.go new file mode 100644 index 0000000..b58407a --- /dev/null +++ b/internal/ai/ratelimit.go @@ -0,0 +1,54 @@ +package ai + +import ( + "sync" + + "golang.org/x/time/rate" +) + +// PerUserLimiter is a tiny wrapper around x/time/rate.Limiter that gives each +// subject (user-id or chat-id, formatted as a string) its own token bucket. +// +// Why per-user: the Gemini free tier is 15 RPM / 1500 RPD shared across the +// entire bot. A single chatty user could exhaust the daily quota in minutes; +// a soft per-user cap (default 5/min) cushions everyone else. +// +// Why we don't enforce daily caps here: x/time/rate is a token bucket, not +// a fixed-window counter. Per-day caps need a different abstraction; if we +// hit RPD limits in practice we'll add a Firestore-backed counter. Phase 11 +// soak data will tell us if it's needed. +type PerUserLimiter struct { + mu sync.Mutex + buckets map[string]*rate.Limiter + r rate.Limit // refill rate per second + burst int // bucket capacity +} + +// NewPerUserLimiter returns a limiter that allows `burst` requests +// instantaneously and refills at `perSec` requests per second. +// +// Defaults for the bot: burst=5, perSec=5/60 → 5 reqs in any 60s window. +func NewPerUserLimiter(perSec float64, burst int) *PerUserLimiter { + if burst < 1 { + burst = 1 + } + return &PerUserLimiter{ + buckets: map[string]*rate.Limiter{}, + r: rate.Limit(perSec), + burst: burst, + } +} + +// Allow consumes one token from the subject's bucket. Returns false if the +// bucket is empty — caller should reply with a "slow down" message and NOT +// invoke the upstream model. +func (p *PerUserLimiter) Allow(subject string) bool { + p.mu.Lock() + b, ok := p.buckets[subject] + if !ok { + b = rate.NewLimiter(p.r, p.burst) + p.buckets[subject] = b + } + p.mu.Unlock() + return b.Allow() +} diff --git a/internal/ai/ratelimit_test.go b/internal/ai/ratelimit_test.go new file mode 100644 index 0000000..74d7e1c --- /dev/null +++ b/internal/ai/ratelimit_test.go @@ -0,0 +1,38 @@ +package ai + +import "testing" + +func TestPerUserLimiter_BurstThenDrop(t *testing.T) { + // 5 burst, 0 refill — second batch must drop. + l := NewPerUserLimiter(0, 5) + for i := 0; i < 5; i++ { + if !l.Allow("user-1") { + t.Fatalf("burst[%d]: want allow, got drop", i) + } + } + if l.Allow("user-1") { + t.Errorf("post-burst: want drop, got allow") + } +} + +func TestPerUserLimiter_PerSubjectIsolated(t *testing.T) { + l := NewPerUserLimiter(0, 1) + if !l.Allow("a") { + t.Fatalf("user a first call dropped") + } + if l.Allow("a") { + t.Errorf("user a second call: want drop") + } + // Different subject must have its own bucket. + if !l.Allow("b") { + t.Errorf("user b first call dropped — buckets not isolated") + } +} + +func TestPerUserLimiter_BurstFloor(t *testing.T) { + // burst=0 → floored to 1 so the limiter never permanently blocks. + l := NewPerUserLimiter(0, 0) + if !l.Allow("x") { + t.Errorf("burst=0 floored: want first call allowed") + } +} diff --git a/internal/ai/types.go b/internal/ai/types.go new file mode 100644 index 0000000..8eb8183 --- /dev/null +++ b/internal/ai/types.go @@ -0,0 +1,29 @@ +// Package ai wraps Google's genai SDK with a small, mockable surface for the +// semantle/twentyq modules. The package owns: +// +// - Embedder / Chatter interfaces — what modules consume +// - Client struct — production implementation backed by genai +// - per-user rate limiter — defends the shared 1500-RPD Gemini free tier +// +// Modules accept the interfaces (not *Client) so unit tests can pass fakes +// without spinning up a real Gemini client. Production wiring in cmd/server +// passes the *Client (which satisfies both interfaces) into Deps. +package ai + +import "context" + +// Embedder produces dense vectors for text. Used by the semantle module to +// score guess similarity. Implementations must respect ctx cancellation. +// +// On rate-limit (HTTP 429) the implementation should return a sentinel error +// — see ErrRateLimited — so callers can show a user-friendly retry message. +type Embedder interface { + Embed(ctx context.Context, texts []string) ([][]float32, error) +} + +// Chatter produces a single text completion from a system + user prompt +// pair. Used by the twentyq module's judge + round-start calls. Same +// rate-limit conventions as Embedder. +type Chatter interface { + Generate(ctx context.Context, system, user string) (string, error) +}