From f525627d7cefcf9f0567cdf80b2ea0ef46372a54 Mon Sep 17 00:00:00 2001
From: tiennm99 <tiennm99@outlook.com>
Date: Sun, 10 May 2026 02:30:04 +0700
Subject: [PATCH] feat(modules): add Gemini AI integration module

---
 internal/ai/client.go         | 141 ++++++++++++++++++++++++++++++++++
 internal/ai/ratelimit.go      |  54 +++++++++++++
 internal/ai/ratelimit_test.go |  38 +++++++++
 internal/ai/types.go          |  29 +++++++
 4 files changed, 262 insertions(+)
 create mode 100644 internal/ai/client.go
 create mode 100644 internal/ai/ratelimit.go
 create mode 100644 internal/ai/ratelimit_test.go
 create mode 100644 internal/ai/types.go

diff --git a/internal/ai/client.go b/internal/ai/client.go
new file mode 100644
index 0000000..085c1ef
--- /dev/null
+++ b/internal/ai/client.go
@@ -0,0 +1,141 @@
+package ai
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"strings"
+
+	"google.golang.org/genai"
+)
+
+// Default model identifiers. Pinned strings rather than constants exposed to
+// callers — modules should not pick their own model. If we ever need to A/B
+// test, a higher-level config wins, not a per-module override.
+const (
+	embeddingModel = "text-embedding-004" // 768-dim, free tier
+	chatModel      = "gemini-2.5-flash"   // newest flash; 15 RPM / 1500 RPD free
+)
+
+// ErrRateLimited is returned when the upstream rejected with 429 (or our
+// in-process per-user bucket dropped the call). Modules show a friendly
+// "AI is rate-limited, try again in N minutes" message on this sentinel.
+var ErrRateLimited = errors.New("ai: rate limited")
+
+// ErrNotConfigured is returned when GEMINI_API_KEY was empty at startup.
+// The Client is nil in that case; modules using AI must check before
+// invoking and refuse the command with a config-error reply.
+var ErrNotConfigured = errors.New("ai: GEMINI_API_KEY not set")
+
+// Client wraps a *genai.Client with the small surface the bot needs. The
+// underlying gRPC connection is reused across requests — Cloud Run cold-start
+// budget makes a per-request handshake intolerable.
+//
+// Safe for concurrent use; *genai.Client is itself goroutine-safe.
+type Client struct {
+	g *genai.Client
+}
+
+// NewClient constructs a *Client backed by the Gemini API (not Vertex AI —
+// Vertex requires a service-account flow incompatible with the free-tier
+// Cloud Run baseline). A blank apiKey returns ErrNotConfigured so callers
+// can decide whether to skip AI-dependent module loading.
+func NewClient(ctx context.Context, apiKey string) (*Client, error) {
+	if strings.TrimSpace(apiKey) == "" {
+		return nil, ErrNotConfigured
+	}
+	g, err := genai.NewClient(ctx, &genai.ClientConfig{
+		APIKey:  apiKey,
+		Backend: genai.BackendGeminiAPI,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("ai: genai.NewClient: %w", err)
+	}
+	return &Client{g: g}, nil
+}
+
+// Embed batches `texts` into a single EmbedContent call and returns
+// dense vectors in the same order. Empty input → (nil, nil).
+//
+// Errors are wrapped; rate-limit (HTTP 429) is mapped to ErrRateLimited
+// so callers can branch on errors.Is.
+func (c *Client) Embed(ctx context.Context, texts []string) ([][]float32, error) {
+	if c == nil || c.g == nil {
+		return nil, ErrNotConfigured
+	}
+	if len(texts) == 0 {
+		return nil, nil
+	}
+	contents := make([]*genai.Content, 0, len(texts))
+	for _, t := range texts {
+		contents = append(contents, genai.NewContentFromText(t, genai.RoleUser))
+	}
+	resp, err := c.g.Models.EmbedContent(ctx, embeddingModel, contents, nil)
+	if err != nil {
+		if isRateLimit(err) {
+			return nil, ErrRateLimited
+		}
+		return nil, fmt.Errorf("ai: EmbedContent: %w", err)
+	}
+	if resp == nil {
+		return nil, fmt.Errorf("ai: EmbedContent: nil response")
+	}
+	if len(resp.Embeddings) != len(texts) {
+		return nil, fmt.Errorf("ai: EmbedContent returned %d embeddings, want %d",
+			len(resp.Embeddings), len(texts))
+	}
+	out := make([][]float32, len(texts))
+	for i, e := range resp.Embeddings {
+		if e == nil || len(e.Values) == 0 {
+			return nil, fmt.Errorf("ai: EmbedContent: empty embedding at index %d", i)
+		}
+		out[i] = e.Values
+	}
+	return out, nil
+}
+
+// Generate runs a single-turn chat with `system` as the system instruction
+// and `user` as the user message. Returns the model's text reply.
+//
+// The output cap matches what the JS twentyq prompt expects (≤200 tokens,
+// single-line JSON). Temperature 0.7 mirrors the JS code path.
+func (c *Client) Generate(ctx context.Context, system, user string) (string, error) {
+	if c == nil || c.g == nil {
+		return "", ErrNotConfigured
+	}
+	cfg := &genai.GenerateContentConfig{
+		Temperature:     ptrFloat32(0.7),
+		MaxOutputTokens: 256,
+	}
+	if system != "" {
+		cfg.SystemInstruction = genai.NewContentFromText(system, genai.RoleUser)
+	}
+	resp, err := c.g.Models.GenerateContent(ctx, chatModel, genai.Text(user), cfg)
+	if err != nil {
+		if isRateLimit(err) {
+			return "", ErrRateLimited
+		}
+		return "", fmt.Errorf("ai: GenerateContent: %w", err)
+	}
+	if resp == nil {
+		return "", fmt.Errorf("ai: GenerateContent: nil response")
+	}
+	return resp.Text(), nil
+}
+
+func ptrFloat32(v float32) *float32 { return &v }
+
+// isRateLimit returns true if err looks like a Gemini 429. The genai SDK
+// surfaces 429 as a typed error with Code=429 in some paths and as a wrapped
+// HTTP status string in others — we sniff both.
+func isRateLimit(err error) bool {
+	if err == nil {
+		return false
+	}
+	var apiErr genai.APIError
+	if errors.As(err, &apiErr) && apiErr.Code == 429 {
+		return true
+	}
+	msg := err.Error()
+	return strings.Contains(msg, "429") || strings.Contains(strings.ToLower(msg), "resource_exhausted")
+}
diff --git a/internal/ai/ratelimit.go b/internal/ai/ratelimit.go
new file mode 100644
index 0000000..b58407a
--- /dev/null
+++ b/internal/ai/ratelimit.go
@@ -0,0 +1,54 @@
+package ai
+
+import (
+	"sync"
+
+	"golang.org/x/time/rate"
+)
+
+// PerUserLimiter is a tiny wrapper around x/time/rate.Limiter that gives each
+// subject (user-id or chat-id, formatted as a string) its own token bucket.
+//
+// Why per-user: the Gemini free tier is 15 RPM / 1500 RPD shared across the
+// entire bot. A single chatty user could exhaust the daily quota in minutes;
+// a soft per-user cap (default 5/min) cushions everyone else.
+//
+// Why we don't enforce daily caps here: x/time/rate is a token bucket, not
+// a fixed-window counter. Per-day caps need a different abstraction; if we
+// hit RPD limits in practice we'll add a Firestore-backed counter. Phase 11
+// soak data will tell us if it's needed.
+type PerUserLimiter struct {
+	mu      sync.Mutex
+	buckets map[string]*rate.Limiter
+	r       rate.Limit // refill rate per second
+	burst   int        // bucket capacity
+}
+
+// NewPerUserLimiter returns a limiter that allows `burst` requests
+// instantaneously and refills at `perSec` requests per second.
+//
+// Defaults for the bot: burst=5, perSec=5/60 → 5 reqs in any 60s window.
+func NewPerUserLimiter(perSec float64, burst int) *PerUserLimiter {
+	if burst < 1 {
+		burst = 1
+	}
+	return &PerUserLimiter{
+		buckets: map[string]*rate.Limiter{},
+		r:       rate.Limit(perSec),
+		burst:   burst,
+	}
+}
+
+// Allow consumes one token from the subject's bucket. Returns false if the
+// bucket is empty — caller should reply with a "slow down" message and NOT
+// invoke the upstream model.
+func (p *PerUserLimiter) Allow(subject string) bool {
+	p.mu.Lock()
+	b, ok := p.buckets[subject]
+	if !ok {
+		b = rate.NewLimiter(p.r, p.burst)
+		p.buckets[subject] = b
+	}
+	p.mu.Unlock()
+	return b.Allow()
+}
diff --git a/internal/ai/ratelimit_test.go b/internal/ai/ratelimit_test.go
new file mode 100644
index 0000000..74d7e1c
--- /dev/null
+++ b/internal/ai/ratelimit_test.go
@@ -0,0 +1,38 @@
+package ai
+
+import "testing"
+
+func TestPerUserLimiter_BurstThenDrop(t *testing.T) {
+	// 5 burst, 0 refill — second batch must drop.
+	l := NewPerUserLimiter(0, 5)
+	for i := 0; i < 5; i++ {
+		if !l.Allow("user-1") {
+			t.Fatalf("burst[%d]: want allow, got drop", i)
+		}
+	}
+	if l.Allow("user-1") {
+		t.Errorf("post-burst: want drop, got allow")
+	}
+}
+
+func TestPerUserLimiter_PerSubjectIsolated(t *testing.T) {
+	l := NewPerUserLimiter(0, 1)
+	if !l.Allow("a") {
+		t.Fatalf("user a first call dropped")
+	}
+	if l.Allow("a") {
+		t.Errorf("user a second call: want drop")
+	}
+	// Different subject must have its own bucket.
+	if !l.Allow("b") {
+		t.Errorf("user b first call dropped — buckets not isolated")
+	}
+}
+
+func TestPerUserLimiter_BurstFloor(t *testing.T) {
+	// burst=0 → floored to 1 so the limiter never permanently blocks.
+	l := NewPerUserLimiter(0, 0)
+	if !l.Allow("x") {
+		t.Errorf("burst=0 floored: want first call allowed")
+	}
+}
diff --git a/internal/ai/types.go b/internal/ai/types.go
new file mode 100644
index 0000000..8eb8183
--- /dev/null
+++ b/internal/ai/types.go
@@ -0,0 +1,29 @@
+// Package ai wraps Google's genai SDK with a small, mockable surface for the
+// semantle/twentyq modules. The package owns:
+//
+//   - Embedder / Chatter interfaces — what modules consume
+//   - Client struct           — production implementation backed by genai
+//   - per-user rate limiter   — defends the shared 1500-RPD Gemini free tier
+//
+// Modules accept the interfaces (not *Client) so unit tests can pass fakes
+// without spinning up a real Gemini client. Production wiring in cmd/server
+// passes the *Client (which satisfies both interfaces) into Deps.
+package ai
+
+import "context"
+
+// Embedder produces dense vectors for text. Used by the semantle module to
+// score guess similarity. Implementations must respect ctx cancellation.
+//
+// On rate-limit (HTTP 429) the implementation should return a sentinel error
+// — see ErrRateLimited — so callers can show a user-friendly retry message.
+type Embedder interface {
+	Embed(ctx context.Context, texts []string) ([][]float32, error)
+}
+
+// Chatter produces a single text completion from a system + user prompt
+// pair. Used by the twentyq module's judge + round-start calls. Same
+// rate-limit conventions as Embedder.
+type Chatter interface {
+	Generate(ctx context.Context, system, user string) (string, error)
+}