refactor(doantu): swap ConceptNet for Workers AI bge-m3 embeddings

Mirror the semantle migration but with @cf/baai/bge-m3 — BAAI's multilingual embedding model — because the English-only BGE variants can't produce meaningful Vietnamese vectors (their tokenizer shreds diacritics into noisy byte-level subwords). bge-m3 is trained across 194 languages incl. Vietnamese and is actually cheaper in Neurons (1,075 vs 1,841 per M tokens for bge-small-en-v1.5). Vocab check reuses the local Viet22K wordlist as an in-memory Set — O(1) OOV detection, no upstream call. Also add a test file for the module (mirrors semantle coverage plus Vietnamese-specific cases: diacritics, multi-syllable compounds).
2026-04-28 12:20:42 +00:00 · 2026-04-22 23:53:36 +07:00
parent 31ced88b78
commit 0740dffd6b
3 changed files with 214 additions and 108 deletions
@@ -1,21 +1,29 @@
 /**
- * @file ConceptNet API client for the doantu module (Vietnamese).
+ * @file Cloudflare Workers AI client for the doantu module (Vietnamese semantle).
 *
- * Mirrors semantle/api-client.js one-for-one — same endpoints, same
- * response shape — with two Vietnamese-specific changes:
- *   1. Concept URIs use `/c/vi/…` instead of `/c/en/…`.
- *   2. Multi-word terms are joined with an underscore for URL building
- *      (`con chó` → `/c/vi/con_chó`), so the board can still display the
- *      space-separated form while ConceptNet resolves the canonical URI.
+ * Mirrors semantle/api-client.js but uses `@cf/baai/bge-m3` — BAAI's
+ * multilingual embedding model — because the English-only BGE variants
+ * can't produce meaningful Vietnamese vectors (their tokenizer is
+ * English-centric and Vietnamese diacritics get shredded into noisy
+ * byte-level subwords).
+ *
+ * Vocabulary: the curated `words-data.js` list (duyet/vietnamese-wordlist
+ * Viet22K) doubles as the in/out-of-vocabulary set. Lookups are O(1) via
+ * Set.has(), so OOV detection needs no extra round-trip.
+ *
+ * The returned `similarity(a, b)` shape matches the semantle sibling so
+ * handlers/render/state can be reused unchanged.
 */

 import { randomLine } from "./wordlist.js";
+import WORDS from "./words-data.js";

-const DEFAULT_API_BASE = "https://api.conceptnet.io";
-const DEFAULT_TIMEOUT_MS = 5000;
-const USER_AGENT = "miti99bot/doantu";
-const MAX_RANDOM_ATTEMPTS = 5;
-const LANG = "vi";
+// BGE-M3: multilingual (194 languages incl. Vietnamese), 1024 dimensions,
+// ~1,075 Neurons per M input tokens — cheaper than bge-small-en-v1.5.
+const DEFAULT_MODEL = "@cf/baai/bge-m3";
+
+// O(1) membership lookup for OOV detection. Built once per isolate.
+const VOCAB = new Set(WORDS);

 export class UpstreamError extends Error {
  /** @param {string} message @param {{status?: number, body?: string, cause?: unknown}} [meta] */
@@ -28,117 +36,70 @@ export class UpstreamError extends Error {
  }
 }

-/** `con chó` → `con_chó` — ConceptNet's concept-URI convention for phrases. */
-function toConceptTerm(word) {
-  return String(word).trim().replace(/\s+/g, "_");
-}
-
-function buildUrl(base, path, params = {}) {
-  const normalized = String(base).replace(/\/+$/, "");
-  const url = new URL(`${normalized}${path}`);
-  for (const [k, v] of Object.entries(params)) {
-    if (v === undefined || v === null) continue;
-    url.searchParams.set(k, String(v));
+function cosineSimilarity(a, b) {
+  if (!a || !b || a.length !== b.length) return null;
+  let dot = 0;
+  let normA = 0;
+  let normB = 0;
+  for (let i = 0; i < a.length; i++) {
+    dot += a[i] * b[i];
+    normA += a[i] * a[i];
+    normB += b[i] * b[i];
  }
-  return url.toString();
-}
-
-async function fetchJson(url, timeoutMs) {
-  const controller = new AbortController();
-  const timer = setTimeout(() => controller.abort(), timeoutMs);
-  let res;
-  try {
-    res = await fetch(url, {
-      headers: { "User-Agent": USER_AGENT, Accept: "application/json" },
-      signal: controller.signal,
-    });
-  } catch (err) {
-    clearTimeout(timer);
-    throw new UpstreamError("conceptnet fetch failed", { cause: err });
-  }
-  clearTimeout(timer);
-  const text = await res.text();
-  if (!res.ok) {
-    throw new UpstreamError(`conceptnet HTTP ${res.status}`, {
-      status: res.status,
-      body: text.slice(0, 500),
-    });
-  }
-  try {
-    return JSON.parse(text);
-  } catch (err) {
-    throw new UpstreamError("conceptnet non-JSON response", { cause: err });
-  }
-}
-
-function hasEdges(concept) {
-  return Array.isArray(concept?.edges) && concept.edges.length > 0;
+  const denom = Math.sqrt(normA) * Math.sqrt(normB);
+  return denom === 0 ? null : dot / denom;
 }

 /**
- * @param {string} [apiBase] — override for mirrors/tests.
- * @param {{ timeoutMs?: number }} [opts]
+ * @param {{ run: (model: string, input: { text: string[] }) => Promise<{ data: number[][] }> }} ai
+ *   — Workers AI binding (`env.AI`). Tests pass a fake with the same `.run()` shape.
+ * @param {{ model?: string }} [opts]
 */
-export function createClient(apiBase = DEFAULT_API_BASE, { timeoutMs = DEFAULT_TIMEOUT_MS } = {}) {
-  /** @param {string} term */
-  function concept(term) {
-    const cn = toConceptTerm(term);
-    return fetchJson(buildUrl(apiBase, `/c/${LANG}/${encodeURIComponent(cn)}`), timeoutMs);
+export function createClient(ai, { model = DEFAULT_MODEL } = {}) {
+  if (!ai || typeof ai.run !== "function") {
+    throw new TypeError("createClient: ai binding with .run(model, input) is required");
  }

-  /** @param {string} a @param {string} b */
-  function relatedness(a, b) {
-    return fetchJson(
-      buildUrl(apiBase, "/relatedness", {
-        node1: `/c/${LANG}/${toConceptTerm(a)}`,
-        node2: `/c/${LANG}/${toConceptTerm(b)}`,
-      }),
-      timeoutMs,
-    );
+  async function embedPair(a, b) {
+    let resp;
+    try {
+      resp = await ai.run(model, { text: [a, b] });
+    } catch (err) {
+      throw new UpstreamError("workers-ai embedding failed", { cause: err });
+    }
+    const data = resp?.data;
+    if (!Array.isArray(data) || data.length < 2) {
+      throw new UpstreamError("workers-ai returned malformed embedding payload");
+    }
+    return [data[0], data[1]];
  }

  return {
-    concept,
-    relatedness,
-
    /**
-     * Pick a target word from the local pool, verify via the concept endpoint,
-     * fall back to an unverified pick after a few misses.
+     * Pick a target word from the local Vietnamese pool. The pool IS the
+     * vocabulary, so every pick is trivially verified.
     * @returns {Promise<{ word: string, verified: boolean }>}
     */
    async randomWord() {
-      for (let i = 0; i < MAX_RANDOM_ATTEMPTS; i++) {
-        const candidate = randomLine();
-        try {
-          const c = await concept(candidate);
-          if (hasEdges(c)) return { word: candidate, verified: true };
-        } catch {
-          // swallow — try the next candidate
-        }
-      }
-      return { word: randomLine(), verified: false };
+      return { word: randomLine(), verified: true };
    },

    /**
-     * Cosine-like similarity. Runs concept edge-check for `b` in parallel
-     * with the relatedness call so OOV guesses are caught on the same round-trip.
-     * Shape mirrors the semantle sibling.
+     * Cosine similarity between `a` (target) and `b` (guess). Uses the local
+     * Vietnamese wordlist as vocabulary — unknown words return
+     * `in_vocab_b: false` with `similarity: null` and skip inference.
+     *
     * @param {string} a
     * @param {string} b
     */
    async similarity(a, b) {
-      const [conceptB, rel] = await Promise.all([concept(b), relatedness(a, b)]);
-      const inVocabB = hasEdges(conceptB);
-      const value = typeof rel?.value === "number" ? rel.value : null;
-      return {
-        a,
-        b,
-        canonical_a: a,
-        canonical_b: b,
-        in_vocab_a: true,
-        in_vocab_b: inVocabB,
-        similarity: inVocabB ? value : null,
-      };
+      const base = { a, b, canonical_a: a, canonical_b: b, in_vocab_a: true };
+      if (!VOCAB.has(b)) {
+        return { ...base, in_vocab_b: false, similarity: null };
+      }
+      const [vecA, vecB] = await embedPair(a, b);
+      const sim = cosineSimilarity(vecA, vecB);
+      return { ...base, in_vocab_b: true, similarity: sim };
    },
  };
 }
@@ -1,10 +1,11 @@
 /**
 * @file Doantu module — Vietnamese semantle.
 *
- * Targets from a curated local wordlist (duyet/vietnamese-wordlist Viet22K);
- * similarity scores from api.conceptnet.io's `/relatedness` endpoint against
- * `/c/vi/<term>` concept URIs. All commands are `protected` — listed in
- * /help but hidden from Telegram's native / autocomplete menu.
+ * Targets from a curated local wordlist (duyet/vietnamese-wordlist Viet22K —
+ * same list doubles as the vocabulary for OOV detection). Similarity scores
+ * come from cosine distance between `@cf/baai/bge-m3` multilingual embeddings
+ * produced by the `env.AI` binding. All commands are `protected` — listed
+ * in /help but hidden from Telegram's native / autocomplete menu.
 */

 import { createClient } from "./api-client.js";
@@ -18,9 +19,9 @@ let client = null;
 /** @type {import("../registry.js").BotModule} */
 const doantuModule = {
  name: "doantu",
-  init: async ({ db: store }) => {
+  init: async ({ db: store, env }) => {
    db = store;
-    client = createClient();
+    client = createClient(env.AI);
  },
  commands: [
    {
@@ -0,0 +1,144 @@
+import { describe, expect, it, vi } from "vitest";
+import { UpstreamError, createClient } from "../../../src/modules/doantu/api-client.js";
+
+/**
+ * Build a deterministic 1024-dim vector from a seed so cosine scores are
+ * reproducible in tests without hardcoding floats. bge-m3 produces 1024-dim
+ * vectors; tests use the same width for realism.
+ */
+function fakeVector(seed, dim = 1024) {
+  const out = new Array(dim);
+  for (let i = 0; i < dim; i++) out[i] = Math.sin(seed * (i + 1));
+  return out;
+}
+
+/**
+ * Minimal Workers AI binding fake. `impl(model, input)` returns the payload
+ * `env.AI.run()` would normally resolve to.
+ */
+function fakeAi(impl) {
+  return { run: vi.fn(impl) };
+}
+
+describe("doantu/api-client", () => {
+  describe("UpstreamError", () => {
+    it("stores status and body metadata", () => {
+      const err = new UpstreamError("test", { status: 404, body: "not found" });
+      expect(err.message).toBe("test");
+      expect(err.status).toBe(404);
+      expect(err.body).toBe("not found");
+      expect(err.name).toBe("UpstreamError");
+    });
+
+    it("stores cause when provided", () => {
+      const cause = new Error("underlying");
+      const err = new UpstreamError("wrapper", { cause });
+      expect(err.cause).toBe(cause);
+    });
+  });
+
+  describe("createClient", () => {
+    it("throws without a valid AI binding", () => {
+      expect(() => createClient(null)).toThrow(TypeError);
+      expect(() => createClient({})).toThrow(TypeError);
+      expect(() => createClient({ run: "not a function" })).toThrow(TypeError);
+    });
+
+    it("similarity batches target + guess in a single run() call with bge-m3", async () => {
+      const ai = fakeAi(async (_model, { text }) => ({
+        shape: [text.length, 1024],
+        data: text.map((_, i) => fakeVector(i + 1)),
+      }));
+      const client = createClient(ai);
+      await client.similarity("chó", "mèo");
+      expect(ai.run).toHaveBeenCalledTimes(1);
+      const [model, input] = ai.run.mock.calls[0];
+      expect(model).toBe("@cf/baai/bge-m3");
+      expect(input).toEqual({ text: ["chó", "mèo"] });
+    });
+
+    it("similarity returns cosine score for an in-vocab Vietnamese guess", async () => {
+      const ai = fakeAi(async (_model, { text }) => ({
+        data: text.map((_, i) => fakeVector(i + 1)),
+      }));
+      const client = createClient(ai);
+      const res = await client.similarity("chó", "mèo");
+      expect(res.in_vocab_a).toBe(true);
+      expect(res.in_vocab_b).toBe(true);
+      expect(res.canonical_a).toBe("chó");
+      expect(res.canonical_b).toBe("mèo");
+      expect(typeof res.similarity).toBe("number");
+      expect(res.similarity).toBeGreaterThan(-1);
+      expect(res.similarity).toBeLessThanOrEqual(1);
+    });
+
+    it('similarity accepts multi-syllable Vietnamese words in vocab ("a dua")', async () => {
+      const ai = fakeAi(async () => ({ data: [fakeVector(1), fakeVector(2)] }));
+      const client = createClient(ai);
+      const res = await client.similarity("chó", "a dua");
+      expect(res.in_vocab_b).toBe(true);
+      expect(res.similarity).not.toBeNull();
+    });
+
+    it("similarity returns 1 for identical vectors", async () => {
+      const vec = fakeVector(7);
+      const ai = fakeAi(async () => ({ data: [vec, vec] }));
+      const client = createClient(ai);
+      const res = await client.similarity("chó", "mèo");
+      expect(res.similarity).toBeCloseTo(1, 10);
+    });
+
+    it("similarity skips AI call for OOV guess and flags in_vocab_b:false", async () => {
+      const ai = fakeAi(async () => ({ data: [fakeVector(1), fakeVector(2)] }));
+      const client = createClient(ai);
+      const res = await client.similarity("chó", "zzzkhôngcótrongtừđiển");
+      expect(res.in_vocab_b).toBe(false);
+      expect(res.similarity).toBe(null);
+      expect(ai.run).not.toHaveBeenCalled();
+    });
+
+    it("similarity wraps AI.run rejection as UpstreamError", async () => {
+      const ai = fakeAi(async () => {
+        throw new Error("boom");
+      });
+      const client = createClient(ai);
+      await expect(client.similarity("chó", "mèo")).rejects.toMatchObject({
+        name: "UpstreamError",
+      });
+    });
+
+    it("similarity throws UpstreamError on malformed payload", async () => {
+      const ai = fakeAi(async () => ({ data: [fakeVector(1)] }));
+      const client = createClient(ai);
+      await expect(client.similarity("chó", "mèo")).rejects.toMatchObject({
+        name: "UpstreamError",
+      });
+    });
+
+    it("similarity returns null score when a vector norm is zero", async () => {
+      const zero = new Array(1024).fill(0);
+      const ai = fakeAi(async () => ({ data: [zero, fakeVector(1)] }));
+      const client = createClient(ai);
+      const res = await client.similarity("chó", "mèo");
+      expect(res.in_vocab_b).toBe(true);
+      expect(res.similarity).toBe(null);
+    });
+
+    it("randomWord returns a verified pick from the local pool", async () => {
+      const ai = fakeAi(async () => ({ data: [] }));
+      const client = createClient(ai);
+      const res = await client.randomWord();
+      expect(typeof res.word).toBe("string");
+      expect(res.word.length).toBeGreaterThan(0);
+      expect(res.verified).toBe(true);
+      expect(ai.run).not.toHaveBeenCalled();
+    });
+
+    it("supports model override via options", async () => {
+      const ai = fakeAi(async () => ({ data: [fakeVector(1), fakeVector(2)] }));
+      const client = createClient(ai, { model: "@cf/baai/bge-large-en-v1.5" });
+      await client.similarity("chó", "mèo");
+      expect(ai.run.mock.calls[0][0]).toBe("@cf/baai/bge-large-en-v1.5");
+    });
+  });
+});