From 0887a07367e1df5b25cc323ebf7c1e9bbc141fb3 Mon Sep 17 00:00:00 2001 From: tiennm99 Date: Fri, 24 Apr 2026 14:57:58 +0700 Subject: [PATCH] fix(twentyq): drop function calling, use JSON-in-content for Gemma 4 compat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gemma 4 likely rejects the flat "traditional" tools schema we were sending (the docs use OpenAI-wrapped shape for this model) — causing env.AI.run to throw and users to see the "AI service hiccup" reply every turn. Switch to the universal approach: - system prompt asks the model for a one-line JSON {is_guess, answer, hint} - ai-client.extractText handles both Workers-AI and OpenAI response shapes - parseJudgementJson walks brace-depth to extract JSON from stray prose / accidental code fences - logs twentyq_ai_throw / twentyq_ai_unparseable with preview on failure so future issues surface in wrangler tail immediately Tests: 7 new (parser + extractText); 444 total pass. --- src/modules/twentyq/ai-client.js | 134 +++++++++++++++--------- src/modules/twentyq/prompts.js | 65 ++++-------- tests/fakes/fake-ai.js | 16 ++- tests/modules/twentyq/ai-client.test.js | 107 +++++++++++-------- 4 files changed, 179 insertions(+), 143 deletions(-) diff --git a/src/modules/twentyq/ai-client.js b/src/modules/twentyq/ai-client.js index 97401ee..66f4998 100644 --- a/src/modules/twentyq/ai-client.js +++ b/src/modules/twentyq/ai-client.js @@ -1,16 +1,17 @@ /** * @file Workers AI client — wraps env.AI.run for the twentyq judge. - * Uses traditional function calling on @cf/google/gemma-4-26b-a4b-it. * - * Returns a structured { is_guess, answer, hint } per turn. On any AI - * failure, throws UpstreamError so handlers can show a friendly retry - * message instead of a 500. + * Plain JSON-in-content approach: the system prompt instructs the model to + * emit one-line JSON, we grep it out of the response. No function calling, + * no tools array — maximum compatibility across Workers AI models. * - * Defensive: tolerates both Cloudflare's "traditional" tool-call shape - * ({ name, arguments }) and OpenAI-style ({ function: { name, arguments } }). + * On any AI failure or unparseable output, throws UpstreamError / returns + * defensive defaults so handlers can show a friendly retry message. + * + * Rich logging on failure paths so wrangler tail surfaces the actual cause. */ -import { ANSWER_FUNCTION_SCHEMA, buildSystemPrompt } from "./prompts.js"; +import { buildSystemPrompt } from "./prompts.js"; export const MODEL_ID = "@cf/google/gemma-4-26b-a4b-it"; @@ -31,47 +32,71 @@ export class UpstreamError extends Error { } /** - * Extract the structured tool-call payload from a Workers AI response. - * Handles both shapes: `tool_calls[].arguments` (traditional) and - * `tool_calls[].function.arguments` (OpenAI-style, possibly stringified). + * Pull the response text out of whatever shape env.AI.run returned. + * Handles { response: "..." } (traditional Workers AI), OpenAI-compatible + * { choices: [{ message: { content } }] }, and plain strings. * * @param {any} response + * @returns {string} + */ +export function extractText(response) { + if (typeof response === "string") return response; + if (!response || typeof response !== "object") return ""; + if (typeof response.response === "string") return response.response; + const choice = response.choices?.[0]; + const content = choice?.message?.content; + if (typeof content === "string") return content; + if (Array.isArray(content)) { + return content.map((part) => (typeof part === "string" ? part : (part?.text ?? ""))).join(""); + } + return ""; +} + +/** + * Extract the first `{...}` JSON object from a text blob. Tolerates + * leading/trailing prose, backtick fences, explanations — even though the + * system prompt forbids them. + * + * @param {string} text * @returns {object | null} */ -export function parseToolCall(response) { - const calls = response?.tool_calls; - if (!Array.isArray(calls) || calls.length === 0) return null; - const first = calls[0]; - if (!first) return null; - - // Cloudflare "traditional" shape: { name, arguments: { ... } } - if (first.arguments && typeof first.arguments === "object") { - return first.arguments; - } - // OpenAI-style: { function: { name, arguments: "..." | { ... } } } - const fnArgs = first.function?.arguments; - if (fnArgs && typeof fnArgs === "object") return fnArgs; - if (typeof fnArgs === "string") { - try { - return JSON.parse(fnArgs); - } catch { - return null; +export function parseJudgementJson(text) { + if (!text || typeof text !== "string") return null; + // Strip code fences if the model disobeyed and wrapped the JSON. + const unfenced = text.replace(/```(?:json)?\s*/gi, "").replace(/```/g, ""); + const start = unfenced.indexOf("{"); + if (start === -1) return null; + // Walk brace depth to find the matching close. + let depth = 0; + let inString = false; + let isEscaped = false; + for (let i = start; i < unfenced.length; i++) { + const ch = unfenced[i]; + if (inString) { + if (isEscaped) isEscaped = false; + else if (ch === "\\") isEscaped = true; + else if (ch === '"') inString = false; + continue; } - } - // Some models return stringified arguments at the top level too. - if (typeof first.arguments === "string") { - try { - return JSON.parse(first.arguments); - } catch { - return null; + if (ch === '"') inString = true; + else if (ch === "{") depth++; + else if (ch === "}") { + depth--; + if (depth === 0) { + const slice = unfenced.slice(start, i + 1); + try { + return JSON.parse(slice); + } catch { + return null; + } + } } } return null; } /** - * Coerce any tool-call payload into the canonical { is_guess, answer, hint } - * shape, applying defaults if fields are missing or malformed. + * Coerce any parsed payload into the canonical shape with defaults. * * @param {any} payload * @returns {{ is_guess: boolean, answer: "yes"|"no", hint: string }} @@ -89,8 +114,9 @@ export function normalizeJudgement(payload) { } /** - * Strip any case-insensitive substring of the secret from the hint. - * Defense-in-depth — the system prompt forbids it but we don't trust the model. + * Strip any case-insensitive whole-word occurrence of the secret from the + * hint. Defense-in-depth — the system prompt forbids it but we don't trust + * the model. * * @param {string} hint * @param {string} target @@ -105,11 +131,11 @@ export function redactSecret(hint, target) { } /** - * Judge a single user turn. + * Judge a single user turn. Throws UpstreamError on AI failure. * * @param {{ AI: { run: (model: string, body: object) => Promise } }} env * @param {import("./state.js").TwentyqGameState} state - * @param {string} userInput — already validated/normalized by validate-input.js + * @param {string} userInput — validated/normalized * @returns {Promise<{ is_guess: boolean, answer: "yes"|"no", hint: string }>} */ export async function judge(env, state, userInput) { @@ -122,15 +148,29 @@ export async function judge(env, state, userInput) { ]; let response; try { - response = await env.AI.run(MODEL_ID, { - messages, - tools: [ANSWER_FUNCTION_SCHEMA], - temperature: 0.3, - }); + response = await env.AI.run(MODEL_ID, { messages }); } catch (err) { + console.log( + JSON.stringify({ + msg: "twentyq_ai_throw", + model: MODEL_ID, + err: err instanceof Error ? { name: err.name, message: err.message } : String(err), + }), + ); throw new UpstreamError("env.AI.run threw", { cause: err }); } - const payload = parseToolCall(response); + + const text = extractText(response); + const payload = parseJudgementJson(text); + if (!payload) { + console.log( + JSON.stringify({ + msg: "twentyq_ai_unparseable", + model: MODEL_ID, + preview: text.slice(0, 200), + }), + ); + } const judgement = normalizeJudgement(payload); judgement.hint = redactSecret(judgement.hint, state.target); return judgement; diff --git a/src/modules/twentyq/prompts.js b/src/modules/twentyq/prompts.js index 8062c41..7de8728 100644 --- a/src/modules/twentyq/prompts.js +++ b/src/modules/twentyq/prompts.js @@ -1,9 +1,11 @@ /** - * @file System prompt + function-calling schema for the twentyq judge. + * @file System prompt + JSON response schema for the twentyq judge. * - * The model receives the secret target + history each turn and emits a single - * structured `submit_answer({ is_guess, answer, hint })` call. We never let - * the model reply in free prose — function calling guarantees parseable shape. + * We ask the model to emit a SINGLE-LINE JSON object with the keys + * {is_guess, answer, hint} — no function calling, no tools array. + * Function calling shape differs between models and Workers AI models + * sometimes reject unknown params; plain JSON instruction is universal + * and battle-tested. ai-client.parseJudgementJson does the parse. */ /** @typedef {import("./state.js").TwentyqGameState} TwentyqGameState */ @@ -35,44 +37,23 @@ ${historyText} The user will send a single message — either a yes/no question (e.g. "is it big?", "does it have wheels?") or a final guess of a specific noun (e.g. "is it an organ?", "is it a piano?"). -You MUST call the submit_answer function with: - - is_guess (boolean): true ONLY when the user is naming a specific concrete object that is the same as, a synonym of, or extremely close to the secret. Vague descriptors like "is it big?", "is it round?" are NOT guesses. Saying "is it a string instrument?" when the secret is "guitar" is NOT a guess (too broad). Saying "is it a guitar?" IS a guess. - - answer ("yes" or "no"): truthful answer about the secret. - * If is_guess is true: "yes" only if the named object matches the secret (allowing for synonyms / minor wording). Otherwise "no". - * If is_guess is false: "yes" or "no" based on whether the property holds for the secret. - - hint (string, max 120 chars): a NEW useful clue. Vary it from prior hints. Never include the secret word, its plural, or its base form. Never reveal the answer in the hint. +You MUST reply with exactly ONE line of JSON and NOTHING else — no prose, no backticks, no code fences, no explanation. + +Schema: +{"is_guess": boolean, "answer": "yes" | "no", "hint": string} + +Field meanings: +- is_guess: true ONLY when the user is naming a specific concrete object equal to, a synonym of, or extremely close to the secret. Vague descriptors ("is it big?", "is it round?") are NOT guesses. Saying "is it a string instrument?" when the secret is "guitar" is NOT a guess (too broad). Saying "is it a guitar?" IS a guess. +- answer: truthful "yes" or "no" about the secret. + * If is_guess is true: "yes" only if the named object matches the secret (allowing for synonyms / minor wording). Otherwise "no". + * If is_guess is false: "yes" or "no" based on whether the property holds for the secret. +- hint: a NEW useful clue in plain text, max 120 characters. Vary it from prior hints. Never include the secret word, its plural, or its base form. Never reveal the answer in the hint. Rules: -- ALWAYS call submit_answer exactly once. Never reply in free text. -- Stay consistent with prior answers above. -- If the user input is not a yes/no question and not a guess (e.g. open-ended), still call submit_answer with answer="no", is_guess=false, and a hint asking them to rephrase as a yes/no question.`; -} +- Output ONLY the JSON line. No markdown fences. No prose before or after. +- If the user input is not a valid yes/no question and not a guess, still return JSON with answer="no", is_guess=false, and a short hint asking them to rephrase. -/** - * Function-calling schema. Traditional Workers-AI format - * (https://developers.cloudflare.com/workers-ai/features/function-calling/traditional/). - */ -export const ANSWER_FUNCTION_SCHEMA = { - name: "submit_answer", - description: "Submit the truthful yes/no answer to the user's question along with a fresh hint.", - parameters: { - type: "object", - properties: { - is_guess: { - type: "boolean", - description: - "True ONLY if the user named a specific concrete object that matches or is a synonym of the secret.", - }, - answer: { - type: "string", - enum: ["yes", "no"], - description: "Truthful yes/no answer about the secret.", - }, - hint: { - type: "string", - description: "A new useful clue (max 120 chars) that does not contain the secret word.", - }, - }, - required: ["is_guess", "answer", "hint"], - }, -}; +Example outputs: +{"is_guess": false, "answer": "yes", "hint": "it is taller than a person"} +{"is_guess": true, "answer": "no", "hint": "its body is mostly metal pipes"}`; +} diff --git a/tests/fakes/fake-ai.js b/tests/fakes/fake-ai.js index b219b92..c1667fa 100644 --- a/tests/fakes/fake-ai.js +++ b/tests/fakes/fake-ai.js @@ -2,8 +2,9 @@ * @file fake-ai — minimal stub for the Workers AI binding (env.AI). * * Real shape: `{ run(modelId, body) -> Promise }`. Tests configure the - * mock via `mockJudgement(ai, { is_guess, answer, hint })` to return the - * structured tool-call response that ai-client.parseToolCall consumes. + * mock via `mockJudgement(ai, { is_guess, answer, hint })` to return a + * Workers-AI-traditional { response: "" } payload that + * ai-client.extractText + parseJudgementJson consume. */ import { vi } from "vitest"; @@ -13,17 +14,12 @@ export function makeFakeAi() { } /** - * Configure the next ai.run call to return a Cloudflare-traditional - * tool_calls response with the given submit_answer arguments. + * Configure the next ai.run call to return a response whose `.response` + * string contains the canonical one-line JSON the judge expects. */ export function mockJudgement(ai, { is_guess = false, answer = "no", hint = "default hint" } = {}) { ai.run.mockResolvedValueOnce({ - tool_calls: [ - { - name: "submit_answer", - arguments: { is_guess, answer, hint }, - }, - ], + response: JSON.stringify({ is_guess, answer, hint }), }); } diff --git a/tests/modules/twentyq/ai-client.test.js b/tests/modules/twentyq/ai-client.test.js index c5e7d16..56554ee 100644 --- a/tests/modules/twentyq/ai-client.test.js +++ b/tests/modules/twentyq/ai-client.test.js @@ -2,9 +2,10 @@ import { describe, expect, it } from "vitest"; import { MODEL_ID, UpstreamError, + extractText, judge, normalizeJudgement, - parseToolCall, + parseJudgementJson, redactSecret, } from "../../../src/modules/twentyq/ai-client.js"; import { makeFakeAi, mockFailure, mockJudgement } from "../../fakes/fake-ai.js"; @@ -19,55 +20,64 @@ const baseState = () => ({ }); describe("twentyq/ai-client", () => { - describe("parseToolCall", () => { - it("extracts traditional Cloudflare shape", () => { - const r = parseToolCall({ - tool_calls: [ - { name: "submit_answer", arguments: { is_guess: false, answer: "yes", hint: "x" } }, - ], - }); - expect(r).toEqual({ is_guess: false, answer: "yes", hint: "x" }); + describe("extractText", () => { + it("reads traditional Workers-AI { response } shape", () => { + expect(extractText({ response: "hello" })).toBe("hello"); }); - it("extracts OpenAI-style nested function shape", () => { - const r = parseToolCall({ - tool_calls: [ - { - function: { - name: "submit_answer", - arguments: { is_guess: true, answer: "no", hint: "y" }, - }, - }, - ], - }); - expect(r).toEqual({ is_guess: true, answer: "no", hint: "y" }); + it("reads OpenAI-compatible choices[0].message.content", () => { + expect(extractText({ choices: [{ message: { content: "world" } }] })).toBe("world"); }); - it("parses stringified JSON arguments", () => { - const r = parseToolCall({ - tool_calls: [ - { - function: { - name: "submit_answer", - arguments: '{"is_guess":false,"answer":"no","hint":"z"}', - }, - }, - ], - }); - expect(r?.hint).toBe("z"); + it("concatenates array content parts", () => { + expect( + extractText({ + choices: [{ message: { content: [{ text: "a" }, { text: "b" }] } }], + }), + ).toBe("ab"); }); - it("returns null when no tool_calls present", () => { - expect(parseToolCall({})).toBeNull(); - expect(parseToolCall({ tool_calls: [] })).toBeNull(); - expect(parseToolCall(null)).toBeNull(); + it("passes through strings", () => { + expect(extractText("direct")).toBe("direct"); }); - it("returns null on malformed stringified args", () => { - const r = parseToolCall({ - tool_calls: [{ function: { name: "submit_answer", arguments: "not json" } }], - }); - expect(r).toBeNull(); + it("empty string on unknown shape", () => { + expect(extractText(null)).toBe(""); + expect(extractText({})).toBe(""); + }); + }); + + describe("parseJudgementJson", () => { + it("parses clean one-line JSON", () => { + const r = parseJudgementJson('{"is_guess":false,"answer":"yes","hint":"big"}'); + expect(r).toEqual({ is_guess: false, answer: "yes", hint: "big" }); + }); + + it("pulls JSON out of surrounding prose", () => { + const r = parseJudgementJson( + 'Sure, here is my answer: {"is_guess":true,"answer":"no","hint":"x"} — hope that helps!', + ); + expect(r?.is_guess).toBe(true); + }); + + it("strips code fences", () => { + const r = parseJudgementJson('```json\n{"is_guess":false,"answer":"yes","hint":"h"}\n```'); + expect(r?.hint).toBe("h"); + }); + + it("handles nested braces inside strings", () => { + const r = parseJudgementJson('{"is_guess":false,"answer":"no","hint":"has {braces}"}'); + expect(r?.hint).toBe("has {braces}"); + }); + + it("returns null when no JSON object present", () => { + expect(parseJudgementJson("no json here")).toBeNull(); + expect(parseJudgementJson("")).toBeNull(); + expect(parseJudgementJson(null)).toBeNull(); + }); + + it("returns null on malformed JSON", () => { + expect(parseJudgementJson("{not: valid}")).toBeNull(); }); }); @@ -141,12 +151,21 @@ describe("twentyq/ai-client", () => { await expect(judge({}, baseState(), "is it big?")).rejects.toBeInstanceOf(UpstreamError); }); - it("uses default fallback when tool_calls absent", async () => { + it("uses default fallback when response is empty", async () => { const ai = makeFakeAi(); - ai.run.mockResolvedValueOnce({}); // no tool_calls + ai.run.mockResolvedValueOnce({ response: "" }); const r = await judge({ AI: ai }, baseState(), "is it big?"); expect(r.is_guess).toBe(false); expect(r.answer).toBe("no"); }); + + it("does NOT send a tools array (drop function calling for Gemma compatibility)", async () => { + const ai = makeFakeAi(); + mockJudgement(ai, { is_guess: false, answer: "yes", hint: "h" }); + await judge({ AI: ai }, baseState(), "is it big?"); + const [, body] = ai.run.mock.calls[0]; + expect(body.tools).toBeUndefined(); + expect(body.messages).toBeDefined(); + }); }); });