Files
miti99bot/scripts/build-doantu-words.js
T
tiennm99 c0315574c0 feat(doantu): add Vietnamese semantle module (protected visibility)
Near-clone of the semantle module, adapted for Vietnamese:
- Targets from duyet/vietnamese-wordlist Viet22K (~22k entries, GPL).
  Regenerate via scripts/build-doantu-words.js; chained into npm run build.
- ConceptNet client uses /c/vi/<term> URIs; multi-word guesses (e.g.
  "con chó") are space-to-underscore converted at URL build time so the
  board keeps the natural display.
- lookup.js permits Unicode letters + combining marks + single internal
  spaces; rejects digits/punctuation.
- All three commands (/doantu, /doantu_giveup, /doantu_stats) are
  visibility=protected — shown in /help, hidden from Telegram's native /
  autocomplete menu while the module is still experimental.

Wired into src/modules/index.js, wrangler.toml MODULES, .env.deploy(.example),
and package.json build chain.

Separate module rather than a shared base with semantle — matches the
repo's one-module-per-game convention (see loldle vs wordle); factor later
if a third language appears.
2026-04-22 23:29:36 +07:00

63 lines
2.3 KiB
JavaScript

#!/usr/bin/env node
/**
* @file build-doantu-words — fetches the middle-sized Vietnamese wordlist
* (Viet22K) from duyet/vietnamese-wordlist and writes it to
* src/modules/doantu/words-data.js as a static ES-module array.
*
* Source is an alphabetically-sorted Unicode dictionary, one word or phrase
* per line. We normalize only (trim, lowercase, dedupe); NO length/char
* filtering — ConceptNet's verify-and-fallback in api-client handles
* unusable picks at round start.
*
* Source: https://github.com/duyet/vietnamese-wordlist
* Credits: Ho Ngoc Duc — Vietnamese word list (GPL).
*
* Usage:
* node scripts/build-doantu-words.js
*/
import { writeFileSync } from "node:fs";
import { resolve } from "node:path";
// Size options from the upstream repo: Viet11K / Viet22K / Viet39K / Viet74K.
// Viet22K is a good balance — enough variety, not overwhelmed by archaic terms.
const SOURCE_URL = "https://raw.githubusercontent.com/duyet/vietnamese-wordlist/master/Viet22K.txt";
const root = resolve(import.meta.dirname, "..");
const dst = resolve(root, "src/modules/doantu/words-data.js");
const res = await fetch(SOURCE_URL);
if (!res.ok) throw new Error(`fetch failed: ${res.status} ${res.statusText}`);
const text = await res.text();
// Normalize only: trim, lowercase, drop blanks, dedupe. Preserve source order.
// Multi-word entries keep their spaces — the api-client converts to underscore
// only at URL-build time, so the board still displays them naturally.
const words = Array.from(
new Set(
text
.split(/\r?\n/)
.map((w) => w.trim().toLowerCase())
.filter((w) => w.length > 0),
),
);
if (words.length === 0) throw new Error("no words parsed from source");
// JSON.stringify each word — safer than manual quoting for Vietnamese
// diacritics and the occasional character that needs escaping.
const body = words.map((w) => ` ${JSON.stringify(w)},`).join("\n");
const out = [
"// Auto-generated from https://github.com/duyet/vietnamese-wordlist (Viet22K.txt)",
"// Credits: Ho Ngoc Duc — Vietnamese word list (GPL).",
"// Normalized (lowercased + deduped) but otherwise unfiltered.",
"// Regenerate with: node scripts/build-doantu-words.js",
"export default [",
body,
"];",
"",
].join("\n");
writeFileSync(dst, out);
console.log(`wrote ${dst} (${words.length} words)`);