Files
miti99bot/scripts/build-semantle-words.js
T
tiennm99 9b331fc24d refactor(semantle,doantu): drop ConceptNet vestiges, trim wordlist API
Now that both modules run on Workers AI embeddings, drop the legacy
Word2SimError alias, the unused wordlist helpers (getLine, LINE_COUNT,
pickFromPool), and every comment/README section still describing the
removed ConceptNet backend. Fix the bge-small doc typo in semantle/index.js
and align the semantle api-client test fake-vector dim with the real
384-dim output.
2026-04-23 00:19:28 +07:00

60 lines
2.0 KiB
JavaScript

#!/usr/bin/env node
/**
* @file build-semantle-words — fetches the google-10000-english (no-swears)
* word list and writes it verbatim (lowercased + deduped, no filtering) to
* src/modules/semantle/words-data.js.
*
* The list is sorted by Google Ngram frequency. ConceptNet's verify-and-
* fallback in api-client.js handles weird picks (`a`, `dvd`, etc.) by
* rejecting them at round-start if they have no concept edges, so there's
* no need to pre-filter here.
*
* Source: https://github.com/first20hours/google-10000-english
* Credits: Josh Kaufman (first20hours) — list derived from Peter Norvig's
* Google Ngram analysis.
*
* Usage:
* node scripts/build-semantle-words.js
*/
import { writeFileSync } from "node:fs";
import { resolve } from "node:path";
const SOURCE_URL =
"https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt";
const root = resolve(import.meta.dirname, "..");
const dst = resolve(root, "src/modules/semantle/words-data.js");
const res = await fetch(SOURCE_URL);
if (!res.ok) throw new Error(`fetch failed: ${res.status} ${res.statusText}`);
const text = await res.text();
// Normalize only: trim whitespace, lowercase, drop blanks, dedupe.
// Preserve original frequency order — source is ranked by Google Ngram.
const words = Array.from(
new Set(
text
.split(/\r?\n/)
.map((w) => w.trim().toLowerCase())
.filter((w) => w.length > 0),
),
);
if (words.length === 0) throw new Error("no words parsed from source");
const body = words.map((w) => ` "${w}",`).join("\n");
const out = [
"// Auto-generated from https://github.com/first20hours/google-10000-english",
"// Credits: Josh Kaufman (first20hours) — common English words by Google Ngram frequency.",
"// Normalized (lowercased, trimmed, deduped) but otherwise unfiltered.",
"// Regenerate with: node scripts/build-semantle-words.js",
"export default [",
body,
"];",
"",
].join("\n");
writeFileSync(dst, out);
console.log(`wrote ${dst} (${words.length} words)`);