Files
miti99bot/scripts/build-semantle-words.js
T
tiennm99 4c2890ba25 refactor(semantle): drop word filter, expose line-based wordlist API
Use the full google-10000-english list verbatim (normalize only —
lowercase + dedupe, no length or alpha filtering). Pool goes from 7953
to 9894 entries; rare/short/long picks are still sieved by ConceptNet's
verify-and-fallback at round start.

Replaces TARGET_POOL/pickFromPool with a clearer line-based API:
  LINE_COUNT    — how many entries
  randomLine()  — uniform pick
  getLine(n)    — nth entry (n = frequency rank)

pickFromPool retained as a back-compat re-export so existing callers
don't break.
2026-04-22 23:19:51 +07:00

60 lines
2.0 KiB
JavaScript

#!/usr/bin/env node
/**
* @file build-semantle-words — fetches the google-10000-english (no-swears)
* word list and writes it verbatim (lowercased + deduped, no filtering) to
* src/modules/semantle/words-data.js.
*
* The list is sorted by Google Ngram frequency. ConceptNet's verify-and-
* fallback in api-client.js handles weird picks (`a`, `dvd`, etc.) by
* rejecting them at round-start if they have no concept edges, so there's
* no need to pre-filter here.
*
* Source: https://github.com/first20hours/google-10000-english
* Credits: Josh Kaufman (first20hours) — list derived from Peter Norvig's
* Google Ngram analysis.
*
* Usage:
* node scripts/build-semantle-words.js
*/
import { writeFileSync } from "node:fs";
import { resolve } from "node:path";
const SOURCE_URL =
"https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt";
const root = resolve(import.meta.dirname, "..");
const dst = resolve(root, "src/modules/semantle/words-data.js");
const res = await fetch(SOURCE_URL);
if (!res.ok) throw new Error(`fetch failed: ${res.status} ${res.statusText}`);
const text = await res.text();
// Normalize only: trim whitespace, lowercase, drop blanks, dedupe.
// Preserve original frequency order so `getLine(n)` stays a frequency rank.
const words = Array.from(
new Set(
text
.split(/\r?\n/)
.map((w) => w.trim().toLowerCase())
.filter((w) => w.length > 0),
),
);
if (words.length === 0) throw new Error("no words parsed from source");
const body = words.map((w) => ` "${w}",`).join("\n");
const out = [
"// Auto-generated from https://github.com/first20hours/google-10000-english",
"// Credits: Josh Kaufman (first20hours) — common English words by Google Ngram frequency.",
"// Normalized (lowercased, trimmed, deduped) but otherwise unfiltered.",
"// Regenerate with: node scripts/build-semantle-words.js",
"export default [",
body,
"];",
"",
].join("\n");
writeFileSync(dst, out);
console.log(`wrote ${dst} (${words.length} words)`);