mirror of
https://github.com/tiennm99/miti99bot.git
synced 2026-04-27 20:20:32 +00:00
9b331fc24d
Now that both modules run on Workers AI embeddings, drop the legacy Word2SimError alias, the unused wordlist helpers (getLine, LINE_COUNT, pickFromPool), and every comment/README section still describing the removed ConceptNet backend. Fix the bge-small doc typo in semantle/index.js and align the semantle api-client test fake-vector dim with the real 384-dim output.
60 lines
2.0 KiB
JavaScript
60 lines
2.0 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* @file build-semantle-words — fetches the google-10000-english (no-swears)
|
|
* word list and writes it verbatim (lowercased + deduped, no filtering) to
|
|
* src/modules/semantle/words-data.js.
|
|
*
|
|
* The list is sorted by Google Ngram frequency. ConceptNet's verify-and-
|
|
* fallback in api-client.js handles weird picks (`a`, `dvd`, etc.) by
|
|
* rejecting them at round-start if they have no concept edges, so there's
|
|
* no need to pre-filter here.
|
|
*
|
|
* Source: https://github.com/first20hours/google-10000-english
|
|
* Credits: Josh Kaufman (first20hours) — list derived from Peter Norvig's
|
|
* Google Ngram analysis.
|
|
*
|
|
* Usage:
|
|
* node scripts/build-semantle-words.js
|
|
*/
|
|
|
|
import { writeFileSync } from "node:fs";
|
|
import { resolve } from "node:path";
|
|
|
|
const SOURCE_URL =
|
|
"https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt";
|
|
|
|
const root = resolve(import.meta.dirname, "..");
|
|
const dst = resolve(root, "src/modules/semantle/words-data.js");
|
|
|
|
const res = await fetch(SOURCE_URL);
|
|
if (!res.ok) throw new Error(`fetch failed: ${res.status} ${res.statusText}`);
|
|
const text = await res.text();
|
|
|
|
// Normalize only: trim whitespace, lowercase, drop blanks, dedupe.
|
|
// Preserve original frequency order — source is ranked by Google Ngram.
|
|
const words = Array.from(
|
|
new Set(
|
|
text
|
|
.split(/\r?\n/)
|
|
.map((w) => w.trim().toLowerCase())
|
|
.filter((w) => w.length > 0),
|
|
),
|
|
);
|
|
|
|
if (words.length === 0) throw new Error("no words parsed from source");
|
|
|
|
const body = words.map((w) => ` "${w}",`).join("\n");
|
|
const out = [
|
|
"// Auto-generated from https://github.com/first20hours/google-10000-english",
|
|
"// Credits: Josh Kaufman (first20hours) — common English words by Google Ngram frequency.",
|
|
"// Normalized (lowercased, trimmed, deduped) but otherwise unfiltered.",
|
|
"// Regenerate with: node scripts/build-semantle-words.js",
|
|
"export default [",
|
|
body,
|
|
"];",
|
|
"",
|
|
].join("\n");
|
|
|
|
writeFileSync(dst, out);
|
|
console.log(`wrote ${dst} (${words.length} words)`);
|