Files
thptqg2017/scripts/build-database-old2.js
tiennm99 622e7383ef feat: build 3 parallel site variants, one per dataset
Sites:
  /thptqg2017/      → data/      861,068 students (baotintuc.vn .xls)
  /thptqg2017/old/  → data-old/  847,348 students (original xlsx)
  /thptqg2017/old2/ → data-old2/ 679,764 students (update/ overrides)

Build pipeline:
- scripts/build-lib.js: shared schema, score regex, toAscii, insert
  builder — keeps the 3 DBs drop-in compatible for the same frontend
- scripts/build-database.js: multi-sheet walk for baotintuc .xls (HN/HCM
  overflow past the 65k-per-sheet .xls cap)
- scripts/build-database-old.js: single-sheet xlsx + strict numeric SBD
  guard (rejects the header-row leak present in data-old)
- scripts/build-database-old2.js: multi-sheet + blank-row pre-filter for
  the 54 update files (HCM overflow present, HN not in this subset)
- Each build prints a source-vs-DB audit; verified 0 parse errors and
  exact row-count match across all 3 datasets

Frontend plumbing:
- vite.config.js: VARIANT env selects base path, publicDir, outDir
- package.json: build:db:*, build:db:all, build:old, build:old2,
  build:all
- .github/workflows/deploy.yml: CI builds all 3 DBs + sites
- .gitignore: cover public-old/ and public-old2/ outputs
- eslint.config.js: add Node globals for config/script files
2026-04-14 22:12:25 +07:00

87 lines
3.1 KiB
JavaScript

// Build DB from data-old2/ — the 54 "update/" corrected-export files.
// Quirks:
// • 54 provinces only (no Hà Nội, Bình Phước etc.)
// • Mostly 2 sheets (one main + one empty trailing) — safe to iterate all
// • HCM (24.HCM_UTLQ.xlsx) overflows into Sheet2 with +6,446 students —
// multi-sheet walk is REQUIRED for it
// • Some headers have extended columns beyond DIEM_THI (pre-parsed per-
// subject cols). We still use only the DIEM_THI text at index 3.
import XLSX from "xlsx";
import fs from "fs";
import path from "path";
import { fileURLToPath } from "url";
import {
createDb,
parseScores,
isHeaderRow,
buildRow,
} from "./build-lib.js";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const SRC_DIR = path.join(__dirname, "..", "data-old2");
const DB_PATH = path.join(__dirname, "..", "public-old2", "thptqg2017.db");
function collectFiles() {
return fs
.readdirSync(SRC_DIR)
.filter((f) => f.endsWith(".xlsx") || f.endsWith(".xls"))
.map((f) => path.join(SRC_DIR, f));
}
function main() {
const { db, insert } = createDb(DB_PATH);
const files = collectFiles();
let sourceRows = 0, skipped = 0, errors = 0;
const run = db.transaction(() => {
for (const file of files) {
const base = path.basename(file);
let fileRows = 0;
const wb = XLSX.readFile(file);
for (const sheetName of wb.SheetNames) {
const rows = XLSX.utils.sheet_to_json(wb.Sheets[sheetName], {
header: 1,
});
for (let i = 0; i < rows.length; i++) {
if (i === 0 && isHeaderRow(rows[i])) continue;
const r = rows[i];
// Skip completely blank rows (common in xlsx tails) before counting
if (!r || r.every((c) => c === "" || c == null)) continue;
sourceRows++;
const hoTen = String(r?.[0] || "").trim();
const ngaySinh = String(r?.[1] || "").trim();
const soBaoDanh = String(r?.[2] || "").trim();
const diemThi = String(r?.[3] || "");
if (!soBaoDanh || !hoTen) { skipped++; continue; }
if (!/^\d+$/.test(soBaoDanh)) { skipped++; continue; }
try {
insert.run(buildRow({ hoTen, ngaySinh, soBaoDanh, scores: parseScores(diemThi) }));
fileRows++;
} catch (err) {
errors++;
if (errors <= 5) console.warn(` [warn] ${base}: ${err.message}`);
}
}
}
console.log(` ${base}: ${fileRows} rows`);
}
});
console.log(`[build] data-old2/ → ${DB_PATH} (${files.length} files)`);
run();
db.exec("VACUUM");
const dbCount = db.prepare("SELECT COUNT(*) c FROM student").get().c;
console.log(`\nSource non-blank data rows: ${sourceRows}`);
console.log(` skipped (empty/non-numeric SBD): ${skipped}`);
console.log(` insertable: ${sourceRows - skipped}`);
console.log(` insert errors: ${errors}`);
console.log(`DB rows (distinct SBD): ${dbCount}`);
const sz = fs.statSync(DB_PATH).size;
console.log(`Size: ${(sz / 1024 / 1024).toFixed(1)} MB`);
db.close();
}
main();