mirror of
https://github.com/tiennm99/thptqg2017.git
synced 2026-05-13 22:58:31 +00:00
622e7383ef
Sites: /thptqg2017/ → data/ 861,068 students (baotintuc.vn .xls) /thptqg2017/old/ → data-old/ 847,348 students (original xlsx) /thptqg2017/old2/ → data-old2/ 679,764 students (update/ overrides) Build pipeline: - scripts/build-lib.js: shared schema, score regex, toAscii, insert builder — keeps the 3 DBs drop-in compatible for the same frontend - scripts/build-database.js: multi-sheet walk for baotintuc .xls (HN/HCM overflow past the 65k-per-sheet .xls cap) - scripts/build-database-old.js: single-sheet xlsx + strict numeric SBD guard (rejects the header-row leak present in data-old) - scripts/build-database-old2.js: multi-sheet + blank-row pre-filter for the 54 update files (HCM overflow present, HN not in this subset) - Each build prints a source-vs-DB audit; verified 0 parse errors and exact row-count match across all 3 datasets Frontend plumbing: - vite.config.js: VARIANT env selects base path, publicDir, outDir - package.json: build:db:*, build:db:all, build:old, build:old2, build:all - .github/workflows/deploy.yml: CI builds all 3 DBs + sites - .gitignore: cover public-old/ and public-old2/ outputs - eslint.config.js: add Node globals for config/script files
91 lines
3.1 KiB
JavaScript
91 lines
3.1 KiB
JavaScript
// Build DB from data/ (baotintuc.vn .xls source).
|
|
// Quirks: .xls has a 65,536-row-per-sheet limit. Hà Nội and HCM overflow into
|
|
// Sheet2, so we MUST iterate every sheet. Header row may or may not be
|
|
// present on each sheet.
|
|
import XLSX from "xlsx";
|
|
import fs from "fs";
|
|
import path from "path";
|
|
import { fileURLToPath } from "url";
|
|
import {
|
|
createDb,
|
|
parseScores,
|
|
isHeaderRow,
|
|
buildRow,
|
|
} from "./build-lib.js";
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
const SRC_DIR = path.join(__dirname, "..", "data");
|
|
const DB_PATH = path.join(__dirname, "..", "public", "thptqg2017.db");
|
|
|
|
function collectFiles() {
|
|
return fs
|
|
.readdirSync(SRC_DIR)
|
|
.filter((f) => f.endsWith(".xls") || f.endsWith(".xlsx"))
|
|
.map((f) => path.join(SRC_DIR, f));
|
|
}
|
|
|
|
function main() {
|
|
const { db, insert } = createDb(DB_PATH);
|
|
|
|
const files = collectFiles();
|
|
let sourceRows = 0; // total data rows observed across ALL sheets (post-header)
|
|
let skipped = 0; // empty/invalid rows skipped
|
|
let errors = 0;
|
|
|
|
const run = db.transaction(() => {
|
|
for (const file of files) {
|
|
const base = path.basename(file);
|
|
let fileRows = 0;
|
|
const wb = XLSX.readFile(file);
|
|
|
|
for (const sheetName of wb.SheetNames) {
|
|
const rows = XLSX.utils.sheet_to_json(wb.Sheets[sheetName], {
|
|
header: 1,
|
|
});
|
|
for (let i = 0; i < rows.length; i++) {
|
|
if (i === 0 && isHeaderRow(rows[i])) continue;
|
|
sourceRows++;
|
|
const r = rows[i];
|
|
const hoTen = String(r?.[0] || "").trim();
|
|
const ngaySinh = String(r?.[1] || "").trim();
|
|
const soBaoDanh = String(r?.[2] || "").trim();
|
|
const diemThi = String(r?.[3] || "");
|
|
if (!soBaoDanh || !hoTen) { skipped++; continue; }
|
|
try {
|
|
insert.run(buildRow({ hoTen, ngaySinh, soBaoDanh, scores: parseScores(diemThi) }));
|
|
fileRows++;
|
|
} catch (err) {
|
|
errors++;
|
|
if (errors <= 5) console.warn(` [warn] ${base}: ${err.message}`);
|
|
}
|
|
}
|
|
}
|
|
console.log(` ${base}: ${fileRows} rows`);
|
|
}
|
|
});
|
|
|
|
console.log(`[build] data/ → ${DB_PATH} (${files.length} files)`);
|
|
run();
|
|
db.exec("VACUUM");
|
|
|
|
const dbCount = db.prepare("SELECT COUNT(*) c FROM student").get().c;
|
|
const distinctSbd = sourceRows - skipped;
|
|
console.log(`\nSource data rows (post-header): ${sourceRows}`);
|
|
console.log(` skipped (empty/invalid): ${skipped}`);
|
|
console.log(` insertable: ${distinctSbd}`);
|
|
console.log(` insert errors: ${errors}`);
|
|
console.log(`DB rows (distinct SBD): ${dbCount}`);
|
|
// data/ comes from a single source, so every SBD should be unique after
|
|
// header skip — no duplicates expected.
|
|
if (dbCount !== distinctSbd - 0 && errors === 0) {
|
|
const gap = distinctSbd - dbCount;
|
|
if (gap === 0) console.log(`Audit: OK — every source row made it in.`);
|
|
else console.log(`Audit: ${gap} row(s) collapsed (duplicate SBDs overwriting).`);
|
|
}
|
|
const sz = fs.statSync(DB_PATH).size;
|
|
console.log(`Size: ${(sz / 1024 / 1024).toFixed(1)} MB`);
|
|
db.close();
|
|
}
|
|
|
|
main();
|