Files
tiennm99 8f86e4dc3b feat: refresh data from baotintuc.vn source, fix overflow sheet loss
Dataset update:
- Crawl all 63 .xls province files from baotintuc.vn CDN (original source)
- Old xlsx dataset moved to data-old/ for reference
- Net: +13,719 students (Hà Nội +7,275, HCM +6,445) — the old .xls → xlsx
  conversion silently dropped rows beyond the 65,536 per-sheet cap
- Also removes 1 bogus header row that had leaked into the old DB
- 100% identical scores on the 847,348 SBDs present in both datasets

Build pipeline:
- build-database.js: iterate ALL sheets per workbook (fixes the overflow
  loss) and accept .xls in addition to .xlsx

Audit tooling:
- scripts/crawl-baotintuc.js: idempotent 63-province downloader
- scripts/diff-datasets.js: compares two DBs by SBD set and per-column
  score deltas
2026-04-14 21:42:29 +07:00

114 lines
3.9 KiB
JavaScript

// Compare new DB (public/thptqg2017.db) vs backup old DB (backup/thptqg2017.old.db)
import Database from "better-sqlite3";
const NEW = new Database("public/thptqg2017.db", { readonly: true });
const OLD = new Database("backup/thptqg2017.old.db", { readonly: true });
const SCORE_COLS = [
"toan", "ngu_van", "vat_ly", "hoa_hoc", "sinh_hoc", "khtn",
"lich_su", "dia_ly", "gdcd", "khxh",
"tieng_anh", "tieng_phap", "tieng_nga", "tieng_trung",
];
// Old schema may lack some newer columns — detect and project a subset
const oldCols = new Set(
OLD.prepare("PRAGMA table_info(student)").all().map((c) => c.name),
);
const commonCols = SCORE_COLS.filter((c) => oldCols.has(c));
const newCount = NEW.prepare("SELECT COUNT(*) c FROM student").get().c;
const oldCount = OLD.prepare("SELECT COUNT(*) c FROM student").get().c;
console.log(`\n=== Row counts ===`);
console.log(` new: ${newCount}`);
console.log(` old: ${oldCount}`);
console.log(` diff: ${newCount - oldCount >= 0 ? "+" : ""}${newCount - oldCount}`);
// Build SBD sets (memory: 861k strings ~50 MB, fine)
const newSbd = new Set(
NEW.prepare("SELECT so_bao_danh FROM student").all().map((r) => r.so_bao_danh),
);
const oldSbd = new Set(
OLD.prepare("SELECT so_bao_danh FROM student").all().map((r) => r.so_bao_danh),
);
let newOnly = 0, oldOnly = 0, common = 0;
for (const s of newSbd) (oldSbd.has(s) ? common++ : newOnly++);
for (const s of oldSbd) if (!newSbd.has(s)) oldOnly++;
console.log(`\n=== SBD membership ===`);
console.log(` new-only (added): ${newOnly}`);
console.log(` old-only (removed): ${oldOnly}`);
console.log(` common: ${common}`);
// Sample 5 new-only SBDs to peek
const sampleNewOnly = [];
for (const s of newSbd) {
if (!oldSbd.has(s)) sampleNewOnly.push(s);
if (sampleNewOnly.length >= 5) break;
}
if (sampleNewOnly.length) {
const rows = NEW.prepare(
`SELECT so_bao_danh, ho_ten FROM student WHERE so_bao_danh IN (${sampleNewOnly.map(() => "?").join(",")})`,
).all(...sampleNewOnly);
console.log(` sample new-only rows:`);
rows.forEach((r) => console.log(` ${r.so_bao_danh} ${r.ho_ten}`));
}
// Score comparison on common SBDs
console.log(`\n=== Score comparison (common SBDs) ===`);
console.log(` columns compared: ${commonCols.join(", ")}`);
const newStmt = NEW.prepare(
`SELECT so_bao_danh, ${commonCols.join(",")} FROM student`,
);
const oldStmt = OLD.prepare(
`SELECT so_bao_danh, ${commonCols.join(",")} FROM student`,
);
// Build old map
const oldMap = new Map();
for (const r of oldStmt.iterate()) oldMap.set(r.so_bao_danh, r);
let identical = 0,
differ = 0;
const colChanges = Object.fromEntries(commonCols.map((c) => [c, { changed: 0, newNull: 0, oldNull: 0 }]));
const sampleDiffs = [];
for (const nr of newStmt.iterate()) {
const or = oldMap.get(nr.so_bao_danh);
if (!or) continue;
let rowDiffers = false;
const rowChanges = [];
for (const c of commonCols) {
const a = nr[c], b = or[c];
if (a === b) continue;
// treat very-close floats as equal (xlsx may emit 5.5 vs 5.50)
if (a !== null && b !== null && Math.abs(a - b) < 1e-9) continue;
rowDiffers = true;
colChanges[c].changed++;
if (a === null) colChanges[c].newNull++;
if (b === null) colChanges[c].oldNull++;
rowChanges.push(`${c}: ${b}${a}`);
}
if (rowDiffers) {
differ++;
if (sampleDiffs.length < 5) sampleDiffs.push({ sbd: nr.so_bao_danh, changes: rowChanges });
} else {
identical++;
}
}
console.log(` identical: ${identical}`);
console.log(` differing: ${differ}`);
console.log(` per-column changes:`);
for (const [c, info] of Object.entries(colChanges)) {
if (info.changed === 0) continue;
console.log(` ${c.padEnd(14)} changed=${info.changed} (new-null=${info.newNull}, old-null=${info.oldNull})`);
}
if (sampleDiffs.length) {
console.log(` sample diffs:`);
sampleDiffs.forEach((d) => {
console.log(` SBD ${d.sbd}: ${d.changes.slice(0, 4).join("; ")}${d.changes.length > 4 ? " ..." : ""}`);
});
}