mirror of
https://github.com/tiennm99/thptqg2017.git
synced 2026-05-14 04:58:52 +00:00
8f86e4dc3b
Dataset update: - Crawl all 63 .xls province files from baotintuc.vn CDN (original source) - Old xlsx dataset moved to data-old/ for reference - Net: +13,719 students (Hà Nội +7,275, HCM +6,445) — the old .xls → xlsx conversion silently dropped rows beyond the 65,536 per-sheet cap - Also removes 1 bogus header row that had leaked into the old DB - 100% identical scores on the 847,348 SBDs present in both datasets Build pipeline: - build-database.js: iterate ALL sheets per workbook (fixes the overflow loss) and accept .xls in addition to .xlsx Audit tooling: - scripts/crawl-baotintuc.js: idempotent 63-province downloader - scripts/diff-datasets.js: compares two DBs by SBD set and per-column score deltas
114 lines
3.9 KiB
JavaScript
114 lines
3.9 KiB
JavaScript
// Compare new DB (public/thptqg2017.db) vs backup old DB (backup/thptqg2017.old.db)
|
|
import Database from "better-sqlite3";
|
|
|
|
const NEW = new Database("public/thptqg2017.db", { readonly: true });
|
|
const OLD = new Database("backup/thptqg2017.old.db", { readonly: true });
|
|
|
|
const SCORE_COLS = [
|
|
"toan", "ngu_van", "vat_ly", "hoa_hoc", "sinh_hoc", "khtn",
|
|
"lich_su", "dia_ly", "gdcd", "khxh",
|
|
"tieng_anh", "tieng_phap", "tieng_nga", "tieng_trung",
|
|
];
|
|
|
|
// Old schema may lack some newer columns — detect and project a subset
|
|
const oldCols = new Set(
|
|
OLD.prepare("PRAGMA table_info(student)").all().map((c) => c.name),
|
|
);
|
|
const commonCols = SCORE_COLS.filter((c) => oldCols.has(c));
|
|
|
|
const newCount = NEW.prepare("SELECT COUNT(*) c FROM student").get().c;
|
|
const oldCount = OLD.prepare("SELECT COUNT(*) c FROM student").get().c;
|
|
console.log(`\n=== Row counts ===`);
|
|
console.log(` new: ${newCount}`);
|
|
console.log(` old: ${oldCount}`);
|
|
console.log(` diff: ${newCount - oldCount >= 0 ? "+" : ""}${newCount - oldCount}`);
|
|
|
|
// Build SBD sets (memory: 861k strings ~50 MB, fine)
|
|
const newSbd = new Set(
|
|
NEW.prepare("SELECT so_bao_danh FROM student").all().map((r) => r.so_bao_danh),
|
|
);
|
|
const oldSbd = new Set(
|
|
OLD.prepare("SELECT so_bao_danh FROM student").all().map((r) => r.so_bao_danh),
|
|
);
|
|
|
|
let newOnly = 0, oldOnly = 0, common = 0;
|
|
for (const s of newSbd) (oldSbd.has(s) ? common++ : newOnly++);
|
|
for (const s of oldSbd) if (!newSbd.has(s)) oldOnly++;
|
|
|
|
console.log(`\n=== SBD membership ===`);
|
|
console.log(` new-only (added): ${newOnly}`);
|
|
console.log(` old-only (removed): ${oldOnly}`);
|
|
console.log(` common: ${common}`);
|
|
|
|
// Sample 5 new-only SBDs to peek
|
|
const sampleNewOnly = [];
|
|
for (const s of newSbd) {
|
|
if (!oldSbd.has(s)) sampleNewOnly.push(s);
|
|
if (sampleNewOnly.length >= 5) break;
|
|
}
|
|
if (sampleNewOnly.length) {
|
|
const rows = NEW.prepare(
|
|
`SELECT so_bao_danh, ho_ten FROM student WHERE so_bao_danh IN (${sampleNewOnly.map(() => "?").join(",")})`,
|
|
).all(...sampleNewOnly);
|
|
console.log(` sample new-only rows:`);
|
|
rows.forEach((r) => console.log(` ${r.so_bao_danh} ${r.ho_ten}`));
|
|
}
|
|
|
|
// Score comparison on common SBDs
|
|
console.log(`\n=== Score comparison (common SBDs) ===`);
|
|
console.log(` columns compared: ${commonCols.join(", ")}`);
|
|
|
|
const newStmt = NEW.prepare(
|
|
`SELECT so_bao_danh, ${commonCols.join(",")} FROM student`,
|
|
);
|
|
const oldStmt = OLD.prepare(
|
|
`SELECT so_bao_danh, ${commonCols.join(",")} FROM student`,
|
|
);
|
|
|
|
// Build old map
|
|
const oldMap = new Map();
|
|
for (const r of oldStmt.iterate()) oldMap.set(r.so_bao_danh, r);
|
|
|
|
let identical = 0,
|
|
differ = 0;
|
|
const colChanges = Object.fromEntries(commonCols.map((c) => [c, { changed: 0, newNull: 0, oldNull: 0 }]));
|
|
const sampleDiffs = [];
|
|
|
|
for (const nr of newStmt.iterate()) {
|
|
const or = oldMap.get(nr.so_bao_danh);
|
|
if (!or) continue;
|
|
let rowDiffers = false;
|
|
const rowChanges = [];
|
|
for (const c of commonCols) {
|
|
const a = nr[c], b = or[c];
|
|
if (a === b) continue;
|
|
// treat very-close floats as equal (xlsx may emit 5.5 vs 5.50)
|
|
if (a !== null && b !== null && Math.abs(a - b) < 1e-9) continue;
|
|
rowDiffers = true;
|
|
colChanges[c].changed++;
|
|
if (a === null) colChanges[c].newNull++;
|
|
if (b === null) colChanges[c].oldNull++;
|
|
rowChanges.push(`${c}: ${b} → ${a}`);
|
|
}
|
|
if (rowDiffers) {
|
|
differ++;
|
|
if (sampleDiffs.length < 5) sampleDiffs.push({ sbd: nr.so_bao_danh, changes: rowChanges });
|
|
} else {
|
|
identical++;
|
|
}
|
|
}
|
|
|
|
console.log(` identical: ${identical}`);
|
|
console.log(` differing: ${differ}`);
|
|
console.log(` per-column changes:`);
|
|
for (const [c, info] of Object.entries(colChanges)) {
|
|
if (info.changed === 0) continue;
|
|
console.log(` ${c.padEnd(14)} changed=${info.changed} (new-null=${info.newNull}, old-null=${info.oldNull})`);
|
|
}
|
|
if (sampleDiffs.length) {
|
|
console.log(` sample diffs:`);
|
|
sampleDiffs.forEach((d) => {
|
|
console.log(` SBD ${d.sbd}: ${d.changes.slice(0, 4).join("; ")}${d.changes.length > 4 ? " ..." : ""}`);
|
|
});
|
|
}
|