Files
thptqg2017/scripts/check-duplicates.js
tiennm99 718e2e9117 refactor: flatten data layout to data/, drop update/ overrides
- Move 63 Excel files from data/raw/ to data/ (single flat dir)
- Remove all 53 files in data/raw/update/: verified identical SBD
  coverage to raw/ (847349 rows either way), so they added no new
  students — only potential score corrections that can be reintroduced
  later if source is recovered
- Update build-database.js to read data/ directly
- Add scripts/audit-row-counts.js: compares source row count to DB row
  count to verify zero-loss parsing
- Point check-duplicates.js at new data/ location
2026-04-14 21:02:47 +07:00

27 lines
905 B
JavaScript

// One-off audit: detect content-identical Excel files via md5
import crypto from "crypto";
import fs from "fs";
import path from "path";
const dirs = ["D:/tiennm99/thptqg2017/data"];
const byHash = {};
for (const d of dirs) {
for (const f of fs.readdirSync(d)) {
const full = path.join(d, f);
if (!fs.statSync(full).isFile() || !f.endsWith(".xlsx")) continue;
const h = crypto.createHash("md5").update(fs.readFileSync(full)).digest("hex");
(byHash[h] ||= []).push(full);
}
}
const total = Object.values(byHash).reduce((s, a) => s + a.length, 0);
const dupes = Object.entries(byHash).filter(([, a]) => a.length > 1);
console.log(`Total files: ${total}`);
console.log(`Unique by md5: ${Object.keys(byHash).length}`);
console.log(`Duplicate groups: ${dupes.length}`);
for (const [h, a] of dupes) {
console.log(` ${h.slice(0, 12)}:`);
a.forEach((p) => console.log(` ${p}`));
}