mirror of
https://github.com/tiennm99/thptqg2017.git
synced 2026-05-29 20:23:37 +00:00
718e2e9117
- Move 63 Excel files from data/raw/ to data/ (single flat dir) - Remove all 53 files in data/raw/update/: verified identical SBD coverage to raw/ (847349 rows either way), so they added no new students — only potential score corrections that can be reintroduced later if source is recovered - Update build-database.js to read data/ directly - Add scripts/audit-row-counts.js: compares source row count to DB row count to verify zero-loss parsing - Point check-duplicates.js at new data/ location
27 lines
905 B
JavaScript
27 lines
905 B
JavaScript
// One-off audit: detect content-identical Excel files via md5
|
|
import crypto from "crypto";
|
|
import fs from "fs";
|
|
import path from "path";
|
|
|
|
const dirs = ["D:/tiennm99/thptqg2017/data"];
|
|
|
|
const byHash = {};
|
|
for (const d of dirs) {
|
|
for (const f of fs.readdirSync(d)) {
|
|
const full = path.join(d, f);
|
|
if (!fs.statSync(full).isFile() || !f.endsWith(".xlsx")) continue;
|
|
const h = crypto.createHash("md5").update(fs.readFileSync(full)).digest("hex");
|
|
(byHash[h] ||= []).push(full);
|
|
}
|
|
}
|
|
|
|
const total = Object.values(byHash).reduce((s, a) => s + a.length, 0);
|
|
const dupes = Object.entries(byHash).filter(([, a]) => a.length > 1);
|
|
console.log(`Total files: ${total}`);
|
|
console.log(`Unique by md5: ${Object.keys(byHash).length}`);
|
|
console.log(`Duplicate groups: ${dupes.length}`);
|
|
for (const [h, a] of dupes) {
|
|
console.log(` ${h.slice(0, 12)}:`);
|
|
a.forEach((p) => console.log(` ${p}`));
|
|
}
|