Files
thptqg2017/scripts/check-duplicates.js
T
tiennm99 f10046f63d chore: remove duplicate Excel files, add md5 audit script
- Drop 10_LamDong_GNFT (1) and 2.BacKan_YQNX(1): identical row content to
  siblings (Excel metadata differs but file size & sheet rows match)
- Add scripts/check-duplicates.js to detect byte-identical and row-identical
  files across data/raw and data/raw/update
2026-04-14 20:49:41 +07:00

30 lines
958 B
JavaScript

// One-off audit: detect content-identical Excel files via md5
import crypto from "crypto";
import fs from "fs";
import path from "path";
const dirs = [
"D:/tiennm99/thptqg2017/data/raw",
"D:/tiennm99/thptqg2017/data/raw/update",
];
const byHash = {};
for (const d of dirs) {
for (const f of fs.readdirSync(d)) {
const full = path.join(d, f);
if (!fs.statSync(full).isFile() || !f.endsWith(".xlsx")) continue;
const h = crypto.createHash("md5").update(fs.readFileSync(full)).digest("hex");
(byHash[h] ||= []).push(full);
}
}
const total = Object.values(byHash).reduce((s, a) => s + a.length, 0);
const dupes = Object.entries(byHash).filter(([, a]) => a.length > 1);
console.log(`Total files: ${total}`);
console.log(`Unique by md5: ${Object.keys(byHash).length}`);
console.log(`Duplicate groups: ${dupes.length}`);
for (const [h, a] of dupes) {
console.log(` ${h.slice(0, 12)}:`);
a.forEach((p) => console.log(` ${p}`));
}