Files
thptqg2017/scripts/build-database.js
T
tiennm99 8f86e4dc3b feat: refresh data from baotintuc.vn source, fix overflow sheet loss
Dataset update:
- Crawl all 63 .xls province files from baotintuc.vn CDN (original source)
- Old xlsx dataset moved to data-old/ for reference
- Net: +13,719 students (Hà Nội +7,275, HCM +6,445) — the old .xls → xlsx
  conversion silently dropped rows beyond the 65,536 per-sheet cap
- Also removes 1 bogus header row that had leaked into the old DB
- 100% identical scores on the 847,348 SBDs present in both datasets

Build pipeline:
- build-database.js: iterate ALL sheets per workbook (fixes the overflow
  loss) and accept .xls in addition to .xlsx

Audit tooling:
- scripts/crawl-baotintuc.js: idempotent 63-province downloader
- scripts/diff-datasets.js: compares two DBs by SBD set and per-column
  score deltas
2026-04-14 21:42:29 +07:00

211 lines
6.7 KiB
JavaScript

import XLSX from "xlsx";
import Database from "better-sqlite3";
import fs from "fs";
import path from "path";
import { fileURLToPath } from "url";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const RAW_DIR = path.join(__dirname, "..", "data");
const DB_PATH = path.join(__dirname, "..", "public", "thptqg2017.db");
// Score patterns. Note: source data uses both "X.YZ" and "X" forms; allow optional decimal.
const NUM = "(\\d+(?:\\.\\d+)?)";
const SCORE_PATTERNS = {
toan: new RegExp("Toán:\\s*" + NUM),
ngu_van: new RegExp("Ngữ văn:\\s*" + NUM),
vat_ly: new RegExp("Vật lí:\\s*" + NUM),
hoa_hoc: new RegExp("Hóa học:\\s*" + NUM),
sinh_hoc: new RegExp("Sinh học:\\s*" + NUM),
khtn: new RegExp("KHTN:\\s*" + NUM),
lich_su: new RegExp("Lịch sử:\\s*" + NUM),
dia_ly: new RegExp("Địa lí:\\s*" + NUM),
gdcd: new RegExp("GDCD:\\s*" + NUM),
khxh: new RegExp("KHXH:\\s*" + NUM),
tieng_anh: new RegExp("Tiếng Anh:\\s*" + NUM),
tieng_phap: new RegExp("Tiếng Pháp:\\s*" + NUM),
tieng_nga: new RegExp("Tiếng Nga:\\s*" + NUM),
tieng_trung: new RegExp("Tiếng Trung:\\s*" + NUM),
};
// Strip Vietnamese diacritics for ASCII-insensitive search
function toAscii(str) {
return str
.normalize("NFD")
.replace(/[\u0300-\u036f]/g, "")
.replace(/đ/gi, "d")
.toLowerCase();
}
// Collect all .xls and .xlsx files directly under data/
function collectExcelFiles() {
const files = [];
for (const f of fs.readdirSync(RAW_DIR)) {
const full = path.join(RAW_DIR, f);
if (!fs.statSync(full).isFile()) continue;
if (f.endsWith(".xls") || f.endsWith(".xlsx")) files.push(full);
}
return files;
}
// Parse score text into an object of score fields
function parseScores(diemThi) {
const scores = {};
for (const [field, pattern] of Object.entries(SCORE_PATTERNS)) {
const match = diemThi.match(pattern);
if (match) {
scores[field] = parseFloat(match[1]);
}
}
return scores;
}
// Detect if first row is a header (not actual student data)
function isHeaderRow(row) {
if (!row || row.length < 3) return false;
const first = String(row[0] || "").toUpperCase();
return first === "HO_TEN" || first === "HỌ TÊN" || first === "STT";
}
function main() {
// Ensure output directory exists
fs.mkdirSync(path.dirname(DB_PATH), { recursive: true });
// Remove old database if exists
if (fs.existsSync(DB_PATH)) fs.unlinkSync(DB_PATH);
const db = new Database(DB_PATH);
// Create table
db.exec(`
CREATE TABLE student (
so_bao_danh TEXT PRIMARY KEY,
ho_ten TEXT NOT NULL,
ho_ten_ascii TEXT NOT NULL,
ngay_sinh TEXT,
toan REAL,
ngu_van REAL,
vat_ly REAL,
hoa_hoc REAL,
sinh_hoc REAL,
khtn REAL,
lich_su REAL,
dia_ly REAL,
gdcd REAL,
khxh REAL,
tieng_anh REAL,
tieng_phap REAL,
tieng_nga REAL,
tieng_trung REAL
);
CREATE INDEX idx_ho_ten ON student(ho_ten);
CREATE INDEX idx_ho_ten_ascii ON student(ho_ten_ascii);
`);
const insert = db.prepare(`
INSERT OR REPLACE INTO student
(so_bao_danh, ho_ten, ho_ten_ascii, ngay_sinh,
toan, ngu_van, vat_ly, hoa_hoc, sinh_hoc, khtn,
lich_su, dia_ly, gdcd, khxh,
tieng_anh, tieng_phap, tieng_nga, tieng_trung)
VALUES
(@so_bao_danh, @ho_ten, @ho_ten_ascii, @ngay_sinh,
@toan, @ngu_van, @vat_ly, @hoa_hoc, @sinh_hoc, @khtn,
@lich_su, @dia_ly, @gdcd, @khxh,
@tieng_anh, @tieng_phap, @tieng_nga, @tieng_trung)
`);
const files = collectExcelFiles();
let errorCount = 0;
// Wrap all inserts in a single transaction for speed
const insertAll = db.transaction((files) => {
for (const file of files) {
const basename = path.basename(file);
let fileRows = 0;
try {
const wb = XLSX.readFile(file);
// Iterate ALL sheets — .xls files over 65,536 rows (Hà Nội, HCM) split
// into continuation sheets that we must not miss.
const allRows = [];
for (const sheetName of wb.SheetNames) {
const rows = XLSX.utils.sheet_to_json(wb.Sheets[sheetName], {
header: 1,
});
for (let i = 0; i < rows.length; i++) {
// Skip header row on every sheet (continuation sheets may repeat it,
// or start directly with data — isHeaderRow handles both)
if (i === 0 && isHeaderRow(rows[i])) continue;
allRows.push(rows[i]);
}
}
for (let i = 0; i < allRows.length; i++) {
const row = allRows[i];
try {
const hoTen = String(row?.[0] || "").trim();
const ngaySinh = String(row?.[1] || "").trim();
const soBaoDanh = String(row?.[2] || "").trim();
const diemThi = String(row?.[3] || "");
// Skip truly empty rows (common tail padding in .xls)
if (!soBaoDanh || !hoTen) continue;
const scores = parseScores(diemThi);
insert.run({
so_bao_danh: soBaoDanh,
ho_ten: hoTen,
ho_ten_ascii: toAscii(hoTen),
ngay_sinh: ngaySinh || null,
toan: scores.toan ?? null,
ngu_van: scores.ngu_van ?? null,
vat_ly: scores.vat_ly ?? null,
hoa_hoc: scores.hoa_hoc ?? null,
sinh_hoc: scores.sinh_hoc ?? null,
khtn: scores.khtn ?? null,
lich_su: scores.lich_su ?? null,
dia_ly: scores.dia_ly ?? null,
gdcd: scores.gdcd ?? null,
khxh: scores.khxh ?? null,
tieng_anh: scores.tieng_anh ?? null,
tieng_phap: scores.tieng_phap ?? null,
tieng_nga: scores.tieng_nga ?? null,
tieng_trung: scores.tieng_trung ?? null,
});
fileRows++;
} catch (err) {
errorCount++;
if (errorCount <= 5) {
console.warn(` [warn] ${basename} row ${i}: ${err.message}`);
}
}
}
} catch (err) {
console.error(`Failed to read ${basename}: ${err.message}`);
}
console.log(` ${basename}: ${fileRows} rows`);
}
});
console.log(`Processing ${files.length} Excel files...\n`);
insertAll(files);
// Compact the database
db.exec("VACUUM");
const count = db.prepare("SELECT COUNT(*) as cnt FROM student").get();
console.log(`\nDone! ${count.cnt} students in database.`);
console.log(`Errors skipped: ${errorCount}`);
console.log(`Output: ${DB_PATH}`);
const stat = fs.statSync(DB_PATH);
console.log(`Size: ${(stat.size / 1024 / 1024).toFixed(1)} MB`);
db.close();
}
main();