Files
thptqg2016/scripts/build-database.js
T

272 lines
8.5 KiB
JavaScript

import XLSX from "xlsx";
import Database from "better-sqlite3";
import fs from "fs";
import path from "path";
import { fileURLToPath } from "url";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const DATA_DIR = path.join(__dirname, "..", "data");
const DB_PATH = path.join(__dirname, "..", "public", "thptqg2016.db");
// Score patterns for the DIEM_THI string format
const SCORE_PATTERNS = {
toan: /Toán:\s*([\d.]+)/,
ngu_van: /Ngữ văn:\s*([\d.]+)/,
vat_ly: /Vật lí:\s*([\d.]+)/,
hoa_hoc: /Hóa học:\s*([\d.]+)/,
sinh_hoc: /Sinh học:\s*([\d.]+)/,
lich_su: /Lịch sử:\s*([\d.]+)/,
dia_ly: /Địa lí:\s*([\d.]+)/,
tieng_anh: /Tiếng Anh:\s*([\d.]+)/,
tieng_phap: /Tiếng Pháp:\s*([\d.]+)/,
tieng_duc: /Tiếng Đức:\s*([\d.]+)/,
tieng_nhat: /Tiếng Nhật:\s*([\d.]+)/,
tieng_trung: /Tiếng Trung:\s*([\d.]+)/,
};
const ALL_SCORE_FIELDS = Object.keys(SCORE_PATTERNS);
// Strip Vietnamese diacritics: "NGUYỄN BŨU LỘC" → "nguyen buu loc"
function toAscii(str) {
return str
.normalize("NFD")
.replace(/[\u0300-\u036f]/g, "")
.replace(/đ/g, "d")
.replace(/Đ/g, "D")
.toLowerCase();
}
// Parse score text "Toán: 3.75 Ngữ văn: 5.00 ..." into { toan: 3.75, ... }
function parseScoreString(diemThi) {
const scores = {};
for (const [field, pattern] of Object.entries(SCORE_PATTERNS)) {
const match = diemThi.match(pattern);
if (match) scores[field] = parseFloat(match[1]);
}
return scores;
}
// Detect header row by checking for known column names
const KNOWN_HEADERS = new Set([
"SOBAODANH", "SBD", "HO_TEN", "HOTEN", "HỌ TÊN",
"NGAY_SINH", "TEN_CUMTHI", "GIOI_TINH", "DIEM_THI", "STT",
"TOAN", "VAN", "LY", "HOA", "SINH ", "SU", "DIA",
]);
function isHeaderRow(row) {
if (!row || row.length < 2) return false;
const first = String(row[0] || "").trim().toUpperCase();
return KNOWN_HEADERS.has(first);
}
// Detect which format a file uses based on its header row
function detectFormat(headerRow) {
if (!headerRow) return null;
const cols = headerRow.map((c) => String(c || "").trim().toUpperCase());
// Format: SBD, HOTEN, TOAN, VAN, LY, HOA, SINH, SU, DIA, ...
if (cols[0] === "SBD" && cols[2] === "TOAN") return "separate-scores";
// Build a column index map for flexible column ordering
const map = {};
for (let i = 0; i < cols.length; i++) {
const c = cols[i];
if (c === "SOBAODANH" || c === "SBD") map.sbd = i;
else if (c === "HO_TEN" || c === "HOTEN" || c === "HỌ TÊN") map.ho_ten = i;
else if (c === "NGAY_SINH") map.ngay_sinh = i;
else if (c === "TEN_CUMTHI") map.ten_cum_thi = i;
else if (c === "GIOI_TINH") map.gioi_tinh = i;
else if (c === "DIEM_THI") map.diem_thi = i;
}
if (map.sbd !== undefined && map.diem_thi !== undefined) {
return { type: "mapped", map };
}
return null;
}
// Process a file with separate score columns (dhhanghai format)
function processSeparateScoresRow(row) {
const sbd = String(row[0] || "").trim();
const hoTen = String(row[1] || "").trim();
if (!sbd || !hoTen) return null;
return {
so_bao_danh: sbd,
ho_ten: hoTen,
ho_ten_ascii: toAscii(hoTen),
ngay_sinh: null,
ten_cum_thi: null,
gioi_tinh: null,
toan: parseFloat(row[2]) || null,
ngu_van: parseFloat(row[3]) || null,
vat_ly: parseFloat(row[4]) || null,
hoa_hoc: parseFloat(row[5]) || null,
sinh_hoc: parseFloat(row[6]) || null,
lich_su: parseFloat(row[7]) || null,
dia_ly: parseFloat(row[8]) || null,
// row[9]=NGOAINGUTN, row[10]=NGOAINGUTL, row[11]=NGOAINGU (total)
tieng_anh: parseFloat(row[11]) || null,
tieng_phap: null,
tieng_duc: null,
tieng_nhat: null,
tieng_trung: null,
};
}
// Process a row using the column map
function processMappedRow(row, map) {
const sbd = String(row[map.sbd] || "").trim();
const hoTen = String(row[map.ho_ten] || "").trim();
if (!sbd || !hoTen) return null;
// Skip leaked header rows
const sbdUpper = sbd.toUpperCase();
if (KNOWN_HEADERS.has(sbdUpper) || KNOWN_HEADERS.has(hoTen.toUpperCase())) return null;
const ngaySinh = map.ngay_sinh !== undefined ? String(row[map.ngay_sinh] || "").trim() : null;
const tenCumThi = map.ten_cum_thi !== undefined ? String(row[map.ten_cum_thi] || "").trim() : null;
const rawGioiTinh = map.gioi_tinh !== undefined ? String(row[map.gioi_tinh] || "").trim() : null;
// Normalize gender: only accept "Nam" or "Nữ"
const gioiTinh = (rawGioiTinh === "Nam" || rawGioiTinh === "Nữ") ? rawGioiTinh : null;
const diemThi = map.diem_thi !== undefined ? String(row[map.diem_thi] || "") : "";
const scores = parseScoreString(diemThi);
return {
so_bao_danh: sbd,
ho_ten: hoTen,
ho_ten_ascii: toAscii(hoTen),
ngay_sinh: ngaySinh || null,
ten_cum_thi: tenCumThi || null,
gioi_tinh: gioiTinh || null,
...Object.fromEntries(ALL_SCORE_FIELDS.map((f) => [f, scores[f] ?? null])),
};
}
// Standard 6-column format without header: SBD, HO_TEN, NGAY_SINH, TEN_CUMTHI, GIOI_TINH, DIEM_THI
const DEFAULT_MAP = {
sbd: 0, ho_ten: 1, ngay_sinh: 2, ten_cum_thi: 3, gioi_tinh: 4, diem_thi: 5,
};
function main() {
fs.mkdirSync(path.dirname(DB_PATH), { recursive: true });
if (fs.existsSync(DB_PATH)) fs.unlinkSync(DB_PATH);
const db = new Database(DB_PATH);
db.exec(`
CREATE TABLE student (
so_bao_danh TEXT PRIMARY KEY,
ho_ten TEXT NOT NULL,
ho_ten_ascii TEXT NOT NULL,
ngay_sinh TEXT,
ten_cum_thi TEXT,
gioi_tinh TEXT,
toan REAL,
ngu_van REAL,
vat_ly REAL,
hoa_hoc REAL,
sinh_hoc REAL,
lich_su REAL,
dia_ly REAL,
tieng_anh REAL,
tieng_phap REAL,
tieng_duc REAL,
tieng_nhat REAL,
tieng_trung REAL
);
CREATE INDEX idx_ho_ten ON student(ho_ten);
CREATE INDEX idx_ho_ten_ascii ON student(ho_ten_ascii);
CREATE INDEX idx_ten_cum_thi ON student(ten_cum_thi);
`);
const insert = db.prepare(`
INSERT OR REPLACE INTO student
(so_bao_danh, ho_ten, ho_ten_ascii, ngay_sinh, ten_cum_thi, gioi_tinh,
toan, ngu_van, vat_ly, hoa_hoc, sinh_hoc, lich_su, dia_ly,
tieng_anh, tieng_phap, tieng_duc, tieng_nhat, tieng_trung)
VALUES
(@so_bao_danh, @ho_ten, @ho_ten_ascii, @ngay_sinh, @ten_cum_thi, @gioi_tinh,
@toan, @ngu_van, @vat_ly, @hoa_hoc, @sinh_hoc, @lich_su, @dia_ly,
@tieng_anh, @tieng_phap, @tieng_duc, @tieng_nhat, @tieng_trung)
`);
// Collect all Excel files (.xlsx and .xls)
const files = fs.readdirSync(DATA_DIR)
.filter((f) => f.endsWith(".xlsx") || f.endsWith(".xls"))
.map((f) => path.join(DATA_DIR, f));
let totalRows = 0;
let errorCount = 0;
const insertAll = db.transaction((files) => {
for (const file of files) {
const basename = path.basename(file);
let fileRows = 0;
try {
const wb = XLSX.readFile(file);
const ws = wb.Sheets[wb.SheetNames[0]];
const rows = XLSX.utils.sheet_to_json(ws, { header: 1 });
if (rows.length === 0) continue;
let startRow = 0;
let format = null;
if (isHeaderRow(rows[0])) {
format = detectFormat(rows[0]);
startRow = 1;
}
for (let i = startRow; i < rows.length; i++) {
const row = rows[i];
if (!row || row.length < 2) continue;
try {
let record;
if (format === "separate-scores") {
record = processSeparateScoresRow(row);
} else if (format && format.type === "mapped") {
record = processMappedRow(row, format.map);
} else {
// No header or unrecognized: assume standard 6-column order
record = processMappedRow(row, DEFAULT_MAP);
}
if (!record) continue;
insert.run(record);
fileRows++;
} catch {
errorCount++;
}
}
} catch (err) {
console.error(`Failed to read ${basename}: ${err.message}`);
}
totalRows += fileRows;
console.log(` ${basename}: ${fileRows} rows`);
}
});
console.log(`Processing ${files.length} Excel files...\n`);
insertAll(files);
db.exec("VACUUM");
const count = db.prepare("SELECT COUNT(*) as cnt FROM student").get();
console.log(`\nDone! ${count.cnt} students in database.`);
console.log(`Errors skipped: ${errorCount}`);
console.log(`Output: ${DB_PATH}`);
const stat = fs.statSync(DB_PATH);
console.log(`Size: ${(stat.size / 1024 / 1024).toFixed(1)} MB`);
db.close();
}
main();