mirror of
https://github.com/tiennm99/thptqg2016.git
synced 2026-05-14 08:58:47 +00:00
272 lines
8.5 KiB
JavaScript
272 lines
8.5 KiB
JavaScript
import XLSX from "xlsx";
|
|
import Database from "better-sqlite3";
|
|
import fs from "fs";
|
|
import path from "path";
|
|
import { fileURLToPath } from "url";
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
const DATA_DIR = path.join(__dirname, "..", "data");
|
|
const DB_PATH = path.join(__dirname, "..", "public", "thptqg2016.db");
|
|
|
|
// Score patterns for the DIEM_THI string format
|
|
const SCORE_PATTERNS = {
|
|
toan: /Toán:\s*([\d.]+)/,
|
|
ngu_van: /Ngữ văn:\s*([\d.]+)/,
|
|
vat_ly: /Vật lí:\s*([\d.]+)/,
|
|
hoa_hoc: /Hóa học:\s*([\d.]+)/,
|
|
sinh_hoc: /Sinh học:\s*([\d.]+)/,
|
|
lich_su: /Lịch sử:\s*([\d.]+)/,
|
|
dia_ly: /Địa lí:\s*([\d.]+)/,
|
|
tieng_anh: /Tiếng Anh:\s*([\d.]+)/,
|
|
tieng_phap: /Tiếng Pháp:\s*([\d.]+)/,
|
|
tieng_duc: /Tiếng Đức:\s*([\d.]+)/,
|
|
tieng_nhat: /Tiếng Nhật:\s*([\d.]+)/,
|
|
tieng_trung: /Tiếng Trung:\s*([\d.]+)/,
|
|
};
|
|
|
|
const ALL_SCORE_FIELDS = Object.keys(SCORE_PATTERNS);
|
|
|
|
// Strip Vietnamese diacritics: "NGUYỄN BŨU LỘC" → "nguyen buu loc"
|
|
function toAscii(str) {
|
|
return str
|
|
.normalize("NFD")
|
|
.replace(/[\u0300-\u036f]/g, "")
|
|
.replace(/đ/g, "d")
|
|
.replace(/Đ/g, "D")
|
|
.toLowerCase();
|
|
}
|
|
|
|
// Parse score text "Toán: 3.75 Ngữ văn: 5.00 ..." into { toan: 3.75, ... }
|
|
function parseScoreString(diemThi) {
|
|
const scores = {};
|
|
for (const [field, pattern] of Object.entries(SCORE_PATTERNS)) {
|
|
const match = diemThi.match(pattern);
|
|
if (match) scores[field] = parseFloat(match[1]);
|
|
}
|
|
return scores;
|
|
}
|
|
|
|
// Detect header row by checking for known column names
|
|
const KNOWN_HEADERS = new Set([
|
|
"SOBAODANH", "SBD", "HO_TEN", "HOTEN", "HỌ TÊN",
|
|
"NGAY_SINH", "TEN_CUMTHI", "GIOI_TINH", "DIEM_THI", "STT",
|
|
"TOAN", "VAN", "LY", "HOA", "SINH ", "SU", "DIA",
|
|
]);
|
|
|
|
function isHeaderRow(row) {
|
|
if (!row || row.length < 2) return false;
|
|
const first = String(row[0] || "").trim().toUpperCase();
|
|
return KNOWN_HEADERS.has(first);
|
|
}
|
|
|
|
// Detect which format a file uses based on its header row
|
|
function detectFormat(headerRow) {
|
|
if (!headerRow) return null;
|
|
const cols = headerRow.map((c) => String(c || "").trim().toUpperCase());
|
|
|
|
// Format: SBD, HOTEN, TOAN, VAN, LY, HOA, SINH, SU, DIA, ...
|
|
if (cols[0] === "SBD" && cols[2] === "TOAN") return "separate-scores";
|
|
|
|
// Build a column index map for flexible column ordering
|
|
const map = {};
|
|
for (let i = 0; i < cols.length; i++) {
|
|
const c = cols[i];
|
|
if (c === "SOBAODANH" || c === "SBD") map.sbd = i;
|
|
else if (c === "HO_TEN" || c === "HOTEN" || c === "HỌ TÊN") map.ho_ten = i;
|
|
else if (c === "NGAY_SINH") map.ngay_sinh = i;
|
|
else if (c === "TEN_CUMTHI") map.ten_cum_thi = i;
|
|
else if (c === "GIOI_TINH") map.gioi_tinh = i;
|
|
else if (c === "DIEM_THI") map.diem_thi = i;
|
|
}
|
|
|
|
if (map.sbd !== undefined && map.diem_thi !== undefined) {
|
|
return { type: "mapped", map };
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
// Process a file with separate score columns (dhhanghai format)
|
|
function processSeparateScoresRow(row) {
|
|
const sbd = String(row[0] || "").trim();
|
|
const hoTen = String(row[1] || "").trim();
|
|
if (!sbd || !hoTen) return null;
|
|
|
|
return {
|
|
so_bao_danh: sbd,
|
|
ho_ten: hoTen,
|
|
ho_ten_ascii: toAscii(hoTen),
|
|
ngay_sinh: null,
|
|
ten_cum_thi: null,
|
|
gioi_tinh: null,
|
|
toan: parseFloat(row[2]) || null,
|
|
ngu_van: parseFloat(row[3]) || null,
|
|
vat_ly: parseFloat(row[4]) || null,
|
|
hoa_hoc: parseFloat(row[5]) || null,
|
|
sinh_hoc: parseFloat(row[6]) || null,
|
|
lich_su: parseFloat(row[7]) || null,
|
|
dia_ly: parseFloat(row[8]) || null,
|
|
// row[9]=NGOAINGUTN, row[10]=NGOAINGUTL, row[11]=NGOAINGU (total)
|
|
tieng_anh: parseFloat(row[11]) || null,
|
|
tieng_phap: null,
|
|
tieng_duc: null,
|
|
tieng_nhat: null,
|
|
tieng_trung: null,
|
|
};
|
|
}
|
|
|
|
// Process a row using the column map
|
|
function processMappedRow(row, map) {
|
|
const sbd = String(row[map.sbd] || "").trim();
|
|
const hoTen = String(row[map.ho_ten] || "").trim();
|
|
if (!sbd || !hoTen) return null;
|
|
|
|
// Skip leaked header rows
|
|
const sbdUpper = sbd.toUpperCase();
|
|
if (KNOWN_HEADERS.has(sbdUpper) || KNOWN_HEADERS.has(hoTen.toUpperCase())) return null;
|
|
|
|
const ngaySinh = map.ngay_sinh !== undefined ? String(row[map.ngay_sinh] || "").trim() : null;
|
|
const tenCumThi = map.ten_cum_thi !== undefined ? String(row[map.ten_cum_thi] || "").trim() : null;
|
|
const rawGioiTinh = map.gioi_tinh !== undefined ? String(row[map.gioi_tinh] || "").trim() : null;
|
|
// Normalize gender: only accept "Nam" or "Nữ"
|
|
const gioiTinh = (rawGioiTinh === "Nam" || rawGioiTinh === "Nữ") ? rawGioiTinh : null;
|
|
const diemThi = map.diem_thi !== undefined ? String(row[map.diem_thi] || "") : "";
|
|
|
|
const scores = parseScoreString(diemThi);
|
|
|
|
return {
|
|
so_bao_danh: sbd,
|
|
ho_ten: hoTen,
|
|
ho_ten_ascii: toAscii(hoTen),
|
|
ngay_sinh: ngaySinh || null,
|
|
ten_cum_thi: tenCumThi || null,
|
|
gioi_tinh: gioiTinh || null,
|
|
...Object.fromEntries(ALL_SCORE_FIELDS.map((f) => [f, scores[f] ?? null])),
|
|
};
|
|
}
|
|
|
|
// Standard 6-column format without header: SBD, HO_TEN, NGAY_SINH, TEN_CUMTHI, GIOI_TINH, DIEM_THI
|
|
const DEFAULT_MAP = {
|
|
sbd: 0, ho_ten: 1, ngay_sinh: 2, ten_cum_thi: 3, gioi_tinh: 4, diem_thi: 5,
|
|
};
|
|
|
|
function main() {
|
|
fs.mkdirSync(path.dirname(DB_PATH), { recursive: true });
|
|
if (fs.existsSync(DB_PATH)) fs.unlinkSync(DB_PATH);
|
|
|
|
const db = new Database(DB_PATH);
|
|
|
|
db.exec(`
|
|
CREATE TABLE student (
|
|
so_bao_danh TEXT PRIMARY KEY,
|
|
ho_ten TEXT NOT NULL,
|
|
ho_ten_ascii TEXT NOT NULL,
|
|
ngay_sinh TEXT,
|
|
ten_cum_thi TEXT,
|
|
gioi_tinh TEXT,
|
|
toan REAL,
|
|
ngu_van REAL,
|
|
vat_ly REAL,
|
|
hoa_hoc REAL,
|
|
sinh_hoc REAL,
|
|
lich_su REAL,
|
|
dia_ly REAL,
|
|
tieng_anh REAL,
|
|
tieng_phap REAL,
|
|
tieng_duc REAL,
|
|
tieng_nhat REAL,
|
|
tieng_trung REAL
|
|
);
|
|
CREATE INDEX idx_ho_ten ON student(ho_ten);
|
|
CREATE INDEX idx_ho_ten_ascii ON student(ho_ten_ascii);
|
|
CREATE INDEX idx_ten_cum_thi ON student(ten_cum_thi);
|
|
`);
|
|
|
|
const insert = db.prepare(`
|
|
INSERT OR REPLACE INTO student
|
|
(so_bao_danh, ho_ten, ho_ten_ascii, ngay_sinh, ten_cum_thi, gioi_tinh,
|
|
toan, ngu_van, vat_ly, hoa_hoc, sinh_hoc, lich_su, dia_ly,
|
|
tieng_anh, tieng_phap, tieng_duc, tieng_nhat, tieng_trung)
|
|
VALUES
|
|
(@so_bao_danh, @ho_ten, @ho_ten_ascii, @ngay_sinh, @ten_cum_thi, @gioi_tinh,
|
|
@toan, @ngu_van, @vat_ly, @hoa_hoc, @sinh_hoc, @lich_su, @dia_ly,
|
|
@tieng_anh, @tieng_phap, @tieng_duc, @tieng_nhat, @tieng_trung)
|
|
`);
|
|
|
|
// Collect all Excel files (.xlsx and .xls)
|
|
const files = fs.readdirSync(DATA_DIR)
|
|
.filter((f) => f.endsWith(".xlsx") || f.endsWith(".xls"))
|
|
.map((f) => path.join(DATA_DIR, f));
|
|
|
|
let totalRows = 0;
|
|
let errorCount = 0;
|
|
|
|
const insertAll = db.transaction((files) => {
|
|
for (const file of files) {
|
|
const basename = path.basename(file);
|
|
let fileRows = 0;
|
|
|
|
try {
|
|
const wb = XLSX.readFile(file);
|
|
const ws = wb.Sheets[wb.SheetNames[0]];
|
|
const rows = XLSX.utils.sheet_to_json(ws, { header: 1 });
|
|
if (rows.length === 0) continue;
|
|
|
|
let startRow = 0;
|
|
let format = null;
|
|
|
|
if (isHeaderRow(rows[0])) {
|
|
format = detectFormat(rows[0]);
|
|
startRow = 1;
|
|
}
|
|
|
|
for (let i = startRow; i < rows.length; i++) {
|
|
const row = rows[i];
|
|
if (!row || row.length < 2) continue;
|
|
|
|
try {
|
|
let record;
|
|
|
|
if (format === "separate-scores") {
|
|
record = processSeparateScoresRow(row);
|
|
} else if (format && format.type === "mapped") {
|
|
record = processMappedRow(row, format.map);
|
|
} else {
|
|
// No header or unrecognized: assume standard 6-column order
|
|
record = processMappedRow(row, DEFAULT_MAP);
|
|
}
|
|
|
|
if (!record) continue;
|
|
insert.run(record);
|
|
fileRows++;
|
|
} catch {
|
|
errorCount++;
|
|
}
|
|
}
|
|
} catch (err) {
|
|
console.error(`Failed to read ${basename}: ${err.message}`);
|
|
}
|
|
|
|
totalRows += fileRows;
|
|
console.log(` ${basename}: ${fileRows} rows`);
|
|
}
|
|
});
|
|
|
|
console.log(`Processing ${files.length} Excel files...\n`);
|
|
insertAll(files);
|
|
|
|
db.exec("VACUUM");
|
|
|
|
const count = db.prepare("SELECT COUNT(*) as cnt FROM student").get();
|
|
console.log(`\nDone! ${count.cnt} students in database.`);
|
|
console.log(`Errors skipped: ${errorCount}`);
|
|
console.log(`Output: ${DB_PATH}`);
|
|
|
|
const stat = fs.statSync(DB_PATH);
|
|
console.log(`Size: ${(stat.size / 1024 / 1024).toFixed(1)} MB`);
|
|
|
|
db.close();
|
|
}
|
|
|
|
main();
|