mirror of
https://github.com/tiennm99/thptqg2017.git
synced 2026-06-04 10:13:55 +00:00
1cf65be51c
- Node script parses 119 Excel files into SQLite (847K students) - Vite + React frontend with sql.js for client-side querying - Search by exam ID (số báo danh) or student name - Gzipped DB (36MB) with download progress bar - GitHub Actions workflow for GitHub Pages deployment
186 lines
5.4 KiB
JavaScript
186 lines
5.4 KiB
JavaScript
const XLSX = require("xlsx");
|
|
const Database = require("better-sqlite3");
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
|
|
const RAW_DIR = path.join(__dirname, "..", "src", "main", "resources", "raw");
|
|
const DB_PATH = path.join(__dirname, "..", "web", "public", "thptqg2017.db");
|
|
|
|
// Score patterns matching the Java Converter.java regex
|
|
const SCORE_PATTERNS = {
|
|
toan: /Toán:\s*(\d*\.\d*)/,
|
|
ngu_van: /Ngữ văn:\s*(\d*\.\d*)/,
|
|
vat_ly: /Vật lí:\s*(\d*\.\d*)/,
|
|
hoa_hoc: /Hóa học:\s*(\d*\.\d*)/,
|
|
sinh_hoc: /Sinh học:\s*(\d*\.\d*)/,
|
|
khtn: /KHTN:\s*(\d*\.\d*)/,
|
|
lich_su: /Lịch sử:\s*(\d*\.\d*)/,
|
|
dia_ly: /Địa lí:\s*(\d*\.\d*)/,
|
|
gdcd: /GDCD:\s*(\d*\.\d*)/,
|
|
khxh: /KHXH:\s*(\d*\.\d*)/,
|
|
tieng_anh: /Tiếng Anh:\s*(\d*\.\d*)/,
|
|
};
|
|
|
|
// Collect all .xlsx files: raw/ first, then raw/(update)/ to overwrite
|
|
function collectExcelFiles() {
|
|
const files = [];
|
|
|
|
// Main raw files first
|
|
for (const f of fs.readdirSync(RAW_DIR)) {
|
|
const full = path.join(RAW_DIR, f);
|
|
if (fs.statSync(full).isFile() && f.endsWith(".xlsx")) {
|
|
files.push(full);
|
|
}
|
|
}
|
|
|
|
// Update files second (will overwrite via INSERT OR REPLACE)
|
|
const updateDir = path.join(RAW_DIR, "(update)");
|
|
if (fs.existsSync(updateDir)) {
|
|
for (const f of fs.readdirSync(updateDir)) {
|
|
const full = path.join(updateDir, f);
|
|
if (fs.statSync(full).isFile() && f.endsWith(".xlsx")) {
|
|
files.push(full);
|
|
}
|
|
}
|
|
}
|
|
|
|
return files;
|
|
}
|
|
|
|
// Parse score text into an object of score fields
|
|
function parseScores(diemThi) {
|
|
const scores = {};
|
|
for (const [field, pattern] of Object.entries(SCORE_PATTERNS)) {
|
|
const match = diemThi.match(pattern);
|
|
if (match) {
|
|
scores[field] = parseFloat(match[1]);
|
|
}
|
|
}
|
|
return scores;
|
|
}
|
|
|
|
// Detect if first row is a header (not actual student data)
|
|
function isHeaderRow(row) {
|
|
if (!row || row.length < 3) return false;
|
|
const first = String(row[0] || "").toUpperCase();
|
|
return first === "HO_TEN" || first === "HỌ TÊN" || first === "STT";
|
|
}
|
|
|
|
function main() {
|
|
// Ensure output directory exists
|
|
fs.mkdirSync(path.dirname(DB_PATH), { recursive: true });
|
|
|
|
// Remove old database if exists
|
|
if (fs.existsSync(DB_PATH)) fs.unlinkSync(DB_PATH);
|
|
|
|
const db = new Database(DB_PATH);
|
|
|
|
// Create table
|
|
db.exec(`
|
|
CREATE TABLE student (
|
|
so_bao_danh TEXT PRIMARY KEY,
|
|
ho_ten TEXT NOT NULL,
|
|
ngay_sinh TEXT,
|
|
toan REAL,
|
|
ngu_van REAL,
|
|
vat_ly REAL,
|
|
hoa_hoc REAL,
|
|
sinh_hoc REAL,
|
|
khtn REAL,
|
|
lich_su REAL,
|
|
dia_ly REAL,
|
|
gdcd REAL,
|
|
khxh REAL,
|
|
tieng_anh REAL
|
|
);
|
|
CREATE INDEX idx_ho_ten ON student(ho_ten);
|
|
`);
|
|
|
|
const insert = db.prepare(`
|
|
INSERT OR REPLACE INTO student
|
|
(so_bao_danh, ho_ten, ngay_sinh, toan, ngu_van, vat_ly, hoa_hoc, sinh_hoc, khtn, lich_su, dia_ly, gdcd, khxh, tieng_anh)
|
|
VALUES
|
|
(@so_bao_danh, @ho_ten, @ngay_sinh, @toan, @ngu_van, @vat_ly, @hoa_hoc, @sinh_hoc, @khtn, @lich_su, @dia_ly, @gdcd, @khxh, @tieng_anh)
|
|
`);
|
|
|
|
const files = collectExcelFiles();
|
|
let totalRows = 0;
|
|
let errorCount = 0;
|
|
|
|
// Wrap all inserts in a single transaction for speed
|
|
const insertAll = db.transaction((files) => {
|
|
for (const file of files) {
|
|
const basename = path.basename(file);
|
|
let fileRows = 0;
|
|
|
|
try {
|
|
const wb = XLSX.readFile(file);
|
|
const ws = wb.Sheets[wb.SheetNames[0]];
|
|
const rows = XLSX.utils.sheet_to_json(ws, { header: 1 });
|
|
|
|
for (let i = 0; i < rows.length; i++) {
|
|
const row = rows[i];
|
|
|
|
// Skip header rows
|
|
if (i === 0 && isHeaderRow(row)) continue;
|
|
|
|
try {
|
|
const hoTen = String(row[0] || "").trim();
|
|
const ngaySinh = String(row[1] || "").trim();
|
|
const soBaoDanh = String(row[2] || "").trim();
|
|
const diemThi = String(row[3] || "");
|
|
|
|
// Validate: soBaoDanh should be numeric-like
|
|
if (!soBaoDanh || !hoTen) continue;
|
|
|
|
const scores = parseScores(diemThi);
|
|
|
|
insert.run({
|
|
so_bao_danh: soBaoDanh,
|
|
ho_ten: hoTen,
|
|
ngay_sinh: ngaySinh || null,
|
|
toan: scores.toan ?? null,
|
|
ngu_van: scores.ngu_van ?? null,
|
|
vat_ly: scores.vat_ly ?? null,
|
|
hoa_hoc: scores.hoa_hoc ?? null,
|
|
sinh_hoc: scores.sinh_hoc ?? null,
|
|
khtn: scores.khtn ?? null,
|
|
lich_su: scores.lich_su ?? null,
|
|
dia_ly: scores.dia_ly ?? null,
|
|
gdcd: scores.gdcd ?? null,
|
|
khxh: scores.khxh ?? null,
|
|
tieng_anh: scores.tieng_anh ?? null,
|
|
});
|
|
fileRows++;
|
|
} catch (err) {
|
|
errorCount++;
|
|
}
|
|
}
|
|
} catch (err) {
|
|
console.error(`Failed to read ${basename}: ${err.message}`);
|
|
}
|
|
|
|
totalRows += fileRows;
|
|
console.log(` ${basename}: ${fileRows} rows`);
|
|
}
|
|
});
|
|
|
|
console.log(`Processing ${files.length} Excel files...\n`);
|
|
insertAll(files);
|
|
|
|
// Compact the database
|
|
db.exec("VACUUM");
|
|
|
|
const count = db.prepare("SELECT COUNT(*) as cnt FROM student").get();
|
|
console.log(`\nDone! ${count.cnt} students in database.`);
|
|
console.log(`Errors skipped: ${errorCount}`);
|
|
console.log(`Output: ${DB_PATH}`);
|
|
|
|
const stat = fs.statSync(DB_PATH);
|
|
console.log(`Size: ${(stat.size / 1024 / 1024).toFixed(1)} MB`);
|
|
|
|
db.close();
|
|
}
|
|
|
|
main();
|