mirror of
https://github.com/tiennm99/phow2sim.git
synced 2026-05-20 10:24:53 +00:00
54eaf95fc4
'Waiting for application startup.' was the last line visible for several minutes while the lifespan hook silently downloaded 1.2GB and parsed the text vectors — looks like a hang. - Print milestones for each load phase (cache hit / download / extract / parse / cache-write) with timings. - During download, print every ~50 MiB with running percent if the server sent Content-Length. - PYTHONUNBUFFERED=1 in Dockerfile so the prints flush to 'docker compose logs' in real time. Uses plain print (not logging) because uvicorn's default log config filters INFO on non-uvicorn loggers, and wrestling with that for six operator-facing status lines isn't worth the surface area.
210 lines
7.8 KiB
Python
210 lines
7.8 KiB
Python
"""PhoW2V model loader and similarity primitives.
|
||
|
||
PhoW2V (VinAI) ships as word2vec-text format (.txt), available in four
|
||
variants: word/syllable × 100/300 dims. Text format is slow to parse on
|
||
first boot (~30-60s for the 300d word model), so we cache a binary .bin
|
||
alongside the .txt after the first successful load — subsequent starts
|
||
use the fast binary path.
|
||
|
||
Tokenization matters. The "word" variant expects underscore-joined
|
||
compounds ("sinh_viên"); the "syllable" variant expects single syllables
|
||
("sinh", "viên"). Callers must normalize to match before querying.
|
||
|
||
Model source: PhoW2V's research license forbids public redistribution,
|
||
so MODEL_URL is expected to point at a mirror the operator controls
|
||
(e.g. a Nextcloud share with an unguessable token in the URL, a signed
|
||
cloud-storage URL, or any HTTP(S) endpoint that serves the zip with a
|
||
GET). The service does a plain GET — any auth must be baked into the
|
||
URL itself.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
import random as _random
|
||
import sys
|
||
import time
|
||
import unicodedata
|
||
import urllib.request
|
||
import zipfile
|
||
from pathlib import Path
|
||
from typing import Optional
|
||
|
||
from gensim.models import KeyedVectors
|
||
|
||
_MODEL: Optional[KeyedVectors] = None
|
||
_DOWNLOAD_CHUNK = 1 << 20 # 1 MiB; keeps peak RAM flat for ~1GB downloads.
|
||
_LOG_EVERY_MB = 50 # Print a progress line every ~50 MiB so the operator knows it's alive.
|
||
|
||
|
||
def _log(msg: str) -> None:
|
||
"""Uvicorn doesn't surface our logger at INFO by default, and the lifespan
|
||
runs before any log config the operator might add — print with flush so
|
||
'docker compose logs' shows progress in real time."""
|
||
print(f"[phow2sim] {msg}", file=sys.stdout, flush=True)
|
||
|
||
|
||
def _download_and_extract(url: str, target_txt: Path) -> None:
|
||
"""Fetch a PhoW2V zip (streamed) and extract its .txt into target_txt."""
|
||
target_txt.parent.mkdir(parents=True, exist_ok=True)
|
||
zip_path = target_txt.with_suffix(".zip")
|
||
|
||
_log(f"downloading model zip from {url}")
|
||
t0 = time.monotonic()
|
||
with urllib.request.urlopen(url) as resp, open(zip_path, "wb") as dst:
|
||
total = int(resp.headers.get("Content-Length") or 0)
|
||
total_mb = total / (1 << 20) if total else 0.0
|
||
downloaded = 0
|
||
next_log = _LOG_EVERY_MB << 20
|
||
while True:
|
||
chunk = resp.read(_DOWNLOAD_CHUNK)
|
||
if not chunk:
|
||
break
|
||
dst.write(chunk)
|
||
downloaded += len(chunk)
|
||
if downloaded >= next_log:
|
||
mb = downloaded / (1 << 20)
|
||
if total_mb:
|
||
_log(f" downloaded {mb:.0f} / {total_mb:.0f} MiB ({downloaded * 100.0 / total:.0f}%)")
|
||
else:
|
||
_log(f" downloaded {mb:.0f} MiB")
|
||
next_log += _LOG_EVERY_MB << 20
|
||
_log(f"download complete in {time.monotonic() - t0:.1f}s ({downloaded / (1 << 20):.0f} MiB)")
|
||
|
||
_log(f"extracting .txt from {zip_path.name}")
|
||
with zipfile.ZipFile(zip_path) as zf:
|
||
txt_members = [m for m in zf.namelist() if m.endswith(".txt")]
|
||
if not txt_members:
|
||
raise RuntimeError(f"no .txt file inside {url}")
|
||
# Flatten into target_txt regardless of archive's internal layout.
|
||
with zf.open(txt_members[0]) as src, open(target_txt, "wb") as dst:
|
||
while True:
|
||
chunk = src.read(_DOWNLOAD_CHUNK)
|
||
if not chunk:
|
||
break
|
||
dst.write(chunk)
|
||
zip_path.unlink(missing_ok=True)
|
||
_log(f"extracted to {target_txt}")
|
||
|
||
|
||
def load_model() -> KeyedVectors:
|
||
"""Return the singleton KeyedVectors, loading (and downloading) on first call."""
|
||
global _MODEL
|
||
if _MODEL is not None:
|
||
return _MODEL
|
||
|
||
txt_path = Path(os.environ["MODEL_PATH"])
|
||
bin_cache = txt_path.with_suffix(".bin")
|
||
|
||
# Prefer the cached binary form for ~5x faster cold start.
|
||
if bin_cache.exists():
|
||
_log(f"loading cached binary vectors from {bin_cache}")
|
||
t0 = time.monotonic()
|
||
_MODEL = KeyedVectors.load_word2vec_format(str(bin_cache), binary=True)
|
||
_log(f"loaded {len(_MODEL)} keys in {time.monotonic() - t0:.1f}s")
|
||
return _MODEL
|
||
|
||
if not txt_path.exists():
|
||
url = os.environ.get("MODEL_URL", "").strip()
|
||
if not url:
|
||
raise FileNotFoundError(
|
||
f"no vectors at MODEL_PATH={txt_path}; set MODEL_URL in .env "
|
||
f"(or mount a local .txt into {txt_path.parent}) and retry"
|
||
)
|
||
_download_and_extract(url, txt_path)
|
||
|
||
_log(f"parsing text-format vectors from {txt_path} (typically ~60s for word-300d)")
|
||
t0 = time.monotonic()
|
||
_MODEL = KeyedVectors.load_word2vec_format(str(txt_path), binary=False)
|
||
_log(f"parsed {len(_MODEL)} keys in {time.monotonic() - t0:.1f}s")
|
||
|
||
# Persist the fast-load cache next to the source .txt.
|
||
try:
|
||
_log(f"writing binary cache to {bin_cache} for faster future starts")
|
||
_MODEL.save_word2vec_format(str(bin_cache), binary=True)
|
||
except OSError as e:
|
||
_log(f"warning: could not write binary cache ({e}); will re-parse .txt next time")
|
||
return _MODEL
|
||
|
||
|
||
def _variant_candidates(word: str) -> list[str]:
|
||
"""Casing/segmentation candidates ordered by specificity.
|
||
|
||
PhoW2V-word uses underscores for compounds; PhoW2V-syllable has no
|
||
multi-token entries. Trying both forms covers either config without
|
||
caller branching.
|
||
"""
|
||
stripped = word.strip()
|
||
lowered = stripped.lower()
|
||
joined = stripped.replace(" ", "_")
|
||
joined_lower = lowered.replace(" ", "_")
|
||
# Ordered, de-duplicated.
|
||
seen: set[str] = set()
|
||
out: list[str] = []
|
||
for c in (stripped, lowered, joined, joined_lower):
|
||
if c and c not in seen:
|
||
seen.add(c)
|
||
out.append(c)
|
||
return out
|
||
|
||
|
||
def canonicalize(kv: KeyedVectors, word: str) -> Optional[str]:
|
||
"""Resolve `word` to its in-vocab form, trying exact → lower → space→underscore."""
|
||
for candidate in _variant_candidates(word):
|
||
if candidate in kv:
|
||
return candidate
|
||
return None
|
||
|
||
|
||
def similarity(kv: KeyedVectors, a: str, b: str) -> float:
|
||
"""Cosine similarity between two in-vocab keys. Caller must canonicalize."""
|
||
return float(kv.similarity(a, b))
|
||
|
||
|
||
def neighbors(kv: KeyedVectors, word: str, topn: int) -> list[tuple[str, float]]:
|
||
"""Top-N nearest-neighbor keys with cosine scores. Caller must canonicalize."""
|
||
return [(w, float(s)) for w, s in kv.most_similar(word, topn=topn)]
|
||
|
||
|
||
def _is_vietnamese_wordlike(word: str) -> bool:
|
||
"""Reject digits and punctuation; accept Latin letters, Vietnamese diacritics, `_`."""
|
||
for ch in word:
|
||
if ch == "_":
|
||
continue
|
||
cat = unicodedata.category(ch)
|
||
# Ll/Lu = letters, Mn = combining marks (diacritics on decomposed input).
|
||
if cat not in ("Ll", "Lu", "Lo", "Lt", "Mn"):
|
||
return False
|
||
return True
|
||
|
||
|
||
def random_word(
|
||
kv: KeyedVectors,
|
||
*,
|
||
min_rank: int = 0,
|
||
max_rank: Optional[int] = None,
|
||
alpha_only: bool = True,
|
||
min_len: int = 1,
|
||
max_len: int = 64,
|
||
max_attempts: int = 1000,
|
||
) -> Optional[str]:
|
||
"""Return a random vocab key matching filters, or None within attempt budget.
|
||
|
||
`index_to_key` is frequency-ordered for word2vec-text files, so rank bounds
|
||
behave as a frequency window. `alpha_only=True` accepts Vietnamese letters
|
||
and the word-boundary `_` — rejects numerals, punctuation, and foreign
|
||
scripts that sometimes leak into Vietnamese corpora.
|
||
"""
|
||
vocab = kv.index_to_key
|
||
upper = min(max_rank, len(vocab)) if max_rank is not None else len(vocab)
|
||
if min_rank >= upper:
|
||
return None
|
||
for _ in range(max_attempts):
|
||
word = vocab[_random.randrange(min_rank, upper)]
|
||
if not (min_len <= len(word) <= max_len):
|
||
continue
|
||
if alpha_only and not _is_vietnamese_wordlike(word):
|
||
continue
|
||
return word
|
||
return None
|