mirror of
https://github.com/tiennm99/phow2sim.git
synced 2026-05-31 02:20:39 +00:00
8dd17acd4f
Tiny FastAPI service over PhoW2V Vietnamese word vectors. Mirrors word2sim's endpoint shapes (/similarity /neighbors /vocab /random) so clients can swap URLs without code changes. - Auto-downloads VinAI's PhoW2V on first boot, caches binary .bin for ~5x faster restarts - Viet-aware canonicalizer: exact -> lowercase -> space-to-underscore - Supports both word (compound) and syllable variants via env - Unicode-aware random-word filter accepts diacritics, rejects digits/punct
27 lines
786 B
Docker
27 lines
786 B
Docker
FROM python:3.11-slim
|
|
|
|
WORKDIR /app
|
|
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
curl unzip ca-certificates \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
COPY requirements.txt .
|
|
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
COPY app ./app
|
|
|
|
# Defaults point at PhoW2V word-300d from VinAI's public mirror.
|
|
# Override MODEL_URL/MODEL_PATH to switch variants (syllables, 100d).
|
|
ENV MODEL_URL=https://public.vinai.io/word2vec_vi_words_300dims.zip \
|
|
MODEL_PATH=/data/phow2v/word2vec_vi_words_300dims.txt \
|
|
MODEL_VARIANT=word \
|
|
PORT=8000
|
|
|
|
EXPOSE 8000
|
|
|
|
HEALTHCHECK --interval=30s --timeout=5s --start-period=600s --retries=3 \
|
|
CMD curl -fsS http://localhost:8000/health || exit 1
|
|
|
|
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|