Files
tiennm99 2e3e61dcbb feat: initial scaffold for word2sim similarity API
Stateless FastAPI service exposing word2vec cosine similarity,
nearest neighbors, vocab lookup, and random-word picker.
Dockerized with gensim GoogleNews pretrained model support.
2026-04-22 21:18:02 +07:00

139 lines
3.8 KiB
Python

"""FastAPI entry point for word2sim.
Exposes pure read-only endpoints over a word2vec model: pairwise similarity,
nearest neighbors, vocab lookup. Stateless — no sessions, no persistence.
"""
from __future__ import annotations
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
from app.vectors import canonicalize, load_model, neighbors, random_word, similarity
@asynccontextmanager
async def lifespan(_app: FastAPI):
# Force model load before accepting traffic (GoogleNews cold start ~30s).
load_model()
yield
app = FastAPI(title="word2sim", version="0.1.0", lifespan=lifespan)
class SimilarityResponse(BaseModel):
a: str
b: str
canonical_a: str | None
canonical_b: str | None
in_vocab_a: bool
in_vocab_b: bool
similarity: float | None # null iff either side is out-of-vocab
class NeighborEntry(BaseModel):
word: str
similarity: float
class NeighborsResponse(BaseModel):
word: str
canonical: str | None
in_vocab: bool
neighbors: list[NeighborEntry]
class VocabResponse(BaseModel):
word: str
canonical: str | None
in_vocab: bool
class RandomWordResponse(BaseModel):
word: str
rank: int # position in frequency-sorted vocab (0 = most frequent)
@app.get("/health")
def health() -> dict[str, str]:
return {"status": "ok"}
@app.get("/similarity", response_model=SimilarityResponse)
def get_similarity(
a: str = Query(min_length=1, max_length=64),
b: str = Query(min_length=1, max_length=64),
) -> SimilarityResponse:
kv = load_model()
canonical_a = canonicalize(kv, a)
canonical_b = canonicalize(kv, b)
score = (
similarity(kv, canonical_a, canonical_b)
if canonical_a and canonical_b
else None
)
return SimilarityResponse(
a=a,
b=b,
canonical_a=canonical_a,
canonical_b=canonical_b,
in_vocab_a=canonical_a is not None,
in_vocab_b=canonical_b is not None,
similarity=score,
)
@app.get("/neighbors", response_model=NeighborsResponse)
def get_neighbors(
word: str = Query(min_length=1, max_length=64),
topn: int = Query(default=10, ge=1, le=1000),
) -> NeighborsResponse:
kv = load_model()
canonical = canonicalize(kv, word)
if canonical is None:
return NeighborsResponse(
word=word, canonical=None, in_vocab=False, neighbors=[]
)
entries = [
NeighborEntry(word=w, similarity=s) for w, s in neighbors(kv, canonical, topn)
]
return NeighborsResponse(
word=word, canonical=canonical, in_vocab=True, neighbors=entries
)
@app.get("/vocab", response_model=VocabResponse)
def get_vocab(word: str = Query(min_length=1, max_length=64)) -> VocabResponse:
kv = load_model()
canonical = canonicalize(kv, word)
return VocabResponse(word=word, canonical=canonical, in_vocab=canonical is not None)
@app.get("/random", response_model=RandomWordResponse)
def get_random(
min_rank: int = Query(default=100, ge=0),
max_rank: int = Query(default=50000, ge=1),
alpha_only: bool = Query(default=True),
min_len: int = Query(default=3, ge=1, le=64),
max_len: int = Query(default=12, ge=1, le=64),
) -> RandomWordResponse:
if min_rank >= max_rank:
raise HTTPException(status_code=400, detail="min_rank must be < max_rank")
if min_len > max_len:
raise HTTPException(status_code=400, detail="min_len must be <= max_len")
kv = load_model()
word = random_word(
kv,
min_rank=min_rank,
max_rank=max_rank,
alpha_only=alpha_only,
min_len=min_len,
max_len=max_len,
)
if word is None:
raise HTTPException(status_code=503, detail="no word matched filter; loosen the constraints")
return RandomWordResponse(word=word, rank=kv.key_to_index[word])