lpetrl commited on
Commit
94e8fb8
1 Parent(s): 264a09d

feat(API): Implemented basic functionality.

Browse files
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ COPY requirements.txt ./requirements.txt
4
+
5
+ RUN python -m pip install -U pip && \
6
+ python -m pip install -r requirements.txt && \
7
+ python -m pip cache purge
8
+
9
+ COPY ./data /app/data
10
+ COPY ./database /app/database
11
+ COPY ./src /app/src
12
+
13
+ WORKDIR /app
14
+
15
+ CMD ["uvicorn", "src.app:app", "--host", "0.0.0.0", "--port", "7860"]
data/ukrainian_nouns.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ лікар
2
+ програміст
3
+ пілот
4
+ літак
5
+ висота
6
+ тиск
7
+ барометр
8
+ вітер
9
+ медицина
10
+ комп'ютер
11
+ рука
12
+ око
13
+ ніс
14
+ книга
15
+ папір
16
+ олівець
17
+ Франція
18
+ Париж
19
+ Германія
database/MiniLM-L12-v.lance/_latest.manifest ADDED
Binary file (497 Bytes). View file
 
database/MiniLM-L12-v.lance/_transactions/0-5e898e5f-189c-473e-892d-8b5947b6a369.txn ADDED
@@ -0,0 +1 @@
 
 
1
+ $5e898e5f-189c-473e-892d-8b5947b6a369�Uword ���������*string084vector ���������*fixed_size_list:float:38408
database/MiniLM-L12-v.lance/_transactions/1-9077628d-c42c-4413-8bd1-cc31ea726bce.txn ADDED
Binary file (97 Bytes). View file
 
database/MiniLM-L12-v.lance/_versions/1.manifest ADDED
Binary file (443 Bytes). View file
 
database/MiniLM-L12-v.lance/_versions/2.manifest ADDED
Binary file (497 Bytes). View file
 
database/MiniLM-L12-v.lance/data/a1b860b1-c7a3-4314-9c51-38ad78b5de8b.lance ADDED
Binary file (30.2 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ duckdb==0.10.1
2
+ fastapi==0.110.0
3
+ pandas==2.2.1
4
+ lancedb==0.6.4
5
+ sentence-transformers==2.5.1
6
+ uvicorn==0.29.0
src/.DS_Store ADDED
Binary file (6.15 kB). View file
 
src/__pycache__/api_models.cpython-311.pyc ADDED
Binary file (2.32 kB). View file
 
src/__pycache__/app.cpython-311.pyc ADDED
Binary file (600 Bytes). View file
 
src/__pycache__/handlers.cpython-311.pyc ADDED
Binary file (3.7 kB). View file
 
src/__pycache__/setting.cpython-311.pyc ADDED
Binary file (1.33 kB). View file
 
src/__pycache__/vector_db.cpython-311.pyc ADDED
Binary file (2.91 kB). View file
 
src/api_models.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class ResponseModel(BaseModel):
5
+ message: str
6
+ data: dict
7
+ code: int
8
+
9
+
10
+ class ResponseGuessWord(BaseModel):
11
+ word: str
12
+
13
+
14
+ class RequestSemanticCalculation(BaseModel):
15
+ supposed_word: str = Field(
16
+ description="The word that the user is trying to guess",
17
+ example="ніс"
18
+ )
19
+ guessed_word: str = Field(
20
+ description="The word that the user guessed",
21
+ example="око"
22
+ )
23
+
24
+
25
+ class SemanticCalculation(BaseModel):
26
+ score: float
27
+ rating: int
28
+ percentage: float
29
+ closest_word: str
30
+
31
+
32
+ class ResponseSemanticCalculation(BaseModel):
33
+ word_exist: bool
34
+ metadata: SemanticCalculation | None
35
+
36
+
37
+ class ResponseMessage(BaseModel):
38
+ message: str
src/app.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+
3
+ from src.handlers import router
4
+
5
+
6
+ def get_application() -> FastAPI:
7
+ application = FastAPI()
8
+ application.include_router(router)
9
+
10
+ return application
11
+
12
+
13
+ app = get_application()
src/create-embedding.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import pandas as pd
4
+ import lancedb
5
+ from lancedb.embeddings import with_embeddings
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ from setting import CFG, AVAILABLE_WORDS
9
+
10
+
11
+ df = pd.DataFrame(AVAILABLE_WORDS, columns=['word'])
12
+
13
+ model = SentenceTransformer(CFG.model.name)
14
+
15
+ data = with_embeddings(
16
+ func=lambda texts: model.encode(texts),
17
+ data=df, column="word", show_progress=True
18
+ )
19
+
20
+ if not os.path.exists(CFG.db.lance_db_folder_path):
21
+ os.makedirs(CFG.db.lance_db_folder_path)
22
+
23
+ db = lancedb.connect(CFG.db.lance_db_folder_path)
24
+ table = db.create_table(CFG.db.table_name, data)
25
+ print("Table created")
src/handlers.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ from fastapi import APIRouter, Depends
4
+ from fastapi.responses import JSONResponse
5
+
6
+ from src.api_models import (
7
+ ResponseGuessWord, ResponseSemanticCalculation,
8
+ RequestSemanticCalculation, ResponseMessage,
9
+ SemanticCalculation
10
+ )
11
+ from src.setting import AVAILABLE_WORDS, CFG
12
+ from src.vector_db import VectorDatabaseHandler
13
+
14
+ router = APIRouter()
15
+
16
+ DEFAULT_RESPONSES = {
17
+ 500: {"description": "Internal Server Error", "model": ResponseMessage},
18
+ }
19
+
20
+
21
+ @router.get(
22
+ "/v1/service/status",
23
+ response_model=ResponseMessage,
24
+ responses={**DEFAULT_RESPONSES},
25
+ description="Description: The endpoint is used to check the service status.",
26
+ tags=["Service Status"]
27
+ )
28
+ async def status() -> ResponseMessage:
29
+ """Health endpoint."""
30
+ return ResponseMessage(message="Success.")
31
+
32
+
33
+ @router.get(
34
+ "/v1/service/get_guess_word",
35
+ response_model=ResponseGuessWord,
36
+ responses={**DEFAULT_RESPONSES},
37
+ description="Description: The endpoint is used to get a random word from the list of available words.",
38
+ tags=["Get Word"]
39
+ )
40
+ async def get_guess_word() -> ResponseGuessWord:
41
+ try:
42
+ guess_word = random.choices(AVAILABLE_WORDS, k=1)[0]
43
+ except Exception as e:
44
+ return JSONResponse(status_code=500, content={"message": str(e)})
45
+ return ResponseGuessWord(word=guess_word)
46
+
47
+
48
+ @router.get(
49
+ "/v1/service/semantic_calculation",
50
+ response_model=ResponseSemanticCalculation,
51
+ responses={**DEFAULT_RESPONSES},
52
+ description="Description: The endpoint is used to calculate the semantic similarity between the guessed word \
53
+ and the supposed word.",
54
+ tags=["Semantic Analysis"]
55
+ )
56
+ async def semantic_calculation(
57
+ request: RequestSemanticCalculation = Depends(RequestSemanticCalculation)
58
+ ) -> ResponseGuessWord:
59
+ supposed_word = request.supposed_word
60
+ guessed_word = request.guessed_word
61
+
62
+ if supposed_word not in AVAILABLE_WORDS:
63
+ return ResponseSemanticCalculation(
64
+ word_exist=False,
65
+ metadata=None
66
+ )
67
+
68
+ vector_db = VectorDatabaseHandler(
69
+ db_path=CFG.db.folder_path,
70
+ table_name=CFG.db.table_name,
71
+ metrics_cfg=CFG.db.metrics
72
+ )
73
+
74
+ try:
75
+ result = vector_db(guessed_word, supposed_word)
76
+ except Exception as e:
77
+ return JSONResponse(status_code=500, content={"message": str(e)})
78
+ return ResponseSemanticCalculation(
79
+ word_exist=True,
80
+ metadata=SemanticCalculation(**result)
81
+ )
src/setting.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from types import SimpleNamespace
2
+
3
+ metrics_cfg = SimpleNamespace(
4
+ metric="cosine",
5
+ threshold=0.5,
6
+ )
7
+
8
+ db_cfg = SimpleNamespace(
9
+ db_name="lancedb",
10
+ table_name="MiniLM-L12-v",
11
+ folder_path="database",
12
+ metrics=metrics_cfg
13
+ )
14
+
15
+ model_cfg = SimpleNamespace(
16
+ language="ukr",
17
+ name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
18
+ task="sentence-transformers",
19
+ )
20
+
21
+ CFG = SimpleNamespace(
22
+ vocab_path="data/ukrainian_nouns.txt",
23
+ model=model_cfg,
24
+ db=db_cfg,
25
+ )
26
+
27
+ with open(CFG.vocab_path, "r") as file:
28
+ AVAILABLE_WORDS = [line.strip() for line in file.readlines()]
src/vector_db.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import duckdb
2
+ import lancedb
3
+
4
+ from src.setting import AVAILABLE_WORDS
5
+
6
+
7
+ class VectorDatabaseHandler:
8
+ QUERY_TEMPLATE = "SELECT word, vector FROM {table_name} WHERE word = '{user_word}'"
9
+
10
+ def __init__(self, db_path: str, table_name: str, metrics_cfg: dict):
11
+ db = lancedb.connect(db_path)
12
+
13
+ self.metrics_cfg = metrics_cfg
14
+ self.embeddings_tbl = db.open_table(table_name)
15
+
16
+ def __call__(self, guessed_word: str, supposed_word: str) -> dict:
17
+ arrow_table = self.embeddings_tbl.to_arrow()
18
+ word_embedding = self.get_word_vector(guessed_word, "arrow_table")
19
+
20
+ df_emb = self.embeddings_tbl.search(word_embedding) \
21
+ .metric(self.metrics_cfg.metric) \
22
+ .limit(len(AVAILABLE_WORDS)) \
23
+ .to_df()
24
+
25
+ supposed_word_row = df_emb[df_emb['word'] == supposed_word].iloc[0]
26
+ cosine_distance = supposed_word_row['_distance']
27
+
28
+ words_between_count = len(df_emb[df_emb['_distance'] < cosine_distance])
29
+ closest_word = df_emb[df_emb['word'] != guessed_word].iloc[0]['word'] if words_between_count else supposed_word
30
+
31
+ return {
32
+ "score": cosine_distance,
33
+ "rating": words_between_count,
34
+ "percentage": 100 - words_between_count / len(df_emb) * 100,
35
+ "closest_word": closest_word
36
+ }
37
+
38
+ def get_word_vector(self, word: str, table_name: str):
39
+ vector = duckdb.query(
40
+ self.QUERY_TEMPLATE.format(table_name=table_name, user_word=word)
41
+ ).to_df()["vector"].values[0]
42
+ return vector