Delete utils.py
Browse files
utils.py
DELETED
|
@@ -1,42 +0,0 @@
|
|
| 1 |
-
import openai
|
| 2 |
-
import numpy as np
|
| 3 |
-
import re
|
| 4 |
-
from typing import List, Tuple
|
| 5 |
-
from config import EMBED_MODEL
|
| 6 |
-
|
| 7 |
-
def get_embedding(text: str) -> List[float]:
|
| 8 |
-
"""Generate embedding for a given text."""
|
| 9 |
-
text_strip = text.replace("\n", " ").strip()
|
| 10 |
-
response = openai.embeddings.create(input=[text_strip], model=EMBED_MODEL)
|
| 11 |
-
return response.data[0].embedding
|
| 12 |
-
|
| 13 |
-
def cosine_similarity(a: List[float], b: List[float]) -> float:
|
| 14 |
-
"""Calculate cosine similarity between two vectors."""
|
| 15 |
-
a = np.array(a)
|
| 16 |
-
b = np.array(b)
|
| 17 |
-
if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
|
| 18 |
-
return 0.0
|
| 19 |
-
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
| 20 |
-
|
| 21 |
-
def clean_time(time_str: str) -> str:
|
| 22 |
-
"""Clean up time string."""
|
| 23 |
-
if not time_str:
|
| 24 |
-
return ""
|
| 25 |
-
|
| 26 |
-
time_match = re.search(r'(\d{1,2}):?(\d{0,2})\s*(AM|PM)', time_str, re.IGNORECASE)
|
| 27 |
-
if time_match:
|
| 28 |
-
hour = time_match.group(1)
|
| 29 |
-
minute = time_match.group(2) or "00"
|
| 30 |
-
ampm = time_match.group(3).upper()
|
| 31 |
-
return f"{hour}:{minute} {ampm}"
|
| 32 |
-
|
| 33 |
-
return time_str.strip()
|
| 34 |
-
|
| 35 |
-
def find_top_k_matches(user_embedding, dataset, k=3):
|
| 36 |
-
"""Find top k matching entries from a dataset."""
|
| 37 |
-
scored = []
|
| 38 |
-
for entry_id, text, emb in dataset:
|
| 39 |
-
score = cosine_similarity(user_embedding, emb)
|
| 40 |
-
scored.append((score, entry_id, text))
|
| 41 |
-
scored.sort(reverse=True)
|
| 42 |
-
return scored[:k]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|