Spaces:
Sleeping
Sleeping
Create embedding.py
Browse files- src/embedding.py +20 -0
src/embedding.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# embedding.py
|
| 2 |
+
import os
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import faiss
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
# --- Load data ---
|
| 9 |
+
def load_data():
|
| 10 |
+
data_path = os.path.join(os.path.dirname(__file__), 'train_data.csv')
|
| 11 |
+
df = pd.read_csv(data_path)
|
| 12 |
+
return df['question'].tolist(), df['answer'].tolist()
|
| 13 |
+
|
| 14 |
+
# --- Embedding model and FAISS index ---
|
| 15 |
+
def setup_embeddings(answers):
|
| 16 |
+
embedder = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
|
| 17 |
+
answer_embeddings = embedder.encode(answers, show_progress_bar=True)
|
| 18 |
+
index = faiss.IndexFlatL2(answer_embeddings.shape[1])
|
| 19 |
+
index.add(np.array(answer_embeddings))
|
| 20 |
+
return embedder, index
|