Spaces:

Princess3
/

python

Runtime error

App Files Files Community

Princess3 commited on Oct 28, 2024

Commit

2457015

verified ·

1 Parent(s): 3cb7033

Upload m3.py

Browse files

Files changed (1) hide show

m3.py +12 -9

m3.py CHANGED Viewed

@@ -1,9 +1,10 @@
-import os, xml.etree.ElementTree as ET, torch, torch.nn as nn, torch.nn.functional as F, faiss, numpy as np
 from typing import List, Dict, Any, Optional
 from collections import defaultdict
 from accelerate import Accelerator
 from transformers import AutoTokenizer, AutoModel
 from termcolor import colored
 class DM(nn.Module):
     def __init__(self, s: Dict[str, List[Dict[str, Any]]]):
@@ -108,7 +109,7 @@ def cmf(folder_path: str) -> DM:
 def ceas(folder_path: str, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
     t = AutoTokenizer.from_pretrained(model_name)
     m = AutoModel.from_pretrained(model_name)
-    vs = faiss.IndexFlatL2(384)
     ds = []
     for r, d, f in os.walk(folder_path):
         for file in f:
@@ -123,20 +124,22 @@ def ceas(folder_path: str, model_name: str = "sentence-transformers/all-MiniLM-L
                             i = t(text, return_tensors="pt", truncation=True, padding=True)
                             with torch.no_grad():
                                 emb = m(**i).last_hidden_state.mean(dim=1).numpy()
-                            vs.add(emb)
                             ds.append(text)
                 except Exception as e:
                     print(colored(f"Error processing {fp}: {str(e)}", 'red'))
-    return vs, ds
-def qvs(query: str, vs, ds, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
     t = AutoTokenizer.from_pretrained(model_name)
     m = AutoModel.from_pretrained(model_name)
     i = t(query, return_tensors="pt", truncation=True, padding=True)
     with torch.no_grad():
         qe = m(**i).last_hidden_state.mean(dim=1).numpy()
-    D, I = vs.search(qe, k=5)
-    return [ds[i] for i in I[0]]
 def main():
     fp = 'data'
@@ -148,7 +151,7 @@ def main():
     si = torch.randn(1, ife)
     o = m(si)
     print(colored(f"Sample output shape: {o.shape}", 'green'))
-    vs, ds = ceas(fp)
     a = Accelerator()
     o = torch.optim.Adam(m.parameters(), lr=0.001)
     c = nn.CrossEntropyLoss()
@@ -169,7 +172,7 @@ def main():
         al = tl / len(td)
         print(colored(f"Epoch {e+1}/{ne}, Average Loss: {al:.4f}", 'blue'))
     uq = "example query text"
-    r = qvs(uq, vs, ds)
     print(colored(f"Query results: {r}", 'magenta'))
 if __name__ == "__main__":

+import os, xml.etree.ElementTree as ET, torch, torch.nn as nn, torch.nn.functional as F, numpy as np
 from typing import List, Dict, Any, Optional
 from collections import defaultdict
 from accelerate import Accelerator
 from transformers import AutoTokenizer, AutoModel
 from termcolor import colored
+from sklearn.metrics.pairwise import cosine_similarity
 class DM(nn.Module):
     def __init__(self, s: Dict[str, List[Dict[str, Any]]]):
 def ceas(folder_path: str, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
     t = AutoTokenizer.from_pretrained(model_name)
     m = AutoModel.from_pretrained(model_name)
+    embeddings = []
     ds = []
     for r, d, f in os.walk(folder_path):
         for file in f:
                             i = t(text, return_tensors="pt", truncation=True, padding=True)
                             with torch.no_grad():
                                 emb = m(**i).last_hidden_state.mean(dim=1).numpy()
+                            embeddings.append(emb)
                             ds.append(text)
                 except Exception as e:
                     print(colored(f"Error processing {fp}: {str(e)}", 'red'))
+    embeddings = np.vstack(embeddings)
+    return embeddings, ds
+def qvs(query: str, embeddings, ds, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
     t = AutoTokenizer.from_pretrained(model_name)
     m = AutoModel.from_pretrained(model_name)
     i = t(query, return_tensors="pt", truncation=True, padding=True)
     with torch.no_grad():
         qe = m(**i).last_hidden_state.mean(dim=1).numpy()
+    similarities = cosine_similarity(qe, embeddings)
+    top_k_indices = similarities[0].argsort()[-5:][::-1]
+    return [ds[i] for i in top_k_indices]
 def main():
     fp = 'data'
     si = torch.randn(1, ife)
     o = m(si)
     print(colored(f"Sample output shape: {o.shape}", 'green'))
+    embeddings, ds = ceas(fp)
     a = Accelerator()
     o = torch.optim.Adam(m.parameters(), lr=0.001)
     c = nn.CrossEntropyLoss()
         al = tl / len(td)
         print(colored(f"Epoch {e+1}/{ne}, Average Loss: {al:.4f}", 'blue'))
     uq = "example query text"
+    r = qvs(uq, embeddings, ds)
     print(colored(f"Query results: {r}", 'magenta'))
 if __name__ == "__main__":