Spaces:
Running
Running
import os | |
import pickle | |
import tempfile | |
from langchain.document_loaders import PyPDFLoader, TextLoader | |
from langchain.document_loaders.csv_loader import CSVLoader | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
class Embedder: | |
def __init__(self): | |
self.PATH = "embeddings" | |
self.createEmbeddingsDir() | |
def createEmbeddingsDir(self): | |
""" | |
Creates a directory to store the embeddings vectors | |
""" | |
if not os.path.exists(self.PATH): | |
os.mkdir(self.PATH) | |
def storeDocEmbeds(self, file, original_filename): | |
""" | |
Stores document embeddings using Langchain and FAISS | |
""" | |
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as tmp_file: | |
tmp_file.write(file) | |
tmp_file_path = tmp_file.name | |
def get_file_extension(uploaded_file): | |
file_extension = os.path.splitext(uploaded_file)[1].lower() | |
return file_extension | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=2000, | |
chunk_overlap=100, | |
length_function=len, | |
) | |
file_extension = get_file_extension(original_filename) | |
if file_extension == ".csv": | |
loader = CSVLoader( | |
file_path=tmp_file_path, | |
encoding="utf-8", | |
csv_args={ | |
"delimiter": ",", | |
}, | |
) | |
data = loader.load() | |
elif file_extension == ".pdf": | |
loader = PyPDFLoader(file_path=tmp_file_path) | |
data = loader.load_and_split(text_splitter) | |
elif file_extension == ".txt": | |
loader = TextLoader(file_path=tmp_file_path, encoding="utf-8") | |
data = loader.load_and_split(text_splitter) | |
# embeddings = OpenAIEmbeddings() | |
from langchain.embeddings import HuggingFaceEmbeddings | |
modelpath = "intfloat/e5-large-v2" | |
embeddings = HuggingFaceEmbeddings(model_name=modelpath) | |
vectors = FAISS.from_documents(data, embeddings) | |
os.remove(tmp_file_path) | |
# Save the vectors to a pickle file | |
with open(f"{self.PATH}/{original_filename}.pkl", "wb") as f: | |
pickle.dump(vectors, f) | |
def getDocEmbeds(self, file, original_filename): | |
""" | |
Retrieves document embeddings | |
""" | |
if not os.path.isfile(f"{self.PATH}/{original_filename}.pkl"): | |
self.storeDocEmbeds(file, original_filename) | |
# Load the vectors from the pickle file | |
with open(f"{self.PATH}/{original_filename}.pkl", "rb") as f: | |
vectors = pickle.load(f) | |
return vectors | |