|
import os |
|
import numpy as np |
|
import pickle |
|
|
|
from langchain.vectorstores import FAISS, Chroma, DocArrayInMemorySearch |
|
from langchain.embeddings.huggingface import HuggingFaceEmbeddings |
|
from langchain.document_loaders.csv_loader import CSVLoader |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
def create_vector_store_index(file_path, embedding_model_repo_id="sentence-transformers/all-roberta-large-v1"): |
|
|
|
file_path_split = file_path.split(".") |
|
file_type = file_path_split[-1].rstrip('/') |
|
|
|
if file_type == 'csv': |
|
print(file_path) |
|
loader = CSVLoader(file_path=file_path) |
|
documents = loader.load() |
|
|
|
elif file_type == 'pdf': |
|
loader = PyPDFLoader(file_path) |
|
pages = loader.load() |
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size = 1024, |
|
chunk_overlap = 128,) |
|
|
|
documents = text_splitter.split_documents(pages) |
|
|
|
|
|
embedding_model = HuggingFaceEmbeddings( |
|
model_name=embedding_model_repo_id |
|
) |
|
|
|
vectordb = FAISS.from_documents(documents, embedding_model) |
|
file_output = "./db/faiss_index" |
|
vectordb.save_local(file_output) |
|
|
|
return "Vector store index is created." |