|
from pathlib import Path |
|
from langchain.text_splitter import CharacterTextSplitter |
|
import faiss |
|
from langchain.vectorstores import FAISS |
|
from langchain.embeddings import OpenAIEmbeddings |
|
import pickle |
|
|
|
def create_vector_store(suffix, paper_text): |
|
|
|
|
|
|
|
split_chars = ["§", "§.§"] |
|
data = [] |
|
for c in split_chars: |
|
paper_text = paper_text.replace(c, "§") |
|
data = paper_text.split("§") |
|
|
|
|
|
sources = [] |
|
for d in data: |
|
sources.append(d.split("\n")[0].strip()) |
|
|
|
|
|
sources[0] = "Beginning of paper" |
|
|
|
|
|
|
|
text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n") |
|
docs = [] |
|
metadatas = [] |
|
for i, d in enumerate(data): |
|
splits = text_splitter.split_text(d) |
|
docs.extend(splits) |
|
metadatas.extend([{"source": sources[i]}] * len(splits)) |
|
|
|
|
|
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas) |
|
faiss.write_index(store.index, f"{suffix}/docs.index") |
|
store.index = None |
|
with open(f"{suffix}/faiss_store.pkl", "wb") as f: |
|
pickle.dump(store, f) |
|
|