ElectionProgramChatbot / Vectorstore.py
phisinger's picture
update paths
7d6d012
from langchain.embeddings import GPT4AllEmbeddings
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import GPT4AllEmbeddings
from langchain.vectorstores import Chroma
import chromadb
class Vectorstore_client:
def __init__(self):
self.persist_directory = "data/vectorstore"
self.client = chromadb.PersistentClient(path=self.persist_directory)
elections = ["2013", "2017", "2021"]
for election in elections:
# load all files from cleaned data set
glob = "*" + election + ".txt"
loader = DirectoryLoader(
'data/clean/', glob=glob, use_multithreading=True, loader_cls=TextLoader)
docs_list = loader.load()
# split documents
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs_list)
all_texts = [text.page_content for text in all_splits]
# generate ids for all documents
ids_list = ["id{}".format(i)
for i in range(1, len(all_texts) + 1)]
# Store splits in database
collection = self.client.get_or_create_collection(
name=election)
if collection.count() == 0:
collection.add(
documents=all_texts,
ids=ids_list
)
return
def get_client(self):
return self.client
# class Vectorstore:
# def __init__(self) -> None:
# self.persist_directory = "/home/phisinger/Programmieren/wahlprogramm_analyse/data/vectorstore"
# if False:
# # load data from data persist_directory
# print("use persisted db.")
# self.vectordb = Chroma(persist_directory=persist_directory,
# embedding_function=GPT4AllEmbeddings())
# else:
# print("Build new vector DB")
# self.build_vectorstore()
# return self.vectordb
# def build_vectorstore(self):
# elections = ["2013", "2017", "2021"]
# for election in elections:
# # load all files from cleaned data set
# glob = "*" + election + ".txt"
# loader = DirectoryLoader(
# '../data/clean/', glob=glob, use_multithreading=True, loader_cls=TextLoader)
# docs_list = loader.load()
# # split documents
# text_splitter = RecursiveCharacterTextSplitter(
# chunk_size=1000, chunk_overlap=200)
# all_splits = text_splitter.split_documents(docs_list)
# # store documents in vector store
# self.vectordb = Chroma.from_documents(
# documents=all_splits, embedding=GPT4AllEmbeddings(), persist_directory=self.persist_directory)
# self.vectordb.persist()
# def get(self):
# return self.vectordb