|
import gradio as gr |
|
from langchain.vectorstores import Chroma |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.embeddings import HuggingFaceInstructEmbeddings |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
|
|
hf = HuggingFaceInstructEmbeddings( |
|
model_name="sentence-transformers/all-MiniLM-L6-v2", |
|
model_kwargs={"device": "cpu"} |
|
) |
|
|
|
|
|
from langchain.document_loaders import PyPDFDirectoryLoader |
|
|
|
loader = PyPDFDirectoryLoader("new_papers/") |
|
|
|
documents = loader.load() |
|
|
|
|
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) |
|
texts = text_splitter.split_documents(documents) |
|
|
|
|
|
db = Chroma.from_documents(texts, hf, collection_name="my-collection") |
|
|
|
class VectoreStoreRetrievalTool: |
|
def __init__(self): |
|
self.retriever = db.as_retriever(search_kwargs={"k": 1}) |
|
|
|
def __call__(self, query): |
|
|
|
response = self.retriever.run(query) |
|
return response['result'] |
|
|