CAPStone / src /helper.py
Rohan246's picture
Upload 20 files
503a7f1 verified
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
from typing import List
from langchain_classic.schema import Document
import torch
from langchain_groq import ChatGroq
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
def load_file(path):
loader = DirectoryLoader(path, glob= '*.pdf', loader_cls= PyPDFLoader)
documents = loader.load()
return documents
def filtering(documents):
docs: List[Document] = []
for doc in documents:
src= doc.metadata.get("source")
docs.append(
Document(
page_content=doc.page_content,
metadata={"source":src})
)
return docs
def chunking(docs):
splitter = RecursiveCharacterTextSplitter(
chunk_size= 1000, chunk_overlap= 200, length_function= len
)
text = splitter.split_documents(docs)
return text
def download_embeddings():
embeddings= HuggingFaceEmbeddings(
model_name= "BAAI/bge-small-en-v1.5",
model_kwargs= {"device":"cuda" if torch.cuda.is_available() else "cpu"}
)
return embeddings