File size: 1,003 Bytes
e394a78
 
 
 
1f89808
e394a78
1f89808
 
e394a78
1f89808
 
e394a78
 
 
 
 
 
 
 
 
 
 
 
 
 
1f89808
 
5ae483b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# * This is for Rag pipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import os
from dotenv import load_dotenv

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENROUTE_API_KEY')
def dataIngestion( document):
    loader = PyPDFLoader(document)
    ingested_docs = loader.load()

    return ingested_docs


def transform( ingested_docs):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
    transformed_docs = text_splitter.split_documents(ingested_docs)
    return transformed_docs


def vectorStoreAndEmbeddings(docs, query):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db = Chroma.from_documents(documents=docs, embedding=embeddings)
    return db.similarity_search(query)