|
import os
|
|
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
|
from langchain_community.vectorstores import FAISS
|
|
from langchain.chains import RetrievalQA
|
|
from langchain.llms import OpenAI
|
|
from dotenv import load_dotenv
|
|
|
|
|
|
load_dotenv()
|
|
|
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
if not OPENAI_API_KEY:
|
|
raise ValueError("Missing OPENAI_API_KEY in environment variables.")
|
|
|
|
|
|
def load_pdf_file(data_path):
|
|
loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
|
|
documents = loader.load()
|
|
return documents
|
|
|
|
|
|
def text_split(docs):
|
|
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
|
|
return splitter.split_documents(docs)
|
|
|
|
|
|
llm = OpenAI(model_name="gpt-4o-mini", temperature=0.5, openai_api_key=OPENAI_API_KEY)
|
|
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
|
|
|
|
|
|
pdf_docs = load_pdf_file("/kaggle/input/rag-test")
|
|
chunks = text_split(pdf_docs)
|
|
|
|
vectorstore = FAISS.from_documents(chunks, embeddings)
|
|
vectorstore.save_local("faiss_index_sysml")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|