|
import os
|
|
from dotenv import load_dotenv
|
|
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain.agents import Tool, AgentExecutor
|
|
from langchain.tools.retriever import create_retriever_tool
|
|
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
from langchain_community.vectorstores import FAISS
|
|
from langchain_community.embeddings import AzureOpenAIEmbeddings
|
|
from langchain_community.chat_models import AzureChatOpenAI
|
|
from openai import AzureOpenAI
|
|
import warnings
|
|
|
|
|
|
load_dotenv()
|
|
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
|
|
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
|
AZURE_OPENAI_LLM_DEPLOYMENT = os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT")
|
|
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
|
|
|
|
if not all([AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_LLM_DEPLOYMENT, AZURE_OPENAI_EMBEDDING_DEPLOYMENT]):
|
|
raise ValueError("Missing one or more Azure OpenAI environment variables.")
|
|
|
|
warnings.filterwarnings("ignore")
|
|
|
|
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
|
|
if not AZURE_OPENAI_API_KEY:
|
|
raise ValueError("Missing AZURE_OPENAI_API_KEY in environment variables.")
|
|
|
|
chunk_size = 500
|
|
|
|
|
|
def load_pdf_file(data_path):
|
|
loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
|
|
documents = loader.load()
|
|
return documents
|
|
|
|
|
|
def text_split(docs):
|
|
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
|
|
return splitter.split_documents(docs)
|
|
|
|
|
|
llm = AzureChatOpenAI(
|
|
deployment_name=AZURE_OPENAI_LLM_DEPLOYMENT,
|
|
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
|
openai_api_key=AZURE_OPENAI_API_KEY,
|
|
openai_api_version="2023-12-01-preview"
|
|
|
|
)
|
|
embeddings = AzureOpenAIEmbeddings(
|
|
azure_deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
|
|
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
|
openai_api_key=AZURE_OPENAI_API_KEY,
|
|
openai_api_version="2023-12-01-preview",
|
|
chunk_size=chunk_size
|
|
)
|
|
|
|
|
|
pdf_docs = load_pdf_file("Dataset/")
|
|
chunks = text_split(pdf_docs)
|
|
|
|
vectorstore = FAISS.from_documents(chunks, embeddings)
|
|
vectorstore.save_local("faiss_index_sysml")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|