import os from dotenv import load_dotenv from bs4.filter import SoupStrainer from langchain_community.document_loaders import WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_community.vectorstores import Chroma from langchain.load import dumps, loads def get_retriever(link: str): loader = WebBaseLoader( web_path=(link,), bs_kwargs=dict( parse_only = SoupStrainer( class_ = ("post-content", "post-title", "post-header") ) ) ) blog_docs = loader.load() text_splitter = RecursiveCharacterTextSplitter( chunk_size=300, chunk_overlap=50, ) splits = text_splitter.split_documents(blog_docs) # type: ignore vectorstore = Chroma.from_documents( documents=splits, # type: ignore embedding=OpenAIEmbeddings(), persist_directory="chroma_db", collection_name="lilian_weng_agent_blog" ) retrievar = vectorstore.as_retriever() return retrievar def get_llm(): llm = ChatOpenAI(temperature=0) return llm