import os import streamlit as st from langchain.chains import RetrievalQA from langchain.llms import OpenAI from langchain.document_loaders import DirectoryLoader from langchain.text_splitter import CharacterTextSplitter from langchain.indexes import VectorstoreIndexCreator from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.callbacks import get_openai_callback # variables db_folder = "db" # set your OpenAI API key os.environ["OPENAI_API_KEY"] = "..." # initialize the language model llm = OpenAI(model_name="text-ada-001", n=2, best_of=2) with get_openai_callback() as cb: # load the documents loader = DirectoryLoader('./docs', glob="**/*.md") documents = loader.load() # print(documents[0]) # split the documents into chunks text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) texts = text_splitter.split_documents(documents) # create the embeddings and index embeddings = OpenAIEmbeddings() # create the vectorstore and retriever db = Chroma.from_documents( texts, embeddings, persist_directory=db_folder) retriever = db.as_retriever(search_type="mmr") # initialize the chain qa = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=retriever) # store the embedings and index db.persist() print(f"Total Tokens: {cb.total_tokens}") print(f"Prompt Tokens: {cb.prompt_tokens}") print(f"Completion Tokens: {cb.completion_tokens}") print(f"Successful Requests: {cb.successful_requests}") print(f"Total Cost (USD): ${cb.total_cost}")