from functools import cache import os import gradio as gr from langchain.llms import OpenAI from langchain.chains.summarize import load_summarize_chain from langchain.document_loaders import TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings.openai import OpenAIEmbeddings from langchain.chains.question_answering import load_qa_chain import tiktoken @cache def tiktoken_len_builder(model_name): tokenizer = tiktoken.encoding_for_model(model_name) def token_len(text): tokens = tokenizer.encode(text, disallowed_special=()) return len(tokens) return token_len def split_documents(docs, length_function, chunk_size=400): text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=20, length_function=length_function, ) return text_splitter.split_documents(docs) def summarize_docs(llm, docs): chain = load_summarize_chain(llm, chain_type="map_reduce") return chain.run(docs) class MdnaQA: def __init__(self, llm, docs): self.docs = docs self.chain = load_qa_chain(llm, chain_type="stuff") embeddings = OpenAIEmbeddings(openai_api_key=llm.openai_api_key) self.docsearch = Chroma.from_documents(docs, embeddings) def ask(self, question): input_documents = self.docsearch.similarity_search(question) return self.chain.run(input_documents=input_documents, question=question) filename = "2023-05-12_2023_q1_goog_mdna.txt" loader = TextLoader(filename) documents = loader.load() model_name = "text-davinci-003" tiktoken_len = tiktoken_len_builder(model_name) docs = split_documents(documents, tiktoken_len) tokens_sum = sum(tiktoken_len(d.page_content) for d in docs) title = "Alphabet's Q1 2023 10-Q MD&A" video = '' with gr.Blocks(title=title) as demo: gr.Markdown(f"# {title}") gr.HTML(video) gr.Markdown("Blog post https://blog.experienced.dev") gr.Markdown( "You can get an API key [from OpenAI](https://platform.openai.com/account/api-keys)" ) openai_api_key = gr.Text( value=os.getenv("OPENAI_API_KEY"), type="password", label="OpenAI API key", ) temperature = gr.Slider( 0, 2, value=0, step=0.1, label="Temperature", info="adjusts a model's output from predictable to random", ) mdna = gr.State(docs) tokens_total = gr.Textbox( label="Total input tokens", value=tokens_sum, info="how many tokens will be spent on input / embeddings", ) with gr.Tabs(visible=True) as tabs: with gr.TabItem("Summary"): summarize = gr.Button( "Summarize MD&A", variant="primary", info="On click you spent tokens on input, instructions and output", ) summary = gr.TextArea(label="Summary") def summarize_mdna(docs, api_key, temp): llm = OpenAI(temperature=temp, openai_api_key=api_key) mdna_summary = summarize_docs(llm, docs) return mdna_summary summarize.click( summarize_mdna, inputs=[mdna, openai_api_key, temperature], outputs=[summary], ) with gr.TabItem("QA with MD&A"): start_qa = gr.Button("Start QA with MD&A", variant="primary") chatbot = gr.Chatbot(label="QA with MD&A", visible=False) question = gr.Textbox( label="Your question", interactive=True, visible=False ) qa_chat = gr.State() send = gr.Button("Ask question", variant="primary", visible=False) def start_chat(docs, api_key, temp): llm = OpenAI(temperature=temp, openai_api_key=api_key) qa_chat = MdnaQA(llm, docs) return ( qa_chat, gr.Textbox.update(visible=True), gr.Textbox.update(visible=True), gr.Button.update(visible=True), ) start_qa.click( start_chat, [mdna, openai_api_key, temperature], [qa_chat, chatbot, question, send], ) def respond(qa_chat, question, chat_history): answer = qa_chat.ask(question) chat_history.append((question, answer)) return "", chat_history send.click(respond, [qa_chat, question, chatbot], [question, chatbot]) question.submit(respond, [qa_chat, question, chatbot], [question, chatbot]) demo.launch()