File size: 2,789 Bytes
6395650 210a0ed 6395650 2cfdcaa 6395650 18e7d5c 6395650 5f336b9 6395650 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# %%
import gradio as gr
import sys, os
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
# from dotenv import load_dotenv, find_dotenv
# _ = load_dotenv(find_dotenv())
# OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_API_KEY = 'your own API key'
# %%
def load_documents(file_path):
loader = UnstructuredPDFLoader(file_path)
return loader.load()
def chunk_documents(data):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
return text_splitter.split_documents(data)
def answer_questions(query, docsearch, chain):
results=[]
results.append(query)
docs = docsearch.similarity_search(query, include_metadata=True)
ans = chain.run(input_documents=docs, question=query)
results.append(ans["answer"])
return results
def run_model(file, question):
# Load PDF file
data = load_documents(file.name)
# print(f'You have {len(data)} document(s) in your data')
# print(f'There are {len(data[0].page_content)} characters in your document')
# Chunk documents
texts = chunk_documents(data)
# print(f'Now you have {len(texts)} documents')
# Set up embeddings and vector store
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
vectorstore = Chroma.from_documents(texts, embedding_function="gpt-turbo-3.5")
# vectorstore = Chroma.from_documents(texts, embeddings) # use more expensive model
# Set up memory and conversational retrieval chain
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa = ConversationalRetrievalChain.from_llm(
OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
vectorstore.as_retriever(search_kwargs={"k": 3}),
memory=memory
)
# Answer question
result = qa({"question": question})
return result["answer"]
# Create Gradio interface
file_upload = gr.inputs.File(label="Upload PDF file")
question = gr.inputs.Textbox(label="Question")
output = gr.outputs.Textbox()
gr.Interface(
fn=run_model,
inputs=[file_upload, question],
outputs=output,
title="Conversational Retrieval Chain",
description="Upload a PDF file and ask a question related to its content.",
# examples=[["./data/fulltext.pdf", "What is the paper about?"], ["./data/fulltext.pdf", "How is the cwsi defined?"]]
).launch() #share=True
# %%
|