File size: 2,789 Bytes
6395650
 
 
 
 
 
 
 
 
 
 
 
210a0ed
 
 
6395650
2cfdcaa
6395650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18e7d5c
 
6395650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f336b9
6395650
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# %%
import gradio as gr
import sys, os
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

# from dotenv import load_dotenv, find_dotenv
# _ = load_dotenv(find_dotenv())
# OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')

OPENAI_API_KEY = 'your own API key'
# %%
def load_documents(file_path):
    loader = UnstructuredPDFLoader(file_path)
    return loader.load()

def chunk_documents(data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    return text_splitter.split_documents(data)

def answer_questions(query, docsearch, chain):
    results=[]
    results.append(query)
    docs = docsearch.similarity_search(query, include_metadata=True)
    ans = chain.run(input_documents=docs, question=query)
    results.append(ans["answer"])
    return results


def run_model(file, question):
    # Load PDF file

    data = load_documents(file.name)
    # print(f'You have {len(data)} document(s) in your data')
    # print(f'There are {len(data[0].page_content)} characters in your document')
    
    # Chunk documents
    texts = chunk_documents(data)
    # print(f'Now you have {len(texts)} documents')
    
    # Set up embeddings and vector store
    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vectorstore = Chroma.from_documents(texts,  embedding_function="gpt-turbo-3.5") 
    # vectorstore = Chroma.from_documents(texts,  embeddings) # use more expensive model
    
    # Set up memory and conversational retrieval chain
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    qa = ConversationalRetrievalChain.from_llm(
        OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY), 
        vectorstore.as_retriever(search_kwargs={"k": 3}),
        memory=memory
    )
    
    # Answer question
    result = qa({"question": question})
    return result["answer"]

# Create Gradio interface
file_upload = gr.inputs.File(label="Upload PDF file")
question = gr.inputs.Textbox(label="Question")
output = gr.outputs.Textbox()

gr.Interface(
    fn=run_model,
    inputs=[file_upload, question],
    outputs=output,
    title="Conversational Retrieval Chain",
    description="Upload a PDF file and ask a question related to its content.",
    # examples=[["./data/fulltext.pdf", "What is the paper about?"], ["./data/fulltext.pdf", "How is the cwsi defined?"]]
).launch() #share=True



# %%