File size: 3,481 Bytes
9958185
 
 
 
 
 
 
 
 
 
 
 
 
a262d57
9958185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe0d56f
9958185
 
fe0d56f
9958185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import gradio as gr
import openai, os
import tqdm
import time
from langchain.vectorstores import Chroma
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain import VectorDBQA
from langchain.llms import AzureOpenAI

os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure"
os.environ["OPENAI_API_VERSION"] = openai.api_version = "2022-12-01"
os.environ["OPENAI_API_BASE"] = openai.api_base = "https://openai-endpoint.openai.azure.com/"
openai.api_key = os.environ["OPENAI_API_KEY"]


def upload_pdf(file, pdf_text, embeddings, vectorstore, azure_embeddings, qa, progress = gr.Progress(track_tqdm=True)):
    reader = PdfReader(file)
    number_of_pages = len(reader.pages)
    pdf_text = ""
    for page_number in range(number_of_pages):
        page = reader.pages[page_number]
        pdf_text += page.extract_text()
    text_splitter = RecursiveCharacterTextSplitter(        
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,)
    texts = text_splitter.split_text(pdf_text)
    for text in tqdm.tqdm(texts):
        try:
            response = openai.Embedding.create(
            input=text,
            engine="text-embedding-ada-002")
            emb = response['data'][0]['embedding']
            embeddings.append(emb)
        except Exception as e:
            print(e)
            time.sleep(8)
            response = openai.Embedding.create(
            input=text,
            engine="text-embedding-ada-002")
            emb = response['data'][0]['embedding']
            embeddings.append(emb)
    

    azure_embeddings = OpenAIEmbeddings(document_model_name="text-embedding-ada-002",query_model_name="text-embedding-ada-002")
    vectorstore = Chroma("collection", embedding_function=azure_embeddings)

    vectorstore._collection.add(
        ids= [f"doc_{i}" for i in range(len(texts))],
        documents=texts,
        embeddings=embeddings,
        metadatas=[{"source": "source"} for text in texts])
    qa = VectorDBQA.from_chain_type(llm= AzureOpenAI(deployment_name="davinci003", model_name="text-davinci-003"), chain_type="stuff", vectorstore=vectorstore)

    return pdf_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)


def add_text(chatstate, query, qa):
    # chain.run(input_documents=docs, question=query)
    chatstate = chatstate + [(query, qa.run(query))]

    return chatstate, chatstate, qa

with gr.Blocks(css="footer {visibility: hidden}") as demo:
    qa = pdf_text = embeddings = vectorstore = azure_embeddings = gr.State([])
    with gr.Row(visible=False) as chat_row:
        chatbot = gr.Chatbot()
    with gr.Row(visible=False) as submit_row:
        text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
    chatstate = gr.State([])
    text.submit(add_text, [chatstate, text, qa], [chatbot, chatstate, qa])

    


    # set state
    with gr.Column() as upload_column:

        file = gr.File()
        upload_btn = gr.Button("Upload")
        output_text = gr.TextArea()
        upload_btn.click(upload_pdf, inputs=[file, pdf_text, embeddings, vectorstore, azure_embeddings, qa], outputs=[output_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, chat_row, submit_row, upload_column])




demo.launch(enable_queue=True)