File size: 7,865 Bytes
3dff4cb
e9840df
46a768d
2c230be
49dbc00
e9840df
3dff4cb
e9840df
ef9e1ba
 
e9840df
ef9e1ba
e9840df
2783dc6
8ae1bd1
d2237c8
8ae1bd1
e2b4917
e9840df
6814430
e9840df
 
 
3dff4cb
 
 
 
 
 
 
 
 
688f875
49dbc00
90fc7ac
e9840df
 
8da2c81
e9840df
 
8da2c81
e9840df
 
8da2c81
57005dc
6814430
 
57005dc
 
6814430
e9840df
fafce48
e9840df
 
3edae51
e9840df
3edae51
e9840df
3edae51
e9840df
3edae51
31f4dd5
7232b90
 
3b8e35c
7232b90
31f4dd5
7232b90
 
 
 
 
 
31f4dd5
 
7232b90
de8093f
e9840df
1c52547
 
e9840df
 
 
1c52547
 
e9840df
 
 
1c52547
 
e9840df
 
 
5a2a128
 
90fc7ac
36b9066
49dbc00
e9840df
a5b8a59
 
 
46a768d
f95de96
 
 
 
46a768d
f95de96
ee9bd35
2968e66
3b8e35c
 
 
 
bcc7659
46a768d
 
b04aa25
46a768d
 
 
 
 
 
 
 
 
 
 
e9840df
 
 
 
 
 
 
 
c2e3e8e
 
 
31c19b2
e9840df
 
 
 
 
 
 
b04aa25
 
 
f9e518b
b04aa25
f9e518b
b04aa25
 
 
 
 
a5b8a59
b04aa25
 
 
 
 
 
 
 
 
 
 
8da2c81
b04aa25
 
 
 
 
 
 
 
 
 
4d8a5d0
fdcda98
bc0dc94
 
46a768d
a5b8a59
 
 
 
3b8e35c
 
 
31c19b2
a5b8a59
2af3209
571b70a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import os 
import gradio as gr
import time 
from langchain.document_loaders import PDFMinerLoader,CSVLoader ,UnstructuredWordDocumentLoader,TextLoader,OnlinePDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

DEVICE = 'cpu'
FILE_EXT = ['pdf','text','csv','word','wav']
DEFAULT_SYSTEM_PROMPT = "You are a chatbot and answering questions point wise"
MAX_NEW_TOKENS = 4096
DEFAULT_TEMPERATURE = 0.1
DEFAULT_MAX_NEW_TOKENS = 2048
MAX_INPUT_TOKEN_LENGTH = 4000

def loading_file():
    return "Loading..."


def get_openai_chat_model(API_key):
    try:
        from langchain.llms import OpenAI
    except ImportError as err:
        raise "{}, unable to load openAI. Please install openai and add OPENAIAPI_KEY"
    os.environ["OPENAI_API_KEY"] = API_key
    llm = OpenAI()
    return llm

def process_documents(documents,data_chunk=1500,chunk_overlap=100):
    text_splitter = CharacterTextSplitter(chunk_size=data_chunk, chunk_overlap=chunk_overlap,separator='\n')
    texts = text_splitter.split_documents(documents)
    return texts

def get_hugging_face_model(model_id,API_key,temperature=0.1,max_tokens=4096):
    chat_llm = HuggingFaceHub(huggingfacehub_api_token=API_key,
                                      repo_id=model_id,
                                      model_kwargs={"temperature": temperature, "max_new_tokens": max_tokens})
    return chat_llm

def chat_application(llm_service,key,temperature=0.1,max_tokens=1024):
    if llm_service == 'HuggingFace':
        llm = get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',API_key=key)
    else:
        llm = get_openai_chat_model(API_key=key)
    return llm 


def document_loader(file_path,api_key,doc_type='pdf',llm='HuggingFace',temperature=0.1,max_tokens=4096):
    document = None
    if doc_type == 'pdf':
        document = process_pdf_document(document_file=file_path)
    elif doc_type == 'text':
        document = process_text_document(document_file=file_path)
    elif doc_type == 'csv':
        document = process_csv_document(document_file=file_path)
    elif doc_type == 'word':
        document = process_word_document(document_file=file_path)
    embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-base',model_kwargs={"device": DEVICE})
    
    texts = process_documents(documents=document)
    global vector_db
    vector_db = FAISS.from_documents(documents=texts, embedding= embedding_model)
    global qa
    qa = RetrievalQA.from_chain_type(llm=chat_application(llm_service=llm,key=api_key,
                                                            temperature=temperature,
                                                            max_tokens=max_tokens
                                                        ),
                                chain_type='stuff',
                                retriever=vector_db.as_retriever(),
                                #  chain_type_kwargs=chain_type_kwargs,
                                return_source_documents=True
                            )
    return "Document Processing completed ..."
        
def process_text_document(document_file):
    loader = TextLoader(document_file.name)
    document = loader.load()
    return document

def process_csv_document(document_file):
    loader = CSVLoader(file_path=document_file.name)
    document = loader.load()
    return document

def process_word_document(document_file):
    loader = UnstructuredWordDocumentLoader(file_path=document_file.name)
    document = loader.load()
    return document

def process_pdf_document(document_file):
    print("Document File Name :",document_file.name)
    loader = PDFMinerLoader(document_file.name)
    document = loader.load()
    return document

def clear_chat():
    return []

def infer(question, history):
    # res = []
    # # for human, ai in history[:-1]:
    # #     pair = (human, ai)
    # #     res.append(pair)
    
    # chat_history = res
    print("Question in infer :",question)
    result = qa({"query": question})
    matching_docs_score = vector_db.similarity_search_with_score(question)
    
    print(" Matching_doc ",matching_docs_score)
    
    return result["result"]

def bot(history):
   
    response = infer(history[-1][0], history)
    history[-1][1] = ""
    
    for character in response:     
        history[-1][1] += character
        time.sleep(0.05)
        yield history

def add_text(history, text):
    history = history + [(text, None)]
    return history, ""

css="""
#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
"""

title = """
<div style="text-align: center;max-width: 700px;">
    <h1>Chat with Data • OpenAI/HuggingFace</h1>
    <p style="text-align: center;">Upload a file from system,UpLoad file and generate embeddings, <br />
    once status is ready, you can start asking questions about the data you uploaded without chat history <br />
    and gives you option to use HuggingFace/OpenAI as LLM's, make sure to add your key.
    </p>
</div>
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML(title)
        
    with gr.Group():
        chatbot = gr.Chatbot(height=300)
    with gr.Row():
        question = gr.Textbox(label="Type your question !",lines=1).style(full_width=True)
        submit_btn = gr.Button(value="Send message", variant="primary", scale = 1)
        clean_chat_btn =  gr.Button("Delete Chat")

    with gr.Column():
        with gr.Box():
            LLM_option = gr.Dropdown(['HuggingFace','OpenAI'],label='Large Language Model Selection',info='LLM Service')
            API_key = gr.Textbox(label="Add API key", type="password")
                 
        with gr.Column():
            with gr.Box():
                file_extension = gr.Dropdown(FILE_EXT, label="File Extensions", info="Select type of file to upload !")
                pdf_doc = gr.File(label="Upload File to start QA", file_types=FILE_EXT, type="file")
                with gr.Accordion(label='Advanced options', open=False):
                    max_new_tokens = gr.Slider(
                        label='Max new tokens',
                        minimum=2048,
                        maximum=MAX_NEW_TOKENS,
                        step=1,
                        value=DEFAULT_MAX_NEW_TOKENS,
                        )
                    temperature = gr.Slider(
                    label='Temperature',
                    minimum=0.1,
                    maximum=4.0,
                    step=0.1,
                    value=DEFAULT_TEMPERATURE,
                    )
                with gr.Row():
                    langchain_status = gr.Textbox(label="Status", placeholder="", interactive = False)
                    load_pdf = gr.Button("Upload File & Generate Embeddings",).style(full_width = False)

        # chatbot = gr.Chatbot()l̥
        # question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter")
        # submit_button = gr.Button("Send Message")

    if pdf_doc:
        load_pdf.click(loading_file, None, langchain_status, queue=False)    
        load_pdf.click(document_loader, inputs=[pdf_doc,API_key,file_extension,LLM_option,temperature,max_new_tokens], outputs=[langchain_status], queue=False)

    question.submit(add_text, inputs=[chatbot, question], outputs=[chatbot, question]).then(bot, chatbot, chatbot)
    submit_btn.click(add_text, inputs=[chatbot, question], outputs=[chatbot, question]).then(bot, chatbot, chatbot)
    # submit_btn.then(chatf.highlight_found_text, [chatbot, sources], [sources])
    clean_chat_btn.click(clear_chat, [], chatbot)


demo.launch()