File size: 7,917 Bytes
3dff4cb
e9840df
46a768d
2c230be
49dbc00
e9840df
3dff4cb
e9840df
ef9e1ba
 
e9840df
ef9e1ba
e9840df
e2b4917
8ae1bd1
d2237c8
8ae1bd1
e2b4917
e9840df
6814430
e9840df
 
 
3dff4cb
 
 
 
 
 
 
 
 
688f875
49dbc00
90fc7ac
e9840df
 
8da2c81
e9840df
 
8da2c81
e9840df
 
8da2c81
57005dc
6814430
 
57005dc
 
6814430
4d8a5d0
 
 
e9840df
8da2c81
e9840df
 
3edae51
e9840df
3edae51
e9840df
3edae51
e9840df
3edae51
31f4dd5
 
 
7232b90
 
 
31f4dd5
7232b90
 
 
 
 
 
31f4dd5
 
7232b90
de8093f
6814430
e9840df
1c52547
 
e9840df
 
 
1c52547
 
e9840df
 
 
 
1c52547
 
e9840df
 
 
 
5a2a128
 
90fc7ac
36b9066
49dbc00
e9840df
46a768d
 
 
 
 
 
 
 
2968e66
bcc7659
46a768d
 
 
 
 
 
 
 
 
 
 
 
 
e9840df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6814430
8da2c81
 
 
e9840df
2b30343
8ae1bd1
6814430
8da2c81
 
 
8ae1bd1
 
8da2c81
 
 
 
 
8ae1bd1
8da2c81
 
8ae1bd1
8da2c81
2b30343
fdcda98
 
4d8a5d0
fdcda98
bc0dc94
 
46a768d
6814430
8da2c81
bc0dc94
e2b4917
 
 
 
2af3209
46a768d
e2b4917
de8093f
 
2af3209
8da2c81
 
571b70a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import os 
import gradio as gr
import time 
from langchain.document_loaders import PDFMinerLoader,CSVLoader ,UnstructuredWordDocumentLoader,TextLoader,OnlinePDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

DEVICE = 'cpu'
FILE_EXT = ['pdf','text','csv','word','wav']
DEFAULT_SYSTEM_PROMPT = "As a chatbot you are answering set of questions being requested ."
MAX_NEW_TOKENS = 4096
DEFAULT_TEMPERATURE = 0.1
DEFAULT_MAX_NEW_TOKENS = 2048
MAX_INPUT_TOKEN_LENGTH = 4000

def loading_file():
    return "Loading..."


def get_openai_chat_model(API_key):
    try:
        from langchain.llms import OpenAI
    except ImportError as err:
        raise "{}, unable to load openAI. Please install openai and add OPENAIAPI_KEY"
    os.environ["OPENAI_API_KEY"] = API_key
    llm = OpenAI()
    return llm

def process_documents(documents,data_chunk=1500,chunk_overlap=100):
    text_splitter = CharacterTextSplitter(chunk_size=data_chunk, chunk_overlap=chunk_overlap,separator='\n')
    texts = text_splitter.split_documents(documents)
    return texts

def get_hugging_face_model(model_id,API_key,temperature=0.1,max_tokens=4096):
    chat_llm = HuggingFaceHub(huggingfacehub_api_token=API_key,
                                      repo_id=model_id,
                                      model_kwargs={"temperature": temperature, "max_new_tokens": max_tokens})
    return chat_llm

def chat_application(llm_service,key,temperature=0.1,max_tokens=1024):
    if llm_service == 'HuggingFace':
        llm = get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',API_key=key)
    else:
        llm = get_openai_chat_model(API_key=key)
    return llm 

def summarize_contents():
    question = "Generate a summary of the contents. Do not return the response in json format"
    return qa.run(question)

def document_loader(file_path,api_key,doc_type='pdf',llm='Huggingface',temperature=0.1,max_tokens=4096):
    document = None
    if doc_type == 'pdf':
        document = process_pdf_document(document_file=file_path)
    elif doc_type == 'text':
        document = process_text_document(document_file=file_path)
    elif doc_type == 'csv':
        document = process_csv_document(document_file=file_path)
    elif doc_type == 'word':
        document = process_word_document(document_file=file_path)
    
    print("Document :",document)
    embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-base',model_kwargs={"device": DEVICE})
    
    texts = process_documents(documents=document)
    vector_db = FAISS.from_documents(documents=texts, embedding= embedding_model)
    global qa
    qa = RetrievalQA.from_chain_type(llm=chat_application(llm_service=llm,key=api_key,
                                                            temperature=temperature,
                                                            max_tokens=max_tokens
                                                        ),
                                chain_type='stuff',
                                retriever=vector_db.as_retriever(),
                                #  chain_type_kwargs=chain_type_kwargs,
                                return_source_documents=True
                            )
    return "Document Processing completed ..."

        
def process_text_document(document_file):
    loader = TextLoader(document_file.name)
    document = loader.load()
    return document

def process_csv_document(document_file):
    loader = CSVLoader(file_path=document_file.name)
    document = loader.load()
    return document


def process_word_document(document_file):
    loader = UnstructuredWordDocumentLoader(file_path=document_file.name)
    document = loader.load()
    return document


def process_pdf_document(document_file):
    print("Document File Name :",document_file.name)
    loader = PDFMinerLoader(document_file.name)
    document = loader.load()
    return document

def infer(question, history):
    
    res = []
    for human, ai in history[:-1]:
        pair = (human, ai)
        res.append(pair)
    
    chat_history = res
    result = qa({"query": question})
    return result["result"]

def bot(history):
    response = infer(history[-1][0], history)
    history[-1][1] = ""
    
    for character in response:     
        history[-1][1] += character
        time.sleep(0.05)
        yield history

def add_text(history, text):
    history = history + [(text, None)]
    return history, ""


css="""
#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
"""

title = """
<div style="text-align: center;max-width: 700px;">
    <h1>Chat with Data • OpenAI/HuggingFace</h1>
    <p style="text-align: center;">Upload a file from your computer, click the "Load data to LangChain" button, <br />
    when everything is ready, you can start asking questions about the data you uploaded ;) <br />
    This version is just for QA retrival so it will not use chat history, and uses Hugging face as LLM, 
    so you don't need any key</p>
</div>
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML(title)
        
        with gr.Column():
            with gr.Box():
                LLM_option = gr.Dropdown(['HuggingFace','OpenAI'],label='Large Language Model Selection',info='LLM Service')
                API_key = gr.Textbox(label="Add API key", type="password")
                
                
            with gr.Column():
                with gr.Box():
                    file_extension = gr.Dropdown(FILE_EXT, label="File Extensions", info="Select type of file to upload !")
                    pdf_doc = gr.File(label="Upload File to start QA", file_types=FILE_EXT, type="file")
                    with gr.Accordion(label='Advanced options', open=False):
                        max_new_tokens = gr.Slider(
                            label='Max new tokens',
                            minimum=2048,
                            maximum=MAX_NEW_TOKENS,
                            step=1,
                            value=DEFAULT_MAX_NEW_TOKENS,
                            )
                        temperature = gr.Slider(
                        label='Temperature',
                        minimum=0.1,
                        maximum=4.0,
                        step=0.1,
                        value=DEFAULT_TEMPERATURE,
                        )
                    with gr.Row():
                        langchain_status = gr.Textbox(label="Status", placeholder="", interactive = False)
                        load_pdf = gr.Button("Upload File & Generate Embeddings",).style(full_width = False)

        # chatbot = gr.Chatbot()l̥
        # question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter")
        # submit_button = gr.Button("Send Message")

    load_pdf.click(loading_file, None, langchain_status, queue=False)    
    load_pdf.click(document_loader, inputs=[pdf_doc,API_key,file_extension,LLM_option,temperature,max_new_tokens], outputs=[langchain_status], queue=False)
        
    with gr.Group():
        chatbot = gr.Chatbot(height=300)
        # with gr.Row():
            # sources = gr.HTML(value = "Source paragraphs where I looked for answers will appear here", height=300)
        with gr.Row():
            question = gr.Textbox(label="Type your question?",lines=1).style(full_width=False)
            submit_btn = gr.Button(value="Send message", variant="primary", scale = 1)
    question.submit(add_text, [chatbot, question], [chatbot, question]).then(bot, chatbot, chatbot)
    submit_btn.click(add_text, [chatbot, question], [chatbot, question]).then(bot, chatbot, chatbot)

    

demo.launch()