File size: 6,030 Bytes
3dff4cb
e9840df
 
2c230be
49dbc00
e9840df
3dff4cb
e9840df
ef9e1ba
 
e9840df
ef9e1ba
e9840df
 
 
6814430
e9840df
 
 
3dff4cb
 
 
 
 
 
 
 
 
e9840df
49dbc00
90fc7ac
e9840df
 
 
 
 
 
 
 
ef9e1ba
57005dc
6814430
 
57005dc
 
6814430
4d8a5d0
 
 
e9840df
3edae51
e9840df
 
3edae51
e9840df
3edae51
e9840df
3edae51
e9840df
3edae51
6814430
2af3209
6814430
ef9e1ba
 
 
 
 
 
730c378
 
3dff4cb
6814430
 
2af3209
6814430
e9840df
1c52547
 
e9840df
 
 
1c52547
 
e9840df
 
 
 
1c52547
 
e9840df
 
 
 
5a2a128
 
90fc7ac
36b9066
49dbc00
e9840df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b30343
6814430
2b30343
 
e9840df
2b30343
6814430
2b30343
 
4d8a5d0
 
bc0dc94
 
 
 
6814430
4727b07
bc0dc94
2af3209
 
 
 
 
 
 
 
 
571b70a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os 
import gradio as gr

from langchain.document_loaders import PDFMinerLoader,CSVLoader ,UnstructuredWordDocumentLoader,TextLoader,OnlinePDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

DEVICE = 'cpu'
FILE_EXT = ['pdf','text','csv','word','wav']


def loading_file():
    return "Loading..."


def get_openai_chat_model(API_key):
    try:
        from langchain.llms import OpenAI
    except ImportError as err:
        raise "{}, unable to load openAI. Please install openai and add OPENAIAPI_KEY"
    os.environ["OPENAI_API_KEY"] = API_key
    llm = OpenAI()
    return llm

def process_documents(documents,data_chunk=1000,chunk_overlap=50):
    text_splitter = CharacterTextSplitter(chunk_size=data_chunk, chunk_overlap=chunk_overlap,separator='\n')
    texts = text_splitter.split_documents(documents)
    return texts

def get_hugging_face_model(model_id,API_key,temperature=0.1):
    chat_llm = HuggingFaceHub(huggingfacehub_api_token=API_key,
                                      repo_id=model_id,
                                      model_kwargs={"temperature": temperature, "max_new_tokens": 2048})
    return chat_llm

def chat_application(llm_service,key):
    if llm_service == 'HuggingFace':
        llm = get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',API_key=key)
    else:
        llm = get_openai_chat_model(API_key=key)
    return llm 

def summarize_contents():
    question = "Generate a summary of the contents. Do not return the response in json format"
    return qa.run(question)

def document_loader(file_path,api_key,doc_type='pdf',llm='Huggingface'):
    document = None
    if doc_type == 'pdf':
        document = process_pdf_document(document_file=file_path)
    elif doc_type == 'text':
        document = process_text_document(document_file=file_path)
    elif doc_type == 'csv':
        document = process_csv_document(document_file=file_path)
    elif doc_type == 'word':
        document = process_word_document(document_file=file_path)
    if document:
        embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-base',model_kwargs={"device": DEVICE})
        texts = process_documents(documents=document)
        vector_db = FAISS.from_documents(documents=texts, embedding= embedding_model)
        global qa 
        qa = RetrievalQA.from_chain_type(llm=chat_application(llm_service=llm,key=api_key),
                                     chain_type='stuff',
                                     retriever=vector_db.as_retriever(),
                                    #  chain_type_kwargs=chain_type_kwargs,
                                    #  return_source_documents=True
                                    )
    else:
        return "Error in loading Documents "
    
    return "Document processing complete-Embeddings Created "

        
def process_text_document(document_file):
    loader = TextLoader(document_file.name)
    document = loader.load()
    return document

def process_csv_document(document_file):
    loader = CSVLoader(file_path=document_file.name)
    document = loader.load()
    return document


def process_word_document(document_file):
    loader = UnstructuredWordDocumentLoader(file_path=document_file.name)
    document = loader.load()
    return document


def process_pdf_document(document_file):
    print("Document File Name :",document_file.name)
    loader = PDFMinerLoader(document_file.name)
    document = loader.load()
    return document



css="""
#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
"""

title = """
<div style="text-align: center;max-width: 700px;">
    <h1>Chat with Data • OpenAI/HuggingFace</h1>
    <p style="text-align: center;">Upload a file from your computer, click the "Load data to LangChain" button, <br />
    when everything is ready, you can start asking questions about the data you uploaded ;) <br />
    This version is just for QA retrival so it will not use chat history, and uses Hugging face as LLM, 
    so you don't need any key</p>
</div>
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML(title)
        
        with gr.Column():
            with gr.Box():
                gr.Row()
                LLM_option = gr.Dropdown(['HuggingFace','OpenAI'],label='Large Language Model Selection',info='LLM Service')
                file_extension = gr.Dropdown(FILE_EXT, label="File Extensions", info="Select your files extensions!")
            API_key = gr.Textbox(label="Add API key", type="password")
            with gr.Column():
                with gr.Box():
                    pdf_doc = gr.File(label="Upload File to start QA", file_types=FILE_EXT, type="file")
                    with gr.Row():
                        langchain_status = gr.Textbox(label="Status", placeholder="", interactive=True)
                        load_pdf = gr.Button("Upload File & Generate Embeddings",).style(full_width=False)

        # chatbot = gr.Chatbot()
        # question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter")
        # submit_button = gr.Button("Send Message")
        
    load_pdf.click(loading_file, None, langchain_status, queue=False)    
    load_pdf.click(document_loader, inputs=[pdf_doc,API_key,file_extension,LLM_option], outputs=[langchain_status], queue=False)
        
    with gr.Column():
        with gr.Row():
            chatbot = gr.Chatbot(height=300)
            sources = gr.HTML(value = "Source paragraphs where I looked for answers will appear here", height=300)

        with gr.Row():
            message = gr.Textbox(label="Type your question?",lines=1).style(full_width=False)
        submit_query = gr.Button(value="Send message", variant="secondary", scale = 1)

demo.launch()