File size: 4,975 Bytes
3dff4cb
e9840df
 
 
 
 
3dff4cb
e9840df
 
 
 
 
 
 
6814430
e9840df
 
 
3dff4cb
 
 
 
 
 
 
 
 
e9840df
 
 
 
 
 
 
 
 
 
 
6814430
 
 
 
 
 
e9840df
446dbbb
6814430
e9840df
 
 
 
 
 
 
 
 
 
6814430
 
 
 
3dff4cb
6814430
 
 
 
e9840df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6814430
 
e9840df
6814430
 
 
e9840df
 
6814430
e9840df
be312e0
 
 
6814430
446dbbb
571b70a
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os 
import gradio as gr

from langchain.document_loaders import PDFMinerLoader,CSVLoader ,UnstructuredWordDocumentLoader,TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain import HuggingFaceHub


DEVICE = 'cpu '
FILE_EXT = ['pdf','text','csv','word','wav']


def loading_file():
    return "Loading..."


def get_openai_chat_model(API_key):
    try:
        from langchain.llms import OpenAI
    except ImportError as err:
        raise "{}, unable to load openAI. Please install openai and add OPENAIAPI_KEY"
    os.environ["OPENAI_API_KEY"] = API_key
    llm = OpenAI()
    return llm

def process_documents(documents,data_chunk=1000,chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=data_chunk, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents[0])
    return texts

def get_hugging_face_model(model_id,API_key,temperature=0.1):
    chat_llm = HuggingFaceHub(huggingfacehub_api_token=API_key,
                                      repo_id=model_id,
                                      model_kwargs={"temperature": temperature, "max_new_tokens": 2048})
    return chat_llm

def chat_application(llm_model,key):
    if llm_model == 'HuggingFace':
        llm = get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',API_key=key)
    else:
        llm_model = get_openai_chat_model(API_key=key)


def document_loader(file_data,doc_type='pdf',key=None):
    embedding_model = SentenceTransformerEmbeddings(model_name='all-mpnet-base-v2',model_kwargs={"device": DEVICE})
    document = None
    if doc_type == 'pdf':
        document = process_pdf_document(document_file_name=file_data)
    elif doc_type == 'text':
        document = process_text_document(document_file_name=file_data)
    elif doc_type == 'csv':
        document = process_csv_document(document_file_name=file_data)
    elif doc_type == 'word':
        document = process_word_document(document_file_name=file_data)
    
    if document:
        texts = process_documents(documents=document)
        global vectordb
        vectordb = FAISS.from_documents(documents=texts, embedding= embedding_model)
    else:
        return "Error in loading Documents "
    
    return "Document loaded - Embeddings ready "

        
def process_text_document(document_file_name):
    loader = TextLoader(document_file_name)
    document = loader.load()
    return document


def process_csv_document(document_file_name):
    loader = CSVLoader(file_path=document_file_name)
    document = loader.load()
    return document


def process_word_document(document_file_name):
    loader = UnstructuredWordDocumentLoader(file_path=document_file_name)
    document = loader.load()
    return document


def process_pdf_document(document_file_name):
    loader = PDFMinerLoader(document_file_name)
    document = loader.load()[0]
    return document



css="""
#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
"""

title = """
<div style="text-align: center;max-width: 700px;">
    <h1>Chat with Data • OpenAI/HuggingFace</h1>
    <p style="text-align: center;">Upload a file from your computer, click the "Load data to LangChain" button, <br />
    when everything is ready, you can start asking questions about the data you uploaded ;) <br />
    This version is just for QA retrival so it will not use chat history, and uses Hugging face as LLM, 
    so you don't need any key</p>
</div>
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML(title)
        
        with gr.Column():
            with gr.Box():
                LLM_option = gr.Dropdown(['HuggingFace','OpenAI'],label='Large Language Model Selection',info='LLM Service')
                API_key = gr.Textbox(label="Add {} API key".format(LLM_option), type="password")
            with gr.Column():
                with gr.row():
                    file_extension = gr.Dropdown(FILE_EXT, label="File Extensions", info="Select your files extensions!")
                    pdf_doc = gr.File(label="Upload File to start QA", file_types=FILE_EXT, type="file")
                with gr.Row():
                    load_pdf = gr.Button("Load file to langchain")
                    langchain_status = gr.Textbox(label="Status", placeholder="", interactive=True)
        
        chatbot = gr.Chatbot()
        question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter")
        submit_button = gr.Button("Send Message")
    load_pdf.click(loading_file, None, langchain_status, queue=False)    
    load_pdf.click(document_loader, inputs=[pdf_doc,file_extension,API_key], outputs=[langchain_status], queue=False)
    # question.submit(add_text, [chatbot, question], [chatbot, question]).then(
    #     bot, chatbot, chatbot
    # )

demo.launch()