Roger Condori commited on
Commit
a9c396e
1 Parent(s): a7536a1

Add files via upload

Browse files
Files changed (4) hide show
  1. app.py +123 -0
  2. conversadocs/bones.py +216 -0
  3. demo_docs/demo.txt +25 -0
  4. requirements.txt +12 -0
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from langchain.embeddings.openai import OpenAIEmbeddings
3
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import DocArrayInMemorySearch
5
+ from langchain.chains import RetrievalQA, ConversationalRetrievalChain
6
+ from langchain.memory import ConversationBufferMemory
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.embeddings import HuggingFaceEmbeddings
9
+ from langchain import HuggingFaceHub
10
+ from langchain.llms import LlamaCpp
11
+ from huggingface_hub import hf_hub_download
12
+ from langchain.document_loaders import (
13
+ EverNoteLoader,
14
+ TextLoader,
15
+ UnstructuredEPubLoader,
16
+ UnstructuredHTMLLoader,
17
+ UnstructuredMarkdownLoader,
18
+ UnstructuredODTLoader,
19
+ UnstructuredPowerPointLoader,
20
+ UnstructuredWordDocumentLoader,
21
+ PyPDFLoader,
22
+ )
23
+ import param
24
+ import os
25
+ import torch
26
+ from conversadocs.bones import DocChat
27
+
28
+ dc = DocChat()
29
+
30
+ ##### GRADIO CONFIG ####
31
+
32
+ if torch.cuda.is_available():
33
+ print("CUDA is available on this system.")
34
+ os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose')
35
+ else:
36
+ print("CUDA is not available on this system.")
37
+ os.system('pip install llama-cpp-python')
38
+
39
+ css="""
40
+ #col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
41
+ """
42
+
43
+ title = """
44
+ <div style="text-align: center;max-width: 700px;">
45
+ <h1>Chat with Documents 📚 - Falcon, Llama-2</h1>
46
+ <p style="text-align: center;">Upload txt, pdf, doc, docx, enex, epub, html, md, odt, ptt, pttx; click the "Click to Upload Files" button, <br />
47
+ Wait for the Status to show Loaded documents, start typing your questions. <br />
48
+ The app is set to store chat-history</p>
49
+ </div>
50
+ """
51
+
52
+ theme='aliabid94/new-theme'
53
+
54
+ def flag():
55
+ return "PROCESSING..."
56
+
57
+ def upload_file(files, max_docs):
58
+ file_paths = [file.name for file in files]
59
+ return dc.call_load_db(file_paths, max_docs)
60
+
61
+ def predict(message, chat_history, max_k):
62
+ print(message)
63
+ bot_message = dc.convchain(message, max_k)
64
+ print(bot_message)
65
+ return "", dc.get_chats()
66
+
67
+ def convert():
68
+ docs = dc.get_sources()
69
+ data_docs = ""
70
+ for i in range(0,len(docs),2):
71
+ txt = docs[i][1].replace("\n","<br>")
72
+ sc = "Archive: " + docs[i+1][1]["source"]
73
+ try:
74
+ pg = "Page: " + str(docs[i+1][1]["page"])
75
+ except:
76
+ pg = "Document Data"
77
+ data_docs += f"<hr><h3 style='color:red;'>{pg}</h2><p>{txt}</p><p>{sc}</p>"
78
+ return data_docs
79
+
80
+
81
+
82
+ with gr.Blocks(theme=theme, css=css) as demo:
83
+ with gr.Tab("Chat"):
84
+
85
+ with gr.Column(elem_id="col-container"):
86
+ gr.HTML(title)
87
+ upload_button = gr.UploadButton("Click to Upload Files", file_types=["pdf"], file_count="multiple")
88
+ file_output = gr.HTML()
89
+ chatbot = gr.Chatbot([], elem_id="chatbot").style(height=300)
90
+ msg = gr.Textbox(label="Question", placeholder="Type your question and hit Enter ")
91
+
92
+ with gr.Column():
93
+ sou = gr.HTML("")
94
+
95
+ with gr.Tab("Chat Options"):
96
+ max_docs = gr.inputs.Slider(1, 10, default=3, label="Maximum querys to the DB.", step=1)
97
+ row_table = gr.HTML("<hr><h4> </h2>")
98
+ clear_button = gr.Button("CLEAR CHAT HISTORY", )
99
+ link_output = gr.HTML("")
100
+ clear_button.click(flag,[],[link_output]).then(dc.clr_history,[], [link_output]).then(lambda: None, None, chatbot, queue=False)
101
+
102
+ upload_button.upload(flag,[],[file_output]).then(upload_file, [upload_button, max_docs], file_output)
103
+
104
+ with gr.Tab("Change model"):
105
+ gr.HTML("<h3>Only models from the GGML library are accepted.</h3>")
106
+ repo_ = gr.Textbox(label="Repository" ,value="TheBloke/Llama-2-7B-Chat-GGML")
107
+ file_ = gr.Textbox(label="File name" ,value="llama-2-7b-chat.ggmlv3.q2_K.bin")
108
+ max_tokens = gr.inputs.Slider(1, 2048, default=16, label="Max new tokens", step=1)
109
+ temperature = gr.inputs.Slider(0.1, 1., default=0.2, label="Temperature", step=0.1)
110
+ top_k = gr.inputs.Slider(0.01, 1., default=0.95, label="Top K", step=0.01)
111
+ top_p = gr.inputs.Slider(0, 100, default=50, label="Top P", step=1)
112
+ repeat_penalty = gr.inputs.Slider(0.1, 100., default=1.2, label="Repeat penalty", step=0.1)
113
+ change_model_button = gr.Button("Load Model")
114
+ model_verify = gr.HTML("Loaded model Falcon 7B-instruct")
115
+ default_model = gr.HTML("<hr><h4>Default Model</h2>")
116
+ falcon_button = gr.Button("FALCON 7B-Instruct")
117
+
118
+ msg.submit(predict,[msg, chatbot, max_docs],[msg, chatbot]).then(convert,[],[sou])
119
+
120
+ change_model_button.click(dc.change_llm,[repo_, file_, max_tokens, temperature, top_p, top_k, repeat_penalty, max_docs],[model_verify])
121
+ falcon_button.click(dc.default_falcon_model, [], [model_verify])
122
+
123
+ demo.launch(enable_queue=True)
conversadocs/bones.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from langchain.embeddings.openai import OpenAIEmbeddings
3
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import DocArrayInMemorySearch
5
+ from langchain.chains import RetrievalQA, ConversationalRetrievalChain
6
+ from langchain.memory import ConversationBufferMemory
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.embeddings import HuggingFaceEmbeddings
9
+ from langchain import HuggingFaceHub
10
+ from langchain.llms import LlamaCpp
11
+ from huggingface_hub import hf_hub_download
12
+ import param
13
+ import os
14
+ import torch
15
+ from langchain.document_loaders import (
16
+ EverNoteLoader,
17
+ TextLoader,
18
+ UnstructuredEPubLoader,
19
+ UnstructuredHTMLLoader,
20
+ UnstructuredMarkdownLoader,
21
+ UnstructuredODTLoader,
22
+ UnstructuredPowerPointLoader,
23
+ UnstructuredWordDocumentLoader,
24
+ PyPDFLoader,
25
+ )
26
+
27
+ #YOUR_HF_TOKEN = os.getenv("My_hf_token")
28
+ llm_api=HuggingFaceHub(
29
+ huggingfacehub_api_token=os.getenv("My_hf_token"),
30
+ repo_id="tiiuae/falcon-7b-instruct",
31
+ model_kwargs={
32
+ "temperature":0.2,
33
+ "max_new_tokens":500,
34
+ "top_k":50,
35
+ "top_p":0.95,
36
+ "repetition_penalty":1.2,
37
+ },), #ChatOpenAI(model_name=llm_name, temperature=0)
38
+
39
+
40
+ #alter
41
+ def load_db(files, chain_type, k, llm):
42
+ EXTENSIONS = {
43
+ ".txt": (TextLoader, {"encoding": "utf8"}),
44
+ ".pdf": (PyPDFLoader, {}),
45
+ ".doc": (UnstructuredWordDocumentLoader, {}),
46
+ ".docx": (UnstructuredWordDocumentLoader, {}),
47
+ ".enex": (EverNoteLoader, {}),
48
+ ".epub": (UnstructuredEPubLoader, {}),
49
+ ".html": (UnstructuredHTMLLoader, {}),
50
+ ".md": (UnstructuredMarkdownLoader, {}),
51
+ ".odt": (UnstructuredODTLoader, {}),
52
+ ".ppt": (UnstructuredPowerPointLoader, {}),
53
+ ".pptx": (UnstructuredPowerPointLoader, {}),
54
+ }
55
+
56
+
57
+
58
+ # select extensions loader
59
+ documents = []
60
+ for file in files:
61
+ ext = "." + file.rsplit(".", 1)[-1]
62
+ if ext in EXTENSIONS:
63
+ loader_class, loader_args = EXTENSIONS[ext]
64
+ loader = loader_class(file, **loader_args)
65
+ documents.extend(loader.load())
66
+ else:
67
+ pass
68
+
69
+ # load documents
70
+ if documents == []:
71
+ loader_class, loader_args = EXTENSIONS['.txt']
72
+ loader = loader_class('demo_docs/demo.txt', **loader_args)
73
+ documents = loader.load()
74
+
75
+ # split documents
76
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
77
+ docs = text_splitter.split_documents(documents)
78
+
79
+ # define embedding
80
+ embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2') # all-mpnet-base-v2 #embeddings = OpenAIEmbeddings()
81
+
82
+ # create vector database from data
83
+ db = DocArrayInMemorySearch.from_documents(docs, embeddings)
84
+ # define retriever
85
+ retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
86
+ # create a chatbot chain. Memory is managed externally.
87
+ qa = ConversationalRetrievalChain.from_llm(
88
+ llm=llm,
89
+ chain_type=chain_type,
90
+ retriever=retriever,
91
+ return_source_documents=True,
92
+ return_generated_question=True,
93
+ )
94
+ return qa
95
+
96
+
97
+ class DocChat(param.Parameterized):
98
+ chat_history = param.List([])
99
+ answer = param.String("")
100
+ db_query = param.String("")
101
+ db_response = param.List([])
102
+ llm = llm_api[0]
103
+ k_value = param.Integer(3)
104
+
105
+
106
+ def __init__(self, **params):
107
+ super(DocChat, self).__init__( **params)
108
+ self.loaded_file = "demo_docs/demo.txt"
109
+ self.qa = load_db(self.loaded_file,"stuff", self.k_value, self.llm)
110
+
111
+ def call_load_db(self, path_file, k):
112
+ if not os.path.exists(path_file[0]): # init or no file specified
113
+ return "No file loaded"
114
+ else:
115
+ try:
116
+ self.qa = load_db(path_file, "stuff", k, self.llm)
117
+ self.loaded_file = path_file
118
+ except:
119
+ return f'No valid file'
120
+ self.clr_history()
121
+ return f"New DB created | Loaded File: {self.loaded_file}"
122
+
123
+ # chat
124
+ def convchain(self, query, k_max):
125
+ if k_max != self.k_value:
126
+ print("Maximum querys changed, reloading DB")
127
+ self.qa = load_db(self.loaded_file,"stuff", k_max, self.llm)
128
+ self.k_value = k_max
129
+
130
+ result = self.qa({"question": query, "chat_history": self.chat_history})
131
+ self.chat_history.extend([(query, result["answer"])])
132
+ self.db_query = result["generated_question"]
133
+ self.db_response = result["source_documents"]
134
+ self.answer = result['answer']
135
+ return self.answer
136
+
137
+ def change_llm(self, repo_, file_, max_tokens=16, temperature=0.2, top_p=0.95, top_k=50, repeat_penalty=1.2, k=3):
138
+
139
+ if torch.cuda.is_available():
140
+ try:
141
+ model_path = hf_hub_download(repo_id=repo_, filename=file_)
142
+
143
+ self.llm = LlamaCpp(
144
+ model_path=model_path,
145
+ n_ctx=1000,
146
+ n_batch=512,
147
+ n_gpu_layers=35,
148
+ max_tokens=max_tokens,
149
+ verbose=False,
150
+ temperature=temperature,
151
+ top_p=top_p,
152
+ top_k=top_k,
153
+ repeat_penalty=repeat_penalty,
154
+ )
155
+ self.qa = load_db(self.loaded_file,"stuff", k, self.llm)
156
+ self.k_value = k
157
+ return f"Loaded {file_}"
158
+ except:
159
+ return "No valid model"
160
+ else:
161
+ try:
162
+ model_path = hf_hub_download(repo_id=repo_, filename=file_)
163
+
164
+ self.llm = LlamaCpp(
165
+ model_path=model_path,
166
+ n_ctx=1000,
167
+ n_batch=8,
168
+ max_tokens=max_tokens,
169
+ verbose=False,
170
+ temperature=temperature,
171
+ top_p=top_p,
172
+ top_k=top_k,
173
+ repeat_penalty=repeat_penalty,
174
+ )
175
+ self.qa = load_db(self.loaded_file,"stuff", k, self.llm)
176
+ self.k_value = k
177
+ return f"Loaded {file_}"
178
+ except:
179
+ return "No valid model"
180
+
181
+ def default_falcon_model(self):
182
+ self.llm = llm_api[0]
183
+ self.qa = load_db(self.loaded_file,"stuff", self.k_value, self.llm)
184
+ return "Loaded model Falcon 7B-instruct"
185
+
186
+
187
+ @param.depends('db_query ', )
188
+ def get_lquest(self):
189
+ if not self.db_query :
190
+ return print("Last question to DB: no DB accesses so far")
191
+ return self.db_query
192
+
193
+ @param.depends('db_response', )
194
+ def get_sources(self):
195
+ if not self.db_response:
196
+ return
197
+ #rlist=[f"Result of DB lookup:"]
198
+ rlist=[]
199
+ for doc in self.db_response:
200
+ for element in doc:
201
+ rlist.append(element)
202
+ return rlist
203
+
204
+ @param.depends('convchain', 'clr_history')
205
+ def get_chats(self):
206
+ if not self.chat_history:
207
+ return "No History Yet"
208
+ #rlist=[f"Current Chat History variable"]
209
+ rlist=[]
210
+ for exchange in self.chat_history:
211
+ rlist.append(exchange)
212
+ return rlist
213
+
214
+ def clr_history(self,count=0):
215
+ self.chat_history = []
216
+ return "HISTORY CLEARED"
demo_docs/demo.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Title: Moon's Legacy - Three Perspectives through Time
2
+
3
+ 1. The Ancient Astronomer: Thoth, 3000 BCE
4
+
5
+ In the ancient land of Egypt, Thoth, an esteemed astronomer and scribe, gazes at the night sky filled with stars. Among them, the mysterious and radiant moon captivates his attention. Thoth believes that the moon is a celestial deity, guiding the seasons and tides, influencing the lives of mortals below. He diligently records his observations and wisdom in hieroglyphs, attributing mystical qualities to the moon, believing it to be a link between the earthly and divine realms.
6
+
7
+ Thoth's insights pass through generations, shaping early lunar beliefs and lunar calendars. The lunar phases become symbols of rebirth and renewal in various ancient cultures, weaving a spiritual connection between humans and the moon. As the centuries pass, Thoth's legacy lives on, influencing astronomical studies and cultural practices.
8
+
9
+ 2. The Space Explorer: Dr. Maria Rodriguez, 1969 CE
10
+
11
+ Fast forward to the 20th century, Dr. Maria Rodriguez is a brilliant astrophysicist working at NASA during the Apollo era. It is July 20, 1969, and Dr. Rodriguez joins millions around the world as they anxiously watch the televised images of the first manned moon landing. As Neil Armstrong takes that historic step, Maria feels an overwhelming mix of emotions, knowing she is witnessing a turning point in human history.
12
+
13
+ Inspired by the Apollo missions, Dr. Rodriguez dedicates her life to lunar research, studying the moon's geology, composition, and its impact on Earth. With the advancements in technology, she becomes a pioneer in space exploration, leading missions to establish lunar bases, conduct experiments, and search for resources to support humanity's expansion beyond Earth.
14
+
15
+ Maria's contributions not only push the boundaries of scientific knowledge but also ignite dreams of colonizing the moon and beyond, leaving an indelible mark on humanity's understanding of space and our place within it.
16
+
17
+ 3. The Lunar Colonist: Chang Min-Joon, 2045 CE
18
+
19
+ In the mid-21st century, the vision of lunar colonization becomes a reality. Chang Min-Joon, a resourceful engineer, is among the first settlers on the moon. Born on Earth, he now finds himself in awe of the barren lunar landscape, Earth rising majestically on the horizon. As part of the lunar colony, Chang Min-Joon works diligently to establish sustainable habitats, tapping into the moon's resources for energy, water, and shelter.
20
+
21
+ Over time, the lunar colony grows, becoming a melting pot of cultures and scientific pursuits. Min-Joon and his fellow colonists encounter challenges, adapt to life in a low-gravity environment, and develop a unique lunar culture. Their existence exemplifies humanity's ability to thrive beyond our home planet.
22
+
23
+ As the moon becomes a stepping stone for future interplanetary missions, Chang Min-Joon envisions a future where the moon is a crucial launch pad for journeys to Mars and beyond. He dreams of humanity spreading throughout the solar system, propelled by the legacy of ancient beliefs, scientific exploration, and the determination of countless individuals who contributed to our understanding and relationship with the moon.
24
+
25
+ Together, the stories of Thoth, Dr. Maria Rodriguez, and Chang Min-Joon embody the multifaceted history of the moon, from a celestial deity to a stepping stone for humanity's interplanetary journey, leaving an enduring legacy for generations to come.
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ numpy
3
+ torch
4
+ pypdf
5
+ langchain
6
+ langchain[docarray]
7
+ tiktoken
8
+ sentence_transformers
9
+ chromadb
10
+ huggingface_hub
11
+ unstructured[local-inference]
12
+ gradio