ChenyuRabbitLove commited on
Commit
a2f42ca
1 Parent(s): 9ef3a1d

add upload feature and optimize user experience

Browse files
app.py CHANGED
@@ -1,100 +1,130 @@
1
  import json
2
  import time
3
  import random
 
4
 
 
5
  import gradio as gr
6
  import pandas as pd
 
 
7
 
8
  from utils.gpt_processor import QuestionAnswerer
 
9
 
10
  qa_processor = QuestionAnswerer()
11
- current_file = None
12
- context = None
 
 
 
 
13
 
14
- with open("final_result.json", 'r', encoding='UTF-8') as fp:
15
- db = json.load(fp)
 
 
16
 
17
- def read_examples():
18
- df = pd.read_csv(r'examples.csv')
19
- return [f"{keyword}" for keyword in df['word'].tolist()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def user(message, history):
22
- #return gr.update(value="", interactive=False), history + [[message, None]]
23
  return "", history + [[message, None]]
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def bot(history):
26
  user_message = history[-1][0]
27
- global current_file
28
- global context
29
- #check if user input has "我想了解"
30
- if "我想了解" in user_message:
31
- # get keyword from "「」"
32
- keyword = user_message.split("")[1].split("」")[0]
33
- # check if keyword is in db
34
- file_list = []
35
- for key in db.keys():
36
- if keyword in db[key]['keywords']:
37
- file_list.append(key)
38
- if len(file_list) == 0:
39
- response = [
40
- [user_message, "Sorry, I can't find any documents about this topic. Please try again."],
41
- ]
42
- else:
43
- bot_message = "以下是我所找到的文件:"
44
- for file in file_list:
45
- bot_message += "\n" + file
46
- bot_message += "\n\n" + "請複製貼上想要了解的文件,我會給你該文件的摘要"
47
- response = [
48
- [user_message, bot_message],
49
- ]
50
  history = response
51
- # history[-1][1] = ""
52
- # for character in bot_message:
53
- # history[-1][1] += character
54
- # time.sleep(random.uniform(0.01, 0.05))
55
- # yield history
56
  return history
57
-
58
- # check if user input has a pdf file name
59
- if ".pdf" in user_message or ".docx" in user_message:
60
- current_file = user_message
61
- context = db[current_file]['file_full_content']
62
- # check if file name is in db
63
- if user_message in db.keys():
64
- bot_message = f"文件 {user_message} 的摘要如下:"
65
- bot_message += "\n\n" + db[user_message]['summarized_content']
66
- bot_message += "\n\n" + "可以透過詢問來了解更多這個文件的內容"
67
- response = [
68
- [user_message, bot_message],
69
- ]
70
- else:
71
  response = [
72
- [user_message, "Sorry, I can't find this file. Please try again."],
73
  ]
74
- history[-1] = response[0]
75
- # history[-1][1] = ""
76
- # for character in bot_message:
77
- # history[-1][1] += character
78
- # time.sleep(random.uniform(0.01, 0.05))
79
- # yield history
80
- return history
81
- if context is None:
82
- response = [
83
- [user_message, "請輸入一個文件名稱或是點選下方的範例"],
84
- ]
85
- history[-1] = response[0]
86
- return history
87
-
88
- if context is not None:
89
- bot_message = qa_processor.answer_question(context, user_message)
90
  response = [
91
  [user_message, bot_message],
92
  ]
93
  history[-1] = response[0]
94
  return history
 
 
 
 
 
 
 
 
 
95
 
96
  with gr.Blocks() as demo:
97
  history = gr.State([])
 
 
98
  user_question = gr.State("")
99
  with gr.Row():
100
  gr.HTML('Junyi Academy Chatbot')
@@ -114,7 +144,9 @@ with gr.Blocks() as demo:
114
  # with gr.Column(min_width=70, scale=1):
115
  # submit_btn = gr.Button("Send")
116
  with gr.Column(min_width=70, scale=1):
117
- clear_btn = gr.Button("Clear")
 
 
118
 
119
  response = user_input.submit(user,
120
  [user_input, chatbot],
@@ -122,11 +154,40 @@ with gr.Blocks() as demo:
122
  queue=False,
123
  ).then(bot, chatbot, chatbot)
124
  response.then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
 
125
  clear_btn.click(lambda: None, None, chatbot, queue=False)
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- examples = gr.Examples(examples=read_examples(),
129
- inputs=[user_input])
 
 
 
 
 
130
 
131
  if __name__ == "__main__":
132
  demo.launch()
 
1
  import json
2
  import time
3
  import random
4
+ import os
5
 
6
+ import openai
7
  import gradio as gr
8
  import pandas as pd
9
+ import numpy as np
10
+ from openai.embeddings_utils import distances_from_embeddings
11
 
12
  from utils.gpt_processor import QuestionAnswerer
13
+ from utils.work_flow_controller import WorkFlowController
14
 
15
  qa_processor = QuestionAnswerer()
16
+ CSV_FILE_PATHS = ''
17
+ JSON_FILE_PATHS = ''
18
+ KNOWLEDGE_BASE = None
19
+ CONTEXT = None
20
+ CONTEXT_PAGE_NUM = None
21
+ CONTEXT_FILE_NAME = None
22
 
23
+ def build_knowledge_base(files):
24
+ global CSV_FILE_PATHS
25
+ global JSON_FILE_PATHS
26
+ global KNOWLEDGE_BASE
27
 
28
+ work_flow_controller = WorkFlowController(files)
29
+ CSV_FILE_PATHS = work_flow_controller.csv_result_path
30
+ JSON_FILE_PATHS = work_flow_controller.result_path
31
+ with open(CSV_FILE_PATHS, 'r', encoding='UTF-8') as fp:
32
+ knowledge_base = pd.read_csv(fp)
33
+ knowledge_base['page_embedding'] = knowledge_base['page_embedding'].apply(eval).apply(np.array)
34
+ KNOWLEDGE_BASE = knowledge_base
35
+
36
+ def construct_summary():
37
+ with open(JSON_FILE_PATHS, 'r', encoding='UTF-8') as fp:
38
+ knowledge_base = json.load(fp)
39
+
40
+ context = """"""
41
+ for key in knowledge_base.keys():
42
+ file_name = knowledge_base[key]['file_name']
43
+ total_page = knowledge_base[key]['total_pages']
44
+ summary = knowledge_base[key]['summarized_content']
45
+ file_context = f"""
46
+ ### 文件摘要
47
+ {file_name} (共 {total_page} 頁)<br><br>
48
+ {summary}<br><br>
49
+ """
50
+ context += file_context
51
+ return context
52
+
53
+ def change_md():
54
+ content = construct_summary()
55
+ return gr.Markdown.update(content, visible=True)
56
 
57
  def user(message, history):
 
58
  return "", history + [[message, None]]
59
 
60
+ def system_notification(action):
61
+ if action == 'upload':
62
+ return [['已上傳文件', '文件處理中(摘要、翻譯等),結束後將自動回覆']]
63
+ else:
64
+ return [['已上傳文件', '文件處理完成,請開始提問']]
65
+
66
+ def get_index_file(user_message):
67
+ global KNOWLEDGE_BASE
68
+ global CONTEXT
69
+ global CONTEXT_PAGE_NUM
70
+ global CONTEXT_FILE_NAME
71
+
72
+ user_message_embedding = openai.Embedding.create(input=user_message, engine='text-embedding-ada-002')['data'][0]['embedding']
73
+ KNOWLEDGE_BASE['distance'] = distances_from_embeddings(user_message_embedding, KNOWLEDGE_BASE['page_embedding'].values, distance_metric='cosine')
74
+ KNOWLEDGE_BASE = KNOWLEDGE_BASE.sort_values(by='distance', ascending=True).head(1)
75
+ if KNOWLEDGE_BASE['distance'].values[0] > 0.2:
76
+ CONTEXT = None
77
+ else:
78
+
79
+ CONTEXT = KNOWLEDGE_BASE['page_content'].values[0]
80
+ CONTEXT_PAGE_NUM = KNOWLEDGE_BASE['page_num'].values[0]
81
+ CONTEXT_FILE_NAME = KNOWLEDGE_BASE['file_name'].values[0]
82
+
83
  def bot(history):
84
  user_message = history[-1][0]
85
+ global CONTEXT
86
+ print(f'user_message: {user_message}')
87
+
88
+ if KNOWLEDGE_BASE is None:
89
+ response = [
90
+ [user_message, "請先上傳文件"],
91
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  history = response
 
 
 
 
 
93
  return history
94
+ elif CONTEXT is None:
95
+ get_index_file(user_message)
96
+ print(f'CONTEXT: {CONTEXT}')
97
+ if CONTEXT is None:
 
 
 
 
 
 
 
 
 
 
98
  response = [
99
+ [user_message, "無法找到相關文件,請重新提問"],
100
  ]
101
+ history = response
102
+ return history
103
+ else:
104
+ pass
105
+
106
+ if CONTEXT is not None:
107
+ bot_message = qa_processor.answer_question(CONTEXT, CONTEXT_PAGE_NUM, CONTEXT_FILE_NAME, history)
108
+ print(f'bot_message: {bot_message}')
 
 
 
 
 
 
 
 
109
  response = [
110
  [user_message, bot_message],
111
  ]
112
  history[-1] = response[0]
113
  return history
114
+
115
+ def clear_state():
116
+ global CONTEXT
117
+ global CONTEXT_PAGE_NUM
118
+ global CONTEXT_FILE_NAME
119
+
120
+ CONTEXT = None
121
+ CONTEXT_PAGE_NUM = None
122
+ CONTEXT_FILE_NAME = None
123
 
124
  with gr.Blocks() as demo:
125
  history = gr.State([])
126
+ upload_state = gr.State("upload")
127
+ finished = gr.State("finished")
128
  user_question = gr.State("")
129
  with gr.Row():
130
  gr.HTML('Junyi Academy Chatbot')
 
144
  # with gr.Column(min_width=70, scale=1):
145
  # submit_btn = gr.Button("Send")
146
  with gr.Column(min_width=70, scale=1):
147
+ clear_btn = gr.Button("清除")
148
+ with gr.Column(min_width=70, scale=1):
149
+ submit_btn = gr.Button("傳送")
150
 
151
  response = user_input.submit(user,
152
  [user_input, chatbot],
 
154
  queue=False,
155
  ).then(bot, chatbot, chatbot)
156
  response.then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
157
+
158
  clear_btn.click(lambda: None, None, chatbot, queue=False)
159
 
160
+ submit_btn.click(user,
161
+ [user_input, chatbot],
162
+ [user_input, chatbot],
163
+ chatbot,
164
+ queue=False).then(bot, chatbot, chatbot).then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
165
+
166
+ clear_btn.click(clear_state, None, None, queue=False)
167
+
168
+ with gr.Row():
169
+ index_file = gr.File(file_count="multiple", file_types=["pdf"], label="Upload PDF file")
170
+
171
+ with gr.Row():
172
+ instruction = gr.Markdown("""
173
+ ## 使用說明
174
+ 1. 上傳一個或多個 PDF 檔案,系統將自動進行摘要、翻譯等處理後建立知識庫
175
+ 2. 在上方輸入欄輸入問題,系統將自動回覆
176
+ 3. 可以根據下方的摘要內容來提問
177
+ 4. 每次對話會根據第一個問題的內容來檢索所有文件,並挑選最能回答問題的文件來回覆
178
+ 5. 要切換檢索的文件,請點選「清除對話記錄」按鈕後再重新提問
179
+ """)
180
+
181
+ with gr.Row():
182
+ describe = gr.Markdown('', visible=True)
183
 
184
+ index_file.upload(system_notification, [upload_state], chatbot) \
185
+ .then(lambda: gr.update(interactive=True), None, None, queue=False) \
186
+ .then(build_knowledge_base, [index_file]) \
187
+ .then(system_notification, [finished], chatbot) \
188
+ .then(lambda: gr.update(interactive=True), None, None, queue=False) \
189
+ .then(change_md, None, describe)
190
+
191
 
192
  if __name__ == "__main__":
193
  demo.launch()
examples.csv DELETED
@@ -1,8 +0,0 @@
1
- word,count
2
- _KTX CARES.Non-Negotiables.docx,0
3
- 🄣 3.5小時 Getter Better Faster Rubric.pdf,0
4
- 02 - IP.Internalization and Planning Mastery Rubric.docx,0
5
- KTX Houst First 21 Days Rubric 2019 Final-2 (2).docx,0
6
- KTX Rubric for Equity _ Excellence.pdf,0
7
- Leading and Coaching through States of Being 1 pager and integration guides.pdf,0
8
- SEAMS Tool.FINAL (1).pdf,0
 
 
 
 
 
 
 
 
 
final_result.json DELETED
The diff for this file is too large to render. See raw diff
 
utils/pdf_processor.py CHANGED
@@ -3,7 +3,6 @@ import unicodedata
3
  import re
4
  import logging
5
 
6
- from datamodel.data_model import PDFRawData
7
  from .gpt_processor import Translator
8
 
9
  class PDFProcessor:
@@ -15,6 +14,7 @@ class PDFProcessor:
15
  'total_pages': 0,
16
  'file_content': {},
17
  'file_full_content': '',
 
18
  }
19
  self.__build_info()
20
 
@@ -31,14 +31,12 @@ class PDFProcessor:
31
  text = re.sub(' +', ' ', text)
32
  self.file_info['is_chinese'] = self.__is_chinese(text)
33
 
34
- temp = {}
35
  logging.info(f"Processing page {i + 1}...")
36
- temp['page_num'] = i + 1
37
- tranlator = Translator()
38
- temp['page_content'] = tranlator.translate_to_chinese(text) if not self.file_info['is_chinese'] else text
39
- self.file_info['file_content'][i + 1] = temp
40
- self.file_info['file_full_content'] = self.file_info['file_full_content'] + temp['page_content']
41
-
42
  except FileNotFoundError:
43
  print(f"File not found: {self.file_path}")
44
  except Exception as e:
 
3
  import re
4
  import logging
5
 
 
6
  from .gpt_processor import Translator
7
 
8
  class PDFProcessor:
 
14
  'total_pages': 0,
15
  'file_content': {},
16
  'file_full_content': '',
17
+ 'is_chinese': '',
18
  }
19
  self.__build_info()
20
 
 
31
  text = re.sub(' +', ' ', text)
32
  self.file_info['is_chinese'] = self.__is_chinese(text)
33
 
34
+ page_info = {}
35
  logging.info(f"Processing page {i + 1}...")
36
+ page_info['page_num'] = i + 1
37
+ page_info['page_content'] = text
38
+ self.file_info['file_content'][i + 1] = page_info
39
+ self.file_info['file_full_content'] = self.file_info['file_full_content'] + page_info['page_content']
 
 
40
  except FileNotFoundError:
41
  print(f"File not found: {self.file_path}")
42
  except Exception as e:
utils/work_flow_controller.py CHANGED
@@ -1,31 +1,132 @@
 
1
  import json
 
 
2
 
 
 
 
 
3
  from .pdf_processor import PDFProcessor
4
- from .gpt_processor import Translator, EmbeddingGenerator, KeywordsGenerator, TopicsGenerator
5
 
6
  processors = {
7
  'pdf': PDFProcessor,
8
  }
9
 
10
  class WorkFlowController():
11
- def __init__(self, file_path: str, file_name: str) -> None:
12
- # get file raw content
13
- self.file_name = file_name
14
- file_format = file_path.split('.')[-1]
15
- self.file_processor = processors[file_format]
16
- self.file_info = self.file_processor(file_path).file_info
17
-
18
- def process_file(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # process file content
20
  # return processed data
21
- if not self.file_info['is_chinese']:
22
- translator = Translator()
23
- self.file_info[1]['file_content'] = translator.translate_to_chinese(self.file_info[1]['file_content'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- # save file_info data to json file
26
- def dump_to_json(self) -> None:
27
- with open(f'{self.file_name}.json', 'w', encoding='utf-8') as f:
28
- json.dump(self.file_info, f, indent=4, ensure_ascii=False)
 
29
 
 
 
 
30
 
 
 
 
 
 
31
 
 
 
1
+ import os
2
  import json
3
+ import logging
4
+ import hashlib
5
 
6
+ import pandas as pd
7
+
8
+ from .gpt_processor import (EmbeddingGenerator, KeywordsGenerator, Summarizer,
9
+ TopicsGenerator, Translator)
10
  from .pdf_processor import PDFProcessor
 
11
 
12
  processors = {
13
  'pdf': PDFProcessor,
14
  }
15
 
16
  class WorkFlowController():
17
+ def __init__(self, file_src) -> None:
18
+ # check if the file_path is list
19
+ # self.file_paths = self.__get_file_name(file_src)
20
+ self.file_paths = [x.name for x in file_src]
21
+
22
+ print(self.file_paths)
23
+
24
+ self.files_info = {}
25
+
26
+ for file_path in self.file_paths:
27
+ file_name = file_path.split('/')[-1]
28
+ file_format = file_path.split('.')[-1]
29
+ self.file_processor = processors[file_format]
30
+ file = self.file_processor(file_path).file_info
31
+ file = self.__process_file(file)
32
+ self.files_info[file_name] = file
33
+
34
+ self.__dump_to_json()
35
+ self.__dump_to_csv()
36
+
37
+
38
+ def __get_summary(self, file: dict):
39
+ # get summary from file content
40
+
41
+ summarizer = Summarizer()
42
+ file['summarized_content'] = summarizer.summarize(file['file_full_content'])
43
+ return file
44
+
45
+ def __get_keywords(self, file: dict):
46
+ # get keywords from file content
47
+ keywords_generator = KeywordsGenerator()
48
+ file['keywords'] = keywords_generator.extract_keywords(file['file_full_content'])
49
+ return file
50
+
51
+ def __get_topics(self, file: dict):
52
+ # get topics from file content
53
+ topics_generator = TopicsGenerator()
54
+ file['topics'] = topics_generator.extract_topics(file['file_full_content'])
55
+ return file
56
+
57
+ def __get_embedding(self, file):
58
+ # get embedding from file content
59
+ # return embedding
60
+ embedding_generator = EmbeddingGenerator()
61
+
62
+ for i, _ in enumerate(file['file_content']):
63
+ # use i+1 to meet the index of file_content
64
+ file['file_content'][i+1]['page_embedding'] = embedding_generator.get_embedding(file['file_content'][i+1]['page_content'])
65
+ return file
66
+
67
+
68
+ def __translate_to_chinese(self, file: dict):
69
+ # translate file content to chinese
70
+ translator = Translator()
71
+ # reset the file full content
72
+ file['file_full_content'] = ''
73
+
74
+ for i, _ in enumerate(file['file_content']):
75
+ # use i+1 to meet the index of file_content
76
+ file['file_content'][i+1]['page_content'] = translator.translate_to_chinese(file['file_content'][i+1]['page_content'])
77
+ file['file_full_content'] = file['file_full_content'] + file['file_content'][i+1]['page_content']
78
+ return file
79
+
80
+ def __process_file(self, file: dict):
81
  # process file content
82
  # return processed data
83
+ if not file['is_chinese']:
84
+ file = self.__translate_to_chinese(file)
85
+ file = self.__get_embedding(file)
86
+ file = self.__get_summary(file)
87
+ # file = self.__get_keywords(file)
88
+ # file = self.__get_topics(file)
89
+ return file
90
+
91
+ def __dump_to_json(self):
92
+ with open(os.path.join(os.getcwd(), 'knowledge_base.json'), 'w', encoding='utf-8') as f:
93
+ print("Dumping to json, the path is: " + os.path.join(os.getcwd(), 'knowledge_base.json'))
94
+ self.result_path = os.path.join(os.getcwd(), 'knowledge_base.json')
95
+ json.dump(self.files_info, f, indent=4, ensure_ascii=False)
96
+
97
+ def __construct_knowledge_base_dataframe(self):
98
+
99
+ rows = []
100
+ for file_path, content in self.files_info.items():
101
+ file_full_content = content["file_full_content"]
102
+ for page_num, page_details in content["file_content"].items():
103
+ row = {
104
+ "file_name": content["file_name"],
105
+ "page_num": page_details["page_num"],
106
+ "page_content": page_details["page_content"],
107
+ "page_embedding": page_details["page_embedding"],
108
+ "file_full_content": file_full_content,
109
+ }
110
+ rows.append(row)
111
+
112
+ columns = ["file_name", "page_num", "page_content", "page_embedding", "file_full_content"]
113
+ df = pd.DataFrame(rows, columns=columns)
114
+ return df
115
 
116
+ def __dump_to_csv(self):
117
+ df = self.__construct_knowledge_base_dataframe()
118
+ df.to_csv(os.path.join(os.getcwd(), 'knowledge_base.csv'), index=False)
119
+ print("Dumping to csv, the path is: " + os.path.join(os.getcwd(), 'knowledge_base.csv'))
120
+ self.csv_result_path = os.path.join(os.getcwd(), 'knowledge_base.csv')
121
 
122
+ def __get_file_name(self, file_src):
123
+ file_paths = [x.name for x in file_src]
124
+ file_paths.sort(key=lambda x: os.path.basename(x))
125
 
126
+ md5_hash = hashlib.md5()
127
+ for file_path in file_paths:
128
+ with open(file_path, "rb") as f:
129
+ while chunk := f.read(8192):
130
+ md5_hash.update(chunk)
131
 
132
+ return md5_hash.hexdigest()