JanDalhuysen commited on
Commit
7391d1d
1 Parent(s): 3ac435e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -9
app.py CHANGED
@@ -44,7 +44,8 @@ def doc_emb(doc: str):
44
  # emb_list.append(f.result())
45
  print('\n'.join(texts))
46
  return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
47
- value="""操作说明 step 3:PDF解析提交成功! 🙋 可以开始对话啦~"""), gr.Chatbot.update(visible=True)
 
48
 
49
 
50
  def get_response(msg, bot, doc_text_list, doc_embeddings):
@@ -72,6 +73,7 @@ def get_response(msg, bot, doc_text_list, doc_embeddings):
72
  index_set.add(s_i[1])
73
  now_len += len(doc)
74
  # 可能段落截断错误,所以把上下段也加入进来
 
75
  if s_i[1] > 0 and s_i[1] -1 not in index_set:
76
  doc = doc_text_list[s_i[1]-1]
77
  if now_len + len(doc) > all_max_len:
@@ -108,11 +110,13 @@ def up_file(files):
108
  with pdfplumber.open(file.name) as pdf:
109
  for i in range(len(pdf.pages)):
110
  # 读取PDF文档第i+1页
 
111
  page = pdf.pages[i]
112
  res_list = page.extract_text().split('\n')[:-1]
113
 
114
  for j in range(len(page.images)):
115
  # 获取图片的二进制流
 
116
  img = page.images[j]
117
  file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
118
  with open(file_name, mode='wb') as f:
@@ -126,7 +130,8 @@ def up_file(files):
126
 
127
  tables = page.extract_tables()
128
  for table in tables:
129
- # 第一列当成表头:
 
130
  df = pd.DataFrame(table[1:], columns=table[0])
131
  try:
132
  records = json.loads(df.to_json(orient="records", force_ascii=False))
@@ -140,22 +145,29 @@ def up_file(files):
140
  print(doc_text_list)
141
  return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
142
  visible=True), gr.Markdown.update(
143
- value="操作说明 step 2:确认PDF解析结果(可修正),点击“提交解析结果”,随后进行对话")
 
144
 
145
 
146
  with gr.Blocks() as demo:
147
  with gr.Row():
148
  with gr.Column():
149
- file = gr.File(file_types=['.pdf'], label='点击上传PDF,进行解析(支持多文档、表格、OCR)', file_count='multiple')
150
- doc_bu = gr.Button(value='提交解析结果', visible=False)
151
- txt = gr.Textbox(label='PDF解析结果', visible=False)
 
 
 
152
  doc_text_state = gr.State([])
153
  doc_emb_state = gr.State([])
154
  with gr.Column():
155
- md = gr.Markdown("""操作说明 step 1:点击左侧区域,上传PDF,进行解析""")
 
156
  chat_bot = gr.Chatbot(visible=False)
157
- msg_txt = gr.Textbox(label='消息框', placeholder='输入消息,点击发送', visible=False)
158
- chat_bu = gr.Button(value='发送', visible=False)
 
 
159
 
160
  file.change(up_file, [file], [txt, doc_bu, md])
161
  doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])
 
44
  # emb_list.append(f.result())
45
  print('\n'.join(texts))
46
  return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
47
+ # value="""操作说明 step 3:PDF解析提交成功! 🙋 可以开始对话啦~"""), gr.Chatbot.update(visible=True)
48
+ value="""Step 3: PDF analysis and submission successful! 🙋 You can start the conversation"""), gr.Chatbot.update(visible=True)
49
 
50
 
51
  def get_response(msg, bot, doc_text_list, doc_embeddings):
 
73
  index_set.add(s_i[1])
74
  now_len += len(doc)
75
  # 可能段落截断错误,所以把上下段也加入进来
76
+ # Maybe the paragraph is truncated wrong, so add the upper and lower paragraphs
77
  if s_i[1] > 0 and s_i[1] -1 not in index_set:
78
  doc = doc_text_list[s_i[1]-1]
79
  if now_len + len(doc) > all_max_len:
 
110
  with pdfplumber.open(file.name) as pdf:
111
  for i in range(len(pdf.pages)):
112
  # 读取PDF文档第i+1页
113
+ # Read page i+1 of PDF document
114
  page = pdf.pages[i]
115
  res_list = page.extract_text().split('\n')[:-1]
116
 
117
  for j in range(len(page.images)):
118
  # 获取图片的二进制流
119
+ # Get the binary stream of the image
120
  img = page.images[j]
121
  file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
122
  with open(file_name, mode='wb') as f:
 
130
 
131
  tables = page.extract_tables()
132
  for table in tables:
133
+ # 第一列当成表头:
134
+ # The first column is used as the header:
135
  df = pd.DataFrame(table[1:], columns=table[0])
136
  try:
137
  records = json.loads(df.to_json(orient="records", force_ascii=False))
 
145
  print(doc_text_list)
146
  return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
147
  visible=True), gr.Markdown.update(
148
+ # value="操作说明 step 2:确认PDF解析结果(可修正),点击“提交解析结果”,随后进行对话")
149
+ value="Step 2: Confirm the PDF analysis result (can be revised), click “Submit analysis result”, and then chat")
150
 
151
 
152
  with gr.Blocks() as demo:
153
  with gr.Row():
154
  with gr.Column():
155
+ # file = gr.File(file_types=['.pdf'], label='点击上传PDF,进行解析(支持多文档、表格、OCR)', file_count='multiple')
156
+ file = gr.File(file_types=['.pdf'], label='Click to upload PDF and analyze it (support multiple documents, forms, OCR)', file_count='multiple')
157
+ # doc_bu = gr.Button(value='提交解析结果', visible=False)
158
+ doc_bu = gr.Button(value='Submit analysis results', visible=False)
159
+ # txt = gr.Textbox(label='PDF解析结果', visible=False)
160
+ txt = gr.Textbox(label='PDF analysis result', visible=False)
161
  doc_text_state = gr.State([])
162
  doc_emb_state = gr.State([])
163
  with gr.Column():
164
+ # md = gr.Markdown("""操作说明 step 1:点击左侧区域,上传PDF,进行解析""")
165
+ md = gr.Markdown("""Step 1: Click on the area on the left, upload the PDF and analyze it""")
166
  chat_bot = gr.Chatbot(visible=False)
167
+ # msg_txt = gr.Textbox(label='消息框', placeholder='输入消息,点击发送', visible=False)
168
+ msg_txt = gr.Textbox(label='message box', placeholder='enter message and click to send', visible=False)
169
+ # chat_bu = gr.Button(value='发送', visible=False)
170
+ chat_bu = gr.Button(value='send', visible=False)
171
 
172
  file.change(up_file, [file], [txt, doc_bu, md])
173
  doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])