ChatPDF

Runtime error

App Files Files Community

JanDalhuysen commited on Apr 6, 2023

Commit

7391d1d

1 Parent(s): 3ac435e

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -9

app.py CHANGED Viewed

@@ -44,7 +44,8 @@ def doc_emb(doc: str):
     #     emb_list.append(f.result())
     print('\n'.join(texts))
     return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
-        value="""操作说明 step 3：PDF解析提交成功！ 🙋 可以开始对话啦~"""), gr.Chatbot.update(visible=True)
 def get_response(msg, bot, doc_text_list, doc_embeddings):
@@ -72,6 +73,7 @@ def get_response(msg, bot, doc_text_list, doc_embeddings):
         index_set.add(s_i[1])
         now_len += len(doc)
         # 可能段落截断错误，所以把上下段也加入进来
         if s_i[1] > 0 and s_i[1] -1 not in index_set:
             doc = doc_text_list[s_i[1]-1]
             if now_len + len(doc) > all_max_len:
@@ -108,11 +110,13 @@ def up_file(files):
         with pdfplumber.open(file.name) as pdf:
             for i in range(len(pdf.pages)):
                 # 读取PDF文档第i+1页
                 page = pdf.pages[i]
                 res_list = page.extract_text().split('\n')[:-1]
                 for j in range(len(page.images)):
                     # 获取图片的二进制流
                     img = page.images[j]
                     file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
                     with open(file_name, mode='wb') as f:
@@ -126,7 +130,8 @@ def up_file(files):
                 tables = page.extract_tables()
                 for table in tables:
-                    # 第一列当成表头：
                     df = pd.DataFrame(table[1:], columns=table[0])
                     try:
                         records = json.loads(df.to_json(orient="records", force_ascii=False))
@@ -140,22 +145,29 @@ def up_file(files):
     print(doc_text_list)
     return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
         visible=True), gr.Markdown.update(
-        value="操作说明 step 2：确认PDF解析结果（可修正），点击“提交解析结果”，随后进行对话")
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            file = gr.File(file_types=['.pdf'], label='点击上传PDF，进行解析(支持多文档、表格、OCR)', file_count='multiple')
-            doc_bu = gr.Button(value='提交解析结果', visible=False)
-            txt = gr.Textbox(label='PDF解析结果', visible=False)
             doc_text_state = gr.State([])
             doc_emb_state = gr.State([])
         with gr.Column():
-            md = gr.Markdown("""操作说明 step 1：点击左侧区域，上传PDF，进行解析""")
             chat_bot = gr.Chatbot(visible=False)
-            msg_txt = gr.Textbox(label='消息框', placeholder='输入消息，点击发送', visible=False)
-            chat_bu = gr.Button(value='发送', visible=False)
     file.change(up_file, [file], [txt, doc_bu, md])
     doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])

     #     emb_list.append(f.result())
     print('\n'.join(texts))
     return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
+        # value="""操作说明 step 3：PDF解析提交成功！ 🙋 可以开始对话啦~"""), gr.Chatbot.update(visible=True)
+        value="""Step 3: PDF analysis and submission successful！ 🙋 You can start the conversation"""), gr.Chatbot.update(visible=True)
 def get_response(msg, bot, doc_text_list, doc_embeddings):
         index_set.add(s_i[1])
         now_len += len(doc)
         # 可能段落截断错误，所以把上下段也加入进来
+        # Maybe the paragraph is truncated wrong, so add the upper and lower paragraphs
         if s_i[1] > 0 and s_i[1] -1 not in index_set:
             doc = doc_text_list[s_i[1]-1]
             if now_len + len(doc) > all_max_len:
         with pdfplumber.open(file.name) as pdf:
             for i in range(len(pdf.pages)):
                 # 读取PDF文档第i+1页
+                # Read page i+1 of PDF document
                 page = pdf.pages[i]
                 res_list = page.extract_text().split('\n')[:-1]
                 for j in range(len(page.images)):
                     # 获取图片的二进制流
+                    # Get the binary stream of the image
                     img = page.images[j]
                     file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
                     with open(file_name, mode='wb') as f:
                 tables = page.extract_tables()
                 for table in tables:
+                    # 第一列当成表头:
+                    # The first column is used as the header:
                     df = pd.DataFrame(table[1:], columns=table[0])
                     try:
                         records = json.loads(df.to_json(orient="records", force_ascii=False))
     print(doc_text_list)
     return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
         visible=True), gr.Markdown.update(
+        # value="操作说明 step 2：确认PDF解析结果（可修正），点击“提交解析结果”，随后进行对话")
+        value="Step 2: Confirm the PDF analysis result (can be revised), click “Submit analysis result”, and then chat")
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
+            # file = gr.File(file_types=['.pdf'], label='点击上传PDF，进行解析(支持多文档、表格、OCR)', file_count='multiple')
+            file = gr.File(file_types=['.pdf'], label='Click to upload PDF and analyze it (support multiple documents, forms, OCR)', file_count='multiple')
+            # doc_bu = gr.Button(value='提交解析结果', visible=False)
+            doc_bu = gr.Button(value='Submit analysis results', visible=False)
+            # txt = gr.Textbox(label='PDF解析结果', visible=False)
+            txt = gr.Textbox(label='PDF analysis result', visible=False)
             doc_text_state = gr.State([])
             doc_emb_state = gr.State([])
         with gr.Column():
+            # md = gr.Markdown("""操作说明 step 1：点击左侧区域，上传PDF，进行解析""")
+            md = gr.Markdown("""Step 1: Click on the area on the left, upload the PDF and analyze it""")
             chat_bot = gr.Chatbot(visible=False)
+            # msg_txt = gr.Textbox(label='消息框', placeholder='输入消息，点击发送', visible=False)
+            msg_txt = gr.Textbox(label='message box', placeholder='enter message and click to send', visible=False)
+            # chat_bu = gr.Button(value='发送', visible=False)
+            chat_bu = gr.Button(value='send', visible=False)
     file.change(up_file, [file], [txt, doc_bu, md])
     doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])