Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -4,12 +4,15 @@ import gradio as gr
|
|
4 |
# from concurrent.futures import ThreadPoolExecutor
|
5 |
import pdfplumber
|
6 |
import pandas as pd
|
|
|
|
|
7 |
from sentence_transformers import SentenceTransformer, models, util
|
8 |
word_embedding_model = models.Transformer('uer/sbert-base-chinese-nli', do_lower_case=True)
|
9 |
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
|
10 |
embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
|
11 |
-
|
12 |
-
|
|
|
13 |
headers = {
|
14 |
'Content-Type': 'application/json',
|
15 |
}
|
@@ -41,7 +44,7 @@ def doc_emb(doc: str):
|
|
41 |
# emb_list.append(f.result())
|
42 |
print('\n'.join(texts))
|
43 |
return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
|
44 |
-
value="""操作说明 step 3:PDF
|
45 |
|
46 |
|
47 |
def get_response(msg, bot, doc_text_list, doc_embeddings):
|
@@ -89,7 +92,7 @@ def get_response(msg, bot, doc_text_list, doc_embeddings):
|
|
89 |
req_json['doc'] = '' if len(sub_doc_list) == 0 else '\n'.join(sub_doc_list)
|
90 |
data = {"content": json.dumps(req_json)}
|
91 |
print('data:\n', req_json)
|
92 |
-
result = requests.post(url=
|
93 |
data=json.dumps(data),
|
94 |
headers=headers
|
95 |
)
|
@@ -107,6 +110,17 @@ def up_file(files):
|
|
107 |
# 读取PDF文档第i+1页
|
108 |
page = pdf.pages[i]
|
109 |
res_list = page.extract_text().split('\n')[:-1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
tables = page.extract_tables()
|
111 |
for table in tables:
|
112 |
# 第一列当成表头:
|
@@ -124,7 +138,7 @@ def up_file(files):
|
|
124 |
print(i)
|
125 |
return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
|
126 |
visible=True), gr.Markdown.update(
|
127 |
-
value="操作说明 step 2:确认PDF
|
128 |
|
129 |
|
130 |
with gr.Blocks() as demo:
|
|
|
4 |
# from concurrent.futures import ThreadPoolExecutor
|
5 |
import pdfplumber
|
6 |
import pandas as pd
|
7 |
+
import time
|
8 |
+
from cnocr import CnOcr
|
9 |
from sentence_transformers import SentenceTransformer, models, util
|
10 |
word_embedding_model = models.Transformer('uer/sbert-base-chinese-nli', do_lower_case=True)
|
11 |
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
|
12 |
embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
|
13 |
+
ocr = CnOcr()
|
14 |
+
# chat_url = 'https://souljoy-my-api.hf.space/sale'
|
15 |
+
chat_url = 'https://souljoy-my-api.hf.space/chatpdf'
|
16 |
headers = {
|
17 |
'Content-Type': 'application/json',
|
18 |
}
|
|
|
44 |
# emb_list.append(f.result())
|
45 |
print('\n'.join(texts))
|
46 |
return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
|
47 |
+
value="""操作说明 step 3:PDF解析提交成功! 🙋 可以开始对话啦~"""), gr.Chatbot.update(visible=True)
|
48 |
|
49 |
|
50 |
def get_response(msg, bot, doc_text_list, doc_embeddings):
|
|
|
92 |
req_json['doc'] = '' if len(sub_doc_list) == 0 else '\n'.join(sub_doc_list)
|
93 |
data = {"content": json.dumps(req_json)}
|
94 |
print('data:\n', req_json)
|
95 |
+
result = requests.post(url=chat_url,
|
96 |
data=json.dumps(data),
|
97 |
headers=headers
|
98 |
)
|
|
|
110 |
# 读取PDF文档第i+1页
|
111 |
page = pdf.pages[i]
|
112 |
res_list = page.extract_text().split('\n')[:-1]
|
113 |
+
|
114 |
+
for j in range(len(page.images)):
|
115 |
+
# 获取图片的二进制流
|
116 |
+
img = page.images[j]
|
117 |
+
file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
|
118 |
+
with open(file_name, mode='wb') as f:
|
119 |
+
f.write(img['stream'].get_data())
|
120 |
+
res = ocr.ocr(file_name)
|
121 |
+
if len(res) > 0:
|
122 |
+
res_list.append(' '.join([re['text'] for re in res]))
|
123 |
+
|
124 |
tables = page.extract_tables()
|
125 |
for table in tables:
|
126 |
# 第一列当成表头:
|
|
|
138 |
print(i)
|
139 |
return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
|
140 |
visible=True), gr.Markdown.update(
|
141 |
+
value="操作说明 step 2:确认PDF解析结果(可修正),点击“提交解析结果”,随后进行对话")
|
142 |
|
143 |
|
144 |
with gr.Blocks() as demo:
|