Spaces:
Runtime error
Runtime error
JanDalhuysen
commited on
Commit
•
7391d1d
1
Parent(s):
3ac435e
Update app.py
Browse files
app.py
CHANGED
@@ -44,7 +44,8 @@ def doc_emb(doc: str):
|
|
44 |
# emb_list.append(f.result())
|
45 |
print('\n'.join(texts))
|
46 |
return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
|
47 |
-
value="""操作说明 step 3:PDF解析提交成功! 🙋 可以开始对话啦~"""), gr.Chatbot.update(visible=True)
|
|
|
48 |
|
49 |
|
50 |
def get_response(msg, bot, doc_text_list, doc_embeddings):
|
@@ -72,6 +73,7 @@ def get_response(msg, bot, doc_text_list, doc_embeddings):
|
|
72 |
index_set.add(s_i[1])
|
73 |
now_len += len(doc)
|
74 |
# 可能段落截断错误,所以把上下段也加入进来
|
|
|
75 |
if s_i[1] > 0 and s_i[1] -1 not in index_set:
|
76 |
doc = doc_text_list[s_i[1]-1]
|
77 |
if now_len + len(doc) > all_max_len:
|
@@ -108,11 +110,13 @@ def up_file(files):
|
|
108 |
with pdfplumber.open(file.name) as pdf:
|
109 |
for i in range(len(pdf.pages)):
|
110 |
# 读取PDF文档第i+1页
|
|
|
111 |
page = pdf.pages[i]
|
112 |
res_list = page.extract_text().split('\n')[:-1]
|
113 |
|
114 |
for j in range(len(page.images)):
|
115 |
# 获取图片的二进制流
|
|
|
116 |
img = page.images[j]
|
117 |
file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
|
118 |
with open(file_name, mode='wb') as f:
|
@@ -126,7 +130,8 @@ def up_file(files):
|
|
126 |
|
127 |
tables = page.extract_tables()
|
128 |
for table in tables:
|
129 |
-
#
|
|
|
130 |
df = pd.DataFrame(table[1:], columns=table[0])
|
131 |
try:
|
132 |
records = json.loads(df.to_json(orient="records", force_ascii=False))
|
@@ -140,22 +145,29 @@ def up_file(files):
|
|
140 |
print(doc_text_list)
|
141 |
return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
|
142 |
visible=True), gr.Markdown.update(
|
143 |
-
value="操作说明 step 2:确认PDF解析结果(可修正),点击“提交解析结果”,随后进行对话")
|
|
|
144 |
|
145 |
|
146 |
with gr.Blocks() as demo:
|
147 |
with gr.Row():
|
148 |
with gr.Column():
|
149 |
-
file = gr.File(file_types=['.pdf'], label='点击上传PDF,进行解析(支持多文档、表格、OCR)', file_count='multiple')
|
150 |
-
|
151 |
-
|
|
|
|
|
|
|
152 |
doc_text_state = gr.State([])
|
153 |
doc_emb_state = gr.State([])
|
154 |
with gr.Column():
|
155 |
-
md = gr.Markdown("""操作说明 step 1:点击左侧区域,上传PDF,进行解析""")
|
|
|
156 |
chat_bot = gr.Chatbot(visible=False)
|
157 |
-
msg_txt = gr.Textbox(label='消息框', placeholder='输入消息,点击发送', visible=False)
|
158 |
-
|
|
|
|
|
159 |
|
160 |
file.change(up_file, [file], [txt, doc_bu, md])
|
161 |
doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])
|
|
|
44 |
# emb_list.append(f.result())
|
45 |
print('\n'.join(texts))
|
46 |
return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
|
47 |
+
# value="""操作说明 step 3:PDF解析提交成功! 🙋 可以开始对话啦~"""), gr.Chatbot.update(visible=True)
|
48 |
+
value="""Step 3: PDF analysis and submission successful! 🙋 You can start the conversation"""), gr.Chatbot.update(visible=True)
|
49 |
|
50 |
|
51 |
def get_response(msg, bot, doc_text_list, doc_embeddings):
|
|
|
73 |
index_set.add(s_i[1])
|
74 |
now_len += len(doc)
|
75 |
# 可能段落截断错误,所以把上下段也加入进来
|
76 |
+
# Maybe the paragraph is truncated wrong, so add the upper and lower paragraphs
|
77 |
if s_i[1] > 0 and s_i[1] -1 not in index_set:
|
78 |
doc = doc_text_list[s_i[1]-1]
|
79 |
if now_len + len(doc) > all_max_len:
|
|
|
110 |
with pdfplumber.open(file.name) as pdf:
|
111 |
for i in range(len(pdf.pages)):
|
112 |
# 读取PDF文档第i+1页
|
113 |
+
# Read page i+1 of PDF document
|
114 |
page = pdf.pages[i]
|
115 |
res_list = page.extract_text().split('\n')[:-1]
|
116 |
|
117 |
for j in range(len(page.images)):
|
118 |
# 获取图片的二进制流
|
119 |
+
# Get the binary stream of the image
|
120 |
img = page.images[j]
|
121 |
file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
|
122 |
with open(file_name, mode='wb') as f:
|
|
|
130 |
|
131 |
tables = page.extract_tables()
|
132 |
for table in tables:
|
133 |
+
# 第一列当成表头:
|
134 |
+
# The first column is used as the header:
|
135 |
df = pd.DataFrame(table[1:], columns=table[0])
|
136 |
try:
|
137 |
records = json.loads(df.to_json(orient="records", force_ascii=False))
|
|
|
145 |
print(doc_text_list)
|
146 |
return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
|
147 |
visible=True), gr.Markdown.update(
|
148 |
+
# value="操作说明 step 2:确认PDF解析结果(可修正),点击“提交解析结果”,随后进行对话")
|
149 |
+
value="Step 2: Confirm the PDF analysis result (can be revised), click “Submit analysis result”, and then chat")
|
150 |
|
151 |
|
152 |
with gr.Blocks() as demo:
|
153 |
with gr.Row():
|
154 |
with gr.Column():
|
155 |
+
# file = gr.File(file_types=['.pdf'], label='点击上传PDF,进行解析(支持多文档、表格、OCR)', file_count='multiple')
|
156 |
+
file = gr.File(file_types=['.pdf'], label='Click to upload PDF and analyze it (support multiple documents, forms, OCR)', file_count='multiple')
|
157 |
+
# doc_bu = gr.Button(value='提交解析结果', visible=False)
|
158 |
+
doc_bu = gr.Button(value='Submit analysis results', visible=False)
|
159 |
+
# txt = gr.Textbox(label='PDF解析结果', visible=False)
|
160 |
+
txt = gr.Textbox(label='PDF analysis result', visible=False)
|
161 |
doc_text_state = gr.State([])
|
162 |
doc_emb_state = gr.State([])
|
163 |
with gr.Column():
|
164 |
+
# md = gr.Markdown("""操作说明 step 1:点击左侧区域,上传PDF,进行解析""")
|
165 |
+
md = gr.Markdown("""Step 1: Click on the area on the left, upload the PDF and analyze it""")
|
166 |
chat_bot = gr.Chatbot(visible=False)
|
167 |
+
# msg_txt = gr.Textbox(label='消息框', placeholder='输入消息,点击发送', visible=False)
|
168 |
+
msg_txt = gr.Textbox(label='message box', placeholder='enter message and click to send', visible=False)
|
169 |
+
# chat_bu = gr.Button(value='发送', visible=False)
|
170 |
+
chat_bu = gr.Button(value='send', visible=False)
|
171 |
|
172 |
file.change(up_file, [file], [txt, doc_bu, md])
|
173 |
doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])
|