souljoy commited on
Commit
7fc63ef
1 Parent(s): 94cd51e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -5
app.py CHANGED
@@ -4,12 +4,15 @@ import gradio as gr
4
  # from concurrent.futures import ThreadPoolExecutor
5
  import pdfplumber
6
  import pandas as pd
 
 
7
  from sentence_transformers import SentenceTransformer, models, util
8
  word_embedding_model = models.Transformer('uer/sbert-base-chinese-nli', do_lower_case=True)
9
  pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
10
  embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
11
-
12
- url = 'https://souljoy-my-api.hf.space/qa_maker'
 
13
  headers = {
14
  'Content-Type': 'application/json',
15
  }
@@ -41,7 +44,7 @@ def doc_emb(doc: str):
41
  # emb_list.append(f.result())
42
  print('\n'.join(texts))
43
  return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
44
- value="""操作说明 step 3:PDF解析提交成功 🙋 可以开始对话啦~"""), gr.Chatbot.update(visible=True)
45
 
46
 
47
  def get_response(msg, bot, doc_text_list, doc_embeddings):
@@ -89,7 +92,7 @@ def get_response(msg, bot, doc_text_list, doc_embeddings):
89
  req_json['doc'] = '' if len(sub_doc_list) == 0 else '\n'.join(sub_doc_list)
90
  data = {"content": json.dumps(req_json)}
91
  print('data:\n', req_json)
92
- result = requests.post(url='https://souljoy-my-api.hf.space/chatpdf',
93
  data=json.dumps(data),
94
  headers=headers
95
  )
@@ -107,6 +110,17 @@ def up_file(files):
107
  # 读取PDF文档第i+1页
108
  page = pdf.pages[i]
109
  res_list = page.extract_text().split('\n')[:-1]
 
 
 
 
 
 
 
 
 
 
 
110
  tables = page.extract_tables()
111
  for table in tables:
112
  # 第一列当成表头:
@@ -124,7 +138,7 @@ def up_file(files):
124
  print(i)
125
  return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
126
  visible=True), gr.Markdown.update(
127
- value="操作说明 step 2:确认PDF解析结果(可修正),点击“提交结果”,进行对话")
128
 
129
 
130
  with gr.Blocks() as demo:
 
4
  # from concurrent.futures import ThreadPoolExecutor
5
  import pdfplumber
6
  import pandas as pd
7
+ import time
8
+ from cnocr import CnOcr
9
  from sentence_transformers import SentenceTransformer, models, util
10
  word_embedding_model = models.Transformer('uer/sbert-base-chinese-nli', do_lower_case=True)
11
  pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
12
  embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
13
+ ocr = CnOcr()
14
+ # chat_url = 'https://souljoy-my-api.hf.space/sale'
15
+ chat_url = 'https://souljoy-my-api.hf.space/chatpdf'
16
  headers = {
17
  'Content-Type': 'application/json',
18
  }
 
44
  # emb_list.append(f.result())
45
  print('\n'.join(texts))
46
  return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
47
+ value="""操作说明 step 3:PDF解析提交成功! 🙋 可以开始对话啦~"""), gr.Chatbot.update(visible=True)
48
 
49
 
50
  def get_response(msg, bot, doc_text_list, doc_embeddings):
 
92
  req_json['doc'] = '' if len(sub_doc_list) == 0 else '\n'.join(sub_doc_list)
93
  data = {"content": json.dumps(req_json)}
94
  print('data:\n', req_json)
95
+ result = requests.post(url=chat_url,
96
  data=json.dumps(data),
97
  headers=headers
98
  )
 
110
  # 读取PDF文档第i+1页
111
  page = pdf.pages[i]
112
  res_list = page.extract_text().split('\n')[:-1]
113
+
114
+ for j in range(len(page.images)):
115
+ # 获取图片的二进制流
116
+ img = page.images[j]
117
+ file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
118
+ with open(file_name, mode='wb') as f:
119
+ f.write(img['stream'].get_data())
120
+ res = ocr.ocr(file_name)
121
+ if len(res) > 0:
122
+ res_list.append(' '.join([re['text'] for re in res]))
123
+
124
  tables = page.extract_tables()
125
  for table in tables:
126
  # 第一列当成表头:
 
138
  print(i)
139
  return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
140
  visible=True), gr.Markdown.update(
141
+ value="操作说明 step 2:确认PDF解析结果(可修正),点击“提交解析结果”,随后进行对话")
142
 
143
 
144
  with gr.Blocks() as demo: