jackkuo commited on
Commit
f704336
·
verified ·
1 Parent(s): efe0300

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -50
app.py CHANGED
@@ -1,88 +1,140 @@
1
  import gradio as gr
2
  import base64
3
- import os
 
 
4
  api_key = os.getenv('API_KEY')
 
5
 
6
- def predict(input, file_input):
7
- print("input:", input)
8
- print("file_input:", file_input.name)
9
- from gradio_client import Client
10
-
11
- client = Client(api_key)
12
- extract_result = client.predict(
13
- input,
14
- file_input.name,
15
- fn_index=1
16
- )
17
- if extract_result:
18
- print(extract_result)
19
- return extract_result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  else:
21
- return "Too many user, please wait a monument!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
 
24
  def view_pdf(pdf_file):
 
 
 
25
  with open(pdf_file.name, 'rb') as f:
26
  pdf_data = f.read()
27
- # print("pdf_file", pdf_file)
28
- # pdf_data = pdf_file
29
  b64_data = base64.b64encode(pdf_data).decode('utf-8')
30
- # print("b64_data", b64_data)
31
  return f"<embed src='data:application/pdf;base64,{b64_data}' type='application/pdf' width='100%' height='700px' />"
32
 
33
 
34
- en_1 = ["""could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
35
- If any of this information was not available in the paper, please replaced it with the string `""`, If the property contains multiple entities, please use a list to contain.
36
- """]
37
 
38
- en_2 = ["""could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a json format?
39
- If any of this information was not available in the paper, please replaced it with the string `""`, If the property contains multiple entities, please use a list to contain.
40
- """]
41
 
42
- examples = [en_1, en_2]
43
 
44
- with gr.Blocks(title="ChatPaperGPT") as demo:
45
  gr.Markdown(
46
- '''<p align="center" width="100%">
47
- <img src="https://big-cheng.com/img/pdf.png" alt="pdf-logo" width="50"/>
48
- <p>
49
-
50
  <h1 align="center"> Paper Extract GPT </h1>
51
  <p> How to use:
52
- <br> <strong>#1</strong>: Upload your pdf.
53
- <br> <strong>#2</strong>: Click the View PDF button to view it.
54
- <br> <strong>#3</strong>: Enter your extraction prompt in the input box (of course, you can click example to test).
55
- <br> <strong>#4</strong>: Click Generate to extract, and the extracted information will be displayed in markdown form.
56
  </p>
57
  '''
58
  )
59
  with gr.Row():
60
  with gr.Column():
61
  gr.Markdown('## Upload PDF')
62
- file_input = gr.File(type="filepath")
63
  viewer_button = gr.Button("View PDF")
64
- file_out = gr.HTML()
 
65
  with gr.Column():
66
- with gr.Row():
67
- model_input = gr.Textbox(lines=7, placeholder='Input prompt about extract information from paper',
68
- label='Input')
69
  with gr.Row():
70
  gen = gr.Button("Generate")
71
  clr = gr.Button("Clear")
72
- example = gr.Examples(examples=examples, inputs=model_input)
73
-
74
- with gr.Row():
75
- outputs = gr.Markdown(label='Output', show_label=True, value="""| Title | Journal | Year | Author | Institution | Email |
76
  |---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
77
  | Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
78
  """)
79
 
80
- inputs = [model_input, file_input]
81
- gen.click(fn=predict, inputs=inputs, outputs=outputs)
82
- clr.click(fn=lambda value: [gr.update(value=""), gr.update(value="")], inputs=clr,
83
- outputs=[model_input, outputs])
84
-
85
  viewer_button.click(view_pdf, inputs=file_input, outputs=file_out)
86
- # parser_button.click(extract_text, inputs=file_input, outputs=[xml_out, md_out, rich_md_out])
87
 
88
  demo.launch()
 
1
  import gradio as gr
2
  import base64
3
+ import os
4
+ from openai import OpenAI
5
+
6
  api_key = os.getenv('API_KEY')
7
+ base_url = os.getenv("BASE_URL")
8
 
9
+ client = OpenAI(
10
+ api_key=api_key,
11
+ base_url=base_url,
12
+ )
13
+
14
+
15
+ def extract_pdf_pypdf(pdf_dir):
16
+ import fitz
17
+ path = pdf_dir
18
+
19
+ try:
20
+ doc = fitz.open(path)
21
+ except:
22
+ print("can not read pdf")
23
+ return None
24
+
25
+ page_count = doc.page_count
26
+ file_content = ""
27
+ for page in range(page_count):
28
+ text = doc.load_page(page).get_text("text")
29
+ # 防止目录中包含References
30
+ file_content += text + "\n\n"
31
+
32
+ return file_content
33
+
34
+
35
+ def openai_api(messages):
36
+ try:
37
+ completion = client.chat.completions.create(
38
+ model="claude-3-5-sonnet-20240620",
39
+ messages=messages,
40
+ temperature=0.1,
41
+ max_tokens=8192,
42
+ # timeout=300,
43
+ stream=True
44
+ )
45
+ except Exception as ex:
46
+ print("api 出现如下异常%s" % ex)
47
+ return None
48
+
49
+ if completion:
50
+ try:
51
+ response_2_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in
52
+ completion]
53
+ print("response tokens:", len(response_2_list))
54
+
55
+ response_2_content = ''.join(response_2_list)
56
+ return response_2_content
57
+ except Exception as ex:
58
+ print("第二轮 出现如下异常%s" % ex)
59
+ return None
60
  else:
61
+ print("第二轮出现异常")
62
+ return None
63
+
64
+
65
+ def predict(input_text, pdf_file):
66
+ if pdf_file is None:
67
+ return "Please upload a PDF file to proceed."
68
+
69
+ file_content = extract_pdf_pypdf(pdf_file.name)
70
+ messages = [
71
+ {
72
+ "role": "system",
73
+ "content": "You are an expert in information extraction from scientific literature.",
74
+ },
75
+ {"role": "user", "content": """Provided Text:
76
+ '''
77
+ {{""" + file_content + """}}
78
+ '''
79
+ """ + input_text}
80
+ ]
81
+ extract_result = openai_api(messages)
82
+
83
+ return extract_result or "Too many users. Please wait a moment!"
84
 
85
 
86
  def view_pdf(pdf_file):
87
+ if pdf_file is None:
88
+ return "Please upload a PDF file to view."
89
+
90
  with open(pdf_file.name, 'rb') as f:
91
  pdf_data = f.read()
 
 
92
  b64_data = base64.b64encode(pdf_data).decode('utf-8')
 
93
  return f"<embed src='data:application/pdf;base64,{b64_data}' type='application/pdf' width='100%' height='700px' />"
94
 
95
 
96
+ en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
97
+ If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
98
+ """
99
 
100
+ en_2 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a JSON format?
101
+ If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
102
+ """
103
 
104
+ examples = [[en_1], [en_2]]
105
 
106
+ with gr.Blocks(title="PaperExtractGPT") as demo:
107
  gr.Markdown(
108
+ '''<p align="center">
 
 
 
109
  <h1 align="center"> Paper Extract GPT </h1>
110
  <p> How to use:
111
+ <br> <strong>1</strong>: Upload your PDF.
112
+ <br> <strong>2</strong>: Click "View PDF" to preview it.
113
+ <br> <strong>3</strong>: Enter your extraction prompt in the input box.
114
+ <br> <strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
115
  </p>
116
  '''
117
  )
118
  with gr.Row():
119
  with gr.Column():
120
  gr.Markdown('## Upload PDF')
121
+ file_input = gr.File(label="Upload your PDF", type="filepath")
122
  viewer_button = gr.Button("View PDF")
123
+ file_out = gr.HTML(label="PDF Preview")
124
+
125
  with gr.Column():
126
+ model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
127
+ example = gr.Examples(examples=examples, inputs=model_input)
 
128
  with gr.Row():
129
  gen = gr.Button("Generate")
130
  clr = gr.Button("Clear")
131
+ outputs = gr.Markdown(label='Output', show_label=True, value="""| Title | Journal | Year | Author | Institution | Email |
 
 
 
132
  |---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
133
  | Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
134
  """)
135
 
136
+ gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
137
+ clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
 
 
 
138
  viewer_button.click(view_pdf, inputs=file_input, outputs=file_out)
 
139
 
140
  demo.launch()