Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	File size: 5,961 Bytes
			
			73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 606ffde f704336 606ffde 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2  | 
								1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156  | 
								import gradio as gr
import base64
import os
from openai import OpenAI
api_key = os.getenv('API_KEY')
base_url = os.getenv("BASE_URL")
client = OpenAI(
    api_key=api_key,
    base_url=base_url,
)
def extract_pdf_pypdf(pdf_dir):
    import fitz
    path = pdf_dir
    try:
        doc = fitz.open(path)
    except:
        print("can not read pdf")
        return None
    page_count = doc.page_count
    file_content = ""
    for page in range(page_count):
        text = doc.load_page(page).get_text("text")
        # 防止目录中包含References
        file_content += text + "\n\n"
    return file_content
def openai_api(messages):
    try:
        completion = client.chat.completions.create(
            model="claude-3-5-sonnet-20240620",
            messages=messages,
            temperature=0.1,
            max_tokens=8192,
            # timeout=300,
            stream=True
        )
    except Exception as ex:
        print("api 出现如下异常%s" % ex)
        return None
    if completion:
        try:
            response_2_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in
                               completion]
            print("response tokens:", len(response_2_list))
            response_2_content = ''.join(response_2_list)
            return response_2_content
        except Exception as ex:
            print("第二轮 出现如下异常%s" % ex)
            return None
    else:
        print("第二轮出现异常")
        return None
def predict(input_text, pdf_file):
    if pdf_file is None:
        return "Please upload a PDF file to proceed."
    file_content = extract_pdf_pypdf(pdf_file.name)
    messages = [
        {
            "role": "system",
            "content": "You are an expert in information extraction from scientific literature.",
        },
        {"role": "user", "content": """Provided Text:
    '''
    {{""" + file_content + """}}
    '''
                                        """ + input_text}
    ]
    extract_result = openai_api(messages)
    return extract_result or "Too many users. Please wait a moment!"
def view_pdf(pdf_file, max_pages=3):
    if pdf_file is None:
        return "Please upload a PDF file to view."
    try:
        # Open the PDF file
        doc = fitz.open(pdf_file.name)
        # Only read up to `max_pages` pages to reduce size for large PDFs
        preview_pdf = fitz.open()  # Create an empty PDF for the preview
        for page_num in range(min(max_pages, doc.page_count)):
            preview_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)
        # Save the preview as a temporary in-memory file
        pdf_data = preview_pdf.tobytes()
        # Encode as base64 for embedding in HTML
        b64_data = base64.b64encode(pdf_data).decode('utf-8')
        return f"<embed src='data:application/pdf;base64,{b64_data}' type='application/pdf' width='100%' height='700px' />"
    except Exception as e:
        print(f"Error displaying PDF: {e}")
        return "Error displaying PDF. Please try re-uploading."
en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
"""
en_2 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a JSON format?
If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
"""
examples = [[en_1], [en_2]]
with gr.Blocks(title="PaperExtractGPT") as demo:
    gr.Markdown(
        '''<p align="center">
        <h1 align="center"> Paper Extract GPT </h1>
        <p> How to use:
        <br> <strong>1</strong>: Upload your PDF.
        <br> <strong>2</strong>: Click "View PDF" to preview it.
        <br> <strong>3</strong>: Enter your extraction prompt in the input box.
        <br> <strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
        </p>
        '''
    )
    with gr.Row():
        with gr.Column():
            gr.Markdown('## Upload PDF')
            file_input = gr.File(label="Upload your PDF", type="filepath")
            viewer_button = gr.Button("View PDF")
            file_out = gr.HTML(label="PDF Preview")
        with gr.Column():
            model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
            example = gr.Examples(examples=examples, inputs=model_input)
            with gr.Row():
                gen = gr.Button("Generate")
                clr = gr.Button("Clear")
            outputs = gr.Markdown(label='Output', show_label=True,  value="""| Title                                       | Journal            | Year | Author                                        | Institution                                           | Email                 |
|---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
| Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
""")
    gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
    clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
    viewer_button.click(view_pdf, inputs=file_input, outputs=file_out)
demo.launch()
 |