from huggingface_hub import hf_hub_download import re from PIL import Image import requests from nougat.dataset.rasterize import rasterize_paper from transformers import NougatProcessor, VisionEncoderDecoderModel import torch processor = NougatProcessor.from_pretrained("nielsr/nougat") model = VisionEncoderDecoderModel.from_pretrained("nielsr/nougat") device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) def get_pdf(pdf_link): unique_filename = f"{os.getcwd()}/downloaded_paper_{uuid.uuid4().hex}.pdf" response = requests.get(pdf_link) if response.status_code == 200: with open(unique_filename, 'wb') as pdf_file: pdf_file.write(response.content) print("PDF downloaded successfully.") else: print("Failed to download the PDF.") return unique_filename def predict(image): # prepare PDF image for the model image = Image.open(image) pixel_values = processor(image, return_tensors="pt").pixel_values # generate transcription (here we only generate 30 tokens) outputs = model.generate( pixel_values.to(device), min_length=1, max_new_tokens=30, bad_words_ids=[[processor.tokenizer.unk_token_id]], ) sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0] sequence = processor.post_process_generation(sequence, fix_markdown=False) return sequence def inference(pdf_file, pdf_link): if pdf_file is None: if pdf_link == '': print("No file is uploaded and No link is provided") return "No data provided. Upload a pdf file or provide a pdf link and try again!" else: file_name = get_pdf(pdf_link) else: file_name = pdf_file.name pdf_name = pdf_file.name.split('/')[-1].split('.')[0] images = rasterize_paper(file_name, return_pil=True) sequence = "" # infer for every page and concat for image in images: sequence += predict(image) content = sequence.replace(r'\(', '$').replace(r'\)', '$').replace(r'\[', '$$').replace(r'\]', '$$') return content import gradio as gr import uuid import os import requests import re css = """ #mkd { height: 500px; overflow: auto; border: 1px solid #ccc; } """ with gr.Blocks(css=css) as demo: gr.HTML("

Nougat: Neural Optical Understanding for Academic Documents 🍫

") gr.HTML("

Lukas Blecher et al. Paper, Project

") gr.HTML("

This demo is based on transformers implementation of Nougat 🤗

") with gr.Row(): mkd = gr.Markdown('

Upload a PDF

',scale=1) mkd = gr.Markdown('

OR

',scale=1) mkd = gr.Markdown('

Provide a PDF link

',scale=1) with gr.Row(equal_height=True): pdf_file = gr.File(label='PDF 📑', file_count='single', scale=1) pdf_link = gr.Textbox(placeholder='Enter an arxiv link here', label='Link to Paper🔗', scale=1) with gr.Row(): btn = gr.Button('Run Nougat 🍫') clr = gr.Button('Clear 🧼') output_headline = gr.Markdown("PDF converted to markup language through Nougat-OCR👇") parsed_output = gr.Markdown(elem_id='mkd', value='OCR Output 📝') btn.click(inference, [pdf_file, pdf_link], parsed_output ) clr.click(lambda : (gr.update(value=None), gr.update(value=None), gr.update(value=None)), [], [pdf_file, pdf_link, parsed_output] ) gr.Examples( [["nougat.pdf", ""], [None, "https://arxiv.org/pdf/2308.08316.pdf"]], inputs = [pdf_file, pdf_link], outputs = parsed_output, fn=inference, cache_examples=True, label='Click on any Examples below to get Nougat OCR results quickly:' ) demo.queue() demo.launch(debug=True)