from transformers import NougatProcessor, VisionEncoderDecoderModel import gradio as gr import torch from pdf2image import convert_from_path # Load the model and processor processor = NougatProcessor.from_pretrained("MohamedRashad/arabic-small-nougat") model = VisionEncoderDecoderModel.from_pretrained("MohamedRashad/arabic-small-nougat") device = "cuda" if torch.cuda.is_available() else "cpu" context_length = 2048 def extract_text_from_image(image): """ Extract text from PIL image Args: image (PIL.Image): Input image Returns: str: Extracted text from the image """ # prepare PDF image for the model pixel_values = processor(image, return_tensors="pt").pixel_values # generate transcription outputs = model.generate( pixel_values.to(device), min_length=1, max_new_tokens=context_length, bad_words_ids=[[processor.tokenizer.unk_token_id]], ) page_sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0] page_sequence = processor.post_process_generation(page_sequence, fix_markdown=False) return page_sequence def extract_text_from_pdf(pdf_path, progress=gr.Progress()): """ Extract text from PDF Args: pdf_path (str): Path to the PDF file progress (gr.Progress): Progress bar Returns: str: Extracted text from the PDF """ progress(0, desc="Starting...") images = convert_from_path(pdf_path) texts = [] for image in progress.tqdm(images): extracted_text = extract_text_from_image(image) texts.append(extracted_text) return "\n".join(texts) model_description = """ This is a demo for the Arabic Small Nougat model. It is an end-to-end OCR model that can extract text from images and PDFs. - The model is trained on the [Khatt dataset](https://huggingface.co/datasets/Fakhraddin/khatt) and custom made dataset. - The model is a finetune of [facebook/nougat-small](https://huggingface.co/facebook/nougat-small) model. **Note**: The model is a prototype in my book and may not work well on all types of images and PDFs. **Check the output carefully before using it for any serious work.** """ with gr.Blocks(title="Arabic Small Nougat") as demo: gr.HTML("

Arabic End-to-End Structured OCR for textbooks

") gr.Markdown(model_description) with gr.Tab("Extract Text from Image"): with gr.Row(): with gr.Column(): image = gr.Image(label="Input Image", type="pil") image_submit_button = gr.Button(value="Submit", variant="primary") output = gr.Markdown(label="Output Markdown", rtl=True) image_submit_button.click(extract_text_from_image, inputs=[image], outputs=output) with gr.Tab("Extract Text from PDF"): with gr.Row(): with gr.Column(): pdf = gr.File(label="Input PDF", type="filepath") pdf_submit_button = gr.Button(value="Submit", variant="primary") output = gr.Markdown(label="Output Markdown", rtl=True) pdf_submit_button.click(extract_text_from_pdf, inputs=[pdf], outputs=output) demo.queue().launch(share=False)