MohamedRashad commited on
Commit
688353f
1 Parent(s): 8d1c352

Add app.py and others

Browse files
Files changed (2) hide show
  1. app.py +77 -0
  2. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import NougatProcessor, VisionEncoderDecoderModel
2
+ import gradio as gr
3
+ from pdf2image import convert_from_path
4
+
5
+ # Load the model and processor
6
+ processor = NougatProcessor.from_pretrained("MohamedRashad/arabic-small-nougat")
7
+ model = VisionEncoderDecoderModel.from_pretrained("MohamedRashad/arabic-small-nougat")
8
+ device = "cpu"
9
+
10
+ context_length = 2048
11
+
12
+ def extract_text_from_image(image):
13
+ """
14
+ Extract text from PIL image
15
+
16
+ Args:
17
+ image (PIL.Image): Input image
18
+
19
+ Returns:
20
+ str: Extracted text from the image
21
+ """
22
+
23
+ # prepare PDF image for the model
24
+ pixel_values = processor(image, return_tensors="pt").pixel_values
25
+
26
+ # generate transcription
27
+ outputs = model.generate(
28
+ pixel_values.to(device),
29
+ min_length=1,
30
+ max_new_tokens=context_length,
31
+ bad_words_ids=[[processor.tokenizer.unk_token_id]],
32
+ )
33
+ page_sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0]
34
+ page_sequence = processor.post_process_generation(page_sequence, fix_markdown=False)
35
+ return page_sequence
36
+
37
+ def extract_text_from_pdf(pdf_path, progress=gr.Progress()):
38
+ """
39
+ Extract text from PDF
40
+
41
+ Args:
42
+ pdf_path (str): Path to the PDF file
43
+ progress (gr.Progress): Progress bar
44
+
45
+ Returns:
46
+ str: Extracted text from the PDF
47
+ """
48
+
49
+ progress(0, desc="Starting...")
50
+ images = convert_from_path(pdf_path)
51
+ texts = []
52
+ for image in progress.tqdm(images):
53
+ extracted_text = extract_text_from_image(image)
54
+ texts.append(extracted_text)
55
+
56
+ return "\n".join(texts)
57
+
58
+ with gr.Blocks(title="Arabic Small Nougat") as demo:
59
+ gr.HTML("<h1 style='text-align: center'>Arabic End-to-End Structured OCR for textbooks</h1>")
60
+
61
+ with gr.Tab("Extract Text from Image"):
62
+ with gr.Row():
63
+ with gr.Column():
64
+ image = gr.Image(label="Input Image", type="pil")
65
+ image_submit_button = gr.Button(value="Submit", variant="primary")
66
+ output = gr.Markdown(label="Output Markdown", rtl=True)
67
+ image_submit_button.click(extract_text_from_image, inputs=[image], outputs=output)
68
+
69
+ with gr.Tab("Extract Text from PDF"):
70
+ with gr.Row():
71
+ with gr.Column():
72
+ pdf = gr.File(label="Input PDF", type="filepath")
73
+ pdf_submit_button = gr.Button(value="Submit", variant="primary")
74
+ output = gr.Markdown(label="Output Markdown", rtl=True)
75
+ pdf_submit_button.click(extract_text_from_pdf, inputs=[pdf], outputs=output)
76
+
77
+ demo.queue().launch(share=False)
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ pdf2image