Ayushnangia commited on
Commit
ea5f05b
1 Parent(s): b8b3256

updated with spell check and grammar

Browse files
Files changed (1) hide show
  1. app.py +75 -44
app.py CHANGED
@@ -1,58 +1,89 @@
1
  import os
2
- os.environ['USE_TORCH'] = '1'
3
-
4
  from doctr.io import DocumentFile
5
  from doctr.models import ocr_predictor
6
  import gradio as gr
7
  from PIL import Image
8
- import base64
9
- from utils import HocrParser
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn',pretrained=True)
 
 
 
 
 
12
 
13
- title="DocTR OCR (PDL Demo)"
14
- description="Upload an image to get the OCR results !"
15
 
16
- def greet(img):
17
  img.save("out.jpg")
18
  doc = DocumentFile.from_images("out.jpg")
19
- output=predictor(doc)
20
 
21
- xml_outputs = output.export_as_xml()
22
- parser = HocrParser()
23
-
24
- res=""
25
  for obj in output.pages:
26
- for obj1 in obj.blocks:
27
- for obj2 in obj1.lines:
28
- for obj3 in obj2.words:
29
- res=res + " " + obj3.value
30
- res=res + "\n"
31
- res=res + "\n"
 
 
 
 
 
 
 
 
32
 
33
  _output_name = "RESULT_OCR.txt"
34
- _output_name_pdf="RESULT_OCR.pdf"
35
-
36
- open(_output_name, 'w').close() # clear file
37
- with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
38
- f.write(res)
39
- print("Writing into file")
40
-
41
- base64_encoded_pdfs = list()
42
- for i, (xml, img) in enumerate(zip(xml_outputs, doc)):
43
- xml_element_tree = xml[1]
44
- parser.export_pdfa(_output_name_pdf,
45
- hocr=xml_element_tree, image=img)
46
- with open(_output_name_pdf, 'rb') as f:
47
- base64_encoded_pdfs.append(base64.b64encode(f.read()))
48
- return res, _output_name, _output_name_pdf
49
-
50
- demo = gr.Interface(fn=greet,
51
- inputs=gr.Image(type="pil"),
52
- outputs=["text", "file","file"],
53
- title=title,
54
- description=description,
55
- examples=[["Examples/Book.png"],["Examples/News.png"],["Examples/Manuscript.jpg"],["Examples/Files.jpg"]]
56
- )
57
-
58
- demo.launch(debug=True)
 
1
  import os
 
 
2
  from doctr.io import DocumentFile
3
  from doctr.models import ocr_predictor
4
  import gradio as gr
5
  from PIL import Image
6
+ from happytransformer import HappyTextToText, TTSettings
7
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
+ import re
9
+
10
+ # OCR Predictor initialization
11
+ predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn', pretrained=True)
12
+
13
+ # Grammar Correction Model initialization
14
+ happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
15
+ grammar_args = TTSettings(num_beams=5, min_length=1)
16
+
17
+ # Spell Check Model initialization
18
+ tokenizer = AutoTokenizer.from_pretrained("Bhuvana/t5-base-spellchecker", use_fast=False)
19
+ model = AutoModelForSeq2SeqLM.from_pretrained("Bhuvana/t5-base-spellchecker")
20
+
21
+ def correct_spell(inputs):
22
+ input_ids = tokenizer.encode(inputs, return_tensors='pt')
23
+ sample_output = model.generate(
24
+ input_ids,
25
+ do_sample=True,
26
+ max_length=512,
27
+ top_p=0.99,
28
+ num_return_sequences=1
29
+ )
30
+ res = tokenizer.decode(sample_output[0], skip_special_tokens=True)
31
+ return res
32
+
33
+ def process_text_in_chunks(text, process_function, max_chunk_size=256):
34
+ # Split text into sentences
35
+ sentences = re.split(r'(?<=[.!?])\s+', text)
36
+ processed_text = ""
37
 
38
+ for sentence in sentences:
39
+ # Further split long sentences into smaller chunks
40
+ chunks = [sentence[i:i + max_chunk_size] for i in range(0, len(sentence), max_chunk_size)]
41
+ for chunk in chunks:
42
+ processed_text += process_function(chunk)
43
+ processed_text += " " # Add space after each processed sentence
44
 
45
+ return processed_text.strip()
 
46
 
47
+ def greet(img, apply_grammar_correction, apply_spell_check):
48
  img.save("out.jpg")
49
  doc = DocumentFile.from_images("out.jpg")
50
+ output = predictor(doc)
51
 
52
+ res = ""
 
 
 
53
  for obj in output.pages:
54
+ for obj1 in obj.blocks:
55
+ for obj2 in obj1.lines:
56
+ for obj3 in obj2.words:
57
+ res += " " + obj3.value
58
+ res += "\n"
59
+ res += "\n"
60
+
61
+ # Process in chunks for grammar correction
62
+ if apply_grammar_correction:
63
+ res = process_text_in_chunks(res, lambda x: happy_tt.generate_text("grammar: " + x, args=grammar_args).text)
64
+
65
+ # Process in chunks for spell check
66
+ if apply_spell_check:
67
+ res = process_text_in_chunks(res, correct_spell)
68
 
69
  _output_name = "RESULT_OCR.txt"
70
+ open(_output_name, 'w').write(res)
71
+ return res, _output_name
72
+
73
+ # Gradio Interface
74
+ title = "DocTR OCR with Grammar and Spell Check"
75
+ description = "Upload an image to get the OCR results. Optionally, apply grammar and spell check."
76
+
77
+ demo = gr.Interface(
78
+ fn=greet,
79
+ inputs=[
80
+ gr.Image(type="pil"),
81
+ gr.Checkbox(label="Apply Grammar Correction"),
82
+ gr.Checkbox(label="Apply Spell Check")
83
+ ],
84
+ outputs=["text", "file"],
85
+ title=title,
86
+ description=description,
87
+ )
88
+
89
+ demo.launch(debug=True)