Spaces:
Runtime error
Runtime error
import os | |
import gradio as gr | |
def correct_spell(inputs): | |
return "res" | |
def process_text_in_chunks(text, process_function, max_chunk_size=256): | |
# Split text into sentences | |
sentences = re.split(r'(?<=[.!?])\s+', text) | |
processed_text = "" | |
for sentence in sentences: | |
# Further split long sentences into smaller chunks | |
chunks = [sentence[i:i + max_chunk_size] for i in range(0, len(sentence), max_chunk_size)] | |
for chunk in chunks: | |
processed_text += process_function(chunk) | |
processed_text += " " # Add space after each processed sentence | |
return processed_text.strip() | |
def greet(img, apply_grammar_correction, apply_spell_check,lang_of_input): | |
if (lang_of_input=="Hindi"): | |
res = pt.image_to_string(img,lang='hin') | |
_output_name = "RESULT_OCR.txt" | |
open(_output_name, 'w').write(res) | |
return res, _output_name | |
if (lang_of_input=="Punjabi"): | |
res = pt.image_to_string(img,lang='pan') | |
_output_name = "RESULT_OCR.txt" | |
open(_output_name, 'w').write(res) | |
return res, _output_name | |
img.save("out.jpg") | |
doc = DocumentFile.from_images("out.jpg") | |
output = OCRpredictor(doc) | |
res = "" | |
for obj in output.pages: | |
for obj1 in obj.blocks: | |
for obj2 in obj1.lines: | |
for obj3 in obj2.words: | |
res += " " + obj3.value | |
res += "\n" | |
res += "\n" | |
# Process in chunks for grammar correction | |
if apply_grammar_correction: | |
res = process_text_in_chunks(res, lambda x: happy_tt.generate_text("grammar: " + x, args=grammar_args).text) | |
# Process in chunks for spell check | |
if apply_spell_check: | |
res = process_text_in_chunks(res, correct_spell) | |
_output_name = "RESULT_OCR.txt" | |
open(_output_name, 'w').write(res) | |
return res, _output_name | |
# Gradio Interface for OCR | |
demo_ocr = gr.Interface( | |
fn=greet, | |
inputs=[ | |
gr.Image(type="pil"), | |
gr.Checkbox(label="Apply Grammar Correction"), | |
gr.Checkbox(label="Apply Spell Check"), | |
gr.Dropdown(["English","Hindi","Punjabi"],label="Select Language") | |
], | |
outputs=["text", "file"], | |
title="DocTR OCR with Grammar and Spell Check", | |
description="Upload an image to get the OCR results. Optionally, apply grammar and spell check." | |
) | |
# demo_ocr.launch(debug=True) | |
def split_text_into_batches(text, max_tokens_per_batch): | |
sentences = text # Tokenize text into sentences | |
batches = [] | |
current_batch = "" | |
for sentence in sentences: | |
if len(current_batch) + len(sentence) + 1 <= max_tokens_per_batch: # Add 1 for space | |
current_batch += sentence + " " # Add sentence to current batch | |
else: | |
batches.append(current_batch.strip()) # Add current batch to batches list | |
current_batch = sentence + " " # Start a new batch with the current sentence | |
if current_batch: | |
batches.append(current_batch.strip()) # Add the last batch | |
return batches | |
def run_t2tt(file_uploader , input_text: str, source_language: str, target_language: str) -> (str, bytes): | |
if file_uploader is not None: | |
with open(file_uploader, 'r') as file: | |
input_text=file.read() | |
source_language_code = [] | |
target_language_code = [] | |
max_tokens_per_batch= 256 | |
batches = split_text_into_batches(input_text, max_tokens_per_batch) | |
translated_text = "" | |
return "hello" | |
with gr.Blocks() as demo_t2tt: | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Group(): | |
file_uploader = gr.File(label="Upload a text file (Optional)") | |
input_text = gr.Textbox(label="Input text") | |
with gr.Row(): | |
source_language = gr.Dropdown( | |
label="Source language", | |
choices=[], | |
value="Punjabi", | |
) | |
target_language = gr.Dropdown( | |
label="Target language", | |
choices=[], | |
value=[], | |
) | |
btn = gr.Button("Translate") | |
with gr.Column(): | |
output_text = gr.Textbox(label="Translated text") | |
output_file = gr.File(label="Translated text file") | |
gr.on( | |
triggers=[input_text.submit, btn.click], | |
fn=run_t2tt, | |
inputs=[file_uploader, input_text, source_language, target_language], | |
outputs=[output_text, output_file], | |
api_name="t2tt", | |
) | |
with gr.Blocks() as demo: | |
with gr.Tabs(): | |
with gr.Tab(label="OCR"): | |
demo_ocr.render() | |
with gr.Tab(label="Translate"): | |
demo_t2tt.render() | |
if __name__ == "__main__": | |
demo.launch() |