Spaces:

pdltiet
/

demo-gpu

Running on Zero

App Files Files Community

vteam27 commited on Feb 5, 2024

Commit

4e9395b

1 Parent(s): 4d734fe

Added merged base

Browse files

Files changed (10) hide show

.gitattributes +2 -0
Examples/Book.png +3 -0
Examples/Files.jpg +3 -0
Examples/Manuscript.jpg +3 -0
Examples/News.png +3 -0
app.py +189 -0
lang_list.py +163 -0
packages.txt +3 -0
requirements.txt +11 -0
utils.py +163 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

Examples/Book.png ADDED Viewed

Git LFS Details

SHA256: 45bf8d8c824d48de2013e572bffcedadcbdc84cda21fb73f5f83ecb809aec803
Pointer size: 133 Bytes
Size of remote file: 16 MB

Examples/Files.jpg ADDED Viewed

Git LFS Details

SHA256: bc1979e548161bb556a037594b3945749419b2367f93acac00e53c6d621ee009
Pointer size: 132 Bytes
Size of remote file: 4.37 MB

Examples/Manuscript.jpg ADDED Viewed

Git LFS Details

SHA256: 4a717cd9c625b7b59ebb80b52b0b3fba47c69e61f881ecd4e4f8ea1bb8883ddf
Pointer size: 132 Bytes
Size of remote file: 4.54 MB

Examples/News.png ADDED Viewed

Git LFS Details

SHA256: 5384175e709017ad917f56ff758bce9164444992be3bcad8fe52f7f83343744d
Pointer size: 131 Bytes
Size of remote file: 388 kB

app.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import os
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+import gradio as gr
+from PIL import Image
+from happytransformer import HappyTextToText, TTSettings
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import re
+from lang_list import (
+    LANGUAGE_NAME_TO_CODE,
+    T2TT_TARGET_LANGUAGE_NAMES,
+    TEXT_SOURCE_LANGUAGE_NAMES,
+)
+DEFAULT_TARGET_LANGUAGE = "English"
+from transformers import SeamlessM4TForTextToText
+from transformers import AutoProcessor
+model = SeamlessM4TForTextToText.from_pretrained("facebook/hf-seamless-m4t-medium")
+processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
+# OCR Predictor initialization
+OCRpredictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn', pretrained=True)
+# Grammar Correction Model initialization
+happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
+grammar_args = TTSettings(num_beams=5, min_length=1)
+# Spell Check Model initialization
+OCRtokenizer = AutoTokenizer.from_pretrained("Bhuvana/t5-base-spellchecker", use_fast=False)
+OCRmodel = AutoModelForSeq2SeqLM.from_pretrained("Bhuvana/t5-base-spellchecker")
+def correct_spell(inputs):
+    input_ids = OCRtokenizer.encode(inputs, return_tensors='pt')
+    sample_output = OCRmodel.generate(
+        input_ids,
+        do_sample=True,
+        max_length=512,
+        top_p=0.99,
+        num_return_sequences=1
+    )
+    res = OCRtokenizer.decode(sample_output[0], skip_special_tokens=True)
+    return res
+def process_text_in_chunks(text, process_function, max_chunk_size=256):
+    # Split text into sentences
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    processed_text = ""
+    for sentence in sentences:
+        # Further split long sentences into smaller chunks
+        chunks = [sentence[i:i + max_chunk_size] for i in range(0, len(sentence), max_chunk_size)]
+        for chunk in chunks:
+            processed_text += process_function(chunk)
+        processed_text += " "  # Add space after each processed sentence
+    return processed_text.strip()
+def greet(img, apply_grammar_correction, apply_spell_check):
+    img.save("out.jpg")
+    doc = DocumentFile.from_images("out.jpg")
+    output = OCRpredictor(doc)
+    res = ""
+    for obj in output.pages:
+        for obj1 in obj.blocks:
+            for obj2 in obj1.lines:
+                for obj3 in obj2.words:
+                    res += " " + obj3.value
+            res += "\n"
+        res += "\n"
+    # Process in chunks for grammar correction
+    if apply_grammar_correction:
+        res = process_text_in_chunks(res, lambda x: happy_tt.generate_text("grammar: " + x, args=grammar_args).text)
+    # Process in chunks for spell check
+    if apply_spell_check:
+        res = process_text_in_chunks(res, correct_spell)
+    _output_name = "RESULT_OCR.txt"
+    open(_output_name, 'w').write(res)
+    return res, _output_name
+# Gradio Interface for OCR
+demo_ocr = gr.Interface(
+    fn=greet,
+    inputs=[
+        gr.Image(type="pil"),
+        gr.Checkbox(label="Apply Grammar Correction"),
+        gr.Checkbox(label="Apply Spell Check")
+    ],
+    outputs=["text", "file"],
+    title="DocTR OCR with Grammar and Spell Check",
+    description="Upload an image to get the OCR results. Optionally, apply grammar and spell check.",
+    examples=[["Examples/Book.png"], ["Examples/News.png"], ["Examples/Manuscript.jpg"], ["Examples/Files.jpg"]]
+)
+# demo_ocr.launch(debug=True)
+def run_t2tt(file_uploader , input_text: str, source_language: str, target_language: str) -> (str, bytes):
+    if file_uploader is not None:
+        with open(file_uploader, 'r') as file:
+            input_text=file.read()
+    source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
+    target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
+    text_inputs = processor(text = input_text, src_lang=source_language_code , return_tensors="pt")
+    output_tokens = model.generate(**text_inputs, tgt_lang=target_language_code)
+    output = processor.decode(output_tokens[0].tolist(), skip_special_tokens=True)
+    _output_name = "result.txt"
+    open(_output_name, 'w').write(output)
+    return str(output), _output_name
+with gr.Blocks() as demo_t2tt:
+    with gr.Row():
+        with gr.Column():
+            with gr.Group():
+                file_uploader = gr.File(label="Upload a text file (Optional)")
+                input_text = gr.Textbox(label="Input text")
+                with gr.Row():
+                    source_language = gr.Dropdown(
+                        label="Source language",
+                        choices=TEXT_SOURCE_LANGUAGE_NAMES,
+                        value="Punjabi",
+                    )
+                    target_language = gr.Dropdown(
+                        label="Target language",
+                        choices=T2TT_TARGET_LANGUAGE_NAMES,
+                        value=DEFAULT_TARGET_LANGUAGE,
+                    )
+            btn = gr.Button("Translate")
+        with gr.Column():
+            output_text = gr.Textbox(label="Translated text")
+            output_file = gr.File(label="Translated text file")
+    gr.Examples(
+        examples=[
+            [
+                None,
+                "The sinister destruction of the holy Akal Takht and the ruthless massacre of thousands of innocent pilgrims had unmasked the deep-seated hatred and animosity that the Indian Government had been nurturing against Sikhs ever since independence",
+                "English",
+                "Punjabi",
+            ],
+            [
+                None,
+                "It contains. much useful information about administrative, revenue, judicial and ecclesiastical activities in various areas which, it is hoped, would supplement the information available in official records.",
+                "English",
+                "Hindi",
+            ],
+            [
+                None,
+                "दुनिया में बहुत सी अलग-अलग भाषाएं हैं और उनमें अपने वर्ण और शब्दों का भंडार होता है. इसमें में कुछ उनके अपने शब्द होते हैं तो कुछ ऐसे भी हैं, जो दूसरी भाषाओं से लिए जाते हैं.",
+                "Hindi",
+                "Punjabi",
+            ],
+            [
+                None,
+                "ਸੂੂਬੇ ਦੇ ਕਈ ਜ਼ਿਲ੍ਹਿਆਂ ’ਚ ਬੁੱਧਵਾਰ ਸਵੇਰੇ ਸੰਘਣੀ ਧੁੰਦ ਛਾਈ ਰਹੀ ਤੇ ਤੇਜ਼ ਹਵਾਵਾਂ ਨੇ ਕਾਂਬਾ ਹੋਰ ਵਧਾ ਦਿੱਤਾ। ਸੱਤ ਸ਼ਹਿਰਾਂ ’ਚ ਦਿਨ ਦਾ ਤਾਪਮਾਨ ਦਸ ਡਿਗਰੀ ਸੈਲਸੀਅਸ ਦੇ ਆਸਪਾਸ ਰਿਹਾ। ਸੂਬੇ ’ਚ ਵੱਧ ਤੋਂ ਵੱਧ ਤਾਪਮਾਨ ’ਚ ਵੀ ਦਸ ਡਿਗਰੀ ਸੈਲਸੀਅਸ ਦੀ ਗਿਰਾਵਟ ਦਰਜ ਕੀਤੀ ਗਈ",
+                "Punjabi",
+                "English",
+            ],
+        ],
+        inputs=[file_uploader ,input_text, source_language, target_language],
+        outputs=[output_text, output_file],
+        fn=run_t2tt,
+        cache_examples=False,
+        api_name=False,
+    )
+    gr.on(
+        triggers=[input_text.submit, btn.click],
+        fn=run_t2tt,
+        inputs=[file_uploader, input_text, source_language, target_language],
+        outputs=[output_text, output_file],
+        api_name="t2tt",
+    )
+with gr.Blocks() as demo:
+    with gr.Tabs():
+        with gr.Tab(label="OCR"):
+            demo_ocr.render()
+        with gr.Tab(label="Translate"):
+            demo_t2tt.render()
+if __name__ == "__main__":
+    demo.launch()

lang_list.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Language dict
+language_code_to_name = {
+    "afr": "Afrikaans",
+    "amh": "Amharic",
+    "arb": "Modern Standard Arabic",
+    "ary": "Moroccan Arabic",
+    "arz": "Egyptian Arabic",
+    "asm": "Assamese",
+    "ast": "Asturian",
+    "azj": "North Azerbaijani",
+    "bel": "Belarusian",
+    "ben": "Bengali",
+    "bos": "Bosnian",
+    "bul": "Bulgarian",
+    "cat": "Catalan",
+    "ceb": "Cebuano",
+    "ces": "Czech",
+    "ckb": "Central Kurdish",
+    "cmn": "Mandarin Chinese",
+    "cym": "Welsh",
+    "dan": "Danish",
+    "deu": "German",
+    "ell": "Greek",
+    "eng": "English",
+    "est": "Estonian",
+    "eus": "Basque",
+    "fin": "Finnish",
+    "fra": "French",
+    "gaz": "West Central Oromo",
+    "gle": "Irish",
+    "glg": "Galician",
+    "guj": "Gujarati",
+    "heb": "Hebrew",
+    "hin": "Hindi",
+    "hrv": "Croatian",
+    "hun": "Hungarian",
+    "hye": "Armenian",
+    "ibo": "Igbo",
+    "ind": "Indonesian",
+    "isl": "Icelandic",
+    "ita": "Italian",
+    "jav": "Javanese",
+    "jpn": "Japanese",
+    "kam": "Kamba",
+    "kan": "Kannada",
+    "kat": "Georgian",
+    "kaz": "Kazakh",
+    "kea": "Kabuverdianu",
+    "khk": "Halh Mongolian",
+    "khm": "Khmer",
+    "kir": "Kyrgyz",
+    "kor": "Korean",
+    "lao": "Lao",
+    "lit": "Lithuanian",
+    "ltz": "Luxembourgish",
+    "lug": "Ganda",
+    "luo": "Luo",
+    "lvs": "Standard Latvian",
+    "mai": "Maithili",
+    "mal": "Malayalam",
+    "mar": "Marathi",
+    "mkd": "Macedonian",
+    "mlt": "Maltese",
+    "mni": "Meitei",
+    "mya": "Burmese",
+    "nld": "Dutch",
+    "nno": "Norwegian Nynorsk",
+    "nob": "Norwegian Bokm\u00e5l",
+    "npi": "Nepali",
+    "nya": "Nyanja",
+    "oci": "Occitan",
+    "ory": "Odia",
+    "pan": "Punjabi",
+    "pbt": "Southern Pashto",
+    "pes": "Western Persian",
+    "pol": "Polish",
+    "por": "Portuguese",
+    "ron": "Romanian",
+    "rus": "Russian",
+    "slk": "Slovak",
+    "slv": "Slovenian",
+    "sna": "Shona",
+    "snd": "Sindhi",
+    "som": "Somali",
+    "spa": "Spanish",
+    "srp": "Serbian",
+    "swe": "Swedish",
+    "swh": "Swahili",
+    "tam": "Tamil",
+    "tel": "Telugu",
+    "tgk": "Tajik",
+    "tgl": "Tagalog",
+    "tha": "Thai",
+    "tur": "Turkish",
+    "ukr": "Ukrainian",
+    "urd": "Urdu",
+    "uzn": "Northern Uzbek",
+    "vie": "Vietnamese",
+    "xho": "Xhosa",
+    "yor": "Yoruba",
+    "yue": "Cantonese",
+    "zlm": "Colloquial Malay",
+    "zsm": "Standard Malay",
+    "zul": "Zulu",
+}
+LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
+# Source langs: S2ST / S2TT / ASR don't need source lang
+# T2TT / T2ST use this
+text_source_language_codes = [
+    "hin",
+    "pan",
+    "eng",
+]
+TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes])
+# Target langs:
+# S2ST / T2ST
+s2st_target_language_codes = [
+    "eng",
+    "arb",
+    "ben",
+    "cat",
+    "ces",
+    "cmn",
+    "cym",
+    "dan",
+    "deu",
+    "est",
+    "fin",
+    "fra",
+    "hin",
+    "ind",
+    "ita",
+    "jpn",
+    "kor",
+    "mlt",
+    "nld",
+    "pes",
+    "pol",
+    "por",
+    "ron",
+    "rus",
+    "slk",
+    "spa",
+    "swe",
+    "swh",
+    "tel",
+    "tgl",
+    "tha",
+    "tur",
+    "ukr",
+    "urd",
+    "uzn",
+    "vie",
+]
+S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes])
+T2ST_TARGET_LANGUAGE_NAMES = S2ST_TARGET_LANGUAGE_NAMES
+# S2TT / T2TT / ASR
+S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
+T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
+ASR_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+libcairo2-dev
+pkg-config
+fonts-freefont-ttf -y

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+pycairo
+gradio
+reportlab>=3.6.2
+PyPDF2==1.26.0
+happytransformer
+python-doctr[torch]@git+https://github.com/mindee/doctr.git
+transformers
+fairseq2==0.1
+pydub
+yt-dlp
+sentencepiece

utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import base64
+import re
+from tempfile import TemporaryDirectory
+from math import atan, cos, sin
+from typing import Dict, Optional, Tuple
+from xml.etree import ElementTree as ET
+from xml.etree.ElementTree import Element
+import numpy as np
+import PyPDF2
+from PyPDF2 import PdfFileMerger
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+from PIL import Image
+from reportlab.lib.colors import black
+from reportlab.lib.units import inch
+from reportlab.lib.utils import ImageReader
+from reportlab.pdfgen.canvas import Canvas
+class HocrParser():
+    def __init__(self):
+        self.box_pattern = re.compile(r'bbox((\s+\d+){4})')
+        self.baseline_pattern = re.compile(r'baseline((\s+[\d\.\-]+){2})')
+    def _element_coordinates(self, element: Element) -> Dict:
+        """
+        Returns a tuple containing the coordinates of the bounding box around
+        an element
+        """
+        out = out = {'x1': 0, 'y1': 0, 'x2': 0, 'y2': 0}
+        if 'title' in element.attrib:
+            matches = self.box_pattern.search(element.attrib['title'])
+            if matches:
+                coords = matches.group(1).split()
+                out = {'x1': int(coords[0]), 'y1': int(
+                    coords[1]), 'x2': int(coords[2]), 'y2': int(coords[3])}
+        return out
+    def _get_baseline(self, element: Element) -> Tuple[float, float]:
+        """
+        Returns a tuple containing the baseline slope and intercept.
+        """
+        if 'title' in element.attrib:
+            matches = self.baseline_pattern.search(
+                element.attrib['title']).group(1).split()
+            if matches:
+                return float(matches[0]), float(matches[1])
+        return (0.0, 0.0)
+    def _pt_from_pixel(self, pxl: Dict, dpi: int) -> Dict:
+        """
+        Returns the quantity in PDF units (pt) given quantity in pixels
+        """
+        pt = [(c / dpi * inch) for c in pxl.values()]
+        return {'x1': pt[0], 'y1': pt[1], 'x2': pt[2], 'y2': pt[3]}
+    def _get_element_text(self, element: Element) -> str:
+        """
+        Return the textual content of the element and its children
+        """
+        text = ''
+        if element.text is not None:
+            text += element.text
+        for child in element:
+            text += self._get_element_text(child)
+        if element.tail is not None:
+            text += element.tail
+        return text
+    def export_pdfa(self,
+                    out_filename: str,
+                    hocr: ET.ElementTree,
+                    image: Optional[np.ndarray] = None,
+                    fontname: str = "Times-Roman",
+                    fontsize: int = 12,
+                    invisible_text: bool = True,
+                    add_spaces: bool = True,
+                    dpi: int = 300):
+        """
+        Generates a PDF/A document from a hOCR document.
+        """
+        width, height = None, None
+        # Get the image dimensions
+        for div in hocr.findall(".//div[@class='ocr_page']"):
+            coords = self._element_coordinates(div)
+            pt_coords = self._pt_from_pixel(coords, dpi)
+            width, height = pt_coords['x2'] - \
+                pt_coords['x1'], pt_coords['y2'] - pt_coords['y1']
+            # after catch break loop
+            break
+        if width is None or height is None:
+            raise ValueError("Could not determine page size")
+        pdf = Canvas(out_filename, pagesize=(width, height), pageCompression=1)
+        span_elements = [element for element in hocr.iterfind(".//span")]
+        for line in span_elements:
+            if 'class' in line.attrib and line.attrib['class'] == 'ocr_line' and line is not None:
+                # get information from xml
+                pxl_line_coords = self._element_coordinates(line)
+                line_box = self._pt_from_pixel(pxl_line_coords, dpi)
+                # compute baseline
+                slope, pxl_intercept = self._get_baseline(line)
+                if abs(slope) < 0.005:
+                    slope = 0.0
+                angle = atan(slope)
+                cos_a, sin_a = cos(angle), sin(angle)
+                intercept = pxl_intercept / dpi * inch
+                baseline_y2 = height - (line_box['y2'] + intercept)
+                # configure options
+                text = pdf.beginText()
+                text.setFont(fontname, fontsize)
+                pdf.setFillColor(black)
+                if invisible_text:
+                    text.setTextRenderMode(3)  # invisible text
+                # transform overlayed text
+                text.setTextTransform(
+                    cos_a, -sin_a, sin_a, cos_a, line_box['x1'], baseline_y2)
+                elements = line.findall(".//span[@class='ocrx_word']")
+                for elem in elements:
+                    elemtxt = self._get_element_text(elem).strip()
+                    # replace unsupported characters
+                    elemtxt = elemtxt.translate(str.maketrans(
+                        {'ﬀ': 'ff', 'ﬃ': 'f‌f‌i', 'ﬄ': 'f‌f‌l', 'ﬁ': 'fi', 'ﬂ': 'fl'}))
+                    if not elemtxt:
+                        continue
+                    # compute string width
+                    pxl_coords = self._element_coordinates(elem)
+                    box = self._pt_from_pixel(pxl_coords, dpi)
+                    if add_spaces:
+                        elemtxt += ' '
+                        box_width = box['x2'] + pdf.stringWidth(elemtxt, fontname, fontsize) - box['x1']
+                    else:
+                        box_width = box['x2'] - box['x1']
+                    font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
+                    # Adjust relative position of cursor
+                    cursor = text.getStartOfLine()
+                    dx = box['x1'] - cursor[0]
+                    dy = baseline_y2 - cursor[1]
+                    text.moveCursor(dx, dy)
+                    # suppress text if it is 0 units wide
+                    if font_width > 0:
+                        text.setHorizScale(100 * box_width / font_width)
+                        text.textOut(elemtxt)
+                pdf.drawText(text)
+        # overlay image if provided
+        if image is not None:
+            pdf.drawImage(ImageReader(Image.fromarray(image)),
+                          0, 0, width=width, height=height)
+        pdf.save()