File size: 2,787 Bytes
8b6c55a
 
 
9ebbd2b
8b6c55a
2e73fb1
 
 
9ebbd2b
2e73fb1
2769653
 
8b6c55a
 
 
 
 
 
 
 
2e73fb1
 
 
 
d8d40d6
 
858ad43
 
20e92e5
d8d40d6
d075bff
20e92e5
d8d40d6
2e73fb1
 
 
d8d40d6
bb67d5e
16858c9
 
 
 
c444b8a
269e4c2
 
fc29243
8b6c55a
 
3b925dc
9ebbd2b
 
a0f2182
9ebbd2b
 
a0f2182
 
 
 
6cf24d5
a0f2182
16858c9
 
 
 
 
 
 
 
 
9ebbd2b
6cf24d5
9ebbd2b
 
6cf24d5
 
8b6c55a
 
 
 
 
858ad43
8b6c55a
 
6cf24d5
269e4c2
e60cc65
5e10a9a
 
6cf24d5
 
8b6c55a
6cf24d5
9ebbd2b
 
3b925dc
9ebbd2b
 
8b6c55a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from PIL import Image
import pytesseract
import gradio as gr
from datetime import datetime
import os
from flair.data import Sentence
from flair.models import SequenceTagger
from segtok.segmenter import split_single
import pandas as pd

# tagger = SequenceTagger.load("ner-ontonotes")
tagger = SequenceTagger.load("flair/ner-english-ontonotes")

langs = []

choices = os.popen("tesseract --list-langs").read().split("\n")[1:-1]

blocks = gr.Blocks()


def get_named_entities(ocr_text: str):
    sentence = [Sentence(sent, use_tokenizer=True) for sent in split_single(ocr_text)]
    tagger.predict(sentence)

    entities = []

    for token in sentence:
        for entity in token.get_spans("ner"):
            entity = str(entity)
            entities.append(entity)

    entities = "\n".join(entities)

    return entities


def run(image, lang="eng"):
    print("Image ", image)
    try:
        print("Image filename ", image.filename)
    except:
        print("Could not print image filename")
    result = pytesseract.image_to_string(image, lang=None if lang == [] else lang)

    ner = get_named_entities(result)
    return result, ner


def download_output(ocr_text: str, named_entities: str, image_name="test"):
    try:
        named_entities_list = named_entities.split("\n")

        now = datetime.now()
        datetime_now = now.strftime("%Y%m%d_%H%M%S")
        output_file = f"{image_name}_{datetime_now}.xlsx"

        ocr_df = pd.Series(ocr_text)
        ner_df = pd.Series(named_entities_list)

        with pd.ExcelWriter(output_file) as writer:
            ocr_df.to_excel(
                writer, sheet_name="OCR text", columns=["OCR text"], index=False
            )
            ner_df.to_excel(
                writer,
                sheet_name="Named entities",
                columns=["Named entities"],
                index=False,
            )
        return output_file

    except Exception as e:
        raise gr.Error(f"Something went wrong: here's the error: {e}")


with gr.Blocks() as demo:
    gr.Markdown("## Theatre Programmer")
    with gr.Row():
        with gr.Column():
            image_in = gr.Image(type="pil")
            lang = gr.Dropdown(choices, value="eng")
            btn = gr.Button("Run")
        with gr.Column():
            ocr_text = gr.TextArea(label="OCR output")
        with gr.Column():
            ner = gr.TextArea(label="Named entities")
        # with gr.Column():
        #     gr.CheckboxGroup(ner, label="Named entities")
    with gr.Row():
        download_btn = gr.Button("Download output")

    btn.click(fn=run, inputs=[image_in, lang], outputs=[ocr_text, ner])
    download_btn.click(
        fn=download_output,
        inputs=[ocr_text, ner],
        outputs=[gr.components.File()],
    )

demo.launch()