Spaces:
Running
Running
File size: 1,850 Bytes
8b6c55a 2e73fb1 8b6c55a 2e73fb1 d8d40d6 858ad43 20e92e5 d8d40d6 d075bff 20e92e5 d8d40d6 1654918 2e73fb1 d8d40d6 c444b8a 269e4c2 fc29243 8b6c55a 6cf24d5 8b6c55a 858ad43 8b6c55a 6cf24d5 269e4c2 e60cc65 5e10a9a 6cf24d5 8b6c55a 6cf24d5 8b6c55a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
from PIL import Image
import pytesseract
import gradio as gr
import os
from flair.data import Sentence
from flair.models import SequenceTagger
from segtok.segmenter import split_single
tagger = SequenceTagger.load("ner-ontonotes")
langs = []
choices = os.popen("tesseract --list-langs").read().split("\n")[1:-1]
blocks = gr.Blocks()
def get_named_entities(ocr_text: str):
sentence = [Sentence(sent, use_tokenizer=True) for sent in split_single(ocr_text)]
tagger.predict(sentence)
entities = []
for token in sentence:
for entity in token.get_spans("ner"):
entity = str(entity)
entities.append(entity)
entities = "\n".join(entities)
print("ENTITIES ", entities)
return entities
def run(image, lang="eng"):
result = pytesseract.image_to_string(image, lang=None if lang == [] else lang)
ner = get_named_entities(result)
return result, ner
def download_output(ocr_text: str, named_entities: str):
print("Download output!")
print("OCR text: ", len(ocr_text))
print("Named Entities: ", len(named_entities))
return True
with gr.Blocks() as demo:
gr.Markdown("## Theatre Programmer")
with gr.Row():
with gr.Column():
image_in = gr.Image(type="pil")
lang = gr.Dropdown(choices, value="eng")
btn = gr.Button("Run")
with gr.Column():
ocr_text = gr.TextArea(label="OCR output")
with gr.Column():
ner = gr.TextArea(label="Named entities")
# with gr.Column():
# gr.CheckboxGroup(ner, label="Named entities")
with gr.Row():
download_btn = gr.Button("Download output")
btn.click(fn=run, inputs=[image_in, lang], outputs=[ocr_text, ner])
download_btn.click(fn=download_output, inputs=[ocr_text, ner], outputs=[])
demo.launch()
|