File size: 4,710 Bytes
8505d54
84b8f8b
57a7aa0
151824c
 
8505d54
 
 
57a7aa0
8505d54
84b8f8b
 
 
 
8505d54
84b8f8b
57a7aa0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84b8f8b
 
 
 
 
 
 
 
 
 
8505d54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84b8f8b
8505d54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84b8f8b
9de8217
151824c
8505d54
 
84b8f8b
8505d54
5b88edd
84b8f8b
 
 
 
 
5b88edd
 
84b8f8b
 
8505d54
84b8f8b
 
 
 
 
5b88edd
 
8505d54
5b88edd
84b8f8b
 
 
5b88edd
84b8f8b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import re
import os
import gc

from cleantext import clean
import gradio as gr
from tqdm.auto import tqdm
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, AutoTokenizer


checker_model_name = "textattack/roberta-base-CoLA"
corrector_model_name = "pszemraj/flan-t5-large-grammar-synthesis"

# pipelines


if os.environ.get("HF_DEMO_NO_USE_ONNX") is None:
    from optimum.bettertransformer import BetterTransformer

    model_hf = AutoModelForSequenceClassification.from_pretrained(checker_model_name)
    tokenizer = AutoTokenizer.from_pretrained(checker_model_name)
    model = BetterTransformer.transform(model_hf, keep_original_model=False)

    checker = pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
    )
else:
    checker = pipeline(
        "text-classification",
        checker_model_name,
    )
gc.collect()
if os.environ.get("HF_DEMO_NO_USE_ONNX") is None:
    # load onnx runtime unless HF_DEMO_NO_USE_ONNX is set
    from optimum.pipelines import pipeline

    corrector = pipeline(
        "text2text-generation", model=corrector_model_name, accelerator="ort"
    )
else:
    corrector = pipeline("text2text-generation", corrector_model_name)


def split_text(text: str) -> list:
    # Split the text into sentences using regex
    sentences = re.split(r"(?<=[^A-Z].[.?]) +(?=[A-Z])", text)

    # Initialize a list to store the sentence batches
    sentence_batches = []

    # Initialize a temporary list to store the current batch of sentences
    temp_batch = []

    # Iterate through the sentences
    for sentence in sentences:
        # Add the sentence to the temporary batch
        temp_batch.append(sentence)

        # If the length of the temporary batch is between 2 and 3 sentences, or if it is the last batch, add it to the list of sentence batches
        if len(temp_batch) >= 2 and len(temp_batch) <= 3 or sentence == sentences[-1]:
            sentence_batches.append(temp_batch)
            temp_batch = []

    return sentence_batches


def correct_text(text: str, checker, corrector, separator: str = " ") -> str:
    # Split the text into sentence batches
    sentence_batches = split_text(text)

    # Initialize a list to store the corrected text
    corrected_text = []

    # Iterate through the sentence batches
    for batch in tqdm(
        sentence_batches, total=len(sentence_batches), desc="correcting text.."
    ):
        # Join the sentences in the batch into a single string
        raw_text = " ".join(batch)

        # Check the grammar quality of the text using the text-classification pipeline
        results = checker(raw_text)

        # Only correct the text if the results of the text-classification are not LABEL_1 or are LABEL_1 with a score below 0.9
        if results[0]["label"] != "LABEL_1" or (
            results[0]["label"] == "LABEL_1" and results[0]["score"] < 0.9
        ):
            # Correct the text using the text-generation pipeline
            corrected_batch = corrector(raw_text)
            corrected_text.append(corrected_batch[0]["generated_text"])
        else:
            corrected_text.append(raw_text)

    # Join the corrected text into a single string
    corrected_text = separator.join(corrected_text)

    return corrected_text


def update(text: str):
    text = clean(text[:4000], lower=False)
    return correct_text(text, checker, corrector)


with gr.Blocks() as demo:
    gr.Markdown("# <center>Robust Grammar Correction with FLAN-T5</center>")
    gr.Markdown(
        "**Instructions:** Enter the text you want to correct in the textbox below (_text will be truncated to 4000 characters_). Click 'Process' to run."
    )
    gr.Markdown(
        """Models:
    - `textattack/roberta-base-CoLA` for grammar quality detection
    - `pszemraj/flan-t5-large-grammar-synthesis` for grammar correction
    """
    )
    with gr.Row():
        inp = gr.Textbox(
            label="input",
            placeholder="PUT TEXT TO CHECK & CORRECT BROSKI",
            value="I wen to the store yesturday to bye some food. I needd milk, bread, and a few otter things. The store was really crowed and I had a hard time finding everyting I needed. I finaly made it to the check out line and payed for my stuff.",
        )
        out = gr.Textbox(label="output", interactive=False)
    btn = gr.Button("Process")
    btn.click(fn=update, inputs=inp, outputs=out)
    gr.Markdown("---")
    gr.Markdown(
        "- see the [model card](https://huggingface.co/pszemraj/flan-t5-large-grammar-synthesis) for more info"
    )
    gr.Markdown("- if experiencing long wait times, feel free to duplicate the space!")
demo.launch()