File size: 2,189 Bytes
2950aec
 
 
 
eb06a8d
2950aec
 
eb06a8d
2950aec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27ce743
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2950aec
 
 
 
 
 
 
 
27ce743
2950aec
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr
import json

from datetime import datetime
import demoji
from huggingface_hub import CommitScheduler
from pathlib import Path
import re
from transformers import pipeline
from uuid import uuid4

#based on https://huggingface.co/spaces/Wauplin/space_to_dataset_saver/blob/main/app_json.py
#data is saved at https://huggingface.co/datasets/MR17u/tweeteval-irony-mcc/tree/main

JSON_DATASET_DIR = Path("json_dataset")
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)

JSON_DATASET_PATH = JSON_DATASET_DIR / f"data-{uuid4()}.json"

CLS_MODEL_NAME = "PierreEpron/tweeteval-irony-mcc"

scheduler = CommitScheduler(
    repo_id="tweeteval-irony-mcc",
    repo_type="dataset",
    folder_path=JSON_DATASET_DIR,
    path_in_repo="data",
)

classifier = pipeline(model = CLS_MODEL_NAME, tokenizer = 'cardiffnlp/twitter-roberta-large-2022-154m')

def clean_brackets(text):
    return text.replace('{', '(').replace('}', ')')

def clean_emojis(text, type:str = ''):
    if type=='rem':
        return demoji.replace(text, '')
    elif type!='keep':
        return demoji.replace_with_desc(text, type)
    else:
        return text

def clean_hashtags(text, hashtags=['#irony', '#sarcasm','#not']):
    for hashtag in hashtags:
        text = re.sub(hashtag, '', text, flags=re.I)
    return re.sub(r' +', r' ', text)

def clean_text(text):
    return re.sub(' {2,}', ' ',clean_emojis(clean_hashtags(clean_brackets(text)))).strip()

def save_json(entry: str, result) -> None:
    with scheduler.lock:
        with JSON_DATASET_PATH.open("a") as f:
            result = json.loads(result.replace("'",'"'))[0]
            json.dump({"entry": entry, "label": result['label'], "score": result['score'], "datetime": datetime.now().isoformat()}, f)
            f.write("\n")

def classif(text: str):
    return classifier(clean_text(text))

with gr.Blocks() as demo:
    with gr.Row():
        entry = gr.Textbox(label="Input")
        result = gr.Textbox(label="Classification")
    input_btn = gr.Button("Submit")
    input_btn.click(fn=classif, inputs=entry, outputs=result).success(
        fn=save_json,
        inputs=[entry, result],
        outputs=None
        )

demo.launch()