File size: 2,285 Bytes
f9d052c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
import pandas as pd
from typing import Any
import datasets
from tqdm import tqdm
from huggingface_hub import login
import os
login(os.environ.get("HF_Token"))
test = datasets.load_dataset(
    "minskiter/weibo",
    split=datasets.Split.TEST,
    streaming=True
)
int2str = test.features["labels"].feature.int2str
page_size = 10
pages = []
i = 0
page = pd.DataFrame(test.take(page_size))
with tqdm(desc="load dataset") as bar:
    while len(page.index)>0:
        pages.append(page)
        i+=1
        page = pd.DataFrame(test.skip(i*page_size).take(page_size))
        bar.update(i)
    
cur = pages[0]

def show(page: float)-> pd.DataFrame:
    global cur
    cur = pages[int(page)]
    return cur 

def getobj():
    return {
        "word":[],
        "start": -1,
        "end": -1,
        "entity": "O"
    }

def showIter(evt: gr.SelectData)->dict[str, Any]:
    row = cur.values.tolist()[evt.index[0]]
    text,labels = row[0],row[1]
    labels = int2str(list(map(int,labels)))
    entities = []
    obj = getobj()
    labels.append("O")
    for i,label in enumerate(labels):
        if label[0]=='B' or label[0]=="S" or label[0]=="O":
            if len(obj["word"])>0:
                obj["word"] = "".join(obj["word"])
                entities.append(obj)
                obj = getobj()
        if label[0]=="B":
            obj["start"] = i
            obj["end"] = i+1
            obj["word"].append(text[i])
            obj["entity"] = label.split("-")[-1]
        elif label[0]=="S":
            obj["start"] = i
            obj["end"] = i+1
            obj["word"] = text[i]
            obj["entity"] = label.split("-")[-1]
            entities.append(obj)
            obj = getobj()
        elif label[0]=='E' or label[0]=="I" or label[0]=="M":
            obj["word"].append(text[i])
            obj["end"] = i+1
    return {"text": "".join(text), "entities": entities}

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            output = gr.DataFrame(value=cur)
            page = gr.Number(minimum=0,maximum=len(pages)-1,label="page")
            page.change(show, page, outputs=output)
        text = gr.HighlightedText(label="preview")
        output.select(showIter,inputs=[], outputs=[text])
        
demo.launch()