File size: 2,810 Bytes
9a7645a
 
 
 
4b67ac0
9a7645a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b67ac0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a7645a
 
 
 
 
4b67ac0
 
f41c648
4b67ac0
 
f41c648
4b67ac0
9a7645a
 
 
 
f41c648
4b67ac0
 
9a7645a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr
import joblib
import spacy
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin

nlp = spacy.load('en_core_web_sm')
tfidf = joblib.load('./tfidf.joblib')
model = joblib.load('./model.joblib')
tags_binarizer = joblib.load('./tags.joblib')

def lemmatize(s: str) -> iter:
    # tokenize
    doc = nlp(s)

    # remove punct and stopwords
    tokens = filter(lambda token: not token.is_space and not token.is_punct and not token.is_stop and not token.is_digit, doc)

    # lemmatize
    return map(lambda token: token.lemma_.lower(), tokens)

def plot(tags, proba):
    plt.style.use('dark_background')
    plt.rcParams.update({'font.size': 16})

    fig, ax = plt.subplots(figsize=(12,9))

    ax.barh(tags, proba, align='center', color='darkred')
    ax.set_yticks(tags, labels=tags)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Score')
    ax.set_title('Score/Tag')

    for i, v in enumerate(proba):
        ax.text(v - 0.065, i + 0.05, str(round(v, 2)))

    plt.xlim(0, 1)
    plt.show()

def predict_words(X):
    y_bin = model.predict(X)
    y_tags = "    ".join(tags_binarizer.inverse_transform(y_bin)[0])

    return y_tags

def proba_chart(X):
    y_proba = model.predict_proba(X)[0]
    tags = list(dict(sorted(tags_binarizer.ts.count.items())).keys())

    # combine
    data = list(zip(tags, y_proba))

    # sort
    data = sorted(data, key=lambda tag_value: tag_value[1], reverse=True)

    # keep values >= min_score
    data = list(filter(lambda tag_value: tag_value[1] >= 0.1, data))

    # we have our two dimensions for chart
    tags, proba = zip(*data)

    # build chart
    plt.style.use('dark_background')
    plt.rcParams.update({'font.size': 16})

    fig, ax = plt.subplots(figsize=(12,9))

    ax.barh(tags, proba, align='center', color='darkred')
    ax.set_yticks(tags, labels=tags)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Score')
    ax.set_title('Score/Tag')

    for i, v in enumerate(proba):
        ax.text(v - 0.065, i + 0.05, str(round(v, 2)))

    plt.xlim(0, 1)

    return fig

def predict(title: str , post: str):
    text = title + " " + post
    lemmes = np.array([' '.join(list(lemmatize(text)))])

    X = tfidf.transform(lemmes)

    # predicted words
    words = predict_words(X)

    # proba chart
    chart = proba_chart(X)

    return words, chart

demo = gr.Interface(
    fn=predict,
    inputs=[
        gr.Textbox(label="Title", lines=1, placeholder="Title..."),
        gr.Textbox(label="Post", lines=20, placeholder="Post...")],
    outputs=[gr.Textbox(label="Tags"), gr.Plot()])

demo.launch()