File size: 1,492 Bytes
9a7645a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f41c648
9a7645a
 
 
 
 
f41c648
 
 
9a7645a
f41c648
 
 
 
 
 
 
 
9a7645a
 
 
 
f41c648
 
 
9a7645a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import gradio as gr
import joblib
import spacy
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin

nlp = spacy.load('en_core_web_sm')
tfidf = joblib.load('./tfidf.joblib')
model = joblib.load('./model.joblib')
tags_binarizer = joblib.load('./tags.joblib')

def lemmatize(s: str) -> iter:
    # tokenize
    doc = nlp(s)

    # remove punct and stopwords
    tokens = filter(lambda token: not token.is_space and not token.is_punct and not token.is_stop and not token.is_digit, doc)

    # lemmatize
    return map(lambda token: token.lemma_.lower(), tokens)

def predict(title: str , post: str, predict_proba: bool):
    text = title + " " + post
    lemmes = np.array([' '.join(list(lemmatize(text)))])

    X = tfidf.transform(lemmes)

    if predict_proba:
        y_proba = model.predict_proba(X)[0]
        tags = list(dict(sorted(tags_binarizer.ts.count.items())).keys())

        result = list(zip(tags, y_proba))
    else:
        y_bin = model.predict(X)
        y_tags = tags_binarizer.inverse_transform(y_bin)

        result = y_tags

    return result

demo = gr.Interface(
    fn=predict,
    inputs=[
        gr.Textbox(label="Title", lines=1, placeholder="Title..."),
        gr.Textbox(label="Post", lines=10, placeholder="Post..."),
        gr.Checkbox(label="Proba?")],
    outputs=gr.Textbox(lines=10))

demo.launch()