mikachou commited on
Commit
9a7645a
1 Parent(s): 821e7db

first working app with Tf-Idf

Browse files
Files changed (7) hide show
  1. .gitignore +5 -0
  2. app.py +42 -0
  3. model.joblib +3 -0
  4. requirements.txt +77 -0
  5. stack_overflow_functions.py +54 -0
  6. tags.joblib +3 -0
  7. tfidf.joblib +3 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ venv
2
+ node_modules/
3
+ package-lock.json
4
+ package.json
5
+ __pycache__
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import joblib
3
+ import spacy
4
+ import numpy as np
5
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
6
+ from sklearn.preprocessing import MultiLabelBinarizer
7
+ from sklearn.base import BaseEstimator, TransformerMixin
8
+
9
+ nlp = spacy.load('en_core_web_sm')
10
+ tfidf = joblib.load('./tfidf.joblib')
11
+ model = joblib.load('./model.joblib')
12
+ tags_binarizer = joblib.load('./tags.joblib')
13
+
14
+ def lemmatize(s: str) -> iter:
15
+ # tokenize
16
+ doc = nlp(s)
17
+
18
+ # remove punct and stopwords
19
+ tokens = filter(lambda token: not token.is_space and not token.is_punct and not token.is_stop and not token.is_digit, doc)
20
+
21
+ # lemmatize
22
+ return map(lambda token: token.lemma_.lower(), tokens)
23
+
24
+ def predict(title: str , post: str):
25
+ text = title + " " + post
26
+ lemmes = np.array([' '.join(list(lemmatize(text)))])
27
+
28
+ X = tfidf.transform(lemmes)
29
+
30
+ y_bin = model.predict(X)
31
+ y_tags = tags_binarizer.inverse_transform(y_bin)
32
+
33
+ return y_tags
34
+
35
+ demo = gr.Interface(
36
+ fn=predict,
37
+ inputs=[
38
+ gr.Textbox(lines=1, placeholder="Title..."),
39
+ gr.Textbox(lines=10, placeholder="Post...")],
40
+ outputs=gr.Textbox(lines=10))
41
+
42
+ demo.launch()
model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df6f5341aa2cc2d2223bbe960deadfc9f42de174040415429b2ca5e9fb0c5ba7
3
+ size 2355322
requirements.txt ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.1
2
+ aiosignal==1.2.0
3
+ analytics-python==1.4.0
4
+ anyio==3.6.1
5
+ asgiref==3.5.2
6
+ async-timeout==4.0.2
7
+ attrs==21.4.0
8
+ backoff==1.10.0
9
+ bcrypt==3.2.2
10
+ blis==0.7.7
11
+ catalogue==2.0.7
12
+ certifi==2022.5.18
13
+ cffi==1.15.0
14
+ charset-normalizer==2.0.12
15
+ click==8.1.3
16
+ cryptography==37.0.2
17
+ cycler==0.11.0
18
+ cymem==2.0.6
19
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl
20
+ fastapi==0.78.0
21
+ ffmpy==0.3.0
22
+ fonttools==4.33.3
23
+ frozenlist==1.3.0
24
+ gradio==3.0.2
25
+ h11==0.13.0
26
+ idna==3.3
27
+ Jinja2==3.1.2
28
+ joblib==1.1.0
29
+ kiwisolver==1.4.2
30
+ langcodes==3.3.0
31
+ linkify-it-py==1.0.3
32
+ markdown-it-py==2.1.0
33
+ MarkupSafe==2.1.1
34
+ matplotlib==3.5.2
35
+ mdit-py-plugins==0.3.0
36
+ mdurl==0.1.1
37
+ monotonic==1.6
38
+ multidict==6.0.2
39
+ murmurhash==1.0.7
40
+ numpy==1.22.3
41
+ orjson==3.6.8
42
+ packaging==21.3
43
+ pandas==1.4.2
44
+ paramiko==2.11.0
45
+ pathy==0.6.1
46
+ Pillow==9.1.1
47
+ preshed==3.0.6
48
+ pycparser==2.21
49
+ pycryptodome==3.14.1
50
+ pydantic==1.8.2
51
+ pydub==0.25.1
52
+ PyNaCl==1.5.0
53
+ pyparsing==3.0.9
54
+ python-dateutil==2.8.2
55
+ python-multipart==0.0.5
56
+ pytz==2022.1
57
+ requests==2.27.1
58
+ scikit-learn==1.0.2
59
+ scipy==1.8.1
60
+ six==1.16.0
61
+ smart-open==5.2.1
62
+ sniffio==1.2.0
63
+ spacy==3.3.0
64
+ spacy-legacy==3.0.9
65
+ spacy-loggers==1.0.2
66
+ srsly==2.4.3
67
+ starlette==0.19.1
68
+ thinc==8.0.16
69
+ threadpoolctl==3.1.0
70
+ tqdm==4.64.0
71
+ typer==0.4.1
72
+ typing_extensions==4.2.0
73
+ uc-micro-py==1.0.1
74
+ urllib3==1.26.9
75
+ uvicorn==0.17.6
76
+ wasabi==0.9.1
77
+ yarl==1.7.2
stack_overflow_functions.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.feature_extraction.text import CountVectorizer
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ from sklearn.preprocessing import MultiLabelBinarizer
5
+
6
+ def top_topics(tags_list: iter, part: float) -> dict:
7
+ cv = CountVectorizer(token_pattern='\S+')
8
+ tags_vect = cv.fit_transform(tags_list)
9
+ tags_vect_sum = np.sum(tags_vect.todense(), axis=0)
10
+ return { k: v for (k, v) in sorted(list(zip(cv.get_feature_names_out(),np.array(tags_vect_sum)[0].tolist())), key=lambda tup: tup[1], reverse=True) if v >= part * len(list(tags_list)) }
11
+
12
+ def simplified_tags(orig_tags: list, allowed_tags: list, alternative: str = None, only_empty: bool = False) -> list:
13
+ # intersection
14
+ simplified_tags = list(set(orig_tags) & set(allowed_tags))
15
+
16
+ # other missing tags = alternative param
17
+ if alternative is not None:
18
+ if (only_empty and len(simplified_tags) == 0) \
19
+ or (not only_empty and len(simplified_tags) < len(orig_tags)):
20
+ simplified_tags.append(alternative) # default = "other"
21
+
22
+ return simplified_tags
23
+
24
+ class TagsSimplifier(BaseEstimator, TransformerMixin):
25
+ def __init__(self, part=0.01):
26
+ self.part = part
27
+
28
+ def fit(self, X, y=None):
29
+ self.count = top_topics(X, self.part)
30
+ return self
31
+
32
+ def transform(self, X, y=None):
33
+ return X.apply(lambda tags: simplified_tags(tags.split(), self.count.keys())).values
34
+
35
+ def inverse_transform(self, X, y=None):
36
+ return X
37
+
38
+ class TagsBinarizer(BaseEstimator, TransformerMixin):
39
+ def __init__(self, part=0.01):
40
+ self.part = part
41
+ self.ts = TagsSimplifier(part=self.part)
42
+ self.mlb = MultiLabelBinarizer()
43
+
44
+ def fit(self, X, y=None):
45
+ simp_X = self.ts.fit_transform(X)
46
+ self.mlb.fit(simp_X)
47
+ return self
48
+
49
+ def transform(self, X, y=None):
50
+ simp_X = self.ts.transform(X)
51
+ return self.mlb.transform(simp_X)
52
+
53
+ def inverse_transform(self, X, y=None):
54
+ return self.mlb.inverse_transform(X)
tags.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b499dfe2b050eff9f02a6eb42567fbcdeb64c1b259038e6226781c2cbcffc5b
3
+ size 1107
tfidf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95ca7956c176afbb3de6eddad6c0079ca542129f8d779e8b767a1d224ef482e6
3
+ size 268451