Rubens commited on
Commit
6350524
1 Parent(s): fcec16f

fisrt commit valinhos

Browse files
Files changed (4) hide show
  1. README.md +5 -6
  2. app.py +267 -0
  3. requirements.txt +9 -0
  4. valinhos_vagas_portugues_pt-BR.csv +0 -0
README.md CHANGED
@@ -1,13 +1,12 @@
1
  ---
2
- title: Spaces Valinhos
3
- emoji: 🏃
4
- colorFrom: red
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 3.19.1
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Recruiting
3
+ emoji: 🐢
4
+ colorFrom: pink
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 3.15.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import scann
2
+ import gradio as gr
3
+ import os
4
+ import pprint
5
+ import tempfile
6
+ from typing import Dict, Text
7
+ import numpy as np
8
+ import tensorflow as tf
9
+ import tensorflow_recommenders as tfrs # scann 1.2.7 + recomm 0.7.0 + TF 2.8.0
10
+ from google.cloud import bigquery # VERSAO 0.30.0
11
+ import os
12
+ from google.oauth2 import service_account
13
+ import unidecode
14
+ from nltk import word_tokenize
15
+ import re
16
+ import pandas as pd
17
+ from nltk.util import ngrams
18
+ import base64
19
+ import hashlib
20
+
21
+
22
+ df = pd.read_csv(
23
+ '/home/user/app/valinhos_vagas_portugues_pt-BR.csv', sep=';', header=0)
24
+
25
+
26
+ df = df.drop_duplicates()
27
+ df = df.dropna()
28
+
29
+ df["nome_vaga"] = df["nome_vaga"].map(lambda x: x.lower().title())
30
+ df["requisito"] = df["requisito"].map(lambda x: x[0:1000].lower().replace(
31
+ "espanhol", "portugues").replace("colombia", "valinhos"))
32
+
33
+
34
+ tf.strings.split(df['requisito'].iloc[-1])
35
+ my_dict = dict(df.iloc[0:int(df.shape[0]*0.9), :])
36
+
37
+ my_dict_cego = dict(df.iloc[int(df.shape[0]*0.9):, :])
38
+
39
+
40
+ ratings = tf.data.Dataset.from_tensor_slices(my_dict).map(lambda x: {
41
+ "code": x["code"],
42
+ "nome_vaga": x["nome_vaga"],
43
+ "requisito": tf.strings.split(x["requisito"], maxsplit=101)
44
+ })
45
+
46
+ l = []
47
+ for x in ratings.as_numpy_iterator():
48
+ #pprint.pprint(len(x['requisito']))
49
+ l.append(len(x['requisito']))
50
+
51
+ min(l)
52
+
53
+
54
+ movies = tf.data.Dataset.from_tensor_slices(dict(df)).map(lambda x: {
55
+ "code": x["code"],
56
+ "nome_vaga": x["nome_vaga"]
57
+ })
58
+
59
+
60
+ movies = movies.map(lambda x: x["code"])
61
+
62
+
63
+ ratings_cego = tf.data.Dataset.from_tensor_slices(my_dict_cego).map(lambda x: {
64
+ "code": x["code"],
65
+ "requisito": tf.strings.split(x["requisito"], maxsplit=101)
66
+ })
67
+
68
+ tf.random.set_seed(42)
69
+ shuffled = ratings.shuffle(
70
+ int(df.shape[0]*0.9), seed=42, reshuffle_each_iteration=False)
71
+ shuffled2 = ratings_cego.shuffle(
72
+ int(df.shape[0]*0.1), seed=42, reshuffle_each_iteration=False)
73
+
74
+ train = shuffled.take(int(df.shape[0]*0.9))
75
+ test = shuffled.take(int(df.shape[0]*0.1))
76
+ cego = shuffled2
77
+
78
+
79
+ movie_titles = movies # .map(lambda x: x["code"])
80
+ user_ids = ratings.map(lambda x: x["requisito"])
81
+
82
+ xx = []
83
+ for x in user_ids.as_numpy_iterator():
84
+ try:
85
+ # print(x)
86
+ xx.append(x)
87
+ except:
88
+ pass
89
+
90
+
91
+ unique_movie_titles = np.unique(list(movie_titles.as_numpy_iterator()))
92
+
93
+ unique_user_ids = np.unique(np.concatenate(xx))
94
+
95
+ user_ids = user_ids.batch(int(df.shape[0]*0.9))
96
+
97
+ layer = tf.keras.layers.StringLookup(vocabulary=unique_user_ids)
98
+
99
+
100
+ unique_movie_titles[:10]
101
+
102
+ embedding_dimension = 768
103
+
104
+ user_model = tf.keras.Sequential([
105
+ tf.keras.layers.StringLookup(
106
+ vocabulary=unique_user_ids, mask_token=None),
107
+ # We add an additional embedding to account for unknown tokens.
108
+ tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
109
+
110
+ ])
111
+
112
+
113
+ movie_model = tf.keras.Sequential([
114
+ tf.keras.layers.StringLookup(
115
+ vocabulary=unique_movie_titles, mask_token=None),
116
+ tf.keras.layers.Embedding(
117
+ len(unique_movie_titles) + 1, embedding_dimension)
118
+ ])
119
+
120
+
121
+ metrics = tfrs.metrics.FactorizedTopK(
122
+ candidates=movies.batch(df.shape[0]
123
+ ).map(movie_model)
124
+ )
125
+
126
+ task = tfrs.tasks.Retrieval(
127
+ metrics=metrics
128
+ )
129
+
130
+
131
+ class MovielensModel(tfrs.Model):
132
+
133
+ def __init__(self, user_model, movie_model):
134
+ super().__init__()
135
+ self.movie_model: tf.keras.Model = movie_model
136
+ self.user_model: tf.keras.Model = user_model
137
+ self.task: tf.keras.layers.Layer = task
138
+
139
+ def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
140
+ # We pick out the user features and pass them into the user model.
141
+ user_embeddings = self.user_model(features["requisito"])
142
+ # And pick out the movie features and pass them into the movie model,
143
+ # getting embeddings back.
144
+ positive_movie_embeddings = self.movie_model(features["code"])
145
+
146
+ # The task computes the loss and the metrics.
147
+ return self.task(tf.reduce_sum(user_embeddings, axis=1), positive_movie_embeddings)
148
+
149
+
150
+ class NoBaseClassMovielensModel(tf.keras.Model):
151
+
152
+ def __init__(self, user_model, movie_model):
153
+ super().__init__()
154
+ self.movie_model: tf.keras.Model = movie_model
155
+ self.user_model: tf.keras.Model = user_model
156
+ self.task: tf.keras.layers.Layer = task
157
+
158
+ def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
159
+
160
+ # Set up a gradient tape to record gradients.
161
+ with tf.GradientTape() as tape:
162
+
163
+ # Loss computation.
164
+ user_embeddings = self.user_model(features["requisito"])
165
+ positive_movie_embeddings = self.movie_model(features["code"])
166
+ loss = self.task(user_embeddings, positive_movie_embeddings)
167
+
168
+ # Handle regularization losses as well.
169
+ regularization_loss = sum(self.losses)
170
+
171
+ total_loss = loss + regularization_loss
172
+
173
+ gradients = tape.gradient(total_loss, self.trainable_variables)
174
+ self.optimizer.apply_gradients(
175
+ zip(gradients, self.trainable_variables))
176
+
177
+ metrics = {metric.name: metric.result() for metric in self.metrics}
178
+ metrics["loss"] = loss
179
+ metrics["regularization_loss"] = regularization_loss
180
+ metrics["total_loss"] = total_loss
181
+
182
+ return metrics
183
+
184
+ def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
185
+
186
+ # Loss computation.
187
+ user_embeddings = self.user_model(features["requisito"])
188
+ positive_movie_embeddings = self.movie_model(features["code"])
189
+ loss = self.task(user_embeddings, positive_movie_embeddings)
190
+
191
+ # Handle regularization losses as well.
192
+ regularization_loss = sum(self.losses)
193
+
194
+ total_loss = loss + regularization_loss
195
+
196
+ metrics = {metric.name: metric.result() for metric in self.metrics}
197
+ metrics["loss"] = loss
198
+ metrics["regularization_loss"] = regularization_loss
199
+ metrics["total_loss"] = total_loss
200
+
201
+ return metrics
202
+
203
+
204
+ model = MovielensModel(user_model, movie_model)
205
+ model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.08))
206
+ cached_train = train.shuffle(
207
+ int(df.shape[0]*0.9)).batch(int(df.shape[0]*0.9)).cache()
208
+
209
+ cached_test = test.batch(int(df.shape[0]*0.1)).cache()
210
+
211
+ path = os.path.join("/home/user/app/", "model/")
212
+
213
+
214
+ cp_callback = tf.keras.callbacks.ModelCheckpoint(
215
+ filepath=path,
216
+ verbose=1,
217
+ save_weights_only=True,
218
+ save_freq=2)
219
+
220
+
221
+ model.fit(cached_train, callbacks=[cp_callback], epochs=120)
222
+
223
+ model.evaluate(cached_test, return_dict=True)
224
+
225
+
226
+
227
+ index = df["code"].map(lambda x: [model.movie_model(tf.constant(x))])
228
+
229
+
230
+
231
+ indice = []
232
+ for i in range(0, 1633):
233
+ indice.append(np.array(index)[i][0])
234
+
235
+
236
+
237
+
238
+ searcher = scann.scann_ops_pybind.builder(np.array(indice), 10, "dot_product").tree(
239
+ num_leaves=1500, num_leaves_to_search=500, training_sample_size=df.shape[0]).score_brute_force(
240
+ 2, quantize=True).build()
241
+
242
+ import matplotlib.pyplot as plt
243
+
244
+ def predict(text):
245
+ campos=str(text).lower()
246
+ query=np.sum([model.user_model(tf.constant(campos.split()[i])) for i in range(0,len(campos.split()))],axis=0)
247
+ neighbors, distances = searcher.search_batched([query])
248
+ xx = df.iloc[neighbors[0],:].nome_vaga
249
+ fig = plt.figure(figsize=(14,9))
250
+ plt.bar(list(xx),distances[0]*0.8*10)
251
+ plt.title('Degree of match')
252
+ plt.xlabel('Labels')
253
+ plt.xticks(rotation=270)
254
+
255
+ plt.ylabel('Distances')
256
+ for x, y in zip(list(range(0,10)),distances[0]*0.8*10):
257
+ plt.text(x, y, y, ha='center', va='bottom', fontsize=12, color='black')
258
+ return xx, fig
259
+
260
+ demo = gr.Interface(fn=predict, inputs=gr.inputs.Textbox(label='SUAS COMPETÊNCIAS E EXPERIÊNCIA - Clique *Clear* antes de entrar o Imput'), \
261
+ outputs=[gr.outputs.Textbox(label='VAGAS SUGERIDAS'),\
262
+ gr.Plot()],\
263
+ css='div {margin-left: auto; margin-right: auto; width: 100%;\
264
+ background-image: url("https://drive.google.com/uc?export=view&id=1KNnISAUcvh2Pt08f-EJZJYCIgkrKw3PI"); repeat 0 0;}')\
265
+ .launch(share=False)
266
+
267
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ nltk==3.6.5
2
+ pandas==1.3.4
3
+ numpy==1.22.4
4
+ matplotlib==3.4.3
5
+ unidecode==1.2.0
6
+ tensorflow==2.9.1
7
+ scann==1.2.7
8
+ tensorflow-recommenders==0.7.0
9
+
valinhos_vagas_portugues_pt-BR.csv ADDED
The diff for this file is too large to render. See raw diff