Rubens commited on
Commit
7acabc8
1 Parent(s): aa90667

first commit ours

Browse files
Files changed (3) hide show
  1. Qatar_translated_best_2500.csv +0 -0
  2. app.py +185 -0
  3. requirements.txt +9 -0
Qatar_translated_best_2500.csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pprint
3
+ import tempfile
4
+ from typing import Dict, Text
5
+ import numpy as np
6
+ import tensorflow as tf
7
+ import tensorflow_recommenders as tfrs
8
+ import os
9
+ import unidecode
10
+ from nltk import word_tokenize
11
+ import re
12
+ import pandas as pd
13
+ from nltk.util import ngrams
14
+ import base64
15
+ import hashlib
16
+ import gradio as gr
17
+ import scann
18
+
19
+ df=pd.read_csv("/home/user/app/Qatar_translated_best_2500.csv",sep=",",header=0)
20
+ df=df.drop_duplicates()
21
+ df=df.dropna()
22
+ df["nome_vaga"]=df["nome_vaga"].map(lambda x: x.lower().title())
23
+ df["requisito"]=df["requisito"].map(lambda x: x[0:1000].lower())
24
+ my_dict=dict(df.iloc[0:int(df.shape[0]*0.9),:])
25
+ my_dict_cego=dict(df.iloc[int(df.shape[0]*0.9):,:])
26
+ ratings = tf.data.Dataset.from_tensor_slices(my_dict).map(lambda x: {
27
+ "code": x["code"],
28
+ "nome_vaga": x["nome_vaga"],
29
+ "requisito": tf.strings.split(x["requisito"],maxsplit=106)
30
+ })
31
+ movies = tf.data.Dataset.from_tensor_slices(dict(df)).map(lambda x: {
32
+ "code": x["code"],
33
+ "nome_vaga": x["nome_vaga"]
34
+ })
35
+ movies = movies.map(lambda x: x["code"])
36
+ ratings_cego = tf.data.Dataset.from_tensor_slices(my_dict_cego).map(lambda x: {
37
+ "code": x["code"],
38
+ "requisito": tf.strings.split(x["requisito"],maxsplit=106)
39
+ })
40
+ tf.random.set_seed(42)
41
+ shuffled = ratings.shuffle(int(df.shape[0]*0.9), seed=42, reshuffle_each_iteration=False)
42
+ shuffled2 = ratings_cego.shuffle(int(df.shape[0]*0.1), seed=42, reshuffle_each_iteration=False)
43
+ train = shuffled.take(int(df.shape[0]*0.9))
44
+ test = shuffled.take(int(df.shape[0]*0.1))
45
+ cego=shuffled2
46
+ movie_titles = movies#.map(lambda x: x["code"])
47
+ user_ids = ratings.map(lambda x: x["requisito"])
48
+ xx=[]
49
+ for x in user_ids.as_numpy_iterator():
50
+ try:
51
+ xx.append(x)
52
+ except:
53
+ pass
54
+ unique_movie_titles = np.unique(list(movie_titles.as_numpy_iterator()))
55
+ unique_user_ids = np.unique(np.concatenate(xx))
56
+ user_ids=user_ids.batch(int(df.shape[0]*0.9))
57
+ layer = tf.keras.layers.StringLookup(vocabulary=unique_user_ids)
58
+ unique_movie_titles[:10]
59
+ embedding_dimension = 768
60
+ user_model = tf.keras.Sequential([
61
+ tf.keras.layers.StringLookup(
62
+ vocabulary=unique_user_ids, mask_token=None),
63
+ # We add an additional embedding to account for unknown tokens.
64
+ tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
65
+
66
+ ])
67
+ movie_model = tf.keras.Sequential([
68
+ tf.keras.layers.StringLookup(
69
+ vocabulary=unique_movie_titles, mask_token=None),
70
+ tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
71
+ ])
72
+ metrics = tfrs.metrics.FactorizedTopK(
73
+ candidates=movies.batch(df.shape[0]
74
+ ).map(movie_model)
75
+ )
76
+ task = tfrs.tasks.Retrieval(
77
+ metrics=metrics
78
+ )
79
+ class MovielensModel(tfrs.Model):
80
+
81
+ def __init__(self, user_model, movie_model):
82
+ super().__init__()
83
+ self.movie_model: tf.keras.Model = movie_model
84
+ self.user_model: tf.keras.Model = user_model
85
+ self.task: tf.keras.layers.Layer = task
86
+
87
+ def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
88
+ user_embeddings = self.user_model(features["requisito"])
89
+ positive_movie_embeddings = self.movie_model(features["code"])
90
+ return self.task(tf.reduce_sum(user_embeddings,axis=1), positive_movie_embeddings)
91
+
92
+ class NoBaseClassMovielensModel(tf.keras.Model):
93
+
94
+ def __init__(self, user_model, movie_model):
95
+ super().__init__()
96
+ self.movie_model: tf.keras.Model = movie_model
97
+ self.user_model: tf.keras.Model = user_model
98
+ self.task: tf.keras.layers.Layer = task
99
+
100
+ def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
101
+
102
+ with tf.GradientTape() as tape:
103
+
104
+ user_embeddings = self.user_model(features["requisito"])
105
+ positive_movie_embeddings = self.movie_model(features["code"])
106
+ loss = self.task(user_embeddings, positive_movie_embeddings)
107
+
108
+ regularization_loss = sum(self.losses)
109
+
110
+ total_loss = loss + regularization_loss
111
+
112
+ gradients = tape.gradient(total_loss, self.trainable_variables)
113
+ self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
114
+
115
+ metrics = {metric.name: metric.result() for metric in self.metrics}
116
+ metrics["loss"] = loss
117
+ metrics["regularization_loss"] = regularization_loss
118
+ metrics["total_loss"] = total_loss
119
+
120
+ return metrics
121
+
122
+ def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
123
+
124
+ user_embeddings = self.user_model(features["requisito"])
125
+ positive_movie_embeddings = self.movie_model(features["code"])
126
+ loss = self.task(user_embeddings, positive_movie_embeddings)
127
+
128
+ regularization_loss = sum(self.losses)
129
+
130
+ total_loss = loss + regularization_loss
131
+
132
+ metrics = {metric.name: metric.result() for metric in self.metrics}
133
+ metrics["loss"] = loss
134
+ metrics["regularization_loss"] = regularization_loss
135
+ metrics["total_loss"] = total_loss
136
+
137
+ return metrics
138
+
139
+ model = MovielensModel(user_model, movie_model)
140
+ model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.08))
141
+ cached_train = train.shuffle(int(df.shape[0]*0.9)).batch(int(df.shape[0]*0.9)).cache()
142
+ cached_test = test.batch(int(df.shape[0]*0.15)).cache()
143
+ path = os.path.join("/home/user/app/", "model/")
144
+ cp_callback = tf.keras.callbacks.ModelCheckpoint(
145
+ filepath=path,
146
+ verbose=1,
147
+ save_weights_only=True,
148
+ save_freq=2)
149
+
150
+ model.fit(cached_train, callbacks=[cp_callback],epochs=110)
151
+
152
+ index=df["code"].map(lambda x: [model.movie_model(tf.constant(x))])
153
+
154
+ indice=[]
155
+ for i in range(0,1633):
156
+ indice.append(np.array(index)[i][0])
157
+
158
+ searcher = scann.scann_ops_pybind.builder(np.array(indice), 10, "dot_product").tree(
159
+ num_leaves=1500, num_leaves_to_search=500, training_sample_size=df.shape[0]).score_brute_force(
160
+ 2, quantize=True).build()
161
+
162
+ import matplotlib.pyplot as plt
163
+
164
+ def predict(text):
165
+ campos=str(text).lower()
166
+ query=np.sum([model.user_model(tf.constant(campos.split()[i])) for i in range(0,len(campos.split()))],axis=0)
167
+ neighbors, distances = searcher.search_batched([query])
168
+ xx = df.iloc[neighbors[0],:].nome_vaga
169
+ fig = plt.figure(figsize=(14,9))
170
+ plt.bar(list(xx),distances[0]*0.8*10)
171
+ plt.title('Degree of match')
172
+ plt.xlabel('Labels')
173
+ plt.xticks(rotation=270)
174
+
175
+ plt.ylabel('Distances')
176
+ for x, y in zip(list(range(0,10)),distances[0]*0.8*10):
177
+ plt.text(x, y, y, ha='center', va='bottom', fontsize=12, color='black')
178
+ return xx, fig
179
+
180
+ demo = gr.Interface(fn=predict, inputs=gr.inputs.Textbox(label='CANDIDATE COMPETENCES - Click *Clear* before adding new input'), \
181
+ outputs=[gr.outputs.Textbox(label='SUGGESTED VACANCIES'),\
182
+ gr.Plot()],\
183
+ css='div {margin-left: auto; margin-right: auto; width: 100%;\
184
+ background-image: url("https://drive.google.com/uc?export=view&id=1KNnISAUcvh2Pt08f-EJZJYCIgkrKw3PI"); repeat 0 0;}')\
185
+ .launch(share=False)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ nltk==3.6.5
2
+ pandas==1.3.4
3
+ numpy==1.22.4
4
+ matplotlib==3.4.3
5
+ unidecode==1.2.0
6
+ tensorflow==2.9.1
7
+ scann==1.2.7
8
+ tensorflow-recommenders==0.7.0
9
+