spaces-valinhos / app.py
Rubens's picture
commit Gi
cb80cd5
import scann
import gradio as gr
import os
import pprint
import tempfile
from typing import Dict, Text
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs # scann 1.2.7 + recomm 0.7.0 + TF 2.8.0
import os
import unidecode
from nltk import word_tokenize
import re
import pandas as pd
from nltk.util import ngrams
import base64
import hashlib
df = pd.read_csv(
'/home/user/app/valinhos_vagas_portugues_pt-BR.csv', sep=';', header=0)
df = df.drop_duplicates()
df = df.dropna()
df["nome_vaga"] = df["nome_vaga"].map(lambda x: x.lower().title())
df["requisito"] = df["requisito"].map(lambda x: x[0:1000].lower().replace(
"espanhol", "portugues").replace("colombia", "valinhos"))
tf.strings.split(df['requisito'].iloc[-1])
my_dict = dict(df.iloc[0:int(df.shape[0]*0.9), :])
my_dict_cego = dict(df.iloc[int(df.shape[0]*0.9):, :])
ratings = tf.data.Dataset.from_tensor_slices(my_dict).map(lambda x: {
"code": x["code"],
"nome_vaga": x["nome_vaga"],
"requisito": tf.strings.split(x["requisito"], maxsplit=101)
})
l = []
for x in ratings.as_numpy_iterator():
#pprint.pprint(len(x['requisito']))
l.append(len(x['requisito']))
min(l)
movies = tf.data.Dataset.from_tensor_slices(dict(df)).map(lambda x: {
"code": x["code"],
"nome_vaga": x["nome_vaga"]
})
movies = movies.map(lambda x: x["code"])
ratings_cego = tf.data.Dataset.from_tensor_slices(my_dict_cego).map(lambda x: {
"code": x["code"],
"requisito": tf.strings.split(x["requisito"], maxsplit=101)
})
tf.random.set_seed(42)
shuffled = ratings.shuffle(
int(df.shape[0]*0.9), seed=42, reshuffle_each_iteration=False)
shuffled2 = ratings_cego.shuffle(
int(df.shape[0]*0.1), seed=42, reshuffle_each_iteration=False)
train = shuffled.take(int(df.shape[0]*0.9))
test = shuffled.take(int(df.shape[0]*0.1))
cego = shuffled2
movie_titles = movies # .map(lambda x: x["code"])
user_ids = ratings.map(lambda x: x["requisito"])
xx = []
for x in user_ids.as_numpy_iterator():
try:
# print(x)
xx.append(x)
except:
pass
unique_movie_titles = np.unique(list(movie_titles.as_numpy_iterator()))
unique_user_ids = np.unique(np.concatenate(xx))
user_ids = user_ids.batch(int(df.shape[0]*0.9))
layer = tf.keras.layers.StringLookup(vocabulary=unique_user_ids)
unique_movie_titles[:10]
embedding_dimension = 768
user_model = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_user_ids, mask_token=None),
# We add an additional embedding to account for unknown tokens.
tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
])
movie_model = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_movie_titles, mask_token=None),
tf.keras.layers.Embedding(
len(unique_movie_titles) + 1, embedding_dimension)
])
metrics = tfrs.metrics.FactorizedTopK(
candidates=movies.batch(df.shape[0]
).map(movie_model)
)
task = tfrs.tasks.Retrieval(
metrics=metrics
)
class MovielensModel(tfrs.Model):
def __init__(self, user_model, movie_model):
super().__init__()
self.movie_model: tf.keras.Model = movie_model
self.user_model: tf.keras.Model = user_model
self.task: tf.keras.layers.Layer = task
def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
# We pick out the user features and pass them into the user model.
user_embeddings = self.user_model(features["requisito"])
# And pick out the movie features and pass them into the movie model,
# getting embeddings back.
positive_movie_embeddings = self.movie_model(features["code"])
# The task computes the loss and the metrics.
return self.task(tf.reduce_sum(user_embeddings, axis=1), positive_movie_embeddings)
class NoBaseClassMovielensModel(tf.keras.Model):
def __init__(self, user_model, movie_model):
super().__init__()
self.movie_model: tf.keras.Model = movie_model
self.user_model: tf.keras.Model = user_model
self.task: tf.keras.layers.Layer = task
def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
# Set up a gradient tape to record gradients.
with tf.GradientTape() as tape:
# Loss computation.
user_embeddings = self.user_model(features["requisito"])
positive_movie_embeddings = self.movie_model(features["code"])
loss = self.task(user_embeddings, positive_movie_embeddings)
# Handle regularization losses as well.
regularization_loss = sum(self.losses)
total_loss = loss + regularization_loss
gradients = tape.gradient(total_loss, self.trainable_variables)
self.optimizer.apply_gradients(
zip(gradients, self.trainable_variables))
metrics = {metric.name: metric.result() for metric in self.metrics}
metrics["loss"] = loss
metrics["regularization_loss"] = regularization_loss
metrics["total_loss"] = total_loss
return metrics
def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
# Loss computation.
user_embeddings = self.user_model(features["requisito"])
positive_movie_embeddings = self.movie_model(features["code"])
loss = self.task(user_embeddings, positive_movie_embeddings)
# Handle regularization losses as well.
regularization_loss = sum(self.losses)
total_loss = loss + regularization_loss
metrics = {metric.name: metric.result() for metric in self.metrics}
metrics["loss"] = loss
metrics["regularization_loss"] = regularization_loss
metrics["total_loss"] = total_loss
return metrics
model = MovielensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.08))
cached_train = train.shuffle(
int(df.shape[0]*0.9)).batch(int(df.shape[0]*0.9)).cache()
cached_test = test.batch(int(df.shape[0]*0.1)).cache()
path = os.path.join("/home/user/app/", "model/")
cp_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=path,
verbose=1,
save_weights_only=True,
save_freq=2)
model.fit(cached_train, callbacks=[cp_callback], epochs=120)
model.evaluate(cached_test, return_dict=True)
index = df["code"].map(lambda x: [model.movie_model(tf.constant(x))])
indice = []
for i in range(0, 1633):
indice.append(np.array(index)[i][0])
searcher = scann.scann_ops_pybind.builder(np.array(indice), 10, "dot_product").tree(
num_leaves=1500, num_leaves_to_search=500, training_sample_size=df.shape[0]).score_brute_force(
2, quantize=True).build()
import matplotlib.pyplot as plt
def predict(text):
##GI
campos=re.sub(r'[\W\s]', ' ', ''.join(unidecode.unidecode(str(text).lower()))).replace(" ", " ")
##GI
query=np.sum([model.user_model(tf.constant(campos.split()[i])) for i in range(0,len(campos.split()))],axis=0)
neighbors, distances = searcher.search_batched([query])
xx = df.iloc[neighbors[0],:].nome_vaga
fig = plt.figure(figsize=(14,9))
plt.bar(list(xx),distances[0]*0.8*10)
plt.title('Degree of match')
plt.xlabel('Labels')
plt.xticks(rotation=270)
plt.ylabel('Distances')
for x, y in zip(list(range(0,10)),distances[0]*0.8*10):
plt.text(x, y, y, ha='center', va='bottom', fontsize=12, color='black')
return xx, fig
demo = gr.Interface(fn=predict, inputs=gr.inputs.Textbox(label='SUAS COMPETÊNCIAS E EXPERIÊNCIA - Clique *Clear* antes de entrar o Imput'), \
outputs=[gr.outputs.Textbox(label='VAGAS SUGERIDAS'),\
gr.Plot()],\
css='div {margin-left: auto; margin-right: auto; width: 100%;\
background-image: url("https://drive.google.com/uc?export=view&id=1KNnISAUcvh2Pt08f-EJZJYCIgkrKw3PI"); repeat 0 0;}')\
.launch(share=False)