Spaces:
Sleeping
Sleeping
| import scann | |
| import gradio as gr | |
| import os | |
| import pprint | |
| import tempfile | |
| from typing import Dict, Text | |
| import numpy as np | |
| import tensorflow as tf | |
| import tensorflow_recommenders as tfrs # scann 1.2.7 + recomm 0.7.0 + TF 2.8.0 | |
| import os | |
| import unidecode | |
| from nltk import word_tokenize | |
| import re | |
| import pandas as pd | |
| from nltk.util import ngrams | |
| import base64 | |
| import hashlib | |
| df = pd.read_csv( | |
| '/home/user/app/valinhos_vagas_portugues_pt-BR.csv', sep=';', header=0) | |
| df = df.drop_duplicates() | |
| df = df.dropna() | |
| df["nome_vaga"] = df["nome_vaga"].map(lambda x: x.lower().title()) | |
| df["requisito"] = df["requisito"].map(lambda x: x[0:1000].lower().replace( | |
| "espanhol", "portugues").replace("colombia", "valinhos")) | |
| tf.strings.split(df['requisito'].iloc[-1]) | |
| my_dict = dict(df.iloc[0:int(df.shape[0]*0.9), :]) | |
| my_dict_cego = dict(df.iloc[int(df.shape[0]*0.9):, :]) | |
| ratings = tf.data.Dataset.from_tensor_slices(my_dict).map(lambda x: { | |
| "code": x["code"], | |
| "nome_vaga": x["nome_vaga"], | |
| "requisito": tf.strings.split(x["requisito"], maxsplit=101) | |
| }) | |
| l = [] | |
| for x in ratings.as_numpy_iterator(): | |
| #pprint.pprint(len(x['requisito'])) | |
| l.append(len(x['requisito'])) | |
| min(l) | |
| movies = tf.data.Dataset.from_tensor_slices(dict(df)).map(lambda x: { | |
| "code": x["code"], | |
| "nome_vaga": x["nome_vaga"] | |
| }) | |
| movies = movies.map(lambda x: x["code"]) | |
| ratings_cego = tf.data.Dataset.from_tensor_slices(my_dict_cego).map(lambda x: { | |
| "code": x["code"], | |
| "requisito": tf.strings.split(x["requisito"], maxsplit=101) | |
| }) | |
| tf.random.set_seed(42) | |
| shuffled = ratings.shuffle( | |
| int(df.shape[0]*0.9), seed=42, reshuffle_each_iteration=False) | |
| shuffled2 = ratings_cego.shuffle( | |
| int(df.shape[0]*0.1), seed=42, reshuffle_each_iteration=False) | |
| train = shuffled.take(int(df.shape[0]*0.9)) | |
| test = shuffled.take(int(df.shape[0]*0.1)) | |
| cego = shuffled2 | |
| movie_titles = movies # .map(lambda x: x["code"]) | |
| user_ids = ratings.map(lambda x: x["requisito"]) | |
| xx = [] | |
| for x in user_ids.as_numpy_iterator(): | |
| try: | |
| # print(x) | |
| xx.append(x) | |
| except: | |
| pass | |
| unique_movie_titles = np.unique(list(movie_titles.as_numpy_iterator())) | |
| unique_user_ids = np.unique(np.concatenate(xx)) | |
| user_ids = user_ids.batch(int(df.shape[0]*0.9)) | |
| layer = tf.keras.layers.StringLookup(vocabulary=unique_user_ids) | |
| unique_movie_titles[:10] | |
| embedding_dimension = 768 | |
| user_model = tf.keras.Sequential([ | |
| tf.keras.layers.StringLookup( | |
| vocabulary=unique_user_ids, mask_token=None), | |
| # We add an additional embedding to account for unknown tokens. | |
| tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension), | |
| ]) | |
| movie_model = tf.keras.Sequential([ | |
| tf.keras.layers.StringLookup( | |
| vocabulary=unique_movie_titles, mask_token=None), | |
| tf.keras.layers.Embedding( | |
| len(unique_movie_titles) + 1, embedding_dimension) | |
| ]) | |
| metrics = tfrs.metrics.FactorizedTopK( | |
| candidates=movies.batch(df.shape[0] | |
| ).map(movie_model) | |
| ) | |
| task = tfrs.tasks.Retrieval( | |
| metrics=metrics | |
| ) | |
| class MovielensModel(tfrs.Model): | |
| def __init__(self, user_model, movie_model): | |
| super().__init__() | |
| self.movie_model: tf.keras.Model = movie_model | |
| self.user_model: tf.keras.Model = user_model | |
| self.task: tf.keras.layers.Layer = task | |
| def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor: | |
| # We pick out the user features and pass them into the user model. | |
| user_embeddings = self.user_model(features["requisito"]) | |
| # And pick out the movie features and pass them into the movie model, | |
| # getting embeddings back. | |
| positive_movie_embeddings = self.movie_model(features["code"]) | |
| # The task computes the loss and the metrics. | |
| return self.task(tf.reduce_sum(user_embeddings, axis=1), positive_movie_embeddings) | |
| class NoBaseClassMovielensModel(tf.keras.Model): | |
| def __init__(self, user_model, movie_model): | |
| super().__init__() | |
| self.movie_model: tf.keras.Model = movie_model | |
| self.user_model: tf.keras.Model = user_model | |
| self.task: tf.keras.layers.Layer = task | |
| def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor: | |
| # Set up a gradient tape to record gradients. | |
| with tf.GradientTape() as tape: | |
| # Loss computation. | |
| user_embeddings = self.user_model(features["requisito"]) | |
| positive_movie_embeddings = self.movie_model(features["code"]) | |
| loss = self.task(user_embeddings, positive_movie_embeddings) | |
| # Handle regularization losses as well. | |
| regularization_loss = sum(self.losses) | |
| total_loss = loss + regularization_loss | |
| gradients = tape.gradient(total_loss, self.trainable_variables) | |
| self.optimizer.apply_gradients( | |
| zip(gradients, self.trainable_variables)) | |
| metrics = {metric.name: metric.result() for metric in self.metrics} | |
| metrics["loss"] = loss | |
| metrics["regularization_loss"] = regularization_loss | |
| metrics["total_loss"] = total_loss | |
| return metrics | |
| def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor: | |
| # Loss computation. | |
| user_embeddings = self.user_model(features["requisito"]) | |
| positive_movie_embeddings = self.movie_model(features["code"]) | |
| loss = self.task(user_embeddings, positive_movie_embeddings) | |
| # Handle regularization losses as well. | |
| regularization_loss = sum(self.losses) | |
| total_loss = loss + regularization_loss | |
| metrics = {metric.name: metric.result() for metric in self.metrics} | |
| metrics["loss"] = loss | |
| metrics["regularization_loss"] = regularization_loss | |
| metrics["total_loss"] = total_loss | |
| return metrics | |
| model = MovielensModel(user_model, movie_model) | |
| model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.08)) | |
| cached_train = train.shuffle( | |
| int(df.shape[0]*0.9)).batch(int(df.shape[0]*0.9)).cache() | |
| cached_test = test.batch(int(df.shape[0]*0.1)).cache() | |
| path = os.path.join("/home/user/app/", "model/") | |
| cp_callback = tf.keras.callbacks.ModelCheckpoint( | |
| filepath=path, | |
| verbose=1, | |
| save_weights_only=True, | |
| save_freq=2) | |
| model.fit(cached_train, callbacks=[cp_callback], epochs=120) | |
| model.evaluate(cached_test, return_dict=True) | |
| index = df["code"].map(lambda x: [model.movie_model(tf.constant(x))]) | |
| indice = [] | |
| for i in range(0, 1633): | |
| indice.append(np.array(index)[i][0]) | |
| searcher = scann.scann_ops_pybind.builder(np.array(indice), 10, "dot_product").tree( | |
| num_leaves=1500, num_leaves_to_search=500, training_sample_size=df.shape[0]).score_brute_force( | |
| 2, quantize=True).build() | |
| import matplotlib.pyplot as plt | |
| def predict(text): | |
| ##GI | |
| campos=re.sub(r'[\W\s]', ' ', ''.join(unidecode.unidecode(str(text).lower()))).replace(" ", " ") | |
| ##GI | |
| query=np.sum([model.user_model(tf.constant(campos.split()[i])) for i in range(0,len(campos.split()))],axis=0) | |
| neighbors, distances = searcher.search_batched([query]) | |
| xx = df.iloc[neighbors[0],:].nome_vaga | |
| fig = plt.figure(figsize=(14,9)) | |
| plt.bar(list(xx),distances[0]*0.8*10) | |
| plt.title('Degree of match') | |
| plt.xlabel('Labels') | |
| plt.xticks(rotation=270) | |
| plt.ylabel('Distances') | |
| for x, y in zip(list(range(0,10)),distances[0]*0.8*10): | |
| plt.text(x, y, y, ha='center', va='bottom', fontsize=12, color='black') | |
| return xx, fig | |
| demo = gr.Interface(fn=predict, inputs=gr.inputs.Textbox(label='SUAS COMPETÊNCIAS E EXPERIÊNCIA - Clique *Clear* antes de entrar o Imput'), \ | |
| outputs=[gr.outputs.Textbox(label='VAGAS SUGERIDAS'),\ | |
| gr.Plot()],\ | |
| css='div {margin-left: auto; margin-right: auto; width: 100%;\ | |
| background-image: url("https://drive.google.com/uc?export=view&id=1KNnISAUcvh2Pt08f-EJZJYCIgkrKw3PI"); repeat 0 0;}')\ | |
| .launch(share=False) | |