Spaces:

stinoco
/

satia

Runtime error

App Files Files Community

stinoco commited on Jan 12, 2023

Commit

a87c588

•

1 Parent(s): 1abe99a

Added classification models for subcategories

Browse files

Files changed (23) hide show

app.py +134 -25
production_models/cliente/cliente_model.pkl +0 -0
production_models/cliente/lstm_atencionalcliente.pt +0 -0
production_models/conforme/conforme_model.pkl +0 -0
production_models/conforme/lstm_conforme.pt +0 -0
production_models/devoluciones/devoluciones_model.pkl +0 -0
production_models/devoluciones/lstm_devoluciones.pt +0 -0
production_models/entrega/entrega_model.pkl +0 -0
production_models/entrega/lstm_entrega.pt +0 -0
production_models/financiamiento/financiamiento_model.pkl +0 -0
production_models/financiamiento/lstm_financiamiento.pt +0 -0
production_models/marketing/lstm_trademarketing.pt +0 -0
production_models/marketing/marketing_model.pkl +0 -0
production_models/otros/lstm_otros.pt +0 -0
production_models/otros/otros_model.pkl +0 -0
production_models/stock/lstm_stock.pt +0 -0
production_models/stock/stock_model.pkl +0 -0
production_models/ventas/lstm_ventas.pt +0 -0
production_models/ventas/ventas_model.pkl +0 -0
utils/load_model.py +27 -0
utils/lstm.py +37 -0
utils/production_model +36 -0
utils/tokenizer.py +167 -0

app.py CHANGED Viewed

@@ -1,40 +1,149 @@
 import gradio as gr
 import numpy as np
 from transformers import pipeline
-# pipelines
 pipeline_clf = pipeline("text-classification", model = "stinoco/beto-sentiment-analysis-finetuned", return_all_scores = True)
 pipeline_pos = pipeline("token-classification", model = "sagorsarker/codeswitch-spaeng-pos-lince")
-def predict(text: str):
-  '''
-  Función que recibe texto como input, devuelve la clasificación de texto para ser recibida por el demo.
-  text: texto a clasificar (str)
-  '''
   # Text Classification
   classes = pipeline_clf(text)[0]
-  # POS
-  classes = {element['label']: element['score'] for element in classes}
-  labeled_text = {'text': text, 'entities': pipeline_pos(text)}
-  return classes, labeled_text
-demo = gr.Interface(fn = predict,
-                         inputs = [gr.Textbox(placeholder = "Ingresa el reclamo acá", label = 'Reclamo')],
-                         outputs = [gr.outputs.Label(label = 'Categorías'),
-                                    gr.Highlightedtext(label = 'Part of Speech')],
-                         examples = [
-                             ['al ser de region simpre esta con quiebre de stock'],
-                             ['que tienen que tener vendedores que conozcan el rubro y que sepan lo que estan vendiendo'],
-                             ['un solo vendedor no pude estar encargado de miles de articulos debe especificarse en cerveza'],
-                             ['no hay mercaderia']
-                         ],
-                         title = 'Demo Clasificación NPS'
-                         )
-demo.launch()

 import gradio as gr
 import numpy as np
 from transformers import pipeline
+from utils.tokenizer import tokenizer
+from utils.lstm import lstm
+from utils.load_model import load_model
+from utils.production_model import ProductionModel
+# Cargamos modelos
+## Transformers
 pipeline_clf = pipeline("text-classification", model = "stinoco/beto-sentiment-analysis-finetuned", return_all_scores = True)
 pipeline_pos = pipeline("token-classification", model = "sagorsarker/codeswitch-spaeng-pos-lince")
+## LSTM
+clf_marketing = load_model('marketing')
+clf_cliente = load_model('cliente')
+clf_conforme = load_model('conforme')
+clf_devoluciones = load_model('devoluciones')
+clf_entrega = load_model('entrega')
+clf_financiamiento = load_model('financiamiento')
+clf_otros = load_model('otros')
+clf_stock = load_model('stock')
+clf_ventas = load_model('ventas')
+# PREDICT
+def predict(text):
   # Text Classification
   classes = pipeline_clf(text)[0]
+  macro_probas = {element['label']: element['score'] for element in classes}
+  macro_probas = dict(sorted(macro_probas.items(), key=lambda x: x[1], reverse = True)[:4])
+  macro_probas['Resto'] = 1 - sum(macro_probas.values())
+  macro_label = max(macro_probas, key = macro_probas.get)
+  macro_labels = macro_label.split(' - ')
+  output = {macro_output: macro_probas, cliente_component: None, conforme_component: None,
+            devoluciones_component: None, entrega_component: None, financiamiento_component: None,
+            otros_component: None, stock_component: None, marketing_component: None,
+            ventas_component: None, row_cliente: gr.update(visible = False),
+            row_conforme: gr.update(visible = False), row_devoluciones: gr.update(visible = False),
+            row_entrega: gr.update(visible = False), row_financiamiento: gr.update(visible = False),
+            row_otros: gr.update(visible = False), row_stock: gr.update(visible = False),
+            row_marketing: gr.update(visible = False), row_ventas: gr.update(visible = False),}
+  if 'Atención al cliente' in macro_labels:
+    output[row_cliente] = gr.update(visible = True)
+    output[cliente_component] = clf_cliente.predict([text])
+  if 'Conforme' in macro_labels:
+    output[row_conforme] = gr.update(visible = True)
+    output[conforme_component] = clf_conforme.predict([text])
+  if 'Devoluciones' in macro_labels:
+    output[row_devoluciones] = gr.update(visible = True)
+    output[devoluciones_component] = clf_devoluciones.predict([text])
+  if 'Entrega' in macro_labels:
+    output[row_entrega] = gr.update(visible = True)
+    output[entrega_component] = clf_entrega.predict([text])
+  if 'Financiamiento' in macro_labels:
+    output[row_financiamiento] = gr.update(visible = True)
+    output[financiamiento_component] = clf_financiamiento.predict([text])
+  if 'Otros' in macro_labels:
+    output[row_otros] = gr.update(visible = True)
+    output[otros_component] = clf_otros.predict([text])
+  if 'Stock' in macro_labels:
+    output[row_stock] = gr.update(visible = True)
+    output[stock_component] = clf_stock.predict([text])
+  if 'Trade Marketing' in macro_labels:
+    output[row_marketing] = gr.update(visible = True)
+    output[marketing_component] = clf_marketing.predict([text])
+  if 'Ventas' in macro_labels:
+    output[row_ventas] = gr.update(visible = True)
+    output[ventas_component] = clf_ventas.predict([text])
+  return output
+# DEMO
+with gr.Blocks(title = 'Modelo NPS') as demo:
+  gr.Markdown(
+    '''
+    # <center>Modelo de Clasificación NPS</center>
+    Este es un modelo para categorizar reclamos de NPS, prueba escribiendo reclamos abajo!
+    ''')
+  with gr.Column() as text_col:
+    with gr.Row():
+      text_input = gr.Textbox(placeholder = "Ingresa el reclamo acá", label = 'Reclamo')
+      #macro_output = gr.outputs.Label(label = 'Categorías Generales')
+  with gr.Row():
+    macro_output = gr.outputs.Label(label = 'Categorías Generales')
+  with gr.Row():
+    #macro_output = gr.outputs.Label(label = 'Categorías Generales')
+    with gr.Row(visible = False) as row_cliente:
+      cliente_component = gr.outputs.Label(label = 'Categorías Atención al Cliente')
+    with gr.Row(visible = False) as row_conforme:
+      conforme_component = gr.outputs.Label(label = 'Categorías Conforme')
+    with gr.Row(visible = False) as row_devoluciones:
+      devoluciones_component = gr.outputs.Label(label = 'Categorías Devoluciones')
+    with gr.Row(visible = False) as row_entrega:
+      entrega_component = gr.outputs.Label(label = 'Categorías Entrega')
+    with gr.Row(visible = False) as row_financiamiento:
+      financiamiento_component = gr.outputs.Label(label = 'Categorías Financiamiento')
+    with gr.Row(visible = False) as row_otros:
+      otros_component = gr.outputs.Label(label = 'Categorías Otros')
+    with gr.Row(visible = False) as row_stock:
+      stock_component = gr.outputs.Label(label = 'Categorías Stock')
+    with gr.Row(visible = False) as row_marketing:
+      marketing_component = gr.outputs.Label(label = 'Categorías Trade Marketing')
+    with gr.Row(visible = False) as row_ventas:
+      ventas_component = gr.outputs.Label(label = 'Categorías Ventas')
+  outputs = [
+      macro_output, cliente_component, conforme_component, devoluciones_component,
+      entrega_component, financiamiento_component, otros_component, stock_component,
+      marketing_component, ventas_component, row_cliente, row_conforme,
+      row_devoluciones, row_entrega, row_financiamiento, row_otros,
+      row_stock, row_marketing, row_ventas, ]
+  button = gr.Button('Submit')
+  button.click(fn = predict, inputs = text_input, outputs = outputs)
+  gr.Examples(
+      examples = [['sale mas a cuenta comprar en los supermercados que a la cervecería'],
+                  ['llega las latas abolladas sucias'],
+                  ['vendedor no viene presencialmente solo por whatsapp'],
+                  ['mejorar la atención de los repartidores porque roban'],
+                  ['seria bueno mas promociones y publicidad']],
+      inputs = text_input)
+demo.launch(share = True)

production_models/cliente/cliente_model.pkl ADDED Viewed

Binary file (48.1 kB). View file

production_models/cliente/lstm_atencionalcliente.pt ADDED Viewed

Binary file (701 kB). View file

production_models/conforme/conforme_model.pkl ADDED Viewed

Binary file (16.9 kB). View file

production_models/conforme/lstm_conforme.pt ADDED Viewed

Binary file (329 kB). View file

production_models/devoluciones/devoluciones_model.pkl ADDED Viewed

Binary file (10.4 kB). View file

production_models/devoluciones/lstm_devoluciones.pt ADDED Viewed

Binary file (248 kB). View file

production_models/entrega/entrega_model.pkl ADDED Viewed

Binary file (30.4 kB). View file

production_models/entrega/lstm_entrega.pt ADDED Viewed

Binary file (491 kB). View file

production_models/financiamiento/financiamiento_model.pkl ADDED Viewed

Binary file (12.4 kB). View file

production_models/financiamiento/lstm_financiamiento.pt ADDED Viewed

Binary file (275 kB). View file

production_models/marketing/lstm_trademarketing.pt ADDED Viewed

Binary file (695 kB). View file

production_models/marketing/marketing_model.pkl ADDED Viewed

Binary file (47.9 kB). View file

production_models/otros/lstm_otros.pt ADDED Viewed

Binary file (406 kB). View file

production_models/otros/otros_model.pkl ADDED Viewed

Binary file (23.3 kB). View file

production_models/stock/lstm_stock.pt ADDED Viewed

Binary file (364 kB). View file

production_models/stock/stock_model.pkl ADDED Viewed

Binary file (19.8 kB). View file

production_models/ventas/lstm_ventas.pt ADDED Viewed

Binary file (473 kB). View file

production_models/ventas/ventas_model.pkl ADDED Viewed

Binary file (29 kB). View file

utils/load_model.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import pickle
+from glob import glob
+import torch
+import os
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def load_model(folder):
+  '''
+  Función que tiene por objetivo cargar un modelo de predicción.
+  Utiliza un modelo .pt y un objeto .pkl
+  folder: carpeta de la que cargar el modelo (str)
+  '''
+  base_folder = 'production_models'
+  folder = folder
+  model_path = glob(os.path.join(base_folder, folder, '*.pt'))[0]
+  clf_path = glob(os.path.join(base_folder, folder, '*.pkl'))[0]
+  with open(clf_path, 'rb') as file:
+    clf = pickle.load(file)
+  clf.model = torch.load(model_path, map_location = device)
+  return clf

utils/lstm.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+class LSTM(nn.Module):
+  def __init__(self, vocab_size, n_classes, hidden_dim, embedding_dim, n_layers, dropout, bidirectional = True):
+    super(LSTM, self).__init__()
+    self.n_layers = n_layers
+    self.hidden_dim = hidden_dim
+    self.embedding_dim = embedding_dim
+    # Capas embedding y LSTM
+    self.embedding = nn.Embedding(vocab_size, embedding_dim, device = device)
+    self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout = dropout, batch_first = True, bidirectional = bidirectional, device = device)
+    # Dropout
+    self.dropout = nn.Dropout(dropout)
+    # Capa lineal
+    self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, n_classes, device = device)
+  def forward(self, x):
+    x = self.embedding(x)
+    x, hidden = self.lstm(x)
+    x = x[:, -1, :]
+    x = self.dropout(x)
+    output = self.fc(x)
+    return output, hidden

utils/production_model ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+import torch.nn.functional as F
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+class ProductionModel():
+  def __init__(self, tokenizer, dict_labels):
+    self.model = None
+    self.tokenizer = tokenizer
+    self.dict_labels = dict_labels
+  def predict(self, X):
+    '''
+    Método que genera la predicción sobre nuevos datos (X).
+    X: Lista con los datos, cada elemento es una observación (list)
+    '''
+    if self.model is None:
+      raise ValueError('Debes cargar el modelo con self.model = torch.load(model_file.pt)')
+    X = self.tokenizer.tokenize(X)
+    X = torch.tensor(X, device = device)
+    self.model.eval()
+    with torch.no_grad():
+      predictions = self.model(X)[0]
+      predictions = F.softmax(predictions, dim = 1)
+      predictions = predictions.to('cpu').detach().numpy()
+    output = [{self.dict_labels[i]: float(lista[i]) for i in range(len(lista))} for lista in predictions]
+    if len(output) == 1:
+      return output[0]
+    return output

utils/tokenizer.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import numpy as np
+import matplotlib.pyplot as plt
+from collections import Counter
+from string import punctuation
+class Tokenizer():
+  def __init__(self):
+    self.vocab = None
+    self.pad_idx = 0
+    self.unk_idx = 1
+  def preprocessing(self, texts: list):
+    '''
+    Método que pre procesa un documento, transformando las palabras a minúsculas, eliminando puntuaciones y caracterés "\n". Devuelve los textos pre procesados.
+    texts: documentos a ser procesados (list)
+    '''
+    texts = [text.lower() for text in texts] # lowercase
+    texts = [''.join([c for c in text if c not in punctuation]) for text in texts] # delete punctuation
+    texts = [text.split('\n') for text in texts] # eliminate \n
+    texts = [' '.join(text) for text in texts]
+    return texts
+  def train(self, texts: list):
+    '''
+    Método que entrena el tokenizador, construyendo un vocabulario de tokens y su codificación respectiva.
+    texts: documentos que el tokenizador usa para construir el vocabulario (list)
+    '''
+    # preprocessing
+    texts = self.preprocessing(texts)
+    # joint text
+    megadoc = ' '.join(texts)
+    words = megadoc.split()
+    self.counts = Counter(words) # Construye un diccionario de palabras. Las claves son las palabras y los valores son la frecuencia
+    self.vocab = sorted(self.counts, key = self.counts.get, reverse = True) # Ordenamos la palabras por frecuencia
+    self.vocab_to_int = {word: ii for ii, word in enumerate(self.vocab, 2)} # Construimos diccionario para mapear palabra a número entero. Empezamos los índices en 2
+    self.vocab_to_int[self.unk_idx] = '<unk>' # token para palabras no reconocidas
+    self.vocab_to_int[self.pad_idx] = '<pad>' # token para padding
+    self.int_to_vocab = {value: key for key, value in self.vocab_to_int.items()}
+  def encode(self, texts: list):
+    '''
+    Método que usa el vocabulario construido para codificar textos. Devuelve los textos codificados.
+    texts: Documentos a ser codificados (list)
+    '''
+    if self.vocab_to_int is None:
+      raise ValueError('Debes entrenar el tokenizador primero')
+    encoded_text = [[self.vocab_to_int[word] if word in self.vocab_to_int.keys() else 1 for word in text.split()] for text in texts]
+    return encoded_text
+  def decode(self, texts: list):
+    '''
+    Método que usa el vocabulario construido para decodificar textos. Devuelve los textos decodificados.
+    texts: Documentos a ser decodificados (list)
+    '''
+    if self.vocab is None:
+      raise ValueError('Debes entrenar el tokenizador primero')
+    decoded_text = [[self.int_to_vocab[word] if word in self.int_to_vocab.keys() else 'unk' for word in text] for text in texts]
+    return decoded_text
+  def filter_text(self, encoded_text: list, encoded_labels: list, min_tokens = 1, max_tokens = 1e6):
+    '''
+    Método que filtra una colección de documentos en función de la cantidad de tokens. Devuelve la coleccion de documentos filtrados.
+    encoded_text: Textos codificados a ser filtrados (list)
+    encoded_labels: Etiquetas a filtrar en función del texto (list)
+    min_tokens: Cantidad mínima de tokens permitida (int)
+    max_tokens: Cantidad máxima de tokens permitida (int)
+    '''
+    print('Documentos antes de eliminación:', len(encoded_text))
+    #Extraemos los índices de todos los reviews que tienen longitud cumpliendo los filtros
+    filter_idx = [ii for ii, text in enumerate(encoded_text) if min_tokens <= len(text) <= max_tokens]
+    #Nos quedamos solo con los reviews con longitud que cumplen los filtros
+    encoded_text = [encoded_text[ii] for ii in filter_idx]
+    #Lo mismo con los labels
+    encoded_labels = np.array([encoded_labels[ii] for ii in filter_idx])
+    print('Documentos después de eliminación:', len(encoded_text))
+    return encoded_text, encoded_labels
+  def padding(self, encoded_text: list, vector_size: int):
+    '''
+    Método que hace padding a una secuencia, fijando el largo de las secuencias en un número determinado vector_size:
+      Las secuencias con largo mayor a vector_size, son acortadas por la derecha hasta ese valor.
+      Las secuencias con largo menor a vector_size, son llenadas con 0s por la izquierda hasta completar ese valor.
+    Retorna la secuencia modificada.
+    encoded_text: lista con los textos a modificar (list)
+    vector_size: largo de los documentos a fijar (int)
+    '''
+    features = np.zeros((len(encoded_text), vector_size), dtype = int)
+    for i, row in enumerate(encoded_text):
+      features[i, -len(row):] = np.array(row)[:vector_size]
+    return features
+  def tokenize(self, texts: list,
+               #vector_size: int
+               ):
+    '''
+    Método que tokeniza documentos a partir del vocabulario construido. Devuelve los documentos codificados en un largo de tamaño vector_size
+    texts: Textos a ser tokenizados (list)
+    vector_size: Largo de los textos de salida (int)
+    '''
+    if self.vocab is None:
+      raise ValueError('Debes entrenar el tokenizador primero')
+    if self.vector_size is None:
+      raise ValueError('Debes especificar vector_size en objeto Tokenizer (tokenizer.vector_size = x)')
+    # preprocessing
+    texts = self.preprocessing(texts)
+    # encode
+    encoded_text = self.encode(texts)
+    # padding
+    features = self.padding(encoded_text, self.vector_size)
+    return features
+  def graph_distribution(self, encoded_text):
+    '''
+    Método que grafica la distribución del largo de los documentos de entrenamiento.
+    '''
+    if self.vocab is None:
+      raise ValueError('Debes entrenar el tokenizador primero')
+    text_lens = Counter([len(text) for text in encoded_text]) #Contamos cuantas palabras hay en cada review
+    plt.figure(figsize = (12, 6))
+    plt.bar(text_lens.keys(), text_lens.values())
+    plt.title('Distribución del largo de documentos en el Dataset')
+    plt.xlabel('Cantidad de tokens')
+    plt.ylabel('Frecuencia')
+    plt.show()
+  def __len__(self):
+    return len(self.vocab_to_int) if self.vocab is not None else 0