# -*- coding: utf-8 -*- """Copia de Modelo con MLP.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1Tz3vMBacQRtb-nGhUrwNjr9dI3LBy29_ """ import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import SVC from sklearn import preprocessing from sklearn.model_selection import train_test_split as split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report as report import numpy as np import matplotlib.pyplot as ptl from imblearn.over_sampling import SMOTE from datasets import load_dataset #from google.colab import drive #drive.mount('/content/drive') #from google.colab import drive #drive.mount("/content/drive") #file = "/content/drive/MyDrive/Corpus_cleaned.csv" dataset = load_dataset("csv", data_files="Corpus_cleaned.csv") df = pd.read_csv(dataset) corpus = df.iloc[:, 1] labels = df.iloc[:, 2] print(corpus[0]) print(labels[0]) #Vectorización transformer = TfidfVectorizer() corpus = corpus.fillna(' ') tfidf = transformer.fit_transform(corpus) transformer.get_feature_names_out() # Palabras X = tfidf.toarray() le = preprocessing.LabelEncoder() le.fit(labels) y = le.transform(labels) print(y[0]) #Para el balanceo de clases smote = SMOTE(random_state=42) X_sm, y_sm = smote.fit_resample(X, y) print(f'''Cambio de X antes de SMOTE: {X.shape} Cambio de X despues de SMOTE: {X_sm.shape}''') print('\nBalance positivo y negativo de la clases (%):') unique, counts = np.unique(y_sm, return_counts=True) print(counts*100) #Usa el 80% para entrenamiento X_train, X_test, y_train, y_test = split(X_sm, y_sm, train_size=0.8, random_state=42) #Creación del modelo MLP clf = MLPClassifier(max_iter=100, activation= 'tanh', alpha= 0.05, hidden_layer_sizes= (100,), learning_rate= 'adaptive', solver= 'adam') clf.fit(X_train, y_train) #Obtener predicciones preds = clf.predict(X_test) print(report(y_test, preds)) #test tweet = "dia de la madre" vectTweet = transformer.transform(np.array([tweet])) prediction = clf.predict(vectTweet) print('El tweet es', 'ciberbullying' if prediction[0]==0 else 'no' if prediction[0]==1 else 'otro') print(prediction[0])