import pandas as pd import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.neural_network import MLPClassifier from sklearn.ensemble import VotingClassifier from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, LSTM, Dense from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences import tensorflow as tf from joblib import dump from joblib import load import pickle import gradio as gr #import Github import github from github import Github g = Github('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') repo = g.get_repo('mss3d2008/Chat_MIA') file_path = "dialogs.csv" df = pd.read_csv(file_path, sep='\t', header=None, names=['Prompt', 'Answer']) X = 0 y = 0 def train_creative_model(text_data): if not text_data or len(text_data) == 0: print("No hay suficientes datos para entrenar el modelo creativo.") return None, None, None tokenizer = Tokenizer() tokenizer.fit_on_texts(text_data) total_words = len(tokenizer.word_index) + 1 input_sequences = [] for line in text_data: tokens = line.split('\t') # Separar por tabuladores for token in tokens: token_list = tokenizer.texts_to_sequences([token])[0] for i in range(len(token_list)): n_gram_sequence = token_list[i] input_sequences.append(n_gram_sequence) if not input_sequences or len(input_sequences) == 0: print("No hay suficientes secuencias para entrenar el modelo creativo.") return None, None, None global X global y X = np.array(input_sequences) y = tf.keras.utils.to_categorical(X, num_classes=total_words) model = Sequential() model.add(Embedding(total_words, 50, input_length=1)) model.add(LSTM(100)) model.add(Dense(total_words, activation='softmax')) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.fit(X, y, epochs=50, verbose=0) return model, tokenizer, None # Devolver None para creative_max_sequence_length file_path = 'dialogs.csv' df = pd.read_csv(file_path) # Crear un vectorizador TF-IDF vectorizer = TfidfVectorizer() # Aplicar el vectorizador a las frases de Prompt y Answer X = vectorizer.fit_transform(df['Prompt']).toarray() y = df['Answer'] # Utilizar las respuestas como etiquetas # Dividir los datos en conjuntos de entrenamiento y prueba X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Inicializar modelos tree_model = DecisionTreeClassifier() nn_model = MLPClassifier(batch_size=32) # Crear un clasificador de votación voting_clf = VotingClassifier(estimators=[('tree', tree_model), ('nn', nn_model)], voting='hard') print("Error? Maybe") creative_model, creative_tokenizer, _ = train_creative_model('dialogs.txt') print("Error? Maybe, but its part 2") with open('Creative_model.pkl', 'wb') as file: pickle.dump(creative_model, file, protocol=pickle.HIGHEST_PROTOCOL) ''' print("Error? Maybe, but its part 2,5") voting_clf.fit(X_train, y_train) print("Error? Maybe but, its part 3") with open('Voting_model.pkl', 'wb') as file: pickle.dump(voting_clf, file, protocol=pickle.HIGHEST_PROTOCOL) ''' print("Wat") with open('Voting_model.pkl', 'rb') as file: voting_model = pickle.load(file) #voting_clf = pickle.load(file) with open('Creative_model.pkl', 'rb') as file: creative_model = pickle.load(file) def get_combined_response(prompt, voting_model, creative_model, tokenizer, creative_max_sequence_length): print("Generating (Part 1)") prompt_vector = vectorizer.transform([prompt]).toarray() print("Generating (Part 5)") response_index = voting_model.predict(prompt_vector)[0] # Utilizar el modelo de votación #return df.loc[df['Answer'] == response_index, 'Answer'].values[0] print("Generating (Part 4)") seed_text = df.loc[df['Answer'] == response_index, 'Prompt'].values[0] print("Generating (Part 6)") creative_response = generate_creative_text(seed_text, CREATIVE_NEXT_WORDS, creative_model, tokenizer, creative_max_sequence_length) #return creative_response print("Generating (Part 2)") return "Awnser 1: " + df.loc[df['Answer'] == response_index, 'Answer'].values[0] + " // Awnser 2: " + creative_response, df.loc[df['Answer'] == response_index, 'Answer'].values[0], creative_response def generate_creative_text(seed_text, next_words, model, tokenizer, max_sequence_length): generated_text = seed_text for _ in range(next_words): token_list = tokenizer.texts_to_sequences([seed_text])[0] if max_sequence_length is not None: token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre') else: token_list = [token_list] predicted_probabilities = model.predict(token_list, verbose=0) predicted = np.argmax(predicted_probabilities) output_word = "" for word, index in tokenizer.word_index.items(): if index == predicted: output_word = word break seed_text += " " + output_word generated_text += " " + output_word return generated_text # Load your models and other necessary components here creative_max_sequence_length = 10 # Replace with the correct value used during training VOTING_RESPONSE_INDEX = 0 # Replace with the correct index for voting model responses CREATIVE_NEXT_WORDS = 10 # Replace with the desired number of creative next words def chat_interface(prompt, score, correct_response): print("Generating (Part 0)") user_input = prompt #X = Xy = y df = pd.read_csv(file_path, sep='\t', header=None, names=['Prompt', 'Answer']) response, Logical_response, Creative_response = get_combined_response(user_input, voting_model, creative_model, creative_tokenizer, creative_max_sequence_length) # Display the response to the user print(f"Model Response: {response}") if score < 3: # Update the model only if the correct response is different from the current response if correct_response.lower() != response.lower(): new_data = {'Prompt': user_input, 'Answer': correct_response} df = df._append(new_data, ignore_index=True) with open('dialogs.txt', 'a') as dialogs_file: dialogs_file.write(f"{user_input}\t{correct_response}\n") new_X = vectorizer.transform([user_input]).toarray() new_y = [correct_response] global X global y #X = np.concatenate((X, new_X)) #y = np.concatenate((y, new_y)) # Re-train the voting classifier with the new data # voting_clf.fit(X, y) print("¡Gracias por tu corrección! El modelo ha sido actualizado para mejorar. La próxima vez el modelo tendrá en cuenta tus respuestas correctas.") else: print("Entendido. No se necesita corrección.") else: print("¡Gracias por tu retroalimentación!") # Save the updated DataFrame to a new file df.to_csv('dialogs.csv', index=False) repo.create_file('dialogs.csv', 'upload csv', data, branch='main') repo.create_file('dialogs.txt', 'upload txt', data, branch='main') return Logical_response,Creative_response # Create a Gradio interface with conditional components iface = gr.Interface(fn=chat_interface, inputs=["text", "number", "text"], outputs=["text","text"]) # Launch the Gradio interface iface.launch()