mss3d commited on
Commit
9e983a3
1 Parent(s): d8af91b

Add the app.py file with the AI code

Files changed (1) hide show
  1. app.py +172 -0
app.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.feature_extraction.text import CountVectorizer
4
+ from sklearn.tree import DecisionTreeClassifier
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.neural_network import MLPClassifier
8
+ from sklearn.ensemble import VotingClassifier
9
+ from tensorflow.keras.models import Sequential
10
+ from tensorflow.keras.layers import Embedding, LSTM, Dense
11
+ from tensorflow.keras.preprocessing.text import Tokenizer
12
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
13
+ import tensorflow as tf
14
+ from joblib import dump
15
+ from joblib import load
16
+ from google.colab import drive
17
+ import gdown
18
+ from pydrive.auth import GoogleAuth
19
+ from pydrive.drive import GoogleDrive
20
+ import pickle
21
+ import gradio as gr
22
+
23
+
24
+ def train_creative_model(text_data):
25
+ if not text_data or len(text_data) == 0:
26
+ print("No hay suficientes datos para entrenar el modelo creativo.")
27
+ return None, None, None
28
+
29
+ tokenizer = Tokenizer()
30
+ tokenizer.fit_on_texts(text_data)
31
+ total_words = len(tokenizer.word_index) + 1
32
+
33
+ input_sequences = []
34
+ for line in text_data:
35
+ tokens = line.split('\t') # Separar por tabuladores
36
+ for token in tokens:
37
+ token_list = tokenizer.texts_to_sequences([token])[0]
38
+ for i in range(len(token_list)):
39
+ n_gram_sequence = token_list[i]
40
+ input_sequences.append(n_gram_sequence)
41
+
42
+ if not input_sequences or len(input_sequences) == 0:
43
+ print("No hay suficientes secuencias para entrenar el modelo creativo.")
44
+ return None, None, None
45
+
46
+ X = np.array(input_sequences)
47
+ y = tf.keras.utils.to_categorical(X, num_classes=total_words)
48
+
49
+ model = Sequential()
50
+ model.add(Embedding(total_words, 50, input_length=1))
51
+ model.add(LSTM(100))
52
+ model.add(Dense(total_words, activation='softmax'))
53
+
54
+ model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
55
+ model.fit(X, y, epochs=50, verbose=0)
56
+
57
+ return model, tokenizer, None # Devolver None para creative_max_sequence_length
58
+
59
+ file_path = 'dialogs.csv'
60
+ df = pd.read_csv(file_path)
61
+
62
+ # Crear un vectorizador TF-IDF
63
+ vectorizer = TfidfVectorizer()
64
+
65
+ # Aplicar el vectorizador a las frases de Prompt y Answer
66
+ X = vectorizer.fit_transform(df['Prompt']).toarray()
67
+ y = df['Answer'] # Utilizar las respuestas como etiquetas
68
+
69
+ # Dividir los datos en conjuntos de entrenamiento y prueba
70
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
71
+
72
+ # Inicializar modelos
73
+ tree_model = DecisionTreeClassifier()
74
+ nn_model = MLPClassifier(batch_size=32)
75
+
76
+ # Crear un clasificador de votación
77
+ voting_clf = VotingClassifier(estimators=[('tree', tree_model), ('nn', nn_model)], voting='hard')
78
+
79
+ with open('Voting_model.pkl', 'rb') as file:
80
+ voting_model = pickle.load(file)
81
+ with open('Creative_model.pkl', 'rb') as file:
82
+ voting_model = pickle.load(file)
83
+
84
+ def get_combined_response(prompt, voting_model, creative_model, tokenizer, creative_max_sequence_length):
85
+ prompt_vector = vectorizer.transform([prompt]).toarray()
86
+ response_index = voting_model.predict(prompt_vector)[0]
87
+
88
+
89
+ # Utilizar el modelo de votación
90
+ #return df.loc[df['Answer'] == response_index, 'Answer'].values[0]
91
+
92
+ seed_text = df.loc[df['Answer'] == response_index, 'Prompt'].values[0]
93
+ creative_response = generate_creative_text(seed_text, CREATIVE_NEXT_WORDS, creative_model, tokenizer, creative_max_sequence_length)
94
+ #return creative_response
95
+ return "Awnser 1: " + df.loc[df['Answer'] == response_index, 'Answer'].values[0] + " // Awnser 2: " + creative_response
96
+
97
+
98
+ def generate_creative_text(seed_text, next_words, model, tokenizer, max_sequence_length):
99
+ generated_text = seed_text
100
+ for _ in range(next_words):
101
+ token_list = tokenizer.texts_to_sequences([seed_text])[0]
102
+ if max_sequence_length is not None:
103
+ token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
104
+ else:
105
+ token_list = [token_list]
106
+
107
+ predicted_probabilities = model.predict(token_list, verbose=0)
108
+ predicted = np.argmax(predicted_probabilities)
109
+
110
+
111
+ output_word = ""
112
+ for word, index in tokenizer.word_index.items():
113
+ if index == predicted:
114
+ output_word = word
115
+ break
116
+
117
+ seed_text += " " + output_word
118
+ generated_text += " " + output_word
119
+
120
+ return generated_text
121
+
122
+
123
+
124
+
125
+ # Load your models and other necessary components here
126
+
127
+ creative_max_sequence_length = 10 # Replace with the correct value used during training
128
+ VOTING_RESPONSE_INDEX = 0 # Replace with the correct index for voting model responses
129
+ CREATIVE_NEXT_WORDS = 10 # Replace with the desired number of creative next words
130
+
131
+ def chat_interface(user_input):
132
+ response = get_combined_response(user_input, voting_clf, creative_model, creative_tokenizer, creative_max_sequence_length)
133
+
134
+ # Display the response to the user
135
+ print(f"Model Response: {response}")
136
+
137
+ # Ask the user for a score
138
+ score = int(input(f"Puntuación para la respuesta '{response}': "))
139
+
140
+ # Efficient retraining process
141
+ if score <= 2:
142
+ # Ask the user for the correct response
143
+ correct_response = input(f"La respuesta actual es '{response}'. ¿Cuál es la respuesta correcta?: ")
144
+
145
+ # Update the model only if the correct response is different from the current response
146
+ if correct_response.lower() != response.lower():
147
+ new_data = {'Prompt': user_input, 'Answer': correct_response}
148
+ df = df.append(new_data, ignore_index=True)
149
+
150
+ with open('dialogs.txt', 'a') as dialogs_file:
151
+ dialogs_file.write(f"{user_input}\t{correct_response}\n")
152
+
153
+ new_X = vectorizer.transform([user_input]).toarray()
154
+ new_y = [correct_response]
155
+ X = np.concatenate((X, new_X))
156
+ y = np.concatenate((y, new_y))
157
+
158
+ # Re-train the voting classifier with the new data
159
+ #voting_clf.fit(X, y)
160
+
161
+ print("¡Gracias por tu corrección! El modelo ha sido actualizado para mejorar. La próxima vez el modelo tendrá en cuenta tus respuestas correctas.")
162
+ else:
163
+ print("Entendido. No se necesita corrección.")
164
+ else:
165
+ print("¡Gracias por tu retroalimentación!")
166
+
167
+ # Save the updated DataFrame to a new file
168
+ df.to_csv('dialogs.csv', index=False)
169
+
170
+ # Create a Gradio interface
171
+ iface = gr.Interface(fn=chat_interface, inputs="text", outputs="text")
172
+ iface.launch()