File size: 6,600 Bytes
9e983a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from joblib import dump
from joblib import load
from google.colab import drive
import gdown
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import pickle
import gradio as gr


def train_creative_model(text_data):
    if not text_data or len(text_data) == 0:
        print("No hay suficientes datos para entrenar el modelo creativo.")
        return None, None, None

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_data)
    total_words = len(tokenizer.word_index) + 1

    input_sequences = []
    for line in text_data:
        tokens = line.split('\t')  # Separar por tabuladores
        for token in tokens:
            token_list = tokenizer.texts_to_sequences([token])[0]
            for i in range(len(token_list)):
                n_gram_sequence = token_list[i]
                input_sequences.append(n_gram_sequence)

    if not input_sequences or len(input_sequences) == 0:
        print("No hay suficientes secuencias para entrenar el modelo creativo.")
        return None, None, None

    X = np.array(input_sequences)
    y = tf.keras.utils.to_categorical(X, num_classes=total_words)

    model = Sequential()
    model.add(Embedding(total_words, 50, input_length=1))
    model.add(LSTM(100))
    model.add(Dense(total_words, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(X, y, epochs=50, verbose=0)

    return model, tokenizer, None  # Devolver None para creative_max_sequence_length

file_path = 'dialogs.csv'
df = pd.read_csv(file_path)

# Crear un vectorizador TF-IDF
vectorizer = TfidfVectorizer()

# Aplicar el vectorizador a las frases de Prompt y Answer
X = vectorizer.fit_transform(df['Prompt']).toarray()
y = df['Answer']  # Utilizar las respuestas como etiquetas

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializar modelos
tree_model = DecisionTreeClassifier()
nn_model = MLPClassifier(batch_size=32)

# Crear un clasificador de votaci贸n
voting_clf = VotingClassifier(estimators=[('tree', tree_model), ('nn', nn_model)], voting='hard')

with open('Voting_model.pkl', 'rb') as file:
    voting_model = pickle.load(file)
with open('Creative_model.pkl', 'rb') as file:
    voting_model = pickle.load(file)

def get_combined_response(prompt, voting_model, creative_model, tokenizer, creative_max_sequence_length):
    prompt_vector = vectorizer.transform([prompt]).toarray()
    response_index = voting_model.predict(prompt_vector)[0]


    # Utilizar el modelo de votaci贸n
    #return df.loc[df['Answer'] == response_index, 'Answer'].values[0]

    seed_text = df.loc[df['Answer'] == response_index, 'Prompt'].values[0]
    creative_response = generate_creative_text(seed_text, CREATIVE_NEXT_WORDS, creative_model, tokenizer, creative_max_sequence_length)
    #return creative_response
    return "Awnser 1: " + df.loc[df['Answer'] == response_index, 'Answer'].values[0] + " // Awnser 2: " + creative_response


def generate_creative_text(seed_text, next_words, model, tokenizer, max_sequence_length):
    generated_text = seed_text
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        if max_sequence_length is not None:
            token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        else:
            token_list = [token_list]

        predicted_probabilities = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted_probabilities)


        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        seed_text += " " + output_word
        generated_text += " " + output_word

    return generated_text




# Load your models and other necessary components here

creative_max_sequence_length = 10  # Replace with the correct value used during training
VOTING_RESPONSE_INDEX = 0  # Replace with the correct index for voting model responses
CREATIVE_NEXT_WORDS = 10  # Replace with the desired number of creative next words

def chat_interface(user_input):
    response = get_combined_response(user_input, voting_clf, creative_model, creative_tokenizer, creative_max_sequence_length)

    # Display the response to the user
    print(f"Model Response: {response}")

    # Ask the user for a score
    score = int(input(f"Puntuaci贸n para la respuesta '{response}': "))

    # Efficient retraining process
    if score <= 2:
        # Ask the user for the correct response
        correct_response = input(f"La respuesta actual es '{response}'. 驴Cu谩l es la respuesta correcta?: ")

        # Update the model only if the correct response is different from the current response
        if correct_response.lower() != response.lower():
            new_data = {'Prompt': user_input, 'Answer': correct_response}
            df = df.append(new_data, ignore_index=True)

            with open('dialogs.txt', 'a') as dialogs_file:
                dialogs_file.write(f"{user_input}\t{correct_response}\n")

            new_X = vectorizer.transform([user_input]).toarray()
            new_y = [correct_response]
            X = np.concatenate((X, new_X))
            y = np.concatenate((y, new_y))

            # Re-train the voting classifier with the new data
            #voting_clf.fit(X, y)

            print("隆Gracias por tu correcci贸n! El modelo ha sido actualizado para mejorar. La pr贸xima vez el modelo tendr谩 en cuenta tus respuestas correctas.")
        else:
            print("Entendido. No se necesita correcci贸n.")
    else:
        print("隆Gracias por tu retroalimentaci贸n!")

    # Save the updated DataFrame to a new file
    df.to_csv('dialogs.csv', index=False)

# Create a Gradio interface
iface = gr.Interface(fn=chat_interface, inputs="text", outputs="text")
iface.launch()