|
import pandas as pd |
|
import numpy as np |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from sklearn.tree import DecisionTreeClassifier |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.neural_network import MLPClassifier |
|
from sklearn.ensemble import VotingClassifier |
|
from tensorflow.keras.models import Sequential |
|
from tensorflow.keras.layers import Embedding, LSTM, Dense |
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
import tensorflow as tf |
|
from joblib import dump |
|
from joblib import load |
|
import pickle |
|
import gradio as gr |
|
import github |
|
from github import Github |
|
import os |
|
import requests |
|
from base64 import b64encode |
|
|
|
file_path = "dialogs.csv" |
|
df = pd.read_csv(file_path, sep='\t', header=None, names=['Prompt', 'Answer']) |
|
|
|
X = 0 |
|
y = 0 |
|
|
|
def train_creative_model(text_data): |
|
if not text_data or len(text_data) == 0: |
|
print("No hay suficientes datos para entrenar el modelo creativo.") |
|
return None, None, None |
|
|
|
tokenizer = Tokenizer() |
|
tokenizer.fit_on_texts(text_data) |
|
total_words = len(tokenizer.word_index) + 1 |
|
|
|
input_sequences = [] |
|
for line in text_data: |
|
tokens = line.split('\t') |
|
for token in tokens: |
|
token_list = tokenizer.texts_to_sequences([token])[0] |
|
for i in range(len(token_list)): |
|
n_gram_sequence = token_list[i] |
|
input_sequences.append(n_gram_sequence) |
|
|
|
if not input_sequences or len(input_sequences) == 0: |
|
print("No hay suficientes secuencias para entrenar el modelo creativo.") |
|
return None, None, None |
|
|
|
global X |
|
global y |
|
X = np.array(input_sequences) |
|
y = tf.keras.utils.to_categorical(X, num_classes=total_words) |
|
|
|
model = Sequential() |
|
model.add(Embedding(total_words, 50)) |
|
model.add(LSTM(100)) |
|
model.add(Dense(total_words, activation='softmax')) |
|
|
|
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) |
|
model.fit(X, y, epochs=50, verbose=0) |
|
|
|
return model, tokenizer, None |
|
|
|
file_path = 'dialogs.csv' |
|
df = pd.read_csv(file_path) |
|
|
|
|
|
vectorizer = TfidfVectorizer() |
|
|
|
|
|
X = vectorizer.fit_transform(df['Prompt']).toarray() |
|
y = df['Answer'] |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
tree_model = DecisionTreeClassifier() |
|
nn_model = MLPClassifier(batch_size=32) |
|
|
|
|
|
voting_clf = VotingClassifier(estimators=[('tree', tree_model), ('nn', nn_model)], voting='hard') |
|
|
|
creative_model, creative_tokenizer, _ = train_creative_model('dialogs.txt') |
|
|
|
with open('Creative_model.pkl', 'wb') as file: |
|
pickle.dump(creative_model, file, protocol=pickle.HIGHEST_PROTOCOL) |
|
|
|
with open('Voting_model.pkl', 'rb') as file: |
|
voting_model = pickle.load(file) |
|
|
|
with open('Creative_model.pkl', 'rb') as file: |
|
creative_model = pickle.load(file) |
|
|
|
def get_combined_response(prompt, voting_model, creative_model, tokenizer, creative_max_sequence_length): |
|
|
|
prompt_vector = vectorizer.transform([prompt]).toarray() |
|
response_index = voting_model.predict(prompt_vector)[0] |
|
seed_text = df.loc[df['Answer'] == response_index, 'Prompt'].values[0] |
|
creative_response = generate_creative_text(seed_text, CREATIVE_NEXT_WORDS, creative_model, tokenizer, creative_max_sequence_length) |
|
|
|
return "Awnser 1: " + df.loc[df['Answer'] == response_index, 'Answer'].values[0] + " // Awnser 2: " + creative_response, df.loc[df['Answer'] == response_index, 'Answer'].values[0], creative_response |
|
|
|
|
|
import numpy as np |
|
|
|
def generate_creative_text(seed_text, next_words, model, tokenizer, max_sequence_length): |
|
generated_text = seed_text |
|
for _ in range(next_words): |
|
token_list = tokenizer.texts_to_sequences([seed_text])[0] |
|
if max_sequence_length is not None: |
|
token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre') |
|
else: |
|
token_list = [token_list] |
|
|
|
predicted_probabilities = model.predict(token_list, verbose=0) |
|
predicted = np.argmax(predicted_probabilities) |
|
|
|
|
|
output_word = "" |
|
for word, index in tokenizer.word_index.items(): |
|
if index == predicted: |
|
output_word = word |
|
break |
|
|
|
seed_text += " " + output_word |
|
generated_text += " " + output_word |
|
|
|
return generated_text |
|
|
|
|
|
|
|
|
|
creative_max_sequence_length = 10 |
|
VOTING_RESPONSE_INDEX = 0 |
|
CREATIVE_NEXT_WORDS = 10 |
|
|
|
def chat_interface(prompt, score, correct_response): |
|
file_path = "dialogs.csv" |
|
|
|
print("Generating (Part 0)") |
|
user_input = prompt |
|
|
|
df = pd.read_csv(file_path, sep='\t', header=None, names=['Prompt', 'Answer']) |
|
|
|
response, Logical_response, Creative_response = get_combined_response(user_input, voting_model, creative_model, creative_tokenizer, creative_max_sequence_length) |
|
|
|
|
|
print(f"Model Response: {response}") |
|
|
|
token = os.environ.get('token') |
|
|
|
if score < 3: |
|
|
|
|
|
if correct_response.lower() != response.lower(): |
|
|
|
from github import Github |
|
import git |
|
|
|
|
|
owner = 'ChatMIADatabase2' |
|
repo = 'Chat_MIA' |
|
file_name = 'dialogs.txt' |
|
branch = 'main' |
|
|
|
|
|
g = github.Github(token) |
|
repo = g.get_repo("ChatMIADatabase2/Chat_MIA") |
|
|
|
file = repo.get_contents("dialogs.txt", ref="main") |
|
|
|
repo.update_file(file.path, "test", f"{file.decoded_content.decode()} \n{user_input}\t{correct_response} ", file.sha, branch="main") |
|
|
|
|
|
with open('MIA_Dataset.txt', 'a+') as dialogs_file: |
|
dialogs_file.write(f"\n{user_input}\t{correct_response}") |
|
dialogs_file.seek(0) |
|
|
|
MIA_txt = dialogs_file.read() |
|
from github import Github |
|
import git |
|
|
|
|
|
owner = 'ChatMIADatabase2' |
|
repo = 'Chat_MIA' |
|
file_name = 'dialogs.txt' |
|
branch = 'main' |
|
|
|
|
|
g = github.Github(token) |
|
repo = g.get_repo("ChatMIADatabase2/Chat_MIA") |
|
file = repo.get_contents("MIA_Dataset.txt", ref="main") |
|
|
|
repo.update_file(file.path, "test", f"{file.decoded_content.decode()} \n{user_input}\t{correct_response} ", file.sha, branch="main") |
|
|
|
return Logical_response, Creative_response |
|
|
|
|
|
|
|
welcome_text = """ |
|
<h1 style="text-align: center;">Chat MHAI</h1> |
|
<p>Chat MHAI is a multi-hemispherical AI that can generate text. The project is still in development, and this is the first version of it.</p> |
|
<p>How does it work? Chat MHAI uses two AIs (called hemispheres). The logical one and the creative one. Each hemisphere is a different AI model trained on the same data. When the AI gets prompted something both models look for the best answer. Then the best answer for each model is outputted.</p> |
|
<p>Sometimes the answer of one or of both models will be incoherent or weird; this is because the AI is still in its development so results will improve in future updates. But you can help too.</p> |
|
<p>How to help: If you would like to help us it is very simple. When you generate text give it a score. The default score is 5 but if the answer is weird or incorrect give it a lower score and fill the text box where it says, “correct answer” (Correct answers will only be saved if the score is lower than 3 for practical reasons) that will be saved in the database and then used to train the model in future occasions. Another way you can help is providing data (all data is well received).</p> |
|
<p>This AI was creating as both a school and passion project this first version was made to hand in for the project and meet the deadline, but I will keep updating the model and making it better through time. And I will update the data regularly with all the inputs from the correct answers.</p> |
|
<p>The dataset used in this model is “Dataset for chatbot” by GRAF STOR in Kaggle <a href="https://www.kaggle.com/datasets/grafstor/simple-dialogs-for-chatbot/data" target="_blank">here</a>.</p> |
|
""" |
|
|
|
iface = gr.Interface( |
|
|
|
fn=chat_interface, |
|
description=welcome_text, |
|
|
|
inputs=[ |
|
gr.Textbox(label="User Input", placeholder="Enter your message here..."), |
|
gr.Slider(label="Score", minimum=1, maximum=5, value=5, step=1), |
|
gr.Textbox(label="Correct Answer", placeholder="Correct Answer (if needed)") |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Logical Response", placeholder="Logical Response"), |
|
gr.Textbox(label="Creative Response", placeholder="Creative Response") |
|
|
|
] |
|
|
|
) |
|
|
|
|
|
iface.launch() |
|
|