Spaces:

mss3d
/

Chat-MIA

Runtime error

App Files Files Community

Chat-MIA / app.py

mss3d

Update app.py

d95454f verified 11 months ago

raw

history blame

9.37 kB

	import pandas as pd
	import numpy as np
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.neural_network import MLPClassifier
	from sklearn.ensemble import VotingClassifier
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.layers import Embedding, LSTM, Dense
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	import tensorflow as tf
	from joblib import dump
	from joblib import load
	import pickle
	import gradio as gr
	#import Github
	import github
	from github import Github

	g = Github('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')

	repo = g.get_repo('mss3d2008/Chat_MIA')


	file_path = "dialogs.csv"
	df = pd.read_csv(file_path, sep='\t', header=None, names=['Prompt', 'Answer'])

	X = 0
	y = 0

	def train_creative_model(text_data):
	if not text_data or len(text_data) == 0:
	print("No hay suficientes datos para entrenar el modelo creativo.")
	return None, None, None

	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(text_data)
	total_words = len(tokenizer.word_index) + 1

	input_sequences = []
	for line in text_data:
	tokens = line.split('\t') # Separar por tabuladores
	for token in tokens:
	token_list = tokenizer.texts_to_sequences([token])[0]
	for i in range(len(token_list)):
	n_gram_sequence = token_list[i]
	input_sequences.append(n_gram_sequence)

	if not input_sequences or len(input_sequences) == 0:
	print("No hay suficientes secuencias para entrenar el modelo creativo.")
	return None, None, None

	global X
	global y
	X = np.array(input_sequences)
	y = tf.keras.utils.to_categorical(X, num_classes=total_words)

	model = Sequential()
	model.add(Embedding(total_words, 50, input_length=1))
	model.add(LSTM(100))
	model.add(Dense(total_words, activation='softmax'))

	model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
	model.fit(X, y, epochs=50, verbose=0)

	return model, tokenizer, None # Devolver None para creative_max_sequence_length

	file_path = 'dialogs.csv'
	df = pd.read_csv(file_path)

	# Crear un vectorizador TF-IDF
	vectorizer = TfidfVectorizer()

	# Aplicar el vectorizador a las frases de Prompt y Answer
	X = vectorizer.fit_transform(df['Prompt']).toarray()
	y = df['Answer'] # Utilizar las respuestas como etiquetas

	# Dividir los datos en conjuntos de entrenamiento y prueba
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Inicializar modelos
	tree_model = DecisionTreeClassifier()
	nn_model = MLPClassifier(batch_size=32)

	# Crear un clasificador de votación
	voting_clf = VotingClassifier(estimators=[('tree', tree_model), ('nn', nn_model)], voting='hard')

	print("Error? Maybe")

	creative_model, creative_tokenizer, _ = train_creative_model('dialogs.txt')
	print("Error? Maybe, but its part 2")
	with open('Creative_model.pkl', 'wb') as file:
	pickle.dump(creative_model, file, protocol=pickle.HIGHEST_PROTOCOL)

	'''
	print("Error? Maybe, but its part 2,5")
	voting_clf.fit(X_train, y_train)

	print("Error? Maybe but, its part 3")

	with open('Voting_model.pkl', 'wb') as file:
	pickle.dump(voting_clf, file, protocol=pickle.HIGHEST_PROTOCOL)
	'''

	print("Wat")

	with open('Voting_model.pkl', 'rb') as file:
	voting_model = pickle.load(file)
	#voting_clf = pickle.load(file)
	with open('Creative_model.pkl', 'rb') as file:
	creative_model = pickle.load(file)

	def get_combined_response(prompt, voting_model, creative_model, tokenizer, creative_max_sequence_length):
	print("Generating (Part 1)")


	prompt_vector = vectorizer.transform([prompt]).toarray()
	print("Generating (Part 5)")
	response_index = voting_model.predict(prompt_vector)[0]


	# Utilizar el modelo de votación
	#return df.loc[df['Answer'] == response_index, 'Answer'].values[0]
	print("Generating (Part 4)")
	seed_text = df.loc[df['Answer'] == response_index, 'Prompt'].values[0]
	print("Generating (Part 6)")
	creative_response = generate_creative_text(seed_text, CREATIVE_NEXT_WORDS, creative_model, tokenizer, creative_max_sequence_length)
	#return creative_response
	print("Generating (Part 2)")
	return "Awnser 1: " + df.loc[df['Answer'] == response_index, 'Answer'].values[0] + " // Awnser 2: " + creative_response, df.loc[df['Answer'] == response_index, 'Answer'].values[0], creative_response


	def generate_creative_text(seed_text, next_words, model, tokenizer, max_sequence_length):
	generated_text = seed_text
	for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	if max_sequence_length is not None:
	token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
	else:
	token_list = [token_list]

	predicted_probabilities = model.predict(token_list, verbose=0)
	predicted = np.argmax(predicted_probabilities)


	output_word = ""
	for word, index in tokenizer.word_index.items():
	if index == predicted:
	output_word = word
	break

	seed_text += " " + output_word
	generated_text += " " + output_word

	return generated_text




	# Load your models and other necessary components here

	creative_max_sequence_length = 10 # Replace with the correct value used during training
	VOTING_RESPONSE_INDEX = 0 # Replace with the correct index for voting model responses
	CREATIVE_NEXT_WORDS = 10 # Replace with the desired number of creative next words



	def chat_interface(prompt, score, correct_response):
	file_path = "dialogs.csv"

	print("Generating (Part 0)")
	user_input = prompt
	#X = Xy = y
	df = pd.read_csv(file_path, sep='\t', header=None, names=['Prompt', 'Answer'])

	response, Logical_response, Creative_response = get_combined_response(user_input, voting_model, creative_model, creative_tokenizer, creative_max_sequence_length)

	# Display the response to the user
	print(f"Model Response: {response}")


	if score < 3:

	# Update the model only if the correct response is different from the current response
	if correct_response.lower() != response.lower():
	new_data = {'Prompt': user_input, 'Answer': correct_response}
	df = df._append(new_data, ignore_index=True)


	with open('dialogs.txt', 'a') as dialogs_file:
	dialogs_file.write(f"{user_input}\t{correct_response}\n")

	new_X = vectorizer.transform([user_input]).toarray()
	new_y = [correct_response]
	global X
	global y
	#X = np.concatenate((X, new_X))
	#y = np.concatenate((y, new_y))

	# Re-train the voting classifier with the new data
	# voting_clf.fit(X, y)

	print("¡Gracias por tu corrección! El modelo ha sido actualizado para mejorar. La próxima vez el modelo tendrá en cuenta tus respuestas correctas.")
	# Save DataFrame to CSV file


	import git

	# Replace 'your/repository/path' with the actual path to your local Git repository
	repo_path = 'your/repository/path'

	# Open the Git repository
	repo = git.Repo(repo_path)

	# Fetch changes from the remote repository
	origin = repo.remote(name='origin')
	origin.fetch()

	# Pull changes from the remote repository (main branch)
	origin.pull('main')

	# If you're working with a different branch, replace 'main' with the branch name
	# origin.pull('your_branch_name')

	import base64

	# Save DataFrame to CSV file
	df.to_csv('dialogs.csv', index=False)

	# Read the CSV file into a string
	with open('dialogs.csv', 'r') as file:
	data = file.read()

	# Get the current commit SHA of the main branch
	main_branch = repo.get_branch("main")
	commit_sha = main_branch.commit.sha

	# Upload CSV file to GitHub
	file_path = 'dialogs.csv'
	commit_message = 'Upload CSV'
	repo.update_file(file_path, commit_message, data, sha=commit_sha, branch='main')

	# Append to the dialogs.txt file and upload it to GitHub
	with open('dialogs.txt', 'a') as dialogs_file:
	dialogs_file.write(f"{user_input}\t{correct_response}\n")

	with open('dialogs.txt', 'r') as file:
	data_txt = file.read()

	# Get the current commit SHA of the main branch again
	commit_sha_txt = main_branch.commit.sha

	repo.update_file('dialogs.txt', 'Upload TXT', data_txt, sha=commit_sha_txt, branch='main')






	return Logical_response,Creative_response

	# Create a Gradio interface with conditional components
	iface = gr.Interface(fn=chat_interface, inputs=["text", "number", "text"], outputs=["text","text"])

	# Launch the Gradio interface
	iface.launch()