training-data-collection_3

Sleeping

Oscar Wang

Update app.py

6663d52 verified 12 months ago

6.02 kB

	import pandas as pd
	import os
	import gradio as gr
	import threading
	import time
	from groq import Groq

	# Initialize Groq client
	client = Groq()

	# Constants
	MAX_SIZE = 1.1 * 1024 * 1024 * 1024 # 1.1GB in bytes
	DATA_DIRECTORY = 'data'
	UPDATE_INTERVAL = 1 # Update interval in seconds

	# Ensure the data directory exists
	os.makedirs(DATA_DIRECTORY, exist_ok=True)

	# Initialize variables
	file_index = 1
	current_file = os.path.join(DATA_DIRECTORY, f'data{file_index}.csv')
	file_paths = [current_file]
	combined_tokens = 0

	# Helper function to get file size
	def get_file_size(filename):
	return os.path.getsize(filename) if os.path.isfile(filename) else 0

	# Data generation and saving function
	def generate_and_save_data():
	global file_index, current_file, file_paths, combined_tokens

	# Create the initial file if it doesn't exist
	if not os.path.isfile(current_file):
	pd.DataFrame(columns=["prompt", "response"]).to_csv(current_file, index=False)

	while True:
	try:
	# Generate a prompt
	completion = client.chat.completions.create(
	model="mixtral-8x7b-32768",
	messages=[
	{
	"role": "user",
	"content": "give me a single prompt to prompt an ai model, simulating what users could want from you. ensure that it is diverse and high quality. for each, choose a random writing style (though it has to be a common one), random length and random clarity of the prompt. ensure that it is a single prompt, and just the prompt itself, nothing else. eg, don't close the prompt in quotation marks or say Here is a single prompt that meets your requirements or anything similar to that"
	}
	],
	temperature=1,
	max_tokens=1024,
	top_p=1,
	stream=True,
	stop=None,
	)

	prompt = ""
	prompt_tokens = 0
	for chunk in completion:
	content = chunk.choices[0].delta.content
	if content:
	prompt += content
	prompt_tokens += len(content.split())

	# Use the generated prompt to query the model again
	second_completion = client.chat.completions.create(
	model="mixtral-8x7b-32768",
	messages=[
	{
	"role": "user",
	"content": prompt
	}
	],
	temperature=1,
	max_tokens=5000,
	top_p=1,
	stream=True,
	stop=None,
	)

	response = ""
	response_tokens = 0
	for chunk in second_completion:
	content = chunk.choices[0].delta.content
	if content:
	response += content
	response_tokens += len(content.split())

	# Update the combined token count
	combined_tokens += (prompt_tokens + response_tokens)

	# Print the generated prompt and the response
	print("Generated prompt:", prompt)
	print("Response to the generated prompt:", response)

	# Create a DataFrame with the prompt and response
	data = pd.DataFrame({"prompt": [prompt], "response": [response]})

	# Check the size of the current file
	if get_file_size(current_file) >= MAX_SIZE:
	file_index += 1
	current_file = os.path.join(DATA_DIRECTORY, f'data{file_index}.csv')
	file_paths.append(current_file)
	# Create the new file with headers
	with open(current_file, 'w') as f:
	data.to_csv(f, header=True, index=False)
	else:
	# Append data to the current file
	with open(current_file, 'a') as f:
	data.to_csv(f, header=False, index=False)

	# Wait for the next update interval
	time.sleep(UPDATE_INTERVAL)

	except Exception as e:
	print(f"An error occurred: {e}. Retrying in 5 seconds...")
	time.sleep(5)

	# Get available files
	def get_available_files():
	return [f for f in file_paths if os.path.isfile(f)]

	# Update file list
	def update_file_list():
	return gr.update(choices=get_available_files())

	# Update token count
	def update_token_count():
	return combined_tokens

	# Display file content
	def display_file_content(selected_file):
	if selected_file:
	return pd.read_csv(selected_file)
	return pd.DataFrame()

	# Start the data generation in a separate thread
	thread = threading.Thread(target=generate_and_save_data)
	thread.daemon = True
	thread.start()

	# Create Gradio interface
	with gr.Blocks() as app:
	gr.Markdown("## AI Prompt and Response Generator")
	gr.Markdown("This app continuously generates AI prompts and responses, and writes them to CSV files.")

	file_selector = gr.Dropdown(label="Select a data file to view and download", choices=get_available_files())
	file_viewer = gr.DataFrame(label="CSV File Content")
	download_button = gr.File(label="Download Selected File")

	def download_file(selected_file):
	return selected_file

	refresh_button = gr.Button("Refresh File List")
	refresh_button.click(update_file_list, outputs=file_selector)
	file_selector.change(display_file_content, inputs=file_selector, outputs=file_viewer)
	file_selector.change(download_file, inputs=file_selector, outputs=download_button)

	token_display = gr.Textbox(label="Combined Tokens", value=str(update_token_count()), interactive=False)

	def update_token_display():
	return str(update_token_count())

	# Update the token count every second
	token_refresh = gr.Button("Refresh Token Count")
	token_refresh.click(update_token_display, outputs=token_display)

	app.launch()