Spaces:

orionai
/

training-data-collection

Running

App Files Files Community

training-data-collection / app.py

oscarwang2

Update app.py

ad19bf8 verified 3 days ago

raw

history blame contribute delete

No virus

6 kB

	import pandas as pd
	import os
	import gradio as gr
	import threading
	import time
	from groq import Groq

	# Initialize Groq client
	client = Groq()

	# Constants
	MAX_SIZE = 1.1 * 1024 * 1024 * 1024 # 1.1GB in bytes
	DATA_DIRECTORY = 'data'
	UPDATE_INTERVAL = 1 # Update interval in seconds

	# Ensure the data directory exists
	os.makedirs(DATA_DIRECTORY, exist_ok=True)

	# Initialize variables
	file_index = 1
	current_file = os.path.join(DATA_DIRECTORY, f'data{file_index}.csv')
	file_paths = [current_file]
	combined_tokens = 0

	# Helper function to get file size
	def get_file_size(filename):
	return os.path.getsize(filename) if os.path.isfile(filename) else 0

	# Data generation and saving function
	def generate_and_save_data():
	global file_index, current_file, file_paths, combined_tokens

	# Create the initial file if it doesn't exist
	if not os.path.isfile(current_file):
	pd.DataFrame(columns=["prompt", "response"]).to_csv(current_file, index=False)

	while True:
	try:
	# Generate a prompt
	completion = client.chat.completions.create(
	model="gemma2-9b-it",
	messages=[
	{
	"role": "user",
	"content": "give me a single prompt to prompt an ai model, simulating what users could want from you. ensure that it is diverse and high quality. for each, choose a random writing style (though it has to be a common one), random length and random clarity of the prompt. ensure that it is a single prompt, and just the prompt itself, nothing else. eg, don't close the prompt in quotation marks or say Here is a single prompt that meets your requirements or anything similar to that"
	}
	],
	temperature=1,
	max_tokens=1024,
	top_p=1,
	stream=True,
	stop=None,
	)

	prompt = ""
	prompt_tokens = 0
	for chunk in completion:
	content = chunk.choices[0].delta.content
	if content:
	prompt += content
	prompt_tokens += len(content.split())

	# Use the generated prompt to query the model again
	second_completion = client.chat.completions.create(
	model="gemma2-9b-it",
	messages=[
	{
	"role": "user",
	"content": prompt
	}
	],
	temperature=1,
	max_tokens=5000,
	top_p=1,
	stream=True,
	stop=None,
	)

	response = ""
	response_tokens = 0
	for chunk in second_completion:
	content = chunk.choices[0].delta.content
	if content:
	response += content
	response_tokens += len(content.split())

	# Update the combined token count
	combined_tokens += (prompt_tokens + response_tokens)

	# Print the generated prompt and the response
	print("Generated prompt:", prompt)
	print("Response to the generated prompt:", response)

	# Create a DataFrame with the prompt and response
	data = pd.DataFrame({"prompt": [prompt], "response": [response]})

	# Check the size of the current file
	if get_file_size(current_file) >= MAX_SIZE:
	file_index += 1
	current_file = os.path.join(DATA_DIRECTORY, f'data{file_index}.csv')
	file_paths.append(current_file)
	# Create the new file with headers
	with open(current_file, 'w') as f:
	data.to_csv(f, header=True, index=False)
	else:
	# Append data to the current file
	with open(current_file, 'a') as f:
	data.to_csv(f, header=False, index=False)

	# Wait for the next update interval
	time.sleep(UPDATE_INTERVAL)

	except Exception as e:
	print(f"An error occurred: {e}. Retrying in 5 seconds...")
	time.sleep(5)

	# Get available files
	def get_available_files():
	return [f for f in file_paths if os.path.isfile(f)]

	# Update file list
	def update_file_list():
	return gr.update(choices=get_available_files())

	# Update token count
	def update_token_count():
	return combined_tokens

	# Display file content
	def display_file_content(selected_file):
	if selected_file:
	return pd.read_csv(selected_file)
	return pd.DataFrame()

	# Start the data generation in a separate thread
	thread = threading.Thread(target=generate_and_save_data)
	thread.daemon = True
	thread.start()

	# Create Gradio interface
	with gr.Blocks() as app:
	gr.Markdown("## AI Prompt and Response Generator")
	gr.Markdown("This app continuously generates AI prompts and responses, and writes them to CSV files.")

	file_selector = gr.Dropdown(label="Select a data file to view and download", choices=get_available_files())
	file_viewer = gr.DataFrame(label="CSV File Content")
	download_button = gr.File(label="Download Selected File")

	def download_file(selected_file):
	return selected_file

	refresh_button = gr.Button("Refresh File List")
	refresh_button.click(update_file_list, outputs=file_selector)
	file_selector.change(display_file_content, inputs=file_selector, outputs=file_viewer)
	file_selector.change(download_file, inputs=file_selector, outputs=download_button)

	token_display = gr.Textbox(label="Combined Tokens", value=str(update_token_count()), interactive=False)

	def update_token_display():
	return str(update_token_count())

	# Update the token count every second
	token_refresh = gr.Button("Refresh Token Count")
	token_refresh.click(update_token_display, outputs=token_display)

	app.launch()