Spaces:

Genooo12
/

ColiFormer-ui

Sleeping

App Files Files Community

ColiFormer-ui / create_model_datasets.py

Genooo12

Deploy Streamlit UI

404d784 verified 16 days ago

raw

history blame contribute delete

1.8 kB

	import pandas as pd
	import json
	import os
	from CodonTransformer.CodonData import prepare_training_data

	def main():
	"""
	Main function to partition the processed data into fine-tuning and test sets.
	"""
	if not os.path.exists('data'):
	print("Error: 'data' directory not found. Please run prepare_ecoli_data.py first.")
	return

	processed_data_path = 'data/ecoli_processed_genes.csv'
	if not os.path.exists(processed_data_path):
	print(f"Error: Processed data file not found at {processed_data_path}")
	return

	df_processed = pd.read_csv(processed_data_path)

	df_finetune = df_processed[df_processed['is_high_cai'] == True].copy()
	df_finetune.drop_duplicates(subset=['dna_sequence'], inplace=True)
	df_finetune.rename(columns={'dna_sequence': 'dna', 'protein_sequence': 'protein'}, inplace=True)
	df_finetune['organism'] = "Escherichia coli general"

	finetune_output_path = 'data/finetune_set.json'
	prepare_training_data(df_finetune, finetune_output_path, shuffle=True)
	print(f"Fine-tuning set saved to {finetune_output_path} with {len(df_finetune)} records.")

	df_test_pool = df_processed[df_processed['is_high_cai'] == False].copy()
	df_test = df_test_pool.sample(n=100, random_state=42) # for reproducibility
	df_test['organism'] = 51 # E. coli general
	df_test.rename(columns={'dna_sequence': 'codons'}, inplace=True)
	test_records = df_test[['codons', 'organism']].to_dict(orient='records')

	test_output_path = 'data/test_set.json'
	with open(test_output_path, 'w') as f:
	json.dump(test_records, f, indent=4)
	print(f"Test set saved to {test_output_path} with {len(df_test)} records.")

	if __name__ == "__main__":
	main()