AmelieSchreiber commited on
Commit
7a4f9a7
1 Parent(s): 90c60d5

Update data_preprocessing_notebook_v1.ipynb

Browse files
data_preprocessing_notebook_v1.ipynb CHANGED
@@ -340,9 +340,12 @@
340
  "num_test_samples = len(test_sequences_by_family)\n",
341
  "num_train_samples = len(train_sequences_by_family)\n",
342
  "\n",
343
- "# Generate random indices representing 50% of each dataset\n",
344
- "random_test_indices = random.sample(range(num_test_samples), num_test_samples // 26.66)\n",
345
- "random_train_indices = random.sample(range(num_train_samples), num_train_samples // 26.66)\n",
 
 
 
346
  "\n",
347
  "# Create smaller datasets using the random indices\n",
348
  "test_sequences_small = [test_sequences_by_family[i] for i in random_test_indices]\n",
 
340
  "num_test_samples = len(test_sequences_by_family)\n",
341
  "num_train_samples = len(train_sequences_by_family)\n",
342
  "\n",
343
+ "# Define the percentage of data you want to keep",
344
+ "percentage_to_keep = 3.64 # for keeping 3.64% of the data",
345
+ "\n",
346
+ "# Generate random indices representing a percentage of each dataset",
347
+ "random_test_indices = random.sample(range(num_test_samples), int(num_test_samples * (percentage_to_keep / 100)))",
348
+ "random_train_indices = random.sample(range(num_train_samples), int(num_train_samples * (percentage_to_keep / 100)))",
349
  "\n",
350
  "# Create smaller datasets using the random indices\n",
351
  "test_sequences_small = [test_sequences_by_family[i] for i in random_test_indices]\n",