Alexander Watson commited on
Commit
faef657
1 Parent(s): 63a19c9

Add HF data support to SDK code generator

Browse files
Files changed (1) hide show
  1. app.py +27 -5
app.py CHANGED
@@ -115,7 +115,12 @@ def main():
115
  )
116
 
117
  df = None
 
 
 
 
118
  if data_source == "Upload a file":
 
119
  uploaded_file = st.file_uploader(
120
  "Upload a CSV, JSON, or JSONL file",
121
  type=["csv", "json", "jsonl"],
@@ -132,16 +137,19 @@ def main():
132
  st.success(f"File uploaded successfully: {uploaded_file.name}")
133
 
134
  elif data_source == "Select a dataset from Hugging Face":
 
135
  huggingface_dataset = st.text_input(
136
  "Hugging Face Dataset Repository",
137
  help="Enter the name of the Hugging Face dataset repository (e.g., 'squad')",
138
  )
 
139
 
140
  huggingface_split = st.selectbox(
141
  "Dataset Split",
142
  options=["train", "validation", "test"],
143
  help="Select the dataset split to use",
144
  )
 
145
 
146
  if st.button("Load Hugging Face Dataset"):
147
  if huggingface_dataset:
@@ -160,6 +168,7 @@ def main():
160
  st.warning("Please provide a Hugging Face dataset repository name.")
161
 
162
  elif data_source == "Use a sample dataset":
 
163
  st.write("Try a sample dataset to get started quickly.")
164
  if st.button("Try Sample Dataset"):
165
  try:
@@ -422,14 +431,27 @@ def main():
422
  import logging
423
  import pandas as pd
424
  from navigator_helpers import InstructionResponseConfig, TrainingDataSynthesizer
 
425
 
426
  # Configure the logger
427
  logging.basicConfig(level=logging.INFO, format="%(message)s")
428
 
429
- DATASET = "YOUR_DATASET"
430
  API_KEY = "YOUR_API_KEY"
431
-
432
- df = pd.read_csv(DATASET)
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
  # Create the instruction response configuration
435
  config = InstructionResponseConfig(
@@ -583,7 +605,7 @@ new_df = synthesizer.generate()
583
 
584
  time.sleep(0.1)
585
  logger.removeHandler(handler)
586
- st.success("Data synthetic completed!")
587
  st.stop()
588
 
589
  if stop_button:
@@ -638,7 +660,7 @@ new_df = synthesizer.generate()
638
  zip_file.write(log_file_path, "complete_logs.jsonl")
639
  if synthesized_data_jsonl:
640
  zip_file.write(
641
- synthesized_data_file_path, "synthesized_data.jsonl"
642
  )
643
  zip_file.write(sdk_file_path, "data_synthesis_code.py")
644
 
 
115
  )
116
 
117
  df = None
118
+ dataset_source_type = ""
119
+ huggingface_dataset = ""
120
+ huggingface_split = ""
121
+
122
  if data_source == "Upload a file":
123
+ dataset_source_type = "uploaded"
124
  uploaded_file = st.file_uploader(
125
  "Upload a CSV, JSON, or JSONL file",
126
  type=["csv", "json", "jsonl"],
 
137
  st.success(f"File uploaded successfully: {uploaded_file.name}")
138
 
139
  elif data_source == "Select a dataset from Hugging Face":
140
+ dataset_source_type = "huggingface"
141
  huggingface_dataset = st.text_input(
142
  "Hugging Face Dataset Repository",
143
  help="Enter the name of the Hugging Face dataset repository (e.g., 'squad')",
144
  )
145
+ st.session_state.huggingface_dataset = huggingface_dataset
146
 
147
  huggingface_split = st.selectbox(
148
  "Dataset Split",
149
  options=["train", "validation", "test"],
150
  help="Select the dataset split to use",
151
  )
152
+ st.session_state.huggingface_split = huggingface_split
153
 
154
  if st.button("Load Hugging Face Dataset"):
155
  if huggingface_dataset:
 
168
  st.warning("Please provide a Hugging Face dataset repository name.")
169
 
170
  elif data_source == "Use a sample dataset":
171
+ dataset_source_type = "sample"
172
  st.write("Try a sample dataset to get started quickly.")
173
  if st.button("Try Sample Dataset"):
174
  try:
 
431
  import logging
432
  import pandas as pd
433
  from navigator_helpers import InstructionResponseConfig, TrainingDataSynthesizer
434
+ from datasets import load_dataset
435
 
436
  # Configure the logger
437
  logging.basicConfig(level=logging.INFO, format="%(message)s")
438
 
 
439
  API_KEY = "YOUR_API_KEY"
440
+ DATASET_SOURCE = "{dataset_source_type}"
441
+ HUGGINGFACE_DATASET = "{huggingface_dataset}"
442
+ HUGGINGFACE_SPLIT = "{huggingface_split}"
443
+ SAMPLE_DATASET_URL = "{SAMPLE_DATASET_URL}"
444
+
445
+ # Load dataset
446
+ if DATASET_SOURCE == 'uploaded':
447
+ df = pd.read_csv("YOUR_UPLOADED_FILE_PATH") # Replace with the actual file path
448
+ elif DATASET_SOURCE == 'huggingface':
449
+ dataset = load_dataset(HUGGINGFACE_DATASET, split=HUGGINGFACE_SPLIT)
450
+ df = dataset.to_pandas()
451
+ elif DATASET_SOURCE == 'sample':
452
+ df = pd.read_csv(SAMPLE_DATASET_URL)
453
+ else:
454
+ raise ValueError("Invalid DATASET_SOURCE specified")
455
 
456
  # Create the instruction response configuration
457
  config = InstructionResponseConfig(
 
605
 
606
  time.sleep(0.1)
607
  logger.removeHandler(handler)
608
+ st.success("Data synthesis completed!")
609
  st.stop()
610
 
611
  if stop_button:
 
660
  zip_file.write(log_file_path, "complete_logs.jsonl")
661
  if synthesized_data_jsonl:
662
  zip_file.write(
663
+ synthesized_data_file_path, "synthetic_data.jsonl"
664
  )
665
  zip_file.write(sdk_file_path, "data_synthesis_code.py")
666