cstr commited on
Commit
964e0c7
β€’
1 Parent(s): 3233c26

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -12
app.py CHANGED
@@ -276,7 +276,7 @@ def validate_item_mix(item):
276
 
277
  return True
278
 
279
- def translate_item_orpo(item, raw_file_path, translator, tokenizer):
280
  try:
281
  translated_texts = {} # Cache to store translated texts
282
 
@@ -321,7 +321,7 @@ def translate_item_orpo(item, raw_file_path, translator, tokenizer):
321
  logging.error(f"An error occurred during translation: {e}")
322
  return None
323
 
324
- def validate_item_orpo(item):
325
  # Check basic required fields
326
  required_fields = ['source', 'prompt', 'chosen', 'rejected']
327
  for field in required_fields:
@@ -356,10 +356,10 @@ def process_file(input_file_path, output_file_path, raw_file_path, line_indices,
356
  print ("translating a mix-style model...")
357
  validate_item = validate_item_mix
358
  translate_item = translate_item_mix
359
- elif model_type == "orpo":
360
- print ("translating an orpo-style model...")
361
- validate_item = validate_item_orpo
362
- translate_item = translate_item_orpo # def translate_item_ufb(item, raw_file_path, translator, tokenizer):
363
  elif model_type == "ufb":
364
  print ("translating an ultrafeedback-style model...")
365
  validate_item = validate_item_ufb
@@ -458,10 +458,12 @@ def upload_output_to_huggingface(output_file_path, repo_name, token):
458
 
459
  # Check if the repository exists
460
  try:
 
461
  api.repo_info(repo_id=repo_name, repo_type="dataset", token=token)
462
  except Exception as e:
463
  if "404" in str(e):
464
  # Create the repository if it doesn't exist
 
465
  create_repo(repo_id=repo_name, repo_type="dataset", token=token)
466
  print(f"Created repository: {repo_name}")
467
  else:
@@ -470,6 +472,7 @@ def upload_output_to_huggingface(output_file_path, repo_name, token):
470
 
471
  # Upload the file to the repository
472
  try:
 
473
  upload_file(
474
  path_or_fileobj=output_file_path,
475
  path_in_repo=output_file_path,
@@ -619,14 +622,12 @@ datasets_desc = """## πŸ“Š Dataset Types:
619
  - `prompt`: List of dictionaries with 'content' and 'role' fields (multi-turn conversation).
620
  - `chosen`: Single dictionary with 'content' and 'role' fields.
621
  - `rejected`: Single dictionary with 'content' and 'role' fields.
622
- - **orpo**:
623
  - `prompt`: String (user input).
624
  - `chosen`: List of dictionaries with 'content' and 'role' fields.
625
  - `rejected`: List of dictionaries with 'content' and 'role' fields.
626
  - **ufb**:
627
- - `prompt`: String (user input).
628
- - `chosen`: List of dictionaries with 'content' and 'role' fields.
629
- - `rejected`: List of dictionaries with 'content' and 'role' fields.
630
  ## πŸ› οΈ Backend:
631
  The translation backend runs on the Hugging Face Hub API."""
632
 
@@ -646,8 +647,8 @@ with gr.Blocks(theme=theme) as demo:
646
  with gr.Row(equal_height=False):
647
  with gr.Column():
648
  dataset_url = gr.Textbox(label="Input Dataset URL", lines=2, placeholder = "https://huggingface.co/datasets/alvarobartt/dpo-mix-7k-simplified/resolve/main/data/train-00000-of-00001.parquet?download=true")
649
- model_type = gr.Dropdown(choices=["mix", "orpo", "ufb"], label="Dataset Type")
650
- output_dataset_name = gr.Textbox(label="Output Dataset Name", lines=1, placeholder = "dataset_test_translations")
651
  range_specification = gr.Textbox(label="Range Specification", lines=1, placeholder="e.g., 1-100")
652
 
653
  with gr.Column():
 
276
 
277
  return True
278
 
279
+ def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer):
280
  try:
281
  translated_texts = {} # Cache to store translated texts
282
 
 
321
  logging.error(f"An error occurred during translation: {e}")
322
  return None
323
 
324
+ def validate_item_ufb_cached(item):
325
  # Check basic required fields
326
  required_fields = ['source', 'prompt', 'chosen', 'rejected']
327
  for field in required_fields:
 
356
  print ("translating a mix-style model...")
357
  validate_item = validate_item_mix
358
  translate_item = translate_item_mix
359
+ elif model_type == "ufb_cached":
360
+ print ("translating an ufb_cached-style model...")
361
+ validate_item = validate_item_ufb_cached
362
+ translate_item = translate_item_ufb_cached # def translate_item_ufb(item, raw_file_path, translator, tokenizer):
363
  elif model_type == "ufb":
364
  print ("translating an ultrafeedback-style model...")
365
  validate_item = validate_item_ufb
 
458
 
459
  # Check if the repository exists
460
  try:
461
+ print ("checking repo:", repo_name)
462
  api.repo_info(repo_id=repo_name, repo_type="dataset", token=token)
463
  except Exception as e:
464
  if "404" in str(e):
465
  # Create the repository if it doesn't exist
466
+ print ("creating it...")
467
  create_repo(repo_id=repo_name, repo_type="dataset", token=token)
468
  print(f"Created repository: {repo_name}")
469
  else:
 
472
 
473
  # Upload the file to the repository
474
  try:
475
+ print ("starting dataset upload from:", output_file_path)
476
  upload_file(
477
  path_or_fileobj=output_file_path,
478
  path_in_repo=output_file_path,
 
622
  - `prompt`: List of dictionaries with 'content' and 'role' fields (multi-turn conversation).
623
  - `chosen`: Single dictionary with 'content' and 'role' fields.
624
  - `rejected`: Single dictionary with 'content' and 'role' fields.
625
+ - **ufb_cached**:
626
  - `prompt`: String (user input).
627
  - `chosen`: List of dictionaries with 'content' and 'role' fields.
628
  - `rejected`: List of dictionaries with 'content' and 'role' fields.
629
  - **ufb**:
630
+ - like ufb_cached, but we do not check for already translated strings
 
 
631
  ## πŸ› οΈ Backend:
632
  The translation backend runs on the Hugging Face Hub API."""
633
 
 
647
  with gr.Row(equal_height=False):
648
  with gr.Column():
649
  dataset_url = gr.Textbox(label="Input Dataset URL", lines=2, placeholder = "https://huggingface.co/datasets/alvarobartt/dpo-mix-7k-simplified/resolve/main/data/train-00000-of-00001.parquet?download=true")
650
+ model_type = gr.Dropdown(choices=["mix", "ufb_cached", "ufb"], label="Dataset Type")
651
+ output_dataset_name = gr.Textbox(label="Output Dataset Name", lines=1, placeholder = "cstr/translated_datasets")
652
  range_specification = gr.Textbox(label="Range Specification", lines=1, placeholder="e.g., 1-100")
653
 
654
  with gr.Column():