cstr commited on
Commit
4ed2821
1 Parent(s): 55f037d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -16
app.py CHANGED
@@ -89,7 +89,7 @@ def parse_range_specification(range_specification, file_length):
89
  line_indices.append(single_line)
90
  return line_indices
91
 
92
- def translate_text(text, translator, tokenizer):
93
  """
94
  Translates the given text from English to German using CTranslate2 and the WMT21 model,
95
  with special handling for newlines and segmenting text longer than 500 characters.
@@ -131,7 +131,7 @@ def translate_text(text, translator, tokenizer):
131
  translated_segments = []
132
  for segment in segments:
133
  source = tokenizer.convert_ids_to_tokens(tokenizer.encode(segment))
134
- target_prefix = [tokenizer.lang_code_to_token["de"]]
135
  results = translator.translate_batch([source], target_prefix=[target_prefix])
136
  target = results[0].hypotheses[0][1:]
137
  translated_segment = tokenizer.decode(tokenizer.convert_tokens_to_ids(target))
@@ -150,7 +150,7 @@ def translate_text(text, translator, tokenizer):
150
  logging.error(f"An error occurred during translation: {e}")
151
  return None
152
 
153
- def translate_item_ufb(item, raw_file_path, translator, tokenizer):
154
  try:
155
  # Translate the prompt directly since it's a string
156
  translated_prompt = translate_text(item['prompt'], translator, tokenizer)
@@ -158,12 +158,12 @@ def translate_item_ufb(item, raw_file_path, translator, tokenizer):
158
  # Translate the chosen and rejected contents
159
  translated_chosen = []
160
  for choice in item['chosen']:
161
- translated_content = translate_text(choice['content'], translator, tokenizer)
162
  translated_chosen.append({'content': translated_content, 'role': choice['role']})
163
 
164
  translated_rejected = []
165
  for choice in item['rejected']:
166
- translated_content = translate_text(choice['content'], translator, tokenizer)
167
  translated_rejected.append({'content': translated_content, 'role': choice['role']})
168
 
169
  # Write the raw response to a backup file
@@ -211,7 +211,7 @@ def validate_item_ufb(item):
211
 
212
 
213
 
214
- def translate_item_mix(item, raw_file_path, translator, tokenizer):
215
  """
216
  Translates the relevant fields in the given item from English to German using CTranslate2 and the WMT21 model,
217
  and saves the raw response to a backup file.
@@ -221,12 +221,12 @@ def translate_item_mix(item, raw_file_path, translator, tokenizer):
221
  # Translate each part of the prompt separately and preserve the order
222
  translated_prompts = []
223
  for message in item['prompt']:
224
- translated_content = translate_text(message['content'], translator, tokenizer)
225
  translated_prompts.append({'content': translated_content, 'role': message['role']})
226
 
227
  # Translate the chosen and rejected contents
228
- translated_chosen_content = translate_text(item['chosen'][0]['content'], translator, tokenizer)
229
- translated_rejected_content = translate_text(item['rejected'][0]['content'], translator, tokenizer)
230
 
231
  # Write the raw response to a backup file
232
  with open(raw_file_path, 'a', encoding='utf-8') as raw_file:
@@ -276,13 +276,13 @@ def validate_item_mix(item):
276
 
277
  return True
278
 
279
- def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer):
280
  try:
281
  translated_texts = {} # Cache to store translated texts
282
 
283
  # Translate the prompt if necessary (which is a user input and can appear again)
284
  if item['prompt'] not in translated_texts:
285
- translated_prompt = translate_text(item['prompt'], translator, tokenizer)
286
  translated_texts[item['prompt']] = translated_prompt
287
  else:
288
  translated_prompt = translated_texts[item['prompt']]
@@ -290,7 +290,7 @@ def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer):
290
  # Helper function to handle content translation with caching
291
  def get_translated_content(content):
292
  if content not in translated_texts:
293
- translated_texts[content] = translate_text(content, translator, tokenizer)
294
  return translated_texts[content]
295
 
296
  # Process translations for chosen and rejected sections
@@ -349,7 +349,7 @@ def validate_item_ufb_cached(item):
349
 
350
  return True
351
 
352
- def process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type):
353
  try:
354
  # Assigning validation and translation functions based on model_type
355
  if model_type == "mix":
@@ -387,7 +387,7 @@ def process_file(input_file_path, output_file_path, raw_file_path, line_indices,
387
  retry_count = 0
388
  while translated_item is None and retry_count < 3:
389
  print ("going to translate the item...")
390
- translated_item = translate_item(item, raw_file_path, translator, tokenizer)
391
  retry_count += 1
392
  if translated_item is None:
393
  logging.warning(f"Translation failed for item. Retry attempt: {retry_count}")
@@ -485,7 +485,7 @@ def upload_output_to_huggingface(output_file_path, repo_name, token):
485
  print(f"Failed to upload {output_file_path} to Hugging Face: {e}")
486
  raise
487
 
488
- def translate_dataset(train_url, local_parquet_path, input_file_path, output_file_path, raw_file_path, range_specification, model_type, output_dir, output_repo_name, token, translator, tokenizer):
489
  try:
490
  # Download the Parquet file
491
  download_parquet(train_url, local_parquet_path)
@@ -527,7 +527,7 @@ def translate_dataset(train_url, local_parquet_path, input_file_path, output_fil
527
 
528
  try:
529
  # Process the file with specified model type and line indices
530
- process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type)
531
  except Exception as e:
532
  logging.error(f"Failed to process the file {input_file_path}: {e}")
533
  return
 
89
  line_indices.append(single_line)
90
  return line_indices
91
 
92
+ def translate_text(text, translator, tokenizer, target_language):
93
  """
94
  Translates the given text from English to German using CTranslate2 and the WMT21 model,
95
  with special handling for newlines and segmenting text longer than 500 characters.
 
131
  translated_segments = []
132
  for segment in segments:
133
  source = tokenizer.convert_ids_to_tokens(tokenizer.encode(segment))
134
+ target_prefix = [tokenizer.lang_code_to_token[target_language]]
135
  results = translator.translate_batch([source], target_prefix=[target_prefix])
136
  target = results[0].hypotheses[0][1:]
137
  translated_segment = tokenizer.decode(tokenizer.convert_tokens_to_ids(target))
 
150
  logging.error(f"An error occurred during translation: {e}")
151
  return None
152
 
153
+ def translate_item_ufb(item, raw_file_path, translator, tokenizer, target_language):
154
  try:
155
  # Translate the prompt directly since it's a string
156
  translated_prompt = translate_text(item['prompt'], translator, tokenizer)
 
158
  # Translate the chosen and rejected contents
159
  translated_chosen = []
160
  for choice in item['chosen']:
161
+ translated_content = translate_text(choice['content'], translator, tokenizer, target_language)
162
  translated_chosen.append({'content': translated_content, 'role': choice['role']})
163
 
164
  translated_rejected = []
165
  for choice in item['rejected']:
166
+ translated_content = translate_text(choice['content'], translator, tokenizer, target_language)
167
  translated_rejected.append({'content': translated_content, 'role': choice['role']})
168
 
169
  # Write the raw response to a backup file
 
211
 
212
 
213
 
214
+ def translate_item_mix(item, raw_file_path, translator, tokenizer, target_language):
215
  """
216
  Translates the relevant fields in the given item from English to German using CTranslate2 and the WMT21 model,
217
  and saves the raw response to a backup file.
 
221
  # Translate each part of the prompt separately and preserve the order
222
  translated_prompts = []
223
  for message in item['prompt']:
224
+ translated_content = translate_text(message['content'], translator, tokenizer, target_language)
225
  translated_prompts.append({'content': translated_content, 'role': message['role']})
226
 
227
  # Translate the chosen and rejected contents
228
+ translated_chosen_content = translate_text(item['chosen'][0]['content'], translator, tokenizer, target_language)
229
+ translated_rejected_content = translate_text(item['rejected'][0]['content'], translator, tokenizer, target_language)
230
 
231
  # Write the raw response to a backup file
232
  with open(raw_file_path, 'a', encoding='utf-8') as raw_file:
 
276
 
277
  return True
278
 
279
+ def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer, target_language):
280
  try:
281
  translated_texts = {} # Cache to store translated texts
282
 
283
  # Translate the prompt if necessary (which is a user input and can appear again)
284
  if item['prompt'] not in translated_texts:
285
+ translated_prompt = translate_text(item['prompt'], translator, tokenizer, target_language)
286
  translated_texts[item['prompt']] = translated_prompt
287
  else:
288
  translated_prompt = translated_texts[item['prompt']]
 
290
  # Helper function to handle content translation with caching
291
  def get_translated_content(content):
292
  if content not in translated_texts:
293
+ translated_texts[content] = translate_text(content, translator, tokenizer, target_language)
294
  return translated_texts[content]
295
 
296
  # Process translations for chosen and rejected sections
 
349
 
350
  return True
351
 
352
+ def process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type, target_language):
353
  try:
354
  # Assigning validation and translation functions based on model_type
355
  if model_type == "mix":
 
387
  retry_count = 0
388
  while translated_item is None and retry_count < 3:
389
  print ("going to translate the item...")
390
+ translated_item = translate_item(item, raw_file_path, translator, tokenizer, target_language)
391
  retry_count += 1
392
  if translated_item is None:
393
  logging.warning(f"Translation failed for item. Retry attempt: {retry_count}")
 
485
  print(f"Failed to upload {output_file_path} to Hugging Face: {e}")
486
  raise
487
 
488
+ def translate_dataset(train_url, local_parquet_path, input_file_path, output_file_path, raw_file_path, range_specification, model_type, output_dir, output_repo_name, token, translator, tokenizer, target_language):
489
  try:
490
  # Download the Parquet file
491
  download_parquet(train_url, local_parquet_path)
 
527
 
528
  try:
529
  # Process the file with specified model type and line indices
530
+ process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type, target_language)
531
  except Exception as e:
532
  logging.error(f"Failed to process the file {input_file_path}: {e}")
533
  return