Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -89,7 +89,7 @@ def parse_range_specification(range_specification, file_length): | |
| 89 | 
             
                        line_indices.append(single_line)
         | 
| 90 | 
             
                return line_indices
         | 
| 91 |  | 
| 92 | 
            -
            def translate_text(text, translator, tokenizer):
         | 
| 93 | 
             
                """
         | 
| 94 | 
             
                Translates the given text from English to German using CTranslate2 and the WMT21 model,
         | 
| 95 | 
             
                with special handling for newlines and segmenting text longer than 500 characters.
         | 
| @@ -131,7 +131,7 @@ def translate_text(text, translator, tokenizer): | |
| 131 | 
             
                    translated_segments = []
         | 
| 132 | 
             
                    for segment in segments:
         | 
| 133 | 
             
                        source = tokenizer.convert_ids_to_tokens(tokenizer.encode(segment))
         | 
| 134 | 
            -
                        target_prefix = [tokenizer.lang_code_to_token[ | 
| 135 | 
             
                        results = translator.translate_batch([source], target_prefix=[target_prefix])
         | 
| 136 | 
             
                        target = results[0].hypotheses[0][1:]
         | 
| 137 | 
             
                        translated_segment = tokenizer.decode(tokenizer.convert_tokens_to_ids(target))
         | 
| @@ -150,7 +150,7 @@ def translate_text(text, translator, tokenizer): | |
| 150 | 
             
                    logging.error(f"An error occurred during translation: {e}")
         | 
| 151 | 
             
                    return None
         | 
| 152 |  | 
| 153 | 
            -
            def translate_item_ufb(item, raw_file_path, translator, tokenizer):
         | 
| 154 | 
             
                try:
         | 
| 155 | 
             
                    # Translate the prompt directly since it's a string
         | 
| 156 | 
             
                    translated_prompt = translate_text(item['prompt'], translator, tokenizer)
         | 
| @@ -158,12 +158,12 @@ def translate_item_ufb(item, raw_file_path, translator, tokenizer): | |
| 158 | 
             
                    # Translate the chosen and rejected contents
         | 
| 159 | 
             
                    translated_chosen = []
         | 
| 160 | 
             
                    for choice in item['chosen']:
         | 
| 161 | 
            -
                        translated_content = translate_text(choice['content'], translator, tokenizer)
         | 
| 162 | 
             
                        translated_chosen.append({'content': translated_content, 'role': choice['role']})
         | 
| 163 |  | 
| 164 | 
             
                    translated_rejected = []
         | 
| 165 | 
             
                    for choice in item['rejected']:
         | 
| 166 | 
            -
                        translated_content = translate_text(choice['content'], translator, tokenizer)
         | 
| 167 | 
             
                        translated_rejected.append({'content': translated_content, 'role': choice['role']})
         | 
| 168 |  | 
| 169 | 
             
                    # Write the raw response to a backup file
         | 
| @@ -211,7 +211,7 @@ def validate_item_ufb(item): | |
| 211 |  | 
| 212 |  | 
| 213 |  | 
| 214 | 
            -
            def translate_item_mix(item, raw_file_path, translator, tokenizer):
         | 
| 215 | 
             
                """
         | 
| 216 | 
             
                Translates the relevant fields in the given item from English to German using CTranslate2 and the WMT21 model,
         | 
| 217 | 
             
                and saves the raw response to a backup file.
         | 
| @@ -221,12 +221,12 @@ def translate_item_mix(item, raw_file_path, translator, tokenizer): | |
| 221 | 
             
                    # Translate each part of the prompt separately and preserve the order
         | 
| 222 | 
             
                    translated_prompts = []
         | 
| 223 | 
             
                    for message in item['prompt']:
         | 
| 224 | 
            -
                        translated_content = translate_text(message['content'], translator, tokenizer)
         | 
| 225 | 
             
                        translated_prompts.append({'content': translated_content, 'role': message['role']})
         | 
| 226 |  | 
| 227 | 
             
                    # Translate the chosen and rejected contents
         | 
| 228 | 
            -
                    translated_chosen_content = translate_text(item['chosen'][0]['content'], translator, tokenizer)
         | 
| 229 | 
            -
                    translated_rejected_content = translate_text(item['rejected'][0]['content'], translator, tokenizer)
         | 
| 230 |  | 
| 231 | 
             
                    # Write the raw response to a backup file
         | 
| 232 | 
             
                    with open(raw_file_path, 'a', encoding='utf-8') as raw_file:
         | 
| @@ -276,13 +276,13 @@ def validate_item_mix(item): | |
| 276 |  | 
| 277 | 
             
                return True
         | 
| 278 |  | 
| 279 | 
            -
            def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer):
         | 
| 280 | 
             
                try:
         | 
| 281 | 
             
                    translated_texts = {}  # Cache to store translated texts
         | 
| 282 |  | 
| 283 | 
             
                    # Translate the prompt if necessary (which is a user input and can appear again)
         | 
| 284 | 
             
                    if item['prompt'] not in translated_texts:
         | 
| 285 | 
            -
                        translated_prompt = translate_text(item['prompt'], translator, tokenizer)
         | 
| 286 | 
             
                        translated_texts[item['prompt']] = translated_prompt
         | 
| 287 | 
             
                    else:
         | 
| 288 | 
             
                        translated_prompt = translated_texts[item['prompt']]
         | 
| @@ -290,7 +290,7 @@ def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer): | |
| 290 | 
             
                    # Helper function to handle content translation with caching
         | 
| 291 | 
             
                    def get_translated_content(content):
         | 
| 292 | 
             
                        if content not in translated_texts:
         | 
| 293 | 
            -
                            translated_texts[content] = translate_text(content, translator, tokenizer)
         | 
| 294 | 
             
                        return translated_texts[content]
         | 
| 295 |  | 
| 296 | 
             
                    # Process translations for chosen and rejected sections
         | 
| @@ -349,7 +349,7 @@ def validate_item_ufb_cached(item): | |
| 349 |  | 
| 350 | 
             
                return True
         | 
| 351 |  | 
| 352 | 
            -
            def process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type):
         | 
| 353 | 
             
                try:
         | 
| 354 | 
             
                    # Assigning validation and translation functions based on model_type
         | 
| 355 | 
             
                    if model_type == "mix":
         | 
| @@ -387,7 +387,7 @@ def process_file(input_file_path, output_file_path, raw_file_path, line_indices, | |
| 387 | 
             
                        retry_count = 0
         | 
| 388 | 
             
                        while translated_item is None and retry_count < 3:
         | 
| 389 | 
             
                            print ("going to translate the item...")
         | 
| 390 | 
            -
                            translated_item = translate_item(item, raw_file_path, translator, tokenizer)
         | 
| 391 | 
             
                            retry_count += 1
         | 
| 392 | 
             
                            if translated_item is None:
         | 
| 393 | 
             
                                logging.warning(f"Translation failed for item. Retry attempt: {retry_count}")
         | 
| @@ -485,7 +485,7 @@ def upload_output_to_huggingface(output_file_path, repo_name, token): | |
| 485 | 
             
                    print(f"Failed to upload {output_file_path} to Hugging Face: {e}")
         | 
| 486 | 
             
                    raise
         | 
| 487 |  | 
| 488 | 
            -
            def translate_dataset(train_url, local_parquet_path, input_file_path, output_file_path, raw_file_path, range_specification, model_type, output_dir, output_repo_name, token, translator, tokenizer):
         | 
| 489 | 
             
                try:
         | 
| 490 | 
             
                    # Download the Parquet file
         | 
| 491 | 
             
                    download_parquet(train_url, local_parquet_path)
         | 
| @@ -527,7 +527,7 @@ def translate_dataset(train_url, local_parquet_path, input_file_path, output_fil | |
| 527 |  | 
| 528 | 
             
                try:
         | 
| 529 | 
             
                    # Process the file with specified model type and line indices
         | 
| 530 | 
            -
                    process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type)
         | 
| 531 | 
             
                except Exception as e:
         | 
| 532 | 
             
                    logging.error(f"Failed to process the file {input_file_path}: {e}")
         | 
| 533 | 
             
                    return
         | 
|  | |
| 89 | 
             
                        line_indices.append(single_line)
         | 
| 90 | 
             
                return line_indices
         | 
| 91 |  | 
| 92 | 
            +
            def translate_text(text, translator, tokenizer, target_language):
         | 
| 93 | 
             
                """
         | 
| 94 | 
             
                Translates the given text from English to German using CTranslate2 and the WMT21 model,
         | 
| 95 | 
             
                with special handling for newlines and segmenting text longer than 500 characters.
         | 
|  | |
| 131 | 
             
                    translated_segments = []
         | 
| 132 | 
             
                    for segment in segments:
         | 
| 133 | 
             
                        source = tokenizer.convert_ids_to_tokens(tokenizer.encode(segment))
         | 
| 134 | 
            +
                        target_prefix = [tokenizer.lang_code_to_token[target_language]]
         | 
| 135 | 
             
                        results = translator.translate_batch([source], target_prefix=[target_prefix])
         | 
| 136 | 
             
                        target = results[0].hypotheses[0][1:]
         | 
| 137 | 
             
                        translated_segment = tokenizer.decode(tokenizer.convert_tokens_to_ids(target))
         | 
|  | |
| 150 | 
             
                    logging.error(f"An error occurred during translation: {e}")
         | 
| 151 | 
             
                    return None
         | 
| 152 |  | 
| 153 | 
            +
            def translate_item_ufb(item, raw_file_path, translator, tokenizer, target_language):
         | 
| 154 | 
             
                try:
         | 
| 155 | 
             
                    # Translate the prompt directly since it's a string
         | 
| 156 | 
             
                    translated_prompt = translate_text(item['prompt'], translator, tokenizer)
         | 
|  | |
| 158 | 
             
                    # Translate the chosen and rejected contents
         | 
| 159 | 
             
                    translated_chosen = []
         | 
| 160 | 
             
                    for choice in item['chosen']:
         | 
| 161 | 
            +
                        translated_content = translate_text(choice['content'], translator, tokenizer, target_language)
         | 
| 162 | 
             
                        translated_chosen.append({'content': translated_content, 'role': choice['role']})
         | 
| 163 |  | 
| 164 | 
             
                    translated_rejected = []
         | 
| 165 | 
             
                    for choice in item['rejected']:
         | 
| 166 | 
            +
                        translated_content = translate_text(choice['content'], translator, tokenizer, target_language)
         | 
| 167 | 
             
                        translated_rejected.append({'content': translated_content, 'role': choice['role']})
         | 
| 168 |  | 
| 169 | 
             
                    # Write the raw response to a backup file
         | 
|  | |
| 211 |  | 
| 212 |  | 
| 213 |  | 
| 214 | 
            +
            def translate_item_mix(item, raw_file_path, translator, tokenizer, target_language):
         | 
| 215 | 
             
                """
         | 
| 216 | 
             
                Translates the relevant fields in the given item from English to German using CTranslate2 and the WMT21 model,
         | 
| 217 | 
             
                and saves the raw response to a backup file.
         | 
|  | |
| 221 | 
             
                    # Translate each part of the prompt separately and preserve the order
         | 
| 222 | 
             
                    translated_prompts = []
         | 
| 223 | 
             
                    for message in item['prompt']:
         | 
| 224 | 
            +
                        translated_content = translate_text(message['content'], translator, tokenizer, target_language)
         | 
| 225 | 
             
                        translated_prompts.append({'content': translated_content, 'role': message['role']})
         | 
| 226 |  | 
| 227 | 
             
                    # Translate the chosen and rejected contents
         | 
| 228 | 
            +
                    translated_chosen_content = translate_text(item['chosen'][0]['content'], translator, tokenizer, target_language)
         | 
| 229 | 
            +
                    translated_rejected_content = translate_text(item['rejected'][0]['content'], translator, tokenizer, target_language)
         | 
| 230 |  | 
| 231 | 
             
                    # Write the raw response to a backup file
         | 
| 232 | 
             
                    with open(raw_file_path, 'a', encoding='utf-8') as raw_file:
         | 
|  | |
| 276 |  | 
| 277 | 
             
                return True
         | 
| 278 |  | 
| 279 | 
            +
            def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer, target_language):
         | 
| 280 | 
             
                try:
         | 
| 281 | 
             
                    translated_texts = {}  # Cache to store translated texts
         | 
| 282 |  | 
| 283 | 
             
                    # Translate the prompt if necessary (which is a user input and can appear again)
         | 
| 284 | 
             
                    if item['prompt'] not in translated_texts:
         | 
| 285 | 
            +
                        translated_prompt = translate_text(item['prompt'], translator, tokenizer, target_language)
         | 
| 286 | 
             
                        translated_texts[item['prompt']] = translated_prompt
         | 
| 287 | 
             
                    else:
         | 
| 288 | 
             
                        translated_prompt = translated_texts[item['prompt']]
         | 
|  | |
| 290 | 
             
                    # Helper function to handle content translation with caching
         | 
| 291 | 
             
                    def get_translated_content(content):
         | 
| 292 | 
             
                        if content not in translated_texts:
         | 
| 293 | 
            +
                            translated_texts[content] = translate_text(content, translator, tokenizer, target_language)
         | 
| 294 | 
             
                        return translated_texts[content]
         | 
| 295 |  | 
| 296 | 
             
                    # Process translations for chosen and rejected sections
         | 
|  | |
| 349 |  | 
| 350 | 
             
                return True
         | 
| 351 |  | 
| 352 | 
            +
            def process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type, target_language):
         | 
| 353 | 
             
                try:
         | 
| 354 | 
             
                    # Assigning validation and translation functions based on model_type
         | 
| 355 | 
             
                    if model_type == "mix":
         | 
|  | |
| 387 | 
             
                        retry_count = 0
         | 
| 388 | 
             
                        while translated_item is None and retry_count < 3:
         | 
| 389 | 
             
                            print ("going to translate the item...")
         | 
| 390 | 
            +
                            translated_item = translate_item(item, raw_file_path, translator, tokenizer, target_language)
         | 
| 391 | 
             
                            retry_count += 1
         | 
| 392 | 
             
                            if translated_item is None:
         | 
| 393 | 
             
                                logging.warning(f"Translation failed for item. Retry attempt: {retry_count}")
         | 
|  | |
| 485 | 
             
                    print(f"Failed to upload {output_file_path} to Hugging Face: {e}")
         | 
| 486 | 
             
                    raise
         | 
| 487 |  | 
| 488 | 
            +
            def translate_dataset(train_url, local_parquet_path, input_file_path, output_file_path, raw_file_path, range_specification, model_type, output_dir, output_repo_name, token, translator, tokenizer, target_language):
         | 
| 489 | 
             
                try:
         | 
| 490 | 
             
                    # Download the Parquet file
         | 
| 491 | 
             
                    download_parquet(train_url, local_parquet_path)
         | 
|  | |
| 527 |  | 
| 528 | 
             
                try:
         | 
| 529 | 
             
                    # Process the file with specified model type and line indices
         | 
| 530 | 
            +
                    process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type, target_language)
         | 
| 531 | 
             
                except Exception as e:
         | 
| 532 | 
             
                    logging.error(f"Failed to process the file {input_file_path}: {e}")
         | 
| 533 | 
             
                    return
         |