RobCaamano commited on
Commit
c80f760
1 Parent(s): 7144462

Update opus.py

Browse files
Files changed (1) hide show
  1. opus.py +65 -63
opus.py CHANGED
@@ -1,63 +1,65 @@
1
- from transformers import MarianMTModel, MarianTokenizer
2
- from tqdm import tqdm
3
- import os
4
- import re
5
- import argparse
6
-
7
- # Load Model and Tokenizer
8
- model_name = "Helsinki-NLP/opus-mt-en-es"
9
- tokenizer = MarianTokenizer.from_pretrained(model_name)
10
- model = MarianMTModel.from_pretrained(model_name)
11
-
12
- # Extract & separate timestamp and text
13
- def extract_timestamp_and_text(line):
14
- match = re.match(r'\[(\d+\.\d+\-\d+\.\d+)\]\s+(.*)', line)
15
- if match:
16
- return match.group(1), match.group(2)
17
- return '', line
18
-
19
- # Translate text
20
- def translate_text(text):
21
- lines = text.split('\n')
22
- translated_lines = []
23
-
24
- for line in tqdm(lines, desc="Translating lines", leave=False):
25
- if not line.strip():
26
- translated_lines.append('')
27
- continue
28
-
29
- timestamp, line_text = extract_timestamp_and_text(line)
30
-
31
- if line_text.strip():
32
- model_inputs = tokenizer(line_text, return_tensors="pt", truncation=True, padding="longest")
33
- translated = model.generate(**model_inputs)
34
- translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
35
- translated_line = f'[{timestamp}] {translated_text}'
36
- else:
37
- translated_line = f'[{timestamp}]'
38
-
39
- translated_lines.append(translated_line)
40
-
41
- return '\n'.join(translated_lines)
42
-
43
- # Main function to translate a file
44
- def translate_file(src_file_path, dst_file_path):
45
- try:
46
- with open(src_file_path, 'r') as file:
47
- english_text = file.read()
48
- spanish_text = translate_text(english_text)
49
-
50
- with open(dst_file_path, 'w') as file:
51
- file.write(spanish_text)
52
- print(f"Translation completed: {dst_file_path}")
53
-
54
- except Exception as e:
55
- print(f"Error processing file: {e}")
56
-
57
- if __name__ == "__main__":
58
- parser = argparse.ArgumentParser(description="Translate English text to Spanish")
59
- parser.add_argument("src_file_path", help="Path to the source file with English text")
60
- parser.add_argument("dst_file_path", help="Path to save the translated Spanish text")
61
- args = parser.parse_args()
62
-
63
- translate_file(args.src_file_path, args.dst_file_path)
 
 
 
1
+ from transformers import MarianMTModel, MarianTokenizer
2
+ from tqdm import tqdm
3
+ import os
4
+ import re
5
+ import argparse
6
+
7
+ # Load Model and Tokenizer
8
+ model_name = "Helsinki-NLP/opus-mt-en-es"
9
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
10
+ model = MarianMTModel.from_pretrained(model_name)
11
+
12
+ # Extract & separate timestamp and text
13
+ def extract_timestamp_and_text(line):
14
+ match = re.match(r'\[(\d+\.\d+\-\d+\.\d+)\]\s+(.*)', line)
15
+ if match:
16
+ return match.group(1), match.group(2)
17
+ return '', line
18
+
19
+ # Translate text
20
+ def translate_text(text):
21
+ lines = text.split('\n')
22
+ translated_lines = []
23
+
24
+ for line in tqdm(lines, desc="Translating lines", leave=False):
25
+ # Check if line empty
26
+ if not line.strip():
27
+ translated_lines.append('')
28
+ continue
29
+
30
+ timestamp, line_text = extract_timestamp_and_text(line)
31
+
32
+ # Translate text
33
+ if line_text.strip():
34
+ model_inputs = tokenizer(line_text, return_tensors="pt", truncation=True, padding="longest")
35
+ translated = model.generate(**model_inputs)
36
+ translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
37
+ translated_line = f'[{timestamp}] {translated_text}'
38
+ else:
39
+ translated_line = f'[{timestamp}]'
40
+
41
+ translated_lines.append(translated_line)
42
+
43
+ return '\n'.join(translated_lines)
44
+
45
+ # Main function to translate a file
46
+ def translate_file(src_file_path, dst_file_path):
47
+ try:
48
+ with open(src_file_path, 'r') as file:
49
+ english_text = file.read()
50
+ spanish_text = translate_text(english_text)
51
+
52
+ with open(dst_file_path, 'w') as file:
53
+ file.write(spanish_text)
54
+ print(f"Translation completed: {dst_file_path}")
55
+
56
+ except Exception as e:
57
+ print(f"Error processing file: {e}")
58
+
59
+ if __name__ == "__main__":
60
+ parser = argparse.ArgumentParser(description="Translate English text to Spanish")
61
+ parser.add_argument("src_file_path", help="Path to the source file with English text")
62
+ parser.add_argument("dst_file_path", help="Path to save the translated Spanish text")
63
+ args = parser.parse_args()
64
+
65
+ translate_file(args.src_file_path, args.dst_file_path)