mjuvilla commited on
Commit
6980db2
·
1 Parent(s): 8be9040

forgot to add this file in the previous commit

Browse files
Files changed (1) hide show
  1. src/utils.py +55 -0
src/utils.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from subprocess import Popen, PIPE
3
+ import re
4
+
5
+ def moses_to_file(translated_moses_file: str, source_lang: str, target_lang: str, tikal_folder: str,
6
+ original_xliff_file_path: str):
7
+ # put the translations into the xlf
8
+ tikal_moses_to_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-lm", original_xliff_file_path, "-sl",
9
+ source_lang, "-tl", target_lang, "-from", translated_moses_file, "-totrg",
10
+ "-noalttrans", "-to", original_xliff_file_path]
11
+ Popen(tikal_moses_to_xliff_command).wait()
12
+
13
+ # any tags that are still <g> have not been paired between original and translated texts by tikal so we remove
14
+ # them. This may happen if a word in the original language has been split in more that one words that have other
15
+ # words in between, or an error in fastalign
16
+ text = open(original_xliff_file_path).read()
17
+ result = re.sub(r'<g id="\d+">(.*?)</g>', r'\1', text)
18
+ open(original_xliff_file_path, "w").write(result)
19
+
20
+ # merge into a docx again
21
+ tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file_path]
22
+ final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)
23
+ stdout, stderr = final_process.communicate()
24
+ final_process.wait()
25
+
26
+ # get the path to the output file
27
+ output = stdout.decode('utf-8')
28
+ return re.search(r'(?<=Output:\s)(.*)', output)[0]
29
+
30
+ def file_to_moses(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
31
+ original_xliff_file_path: str) -> str:
32
+ """
33
+ Given a document, this function generates an xliff file and then a plain text file with the text contents
34
+ while keeping style and formatting using tags like <g id=1> </g>
35
+
36
+ Parameters:
37
+ input_file: Path to document to process
38
+ source_lang: Source language of the document
39
+ target_lang: Target language of the document
40
+ tikal_folder: Folder where tikal.sh is located
41
+ original_xliff_file_path: Path to xliff file to generate, which will be use later
42
+
43
+ Returns:
44
+ string: Path to plain text file
45
+ """
46
+
47
+ tikal_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-x", input_file, "-nocopy", "-sl", source_lang,
48
+ "-tl", target_lang]
49
+ Popen(tikal_xliff_command).wait()
50
+
51
+ tikal_moses_command = [os.path.join(tikal_folder, "tikal.sh"), "-xm", original_xliff_file_path, "-sl", source_lang,
52
+ "-tl", target_lang]
53
+ Popen(tikal_moses_command).wait()
54
+
55
+ return os.path.join(original_xliff_file_path + f".{source_lang}")