Dy3257 commited on
Commit
7c19755
1 Parent(s): 535a983

Upload 9 files

Browse files
app.py CHANGED
@@ -6,50 +6,22 @@
6
 
7
  import gradio as gr
8
 
9
- import ctranslate2
10
- from split import split_string
11
-
12
- translator_zh2en = ctranslate2.Translator("zh-en_model/", device="cpu")##路径
13
- translator2_zh2en = ctranslate2.Translator("zh2en_cmodel/", device="cpu")##路径
14
- translator_en2zh = ctranslate2.Translator("en-zh_model/", device="cpu")##路径
15
- translator2_en2zh = ctranslate2.Translator("en2zh_cmodel", device="cpu")##路径
16
-
17
- def translate(input_tokens, input_tokens2, mode):
18
-
19
- input_tokens = input_tokens.split()
20
- input_tokens2 = input_tokens2.split()
21
-
22
- source = split_string(input_tokens)
23
- lenth = len(source)
24
-
25
- source2 = split_string(input_tokens2)
26
- lenth2 = len(source2)
27
-
28
- results = []
29
- results2 = []
30
-
31
- if mode == "汉译英" :
32
- results = translator_zh2en.translate_batch(source)##翻译的分词分句
33
- results2 = translator2_zh2en.translate_batch(source2)##翻译的分词分句
34
- else :
35
- results = translator_en2zh.translate_batch(source)##翻译的分词分句
36
- results2 = translator2_en2zh.translate_batch(source2)##翻译的分词分句
37
-
38
- target = []
39
- target2 = []
40
-
41
- for i in range(0, lenth, 1):
42
- target = target + results[i].hypotheses[0]
43
- for i in range(0, lenth2, 1):
44
- target2 = target2 + results2[i].hypotheses[0]
45
-
46
- #print(results[0].hypotheses[0])##results[0]为第0句,hypotheses[0]保持0
47
- ##print(results[1].hypotheses[0])
48
- #return results[0].hypotheses[0]
49
- return ' '.join(target),' '.join(target2)
50
-
51
- demo = gr.Interface(fn=translate,
52
- inputs=["text", "text", gr.Dropdown(["汉译英", "英译汉"])],
53
- outputs=["text", "text"],)
54
 
55
  demo.launch()
 
6
 
7
  import gradio as gr
8
 
9
+ from tokenizer import tokenize, tokenize2
10
+ from translater import translate
11
+ from detokenizer import detokenize, detokenize2
12
+
13
+ def run(source_text, mode):
14
+ source_tokens = tokenize(source_text, mode)
15
+ source_tokens2 = tokenize2(source_text, mode)
16
+ source_tokenized_text = ' '.join(source_tokens)
17
+ target_tokens, target_tokens2 = translate(source_tokens, source_tokens2, mode)
18
+ target_text = detokenize(target_tokens, mode)
19
+ target_text2 = detokenize2(target_tokens2, mode)
20
+ return target_text, target_text2, source_tokenized_text
21
+
22
+
23
+ demo = gr.Interface(fn=run,
24
+ inputs=["text", gr.Dropdown(["汉译英", "英译汉"])],
25
+ outputs=["text", "text", "text"],)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  demo.launch()
detokenizer.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import sys
3
+ from sacremoses import MosesDetokenizer
4
+
5
+ md_en = MosesDetokenizer(lang='en')
6
+ md_zh = MosesDetokenizer(lang='zh')
7
+
8
+ def moses_detokenize(tokens, language='en'):
9
+ en_detokenizer = MosesDetokenizer(lang=language)
10
+
11
+ stdout = en_detokenizer.detokenize(tokens,return_str=True)
12
+
13
+ # 返回处理后的句子
14
+ return stdout.strip()
15
+
16
+ def detokenize(tokens, mode):
17
+ if mode == "汉译英" :
18
+ text = moses_detokenize(tokens)
19
+ text = re.sub(r" n't", "n't",text)
20
+ else :
21
+ text = ''.join(tokens)
22
+
23
+ return text
24
+
25
+ def detokenize2(tokens, mode):
26
+ if mode == "汉译英" :
27
+ answer_en_bpe = md_en.detokenize(tokens,return_str=True)
28
+ text = re.sub(r"@@ ", "",answer_en_bpe)
29
+ else :
30
+ answer_zh_bpe = md_zh.detokenize(tokens,return_str=True)
31
+ text = re.sub(r"@@ ", "",answer_zh_bpe)
32
+ return text
model2_data/bpecode.en ADDED
The diff for this file is too large to render. See raw diff
 
model2_data/bpecode.zh ADDED
The diff for this file is too large to render. See raw diff
 
model2_data/dict.zh.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1 +1,6 @@
1
- ctranslate2==4.1.0
 
 
 
 
 
 
1
+ ctranslate2==4.1.0
2
+ spacy==3.7.4
3
+ nltk==3.8.1
4
+ jieba==0.42.1
5
+ sacremoses==0.1.1
6
+ subword_nmt==0.3.8
tokenizer.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from spacy.tokens import Doc
3
+
4
+ # 加载英文模型
5
+ nlp = spacy.load('en_core_web_sm')
6
+
7
+ import nltk
8
+ from nltk.tokenize import word_tokenize
9
+
10
+ import jieba
11
+
12
+ from sacremoses import MosesTokenizer
13
+ from subword_nmt import apply_bpe
14
+ import codecs
15
+
16
+ jieba1 = jieba.Tokenizer()
17
+ jieba2 = jieba.Tokenizer()
18
+ jieba2.load_userdict('model2_data/dict.zh.txt')
19
+
20
+ mt_zh = MosesTokenizer(lang='zh')
21
+ with codecs.open('model2_data/bpecode.zh', 'r', 'utf-8') as f:
22
+ bpe_zh_f = apply_bpe.BPE(f)
23
+
24
+ #英文部分初始化,定义tokenize等等
25
+ mt_en = MosesTokenizer(lang='en')
26
+ with codecs.open('model2_data/bpecode.en', 'r', 'utf-8') as f:
27
+ bpe_en_f = apply_bpe.BPE(f)
28
+
29
+ def spacy_tokenize(line):
30
+ # 使用spaCy处理文本
31
+ doc = nlp(line)
32
+ # 获取单词列表
33
+ words = [token.text for token in doc]
34
+ # 将单词连接成一个字符串,单词间用一个空格间隔
35
+ return ' '.join(words)
36
+
37
+
38
+ def nltk_tokenize(line):
39
+ # 使用NLTK的word_tokenize进行分词
40
+ tokens = word_tokenize(line)
41
+ #print(tokens)
42
+ return tokens
43
+
44
+
45
+ def jieba_tokenize(line):
46
+ # 使用jieba进行分词
47
+ tokens = list(jieba1.cut(line.strip())) # strip用于去除可能的空白字符
48
+ #print(tokens)
49
+ return tokens
50
+
51
+ def tokenize(line, mode):
52
+ if mode == "汉译英" :
53
+ return jieba_tokenize(line)
54
+ else :
55
+ return nltk_tokenize(spacy_tokenize(line))
56
+
57
+
58
+ def jieba_tokenize2(line):
59
+ tokens = list(jieba2.cut(line.strip()))
60
+ return tokens
61
+
62
+ def mt_bpe_zh(line):
63
+ zh_tok = mt_zh.tokenize(line)
64
+ bpe_zh = bpe_zh_f.segment_tokens(zh_tok)
65
+ print(bpe_zh)
66
+ return bpe_zh
67
+
68
+ def mt_bpe_en(line):
69
+ en_tok = mt_en.tokenize(line)
70
+ bpe_en = bpe_en_f.segment_tokens(en_tok)
71
+ print(bpe_en)
72
+ return bpe_en
73
+
74
+ def tokenize2(line, mode):
75
+ if mode == "汉译英" :
76
+ return mt_bpe_zh(' '.join(jieba_tokenize2(line)))
77
+ else :
78
+ return mt_bpe_en(line)
translater.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctranslate2
2
+ from split import split_string
3
+
4
+ translator_zh2en = ctranslate2.Translator("zh-en_model/", device="cpu")##路径
5
+ translator2_zh2en = ctranslate2.Translator("zh2en_cmodel/", device="cpu")##路径
6
+ translator_en2zh = ctranslate2.Translator("en-zh_model/", device="cpu")##路径
7
+ translator2_en2zh = ctranslate2.Translator("en2zh_cmodel", device="cpu")##路径
8
+
9
+ def translate(input_tokens, input_tokens2, mode):
10
+
11
+ source = split_string(input_tokens)
12
+ lenth = len(source)
13
+
14
+ source2 = split_string(input_tokens2)
15
+ lenth2 = len(source2)
16
+
17
+ if mode == "汉译英" :
18
+ results = translator_zh2en.translate_batch(source)##翻译的分词分句
19
+ results2 = translator2_zh2en.translate_batch(source2)##翻译的分词分句
20
+ else :
21
+ results = translator_en2zh.translate_batch(source)##翻译的分词分句
22
+ results2 = translator2_en2zh.translate_batch(source2)##翻译的分词分句
23
+
24
+ target = []
25
+ target2 = []
26
+
27
+ for i in range(0, lenth, 1):
28
+ target = target + results[i].hypotheses[0]
29
+ for i in range(0, lenth2, 1):
30
+ target2 = target2 + results2[i].hypotheses[0]
31
+
32
+ #print(results[0].hypotheses[0])##results[0]为第0句,hypotheses[0]保持0
33
+ ##print(results[1].hypotheses[0])
34
+ #return results[0].hypotheses[0]
35
+ return target,target2