Yi Liu
commited on
Commit
β’
33cbd55
1
Parent(s):
175a258
update README
Browse files
README.md
CHANGED
@@ -18,6 +18,10 @@ tags:
|
|
18 |
|
19 |
**Online Demo**: https://mooer-speech.mthreads.com:10077/
|
20 |
|
|
|
|
|
|
|
|
|
21 |
## π Introduction
|
22 |
|
23 |
We introduce **MooER (ζ©θ³)**: an LLM-based speech recognition and translation model developed by Moore Threads. With the *MooER* framework, you can transcribe the speech into text (speech recognition or, ASR), and translate it into other languages (speech translation or, AST) in a end-to-end manner. The performance of *MooER* is demonstrated in the subsequent section, along with our insights into model configurations, training strategies, and more, provided in our [technical report](https://arxiv.org/abs/2408.05101).
|
@@ -67,6 +71,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
67 |
<th>SeamlessM4T-v2</th>
|
68 |
<th>MooER-5K</th>
|
69 |
<th>MooER-80K</th>
|
|
|
70 |
</tr>
|
71 |
<tr>
|
72 |
<td rowspan="7">Chinese</td>
|
@@ -78,6 +83,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
78 |
<td>4.09</td>
|
79 |
<td>1.93</td>
|
80 |
<td>1.25</td>
|
|
|
81 |
</tr>
|
82 |
<tr>
|
83 |
<td>aishell2_ios</td>
|
@@ -88,6 +94,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
88 |
<td>4.81</td>
|
89 |
<td>3.17</td>
|
90 |
<td>2.67</td>
|
|
|
91 |
</tr>
|
92 |
<tr>
|
93 |
<td>test_magicdata</td>
|
@@ -98,6 +105,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
98 |
<td>9.69</td>
|
99 |
<td>3.48</td>
|
100 |
<td>2.52</td>
|
|
|
101 |
</tr>
|
102 |
<tr>
|
103 |
<td>test_thchs</td>
|
@@ -108,6 +116,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
108 |
<td>7.14</td>
|
109 |
<td>4.11</td>
|
110 |
<td>3.14</td>
|
|
|
111 |
</tr>
|
112 |
<tr>
|
113 |
<td>fleurs cmn_dev</td>
|
@@ -118,6 +127,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
118 |
<td>7.12</td>
|
119 |
<td>5.81</td>
|
120 |
<td>5.23</td>
|
|
|
121 |
</tr>
|
122 |
<tr>
|
123 |
<td>fleurs cmn_test</td>
|
@@ -128,6 +138,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
128 |
<td>7.66</td>
|
129 |
<td>6.77</td>
|
130 |
<td>6.18</td>
|
|
|
131 |
</tr>
|
132 |
<tr>
|
133 |
<td>average</td>
|
@@ -138,6 +149,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
138 |
<td><strong>6.75</strong></td>
|
139 |
<td><strong>4.21</strong></td>
|
140 |
<td><strong>3.50</strong></td>
|
|
|
141 |
</tr>
|
142 |
<tr>
|
143 |
<td rowspan="7">English</td>
|
@@ -149,6 +161,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
149 |
<td>2.77</td>
|
150 |
<td>7.78</td>
|
151 |
<td>4.11</td>
|
|
|
152 |
</tr>
|
153 |
<tr>
|
154 |
<td>librispeech test_other</td>
|
@@ -159,6 +172,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
159 |
<td>5.25</td>
|
160 |
<td>15.25</td>
|
161 |
<td>9.99</td>
|
|
|
162 |
</tr>
|
163 |
<tr>
|
164 |
<td>fleurs eng_dev</td>
|
@@ -169,6 +183,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
169 |
<td>11.36</td>
|
170 |
<td>18.89</td>
|
171 |
<td>13.32</td>
|
|
|
172 |
</tr>
|
173 |
<tr>
|
174 |
<td>fleurs eng_test</td>
|
@@ -179,6 +194,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
179 |
<td>11.82</td>
|
180 |
<td>20.41</td>
|
181 |
<td>14.97</td>
|
|
|
182 |
</tr>
|
183 |
<tr>
|
184 |
<td>gigaspeech dev</td>
|
@@ -189,6 +205,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
189 |
<td>28.01</td>
|
190 |
<td>23.46</td>
|
191 |
<td>16.92</td>
|
|
|
192 |
</tr>
|
193 |
<tr>
|
194 |
<td>gigaspeech test</td>
|
@@ -199,6 +216,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
199 |
<td>28.65</td>
|
200 |
<td>22.09</td>
|
201 |
<td>16.64</td>
|
|
|
202 |
</tr>
|
203 |
<tr>
|
204 |
<td>average</td>
|
@@ -209,6 +227,7 @@ The performance of speech recognition is evaluated using WER/CER.
|
|
209 |
<td><strong>14.64</strong></td>
|
210 |
<td><strong>17.98</strong></td>
|
211 |
<td><strong>12.66</strong></td>
|
|
|
212 |
</tr>
|
213 |
</table>
|
214 |
|
@@ -239,7 +258,7 @@ If you find MooER useful for your research, please π this repo and cite our w
|
|
239 |
|
240 |
```bibtex
|
241 |
@article{liang2024mooer,
|
242 |
-
title = {MooER: an LLM-based Speech Recognition and Translation Model from Moore
|
243 |
author = {Zhenlin Liang, Junhao Xu, Yi Liu, Yichao Hu, Jian Li, Yajun Zheng, Meng Cai, Hua Wang},
|
244 |
journal = {arXiv preprint arXiv:2408.05101},
|
245 |
url = {https://arxiv.org/abs/2408.05101},
|
|
|
18 |
|
19 |
**Online Demo**: https://mooer-speech.mthreads.com:10077/
|
20 |
|
21 |
+
## π₯ Update
|
22 |
+
|
23 |
+
We release a new model *MooER-80K-v2* using 80K hours of data. Click [here](https://huggingface.co/mtspeech/MooER-MTL-80K) to try the new model.
|
24 |
+
|
25 |
## π Introduction
|
26 |
|
27 |
We introduce **MooER (ζ©θ³)**: an LLM-based speech recognition and translation model developed by Moore Threads. With the *MooER* framework, you can transcribe the speech into text (speech recognition or, ASR), and translate it into other languages (speech translation or, AST) in a end-to-end manner. The performance of *MooER* is demonstrated in the subsequent section, along with our insights into model configurations, training strategies, and more, provided in our [technical report](https://arxiv.org/abs/2408.05101).
|
|
|
71 |
<th>SeamlessM4T-v2</th>
|
72 |
<th>MooER-5K</th>
|
73 |
<th>MooER-80K</th>
|
74 |
+
<th>MooER-80K-v2</th>
|
75 |
</tr>
|
76 |
<tr>
|
77 |
<td rowspan="7">Chinese</td>
|
|
|
83 |
<td>4.09</td>
|
84 |
<td>1.93</td>
|
85 |
<td>1.25</td>
|
86 |
+
<td>1.00</td>
|
87 |
</tr>
|
88 |
<tr>
|
89 |
<td>aishell2_ios</td>
|
|
|
94 |
<td>4.81</td>
|
95 |
<td>3.17</td>
|
96 |
<td>2.67</td>
|
97 |
+
<td>2.62</td>
|
98 |
</tr>
|
99 |
<tr>
|
100 |
<td>test_magicdata</td>
|
|
|
105 |
<td>9.69</td>
|
106 |
<td>3.48</td>
|
107 |
<td>2.52</td>
|
108 |
+
<td>2.17</td>
|
109 |
</tr>
|
110 |
<tr>
|
111 |
<td>test_thchs</td>
|
|
|
116 |
<td>7.14</td>
|
117 |
<td>4.11</td>
|
118 |
<td>3.14</td>
|
119 |
+
<td>3.00</td>
|
120 |
</tr>
|
121 |
<tr>
|
122 |
<td>fleurs cmn_dev</td>
|
|
|
127 |
<td>7.12</td>
|
128 |
<td>5.81</td>
|
129 |
<td>5.23</td>
|
130 |
+
<td>5.15</td>
|
131 |
</tr>
|
132 |
<tr>
|
133 |
<td>fleurs cmn_test</td>
|
|
|
138 |
<td>7.66</td>
|
139 |
<td>6.77</td>
|
140 |
<td>6.18</td>
|
141 |
+
<td>6.14</td>
|
142 |
</tr>
|
143 |
<tr>
|
144 |
<td>average</td>
|
|
|
149 |
<td><strong>6.75</strong></td>
|
150 |
<td><strong>4.21</strong></td>
|
151 |
<td><strong>3.50</strong></td>
|
152 |
+
<td><strong>3.35</strong></td>
|
153 |
</tr>
|
154 |
<tr>
|
155 |
<td rowspan="7">English</td>
|
|
|
161 |
<td>2.77</td>
|
162 |
<td>7.78</td>
|
163 |
<td>4.11</td>
|
164 |
+
<td>3.57</td>
|
165 |
</tr>
|
166 |
<tr>
|
167 |
<td>librispeech test_other</td>
|
|
|
172 |
<td>5.25</td>
|
173 |
<td>15.25</td>
|
174 |
<td>9.99</td>
|
175 |
+
<td>9.09</td>
|
176 |
</tr>
|
177 |
<tr>
|
178 |
<td>fleurs eng_dev</td>
|
|
|
183 |
<td>11.36</td>
|
184 |
<td>18.89</td>
|
185 |
<td>13.32</td>
|
186 |
+
<td>13.12</td>
|
187 |
</tr>
|
188 |
<tr>
|
189 |
<td>fleurs eng_test</td>
|
|
|
194 |
<td>11.82</td>
|
195 |
<td>20.41</td>
|
196 |
<td>14.97</td>
|
197 |
+
<td>14.74</td>
|
198 |
</tr>
|
199 |
<tr>
|
200 |
<td>gigaspeech dev</td>
|
|
|
205 |
<td>28.01</td>
|
206 |
<td>23.46</td>
|
207 |
<td>16.92</td>
|
208 |
+
<td>17.34</td>
|
209 |
</tr>
|
210 |
<tr>
|
211 |
<td>gigaspeech test</td>
|
|
|
216 |
<td>28.65</td>
|
217 |
<td>22.09</td>
|
218 |
<td>16.64</td>
|
219 |
+
<td>16.97</td>
|
220 |
</tr>
|
221 |
<tr>
|
222 |
<td>average</td>
|
|
|
227 |
<td><strong>14.64</strong></td>
|
228 |
<td><strong>17.98</strong></td>
|
229 |
<td><strong>12.66</strong></td>
|
230 |
+
<td><strong>12.47</strong></td>
|
231 |
</tr>
|
232 |
</table>
|
233 |
|
|
|
258 |
|
259 |
```bibtex
|
260 |
@article{liang2024mooer,
|
261 |
+
title = {MooER: an LLM-based Speech Recognition and Translation Model from Moore Threads},
|
262 |
author = {Zhenlin Liang, Junhao Xu, Yi Liu, Yichao Hu, Jian Li, Yajun Zheng, Meng Cai, Hua Wang},
|
263 |
journal = {arXiv preprint arXiv:2408.05101},
|
264 |
url = {https://arxiv.org/abs/2408.05101},
|