TongkunGuan commited on
Commit
90c5590
·
verified ·
1 Parent(s): 4d82faf

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +3 -268
README.md CHANGED
@@ -200,274 +200,9 @@ remaining weights from the LLM-guided Token Alignment and unfreeze all parameter
200
 
201
  ### Document Understanding Results
202
 
203
- <table>
204
- <thead>
205
- <tr>
206
- <th style="background-color: #d3d3d3;"><b>8B-Model</b></th>
207
- <th>MM1.5</th>
208
- <th>InfoVQA</th>
209
- <th>DeepForm</th>
210
- <th>ChartQA</th>
211
- <th>TextVQA<sub>Val</sub></th>
212
- <th>WTQ</th>
213
- <th>TabFact</th>
214
- <th>FUNSD</th>
215
- <th>SROIE</th>
216
- <th>KLC</th>
217
- </tr>
218
- </thead>
219
- <tbody>
220
- <tr>
221
- <td style="background-color: #d3d3d3;"><b>Score</b></td>
222
- <td>71.9</td>
223
- <td>-</td>
224
- <td>-</td>
225
- <td>55.6</td>
226
- <td>74.1</td>
227
- <td>-</td>
228
- <td>-</td>
229
- <td>-</td>
230
- <td>-</td>
231
- <td>-</td>
232
- </tr>
233
- <tr>
234
- <td>MiniCPM-V</td>
235
- <td>3B</td>
236
- <td>COLM'24</td>
237
- <td>71.9</td>
238
- <td>-</td>
239
- <td>-</td>
240
- <td>55.6</td>
241
- <td>74.1</td>
242
- <td>-</td>
243
- <td>-</td>
244
- <td>-</td>
245
- </tr>
246
- <tr>
247
- <td>Mini-Monkey</td>
248
- <td>2B</td>
249
- <td>ICLR'25</td>
250
- <td>87.4</td>
251
- <td>60.1</td>
252
- <td>-</td>
253
- <td>76.5</td>
254
- <td>75.7</td>
255
- <td>-</td>
256
- <td>-</td>
257
- <td>42.9</td>
258
- <td>70.3</td>
259
- <td>-</td>
260
- </tr>
261
- <tr>
262
- <td>InternVL2.5</td>
263
- <td>2B</td>
264
- <td>arxiv'24</td>
265
- <td>88.7</td>
266
- <td>60.9</td>
267
- <td>15.2</td>
268
- <td>79.2</td>
269
- <td>74.3</td>
270
- <td>38.7</td>
271
- <td>58.1</td>
272
- <td>37.9</td>
273
- <td>68.1</td>
274
- <td>16.1</td>
275
- </tr>
276
- <tr style="background-color: #d3d3d3;">
277
- <td style="background-color: #d3d3d3;"><b>TokenVL w/o TA</b></td>
278
- <td style="background-color: #d3d3d3;">8B</td>
279
- <td style="background-color: #d3d3d3;">-</td>
280
- <td style="background-color: #d3d3d3;"><u>93.8</u></td>
281
- <td style="background-color: #d3d3d3;">75.3</td>
282
- <td style="background-color: #d3d3d3;"><u>72.4</u></td>
283
- <td style="background-color: #d3d3d3;"><u>86.5</u></td>
284
- <td style="background-color: #d3d3d3;"><u>79.3</u></td>
285
- <td style="background-color: #d3d3d3;"><u>57.2</u></td>
286
- <td style="background-color: #d3d3d3;"><u>83.6</u></td>
287
- <td style="background-color: #d3d3d3;"><u>41.5</u></td>
288
- <td style="background-color: #d3d3d3;"><u>79.0</u></td>
289
- <td style="background-color: #d3d3d3;"><u>39.6</u></td>
290
- </tr>
291
- <tr style="background-color: #d3d3d3;">
292
- <td style="background-color: #d3d3d3;"><b>TokenVL</b></td>
293
- <td style="background-color: #d3d3d3;">8B</td>
294
- <td style="background-color: #d3d3d3;">-</td>
295
- <td style="background-color: #d3d3d3;"><b>94.2</b></td>
296
- <td style="background-color: #d3d3d3;"><u>76.5</u></td>
297
- <td style="background-color: #d3d3d3;"><b>72.9</b></td>
298
- <td style="background-color: #d3d3d3;"><b>86.6</b></td>
299
- <td style="background-color: #d3d3d3;"><b>79.9</b></td>
300
- <td style="background-color: #d3d3d3;"><b>61.4</b></td>
301
- <td style="background-color: #d3d3d3;"><b>85.2</b></td>
302
- <td style="background-color: #d3d3d3;"><b>42.2</b></td>
303
- <td style="background-color: #d3d3d3;"><b>81.9</b></td>
304
- <td style="background-color: #d3d3d3;"><b>39.9</b></td>
305
- </tr>
306
- </tbody>
307
- </table>
308
-
309
- <table>
310
- <thead>
311
- <tr>
312
- <th>Model</th>
313
- <th>size</th>
314
- <th>Venue</th>
315
- <th>DocVQA</th>
316
- <th>InfoVQA</th>
317
- <th>DeepForm</th>
318
- <th>ChartQA</th>
319
- <th>TextVQA<sub>Val</sub></th>
320
- <th>WTQ</th>
321
- <th>TabFact</th>
322
- <th>FUNSD</th>
323
- <th>SROIE</th>
324
- <th>KLC</th>
325
- </tr>
326
- </thead>
327
- <tbody>
328
- <tr>
329
- <td>Claude-3.5 Sonnet</td>
330
- <td colspan="2" style="background-color: #d3d3d3;">Closed-source model</td>
331
- <td>88.5</td>
332
- <td>59.1</td>
333
- <td>31.4</td>
334
- <td>51.8</td>
335
- <td>71.4</td>
336
- <td>47.1</td>
337
- <td>53.5</td>
338
- <td>-</td>
339
- <td>-</td>
340
- <td>24.8</td>
341
- </tr>
342
- <tr>
343
- <td>GeminiPro-1.5</td>
344
- <td colspan="2" style="background-color: #d3d3d3;">Closed-source model</td>
345
- <td>91.2</td>
346
- <td>73.9</td>
347
- <td>32.2</td>
348
- <td>34.7</td>
349
- <td>80.4</td>
350
- <td>50.3</td>
351
- <td>71.2</td>
352
- <td>-</td>
353
- <td>-</td>
354
- <td>24.1</td>
355
- </tr>
356
- <tr>
357
- <td>GPT4o 20240806</td>
358
- <td colspan="2" style="background-color: #d3d3d3;">Closed-source model</td>
359
- <td>92.8</td>
360
- <td>66.4</td>
361
- <td>38.4</td>
362
- <td>85.7</td>
363
- <td>70.5</td>
364
- <td>46.6</td>
365
- <td>81.1</td>
366
- <td>-</td>
367
- <td>-</td>
368
- <td>29.9</td>
369
- </tr>
370
- </tbody>
371
- </table>
372
-
373
- <table>
374
- <thead>
375
- <tr>
376
- <th>Model</th>
377
- <th>size</th>
378
- <th>Venue</th>
379
- <th>DocVQA</th>
380
- <th>InfoVQA</th>
381
- <th>DeepForm</th>
382
- <th>ChartQA</th>
383
- <th>TextVQA<sub>Val</sub></th>
384
- <th>WTQ</th>
385
- <th>TabFact</th>
386
- <th>FUNSD</th>
387
- <th>SROIE</th>
388
- <th>KLC</th>
389
- </tr>
390
- </thead>
391
- <tbody>
392
- <tr>
393
- <td>DocPeida</td>
394
- <td>7B</td>
395
- <td>arxiv'23</td>
396
- <td>47.1</td>
397
- <td>15.2</td>
398
- <td>-</td>
399
- <td>46.9</td>
400
- <td>60.2</td>
401
- <td>-</td>
402
- <td>-</td>
403
- <td>29.9</td>
404
- <td>21.4</td>
405
- <td>-</td>
406
- </tr>
407
- <tr>
408
- <td>DocOwl</td>
409
- <td>7B</td>
410
- <td>arxiv'23</td>
411
- <td>62.2</td>
412
- <td>38.2</td>
413
- <td>42.6</td>
414
- <td>57.4</td>
415
- <td>52.6</td>
416
- <td>26.9</td>
417
- <td>67.6</td>
418
- <td>0.5</td>
419
- <td>1.7</td>
420
- <td>30.3</td>
421
- </tr>
422
- <tr>
423
- <td>LLaVA1.5</td>
424
- <td>7B</td>
425
- <td>NeurIPS'23</td>
426
- <td>-</td>
427
- <td>-</td>
428
- <td>-</td>
429
- <td>9.3</td>
430
- <td>-</td>
431
- <td>-</td>
432
- <td>-</td>
433
- <td>0.2</td>
434
- <td>1.7</td>
435
- <td>-</td>
436
- </tr>
437
- <tr>
438
- <td>UReader</td>
439
- <td>7B</td>
440
- <td>EMNLP'23</td>
441
- <td>65.4</td>
442
- <td>42.2</td>
443
- <td>49.5</td>
444
- <td>59.3</td>
445
- <td>57.6</td>
446
- <td>29.4</td>
447
- <td>67.6</td>
448
- <td>-</td>
449
- <td>-</td>
450
- <td>32.8</td>
451
- </tr>
452
- <tr>
453
- <td>DocPeida</td>
454
- <td>7B</td>
455
- <td>arxiv'23</td>
456
- <td>47.1</td>
457
- <td>15.2</td>
458
- <td>-</td>
459
- <td>46.9</td>
460
- <td>60.2</td>
461
- <td>-</td>
462
- <td>-</td>
463
- <td>29.9</td>
464
- <td>21.4</td>
465
- <td>-</td>
466
- </tr>
467
- <!-- Add additional rows as needed -->
468
- </tbody>
469
- </table>
470
-
471
 
472
 
473
 
 
200
 
201
  ### Document Understanding Results
202
 
203
+ <div align="center">
204
+ <img width="1300" alt="image" src="https://cdn-uploads.huggingface.co/production/uploads/650d4a36cbd0c7d550d3b41b/Msfs1YkDQHq2-djhm6QqD.png">
205
+ </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
 
208