linbojunzi commited on
Commit
fd31a8c
·
verified ·
1 Parent(s): 7360ce1

Upload 30 files

Browse files
table_result/2407.00009v1_output.json ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00009v1.json",
4
+ "table_id": "1",
5
+ "section": "4.1",
6
+ "all_context": [
7
+ "The statistics of FPGA24 public benchmarks are summarized in Table 1 .",
8
+ "In FPGA24 contest, these circuits are obtained from different public benchmark suites and are then synthesized, placed, and routed on the target FPGA by using Vivado.",
9
+ "The routing solutions of all signal nets are removed for the contest task.",
10
+ "The benchmarks use the open-source FPGA Interchange Format (FPGAIF).",
11
+ "The nets in Table 1 include all signal nets to be routed and the connections represent the corresponding two-pin sub-nets to be routed.",
12
+ ""
13
+ ],
14
+ "target_context_ids": [
15
+ 0,
16
+ 4
17
+ ],
18
+ "selected_paragraphs": [
19
+ "[paragraph id = 0] The statistics of FPGA24 public benchmarks are summarized in Table 1 .",
20
+ "[paragraph id = 4] The nets in Table 1 include all signal nets to be routed and the connections represent the corresponding two-pin sub-nets to be routed."
21
+ ],
22
+ "table_html": "<figure class=\"ltx_table\" id=\"S3.T1\">\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S3.T1.2.1.1\" style=\"font-size:90%;\">Table 1</span>. </span><span class=\"ltx_text\" id=\"S3.T1.3.2\" style=\"font-size:90%;\">Statistics of FPGA24 public benchmarks</span></figcaption>\n<div class=\"ltx_inline-block ltx_transformed_outer\" id=\"S3.T1.4\" style=\"width:433.6pt;height:269pt;vertical-align:-1.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-1.6pt,1.0pt) scale(0.99276869630907,0.99276869630907) ;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S3.T1.4.1\">\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.1\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S3.T1.4.1.1.1\">Benchmark</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.4.1.1.2\">Nets (k)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.4.1.1.3\">Connections (k)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.4.1.1.4\">LUTs (k)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.4.1.1.5\">FFs (k)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.4.1.1.6\">DSPs</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.4.1.1.7\">BRAMs</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.2\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S3.T1.4.1.2.1\">logicnets_jscl</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.4.1.2.2\">28</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.4.1.2.3\">180</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.4.1.2.4\">31</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.4.1.2.5\">2</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.4.1.2.6\">0</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.4.1.2.7\">0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.3\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T1.4.1.3.1\">boom_med_pb</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.3.2\">54</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T1.4.1.3.3\">221</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.3.4\">36</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.3.5\">17</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.3.6\">24</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.3.7\">142</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.4\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T1.4.1.4.1\">vtr_mcml</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.4.2\">71</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T1.4.1.4.3\">225</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.4.4\">43</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.4.5\">15</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.4.6\">105</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.4.7\">142</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.5\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T1.4.1.5.1\">rosetta_fd</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.5.2\">77</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T1.4.1.5.3\">230</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.5.4\">46</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.5.5\">39</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.5.6\">72</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.5.7\">62</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.6\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T1.4.1.6.1\">corundum_25g</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.6.2\">166</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T1.4.1.6.3\">495</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.6.4\">73</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.6.5\">96</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.6.6\">0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.6.7\">221</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.7\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T1.4.1.7.1\">finn_radioml</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.7.2\">110</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T1.4.1.7.3\">405</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.7.4\">74</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.7.5\">46</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.7.6\">0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.7.7\">25</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.8\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T1.4.1.8.1\">vtr_lu64peeng</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.8.2\">143</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T1.4.1.8.3\">537</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.8.4\">90</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.8.5\">36</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.8.6\">128</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.8.7\">303</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.9\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T1.4.1.9.1\">corescore_500</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.9.2\">179</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T1.4.1.9.3\">590</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.9.4\">96</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.9.5\">116</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.9.6\">0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.9.7\">250</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.10\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T1.4.1.10.1\">corescore_500_pb</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.10.2\">175</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T1.4.1.10.3\">597</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.10.4\">96</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.10.5\">116</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.10.6\">0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.10.7\">250</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.11\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T1.4.1.11.1\">mlcad_d181_lefttwo3rds</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.11.2\">361</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T1.4.1.11.3\">916</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.11.4\">155</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.11.5\">203</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.11.6\">1344</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.11.7\">405</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.12\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T1.4.1.12.1\">koios_dla_like_large</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.12.2\">509</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T1.4.1.12.3\">912</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.12.4\">189</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.12.5\">362</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.12.6\">2209</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.12.7\">192</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.13\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T1.4.1.13.1\">boom_soc</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.13.2\">274</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T1.4.1.13.3\">1374</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.13.4\">227</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.13.5\">98</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.13.6\">61</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.13.7\">161</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.14\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T1.4.1.14.1\">ispd16_example2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.14.2\">449</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T1.4.1.14.3\">1455</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.14.4\">289</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.14.5\">234</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.14.6\">200</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.4.1.14.7\">384</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.4.1.15\">\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_r ltx_border_t\" id=\"S3.T1.4.1.15.1\">UltraScale+ xcvu3p</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S3.T1.4.1.15.2\">-</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S3.T1.4.1.15.3\">-</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S3.T1.4.1.15.4\">394</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S3.T1.4.1.15.5\">788</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S3.T1.4.1.15.6\">2280</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S3.T1.4.1.15.7\">720</td>\n</tr>\n</table>\n</span></div>\n</figure>",
23
+ "perturb_sentence_id": 0,
24
+ "output": {
25
+ "perturbed_statement": "[paragraph id = 0] The statistics of FPGA24 public benchmarks are summarized in Table 2. In FPGA24 contest, these circuits are obtained from different public benchmark suites and are then synthesized, placed, and routed on the target FPGA by using Vivado.",
26
+ "perturbed_explanation": "1. The original explanation states that the data for FPGA24 benchmarks are summarized in Table 1. 2. In the statement, it is mentioned that these statistics are summarized in Table 2, which contains an incorrect table reference as it contradicts the context provided [paragraph id = 4]."
27
+ }
28
+ },
29
+ {
30
+ "path": "table_paper/2407.00009v1.json",
31
+ "table_id": "2",
32
+ "section": "4.2",
33
+ "all_context": [
34
+ "The overall results of different methods are presented in Table 2 .",
35
+ "Compared with Vivado, RWRoute can significantly reduce the wirelength but incur considerable time overhead in some circuits, like mlcad_d181_lefttwo3rds and boom_soc.",
36
+ "Compared with both Vivado and RWRoute, our router can not only run two times faster on average but also further improve the wirelength in most cases, demonstrating the effectiveness of our proposed parallel framework.",
37
+ "In the following, we will conduct two ablation studies to discuss the contributions of different techniques in our proposed method.",
38
+ ""
39
+ ],
40
+ "target_context_ids": [
41
+ 0,
42
+ 1,
43
+ 2
44
+ ],
45
+ "selected_paragraphs": [
46
+ "[paragraph id = 0] The overall results of different methods are presented in Table 2 .",
47
+ "[paragraph id = 1] Compared with Vivado, RWRoute can significantly reduce the wirelength but incur considerable time overhead in some circuits, like mlcad_d181_lefttwo3rds and boom_soc.",
48
+ "[paragraph id = 2] Compared with both Vivado and RWRoute, our router can not only run two times faster on average but also further improve the wirelength in most cases, demonstrating the effectiveness of our proposed parallel framework."
49
+ ],
50
+ "table_html": "<figure class=\"ltx_table\" id=\"S3.T2\">\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S3.T2.2.1.1\" style=\"font-size:90%;\">Table 2</span>. </span><span class=\"ltx_text\" id=\"S3.T2.3.2\" style=\"font-size:90%;\">Overall performance. All metrics are the smaller the better.</span></figcaption><div class=\"ltx_flex_figure\">\n<div class=\"ltx_flex_cell ltx_flex_size_1\">\n<div class=\"ltx_inline-block ltx_figure_panel ltx_transformed_outer\" id=\"S3.T2.4\" style=\"width:433.6pt;height:199.5pt;vertical-align:-0.7pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-97.3pt,44.6pt) scale(0.690201763473409,0.690201763473409) ;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S3.T2.4.1\">\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.1\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S3.T2.4.1.1.1\" rowspan=\"2\"><span class=\"ltx_text\" id=\"S3.T2.4.1.1.1.1\">Benchmark</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" colspan=\"3\" id=\"S3.T2.4.1.1.2\">Vivado</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" colspan=\"3\" id=\"S3.T2.4.1.1.3\">RWRoute</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" colspan=\"3\" id=\"S3.T2.4.1.1.4\">Ours</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.2\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.2.1\">Runtime (s)</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.2.2\">Wirelength</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.2.3\">Score</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.2.4\">Runtime (s)</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.2.5\">Wirelength</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.2.6\">Score</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.2.7\">Runtime (s)</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.2.8\">Wirelength</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.2.9\">Score</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.3\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S3.T2.4.1.3.1\">logicnets_jscl</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.4.1.3.2\">78.33</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.4.1.3.3\">310</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T2.4.1.3.4\">101.50</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.4.1.3.5\">52.03</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.4.1.3.6\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.3.6.1\">226</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T2.4.1.3.7\">69.43</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.4.1.3.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.3.8.1\">35.26</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.4.1.3.9\">234</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.4.1.3.10\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.3.10.1\">55.13</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.4\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T2.4.1.4.1\">boom_med_pb</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.4.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.4.2.1\">139.33</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.4.3\">823</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.4.4\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.4.4.1\">207.70</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.4.5\">230.88</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.4.6\">969</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.4.7\">304.69</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.4.8\">144.50</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.4.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.4.9.1\">806</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.4.10\">210.65</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.5\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T2.4.1.5.1\">vtr_mcml</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.5.2\">490.33</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.5.3\">666</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.5.4\">507.90</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.5.5\">243.13</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.5.6\">594</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.5.7\">278.22</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.5.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.5.8.1\">94.29</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.5.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.5.9.1\">584</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.5.10\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.5.10.1\">143.26</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.6\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T2.4.1.6.1\">rosetta_fd</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.6.2\">147.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.6.3\">888</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.6.4\">221.70</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.6.5\">161.30</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.6.6\">839</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.6.7\">229.07</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.6.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.6.8.1\">125.32</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.6.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.6.9.1\">804</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.6.10\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.6.10.1\">193.19</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.7\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T2.4.1.7.1\">corundum_25g</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.7.2\">-</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.7.3\">-</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.7.4\">-</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.7.5\">249.61</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.7.6\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.7.6.1\">396</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.7.7\">264.25</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.7.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.7.8.1\">131.11</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.7.9\">500</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.7.10\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.7.10.1\">168.00</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.8\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T2.4.1.8.1\">finn_radioml</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.8.2\">154.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.8.3\">338</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.8.4\">173.00</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.8.5\">119.88</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.8.6\">277</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.8.7\">135.59</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.8.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.8.8.1\">63.29</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.8.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.8.9.1\">251</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.8.10\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.8.10.1\">82.06</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.9\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T2.4.1.9.1\">vtr_lu64peeng</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.9.2\">218.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.9.3\">1728</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.9.4\">369.60</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.9.5\">226.57</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.9.6\">1412</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.9.7\">345.12</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.9.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.9.8.1\">114.12</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.9.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.9.9.1\">1333</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.9.10\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.9.10.1\">236.01</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.10\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T2.4.1.10.1\">corescore_500</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.10.2\">188.33</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.10.3\">751</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.10.4\">244.60</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.10.5\">158.84</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.10.6\">680</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.10.7\">210.96</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.10.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.10.8.1\">73.03</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.10.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.10.9.1\">668</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.10.10\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.10.10.1\">132.52</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.11\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T2.4.1.11.1\">corescore_500_pb</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.11.2\">226.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.11.3\">861</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.11.4\">290.10</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.11.5\">278.30</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.11.6\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.11.6.1\">687</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.11.7\">319.17</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.11.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.11.8.1\">138.63</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.11.9\">739</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.11.10\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.11.10.1\">198.67</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.12\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T2.4.1.12.1\">mlcad_d181_lefttwo3rds</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.12.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.12.2.1\">407.67</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.12.3\">1159</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.12.4\">482.80</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.12.5\">1,779.59</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.12.6\">809</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.12.7\">1,682.53</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.12.8\">409.81</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.12.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.12.9.1\">771</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.12.10\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.12.10.1\">445.93</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.13\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T2.4.1.13.1\">koios_dla_like_large</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.13.2\">542.33</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.13.3\">927</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.13.4\">580.80</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.13.5\">392.07</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.13.6\">548</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.13.7\">407.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.13.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.13.8.1\">181.47</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.13.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.13.9.1\">520</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.13.10\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.13.10.1\">215.33</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.14\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T2.4.1.14.1\">boom_soc</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.14.2\">711.00</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.14.3\">2235</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.14.4\">863.40</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.14.5\">1,292.74</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.14.6\">1698</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.14.7\">1,333.26</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.14.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.14.8.1\">635.33</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.14.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.14.9.1\">1673</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.14.10\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.14.10.1\">739.10</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.15\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S3.T2.4.1.15.1\">ispd16_example2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.15.2\">385.00</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.15.3\">1481</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.15.4\">494.60</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.15.5\">584.94</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.15.6\">1114</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.4.1.15.7\">637.85</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.15.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.15.8.1\">314.65</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.15.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.15.9.1\">939</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.4.1.15.10\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.4.1.15.10.1\">377.09</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.4.1.16\">\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_r ltx_border_t\" id=\"S3.T2.4.1.16.1\">Avg. Ratio</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S3.T2.4.1.16.2\">2.04</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S3.T2.4.1.16.3\">1.31</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S3.T2.4.1.16.4\">1.73</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S3.T2.4.1.16.5\">2.10</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S3.T2.4.1.16.6\">1.03</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S3.T2.4.1.16.7\">1.76</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S3.T2.4.1.16.8\">1.00</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S3.T2.4.1.16.9\">1.00</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S3.T2.4.1.16.10\">1.00</td>\n</tr>\n</table>\n</span></div>\n</div>\n<div class=\"ltx_flex_break\"></div>\n<div class=\"ltx_flex_cell ltx_flex_size_1\">\n<p class=\"ltx_p ltx_figure_panel\" id=\"S3.T2.5\"><span class=\"ltx_text\" id=\"S3.T2.5.1\" style=\"font-size:80%;\">*Vivado fails to route the corundum_25g due to the failure in the DRC during the routing.</span></p>\n</div>\n</div>\n</figure>",
51
+ "perturb_sentence_id": 1,
52
+ "output": {
53
+ "perturbed_statement": "[paragraph id = 1] Compared with Vivado, RWRoute can significantly reduce the wirelength and eliminate time overhead in all circuits, like mlcad_d181_lefttwo3rds and boom_soc. Compared with both Vivado and RWRoute, our router can not only run two times faster on average but also further improve the wirelength in most cases, demonstrating the effectiveness of our proposed parallel framework.",
54
+ "perturbed_explanation": "The original explanation states: 1. RWRoute significantly reduces wirelength but can introduce time overhead in specific circuits, such as mlcad_d181_lefttwo3rds and boom_soc. 2. The proposed router enhances speed by twofold on average while improving wirelength, showcasing the framework's efficiency. The statement introduces inaccuracies by claiming RWRoute eliminates time overhead in all circuits, which contradicts the noted presence of time overhead in specific cases. By specifying that time overhead is eliminated universally, the altered statement no longer aligns with contextual evidence presented."
55
+ }
56
+ },
57
+ {
58
+ "path": "table_paper/2407.00009v1.json",
59
+ "table_id": "3",
60
+ "section": "4.3",
61
+ "all_context": [
62
+ "Firstly, we conduct an ablation study on the recursive partitioning ternary tree (RPTT) in our framework by replacing the RPTT with the single recursive partitioning tree in ParaDRo (Hoo and Kumar, 2018 ).",
63
+ "The comparison results, shown in Table 3 , reveal that the RPTT can reduce the runtime by 14% without obvious wirelength degradations.",
64
+ "Secondly, we study the effect of the hybrid updating strategy (HUS) for congestion coefficients.",
65
+ "We disable the HUS and apply the default updating strategy in RWRoute.",
66
+ "The results on the four congested designs, depicted in Figure 6 , show that our HUS can both improve the runtime and the wirelength for congested designs.",
67
+ "In particular, the runtime of mlcad_d181_lefttwo3rds is accelerated by around 4.5 times, and the wirelengths of mlcad_d181_lefttwo3rds and boom_med_pb are reduced by over 16%.",
68
+ ""
69
+ ],
70
+ "target_context_ids": [
71
+ 0,
72
+ 1
73
+ ],
74
+ "selected_paragraphs": [
75
+ "[paragraph id = 0] Firstly, we conduct an ablation study on the recursive partitioning ternary tree (RPTT) in our framework by replacing the RPTT with the single recursive partitioning tree in ParaDRo (Hoo and Kumar, 2018 ).",
76
+ "[paragraph id = 1] The comparison results, shown in Table 3 , reveal that the RPTT can reduce the runtime by 14% without obvious wirelength degradations."
77
+ ],
78
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T3\">\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S4.T3.2.1.1\" style=\"font-size:90%;\">Table 3</span>. </span><span class=\"ltx_text\" id=\"S4.T3.3.2\" style=\"font-size:90%;\">The comparison between Ours w.o. RPTT and Ours. The ratios larger than 1 represent the quality degradation.</span></figcaption>\n<div class=\"ltx_inline-block ltx_transformed_outer\" id=\"S4.T3.4\" style=\"width:433.6pt;height:404.9pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(72.2pt,-67.4pt) scale(1.49954817342014,1.49954817342014) ;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T3.4.1\">\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.1\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T3.4.1.1.1\">Benchmark</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.4.1.1.2\">Runtime (s)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.4.1.1.3\">Wirelength</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.4.1.1.4\">Score</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.2\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T3.4.1.2.1\">logicnets_jscl</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.4.1.2.2\">1.02</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.4.1.2.3\">0.98</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.4.1.2.4\">1.00</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.3\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T3.4.1.3.1\">boom_med_pb</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.3.2\">1.15</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.3.3\">1.02</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.3.4\">1.10</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.4\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T3.4.1.4.1\">vtr_mcml</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.4.2\">1.46</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.4.3\">1.06</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.4.4\">1.30</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.5\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T3.4.1.5.1\">rosetta_fd</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.5.2\">1.11</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.5.3\">1.06</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.5.4\">1.09</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.6\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T3.4.1.6.1\">corundum_25g</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.6.2\">1.03</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.6.3\">0.76</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.6.4\">0.95</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.7\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T3.4.1.7.1\">finn_radioml</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.7.2\">1.02</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.7.3\">1.04</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.7.4\">1.03</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.8\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T3.4.1.8.1\">vtr_lu64peeng</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.8.2\">1.12</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.8.3\">1.02</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.8.4\">1.06</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.9\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T3.4.1.9.1\">corescore_500</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.9.2\">1.08</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.9.3\">1.01</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.9.4\">1.04</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.10\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T3.4.1.10.1\">corescore_500_pb</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.10.2\">1.11</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.10.3\">1.08</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.10.4\">1.10</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.11\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T3.4.1.11.1\">mlcad_d181_lefttwo3rds</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.11.2\">1.16</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.11.3\">1.11</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.11.4\">1.15</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.12\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T3.4.1.12.1\">koios_dla_like_large</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.12.2\">1.14</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.12.3\">1.04</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.12.4\">1.12</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.13\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T3.4.1.13.1\">boom_soc</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.13.2\">1.42</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.13.3\">0.98</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.13.4\">1.32</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.14\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T3.4.1.14.1\">ispd16_example2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.14.2\">1.01</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.14.3\">0.99</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.4.1.14.4\">1.00</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.1.15\">\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T3.4.1.15.1\">Avg. Ratio</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S4.T3.4.1.15.2\">1.14</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S4.T3.4.1.15.3\">1.01</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S4.T3.4.1.15.4\">1.10</td>\n</tr>\n</table>\n</span></div>\n</figure>",
79
+ "perturb_sentence_id": 0,
80
+ "output": {
81
+ "perturbed_statement": "[paragraph id = 0] Firstly, we conduct an ablation study on the recursive partitioning ternary tree (RPTT) in our framework by replacing the RPTT with the binary recursive partitioning tree in ParaDRo (Hoo and Kumar, 2018 ).The comparison results, shown in Table 3 , reveal that the RPTT can reduce the runtime by 14% without obvious wirelength degradations.",
82
+ "perturbed_explanation": "1. The original explanation highlights the superiority of the recursive partitioning ternary tree (RPTT) over the single recursive partitioning tree in terms of runtime and wirelength performance. 2. The altered statement inaccurately claims that the RPTT was compared against a binary recursive partitioning tree rather than a single one, introducing an inconsistency with the documented setup in the source."
83
+ }
84
+ }
85
+ ]
table_result/2407.00010v1_output.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00010v1.json",
4
+ "table_id": "1",
5
+ "section": "5.1",
6
+ "all_context": [
7
+ "The systems we profile are shown in Table 1 .",
8
+ "We consider these systems as they demonstrate three prominent CPU manufactures and different generations of GPUs.",
9
+ "We utilize PyTorch v2.0.1, Torchvision v0.15.2, Numpy v1.26.0, Huggingface v0.20.2, and Accelerate v0.26.1.",
10
+ "We note that the M1-Pro results only include the Llama-2 (7B) and Mistral (7B) results, as Falcon (7B) generally did not complete tasks in less than two orders of magnitude greater runtime.",
11
+ ""
12
+ ],
13
+ "target_context_ids": [
14
+ 0,
15
+ 1,
16
+ 3
17
+ ],
18
+ "selected_paragraphs": [
19
+ "[paragraph id = 0] The systems we profile are shown in Table 1 .",
20
+ "[paragraph id = 1] We consider these systems as they demonstrate three prominent CPU manufactures and different generations of GPUs.",
21
+ "[paragraph id = 3] We note that the M1-Pro results only include the Llama-2 (7B) and Mistral (7B) results, as Falcon (7B) generally did not complete tasks in less than two orders of magnitude greater runtime."
22
+ ],
23
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T1\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S5.T1.3\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T1.3.4.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T1.3.4.1.1\">System Name</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T1.3.4.1.2\">CPU</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T1.3.4.1.3\">GPU(s) per Node</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T1.3.4.1.4\">DRAM per Node</th>\n<th class=\"ltx_td ltx_nopad_r ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T1.3.4.1.5\">VRAM per GPU</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T1.3.5.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S5.T1.3.5.1.1\">Macbook Pro</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T1.3.5.1.2\">10-core M1 Pro</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T1.3.5.1.3\">14-core M1 Pro</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T1.3.5.1.4\">32GB</td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center ltx_border_t\" id=\"S5.T1.3.5.1.5\">-</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.2.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T1.2.2.3\">Swing AMD+A100</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.1.1.1\">264-core AMD EPYC 7742</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.2.2.2\">8NVIDIA A100</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.2.2.4\">1TB</td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S5.T1.2.2.5\">40GB</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.3.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_bb\" id=\"S5.T1.3.3.2\">Palmetto Intel+V100</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T1.3.3.3\">40-Core Intel Xeon 6148G</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T1.3.3.1\">2NVIDIA V100</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T1.3.3.4\">376GB</td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center ltx_border_bb\" id=\"S5.T1.3.3.5\">16GB</td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 1. </span>Our System Configurations</figcaption>\n</figure>",
24
+ "perturb_sentence_id": 3,
25
+ "output": {
26
+ "perturbed_statement": "[paragraph id = 3] We note that the M1-Pro results only include the Falcon (7B) and Mistral (7B) results, as Llama-2 (7B) generally did not complete tasks in less than two orders of magnitude greater runtime.",
27
+ "perturbed_explanation": "1. The original explanation mentions that the M1-Pro results include the Llama-2 (7B) and Mistral (7B) results, whilst Falcon (7B) was excluded due to runtime inefficiency. 2. The statement is incorrect as it claims Falcon (7B) and Mistral (7B) results are included instead of the actual Llama-2 (7B) and Mistral (7B). Falcon (7B) was noted for its omission due to runtime issues, which does not align with the claim made in the statement."
28
+ }
29
+ },
30
+ {
31
+ "path": "table_paper/2407.00010v1.json",
32
+ "table_id": "1",
33
+ "section": "5.3",
34
+ "all_context": [
35
+ "Here, we present the impacts on runtime, energy consumption per token, and throughput for LLMs across different hardware configurations while varying the number of input tokens.",
36
+ "We perform these experiments using the suite of systems outlined in Table 1 with the models outlined in Section 4.1 .",
37
+ "In our experiments on the Palmetto Intel+V100 system, the V100 GPU had an out-of-memory error beyond 1024 output tokens for Falcon (7B).",
38
+ "Our runtime measurements show a significant increase as input tokens grow.",
39
+ "As depicted in Figure 1(a) , all systems exhibit a nonlinear escalation in runtime with increasing token counts, with the M1-Pro system showing the most significant magnitude.",
40
+ "This trend highlights the computational burden imposed by larger input sizes, particularly on smaller systems that are not as well designed to handle extensive workloads.",
41
+ "For all systems, we notice that throughput follows a ”roofline model” with increasing input tokens (roofline, ).",
42
+ "Figure 1(b) illustrates these dynamics, indicating an increase in throughput for all systems until a certain point where inference becomes bound by compute and not by the overhead of the software, as described by roofline performance models (roofline, ).",
43
+ "Energy efficiency varies markedly across different systems.",
44
+ "The M1-Pro demonstrates consistently low energy consumption per token, particularly for smaller input sizes, as shown in Figure 1(c) .",
45
+ "This efficiency reflects the M1-Pro s design optimization for low-power operations.",
46
+ "In contrast, the Swing AMD+A100, while capable of handling more significant token inputs more efficiently, consumed more energy per token for small workloads yet became more energy efficient at larger input token sizes, underscoring a trade-off between workload size and energy efficiency.",
47
+ ""
48
+ ],
49
+ "target_context_ids": [
50
+ 1
51
+ ],
52
+ "selected_paragraphs": [
53
+ "[paragraph id = 1] We perform these experiments using the suite of systems outlined in Table 1 with the models outlined in Section 4.1 ."
54
+ ],
55
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T1\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S5.T1.3\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T1.3.4.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T1.3.4.1.1\">System Name</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T1.3.4.1.2\">CPU</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T1.3.4.1.3\">GPU(s) per Node</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T1.3.4.1.4\">DRAM per Node</th>\n<th class=\"ltx_td ltx_nopad_r ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T1.3.4.1.5\">VRAM per GPU</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T1.3.5.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S5.T1.3.5.1.1\">Macbook Pro</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T1.3.5.1.2\">10-core M1 Pro</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T1.3.5.1.3\">14-core M1 Pro</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T1.3.5.1.4\">32GB</td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center ltx_border_t\" id=\"S5.T1.3.5.1.5\">-</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.2.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T1.2.2.3\">Swing AMD+A100</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.1.1.1\">264-core AMD EPYC 7742</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.2.2.2\">8NVIDIA A100</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.2.2.4\">1TB</td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S5.T1.2.2.5\">40GB</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.3.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_bb\" id=\"S5.T1.3.3.2\">Palmetto Intel+V100</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T1.3.3.3\">40-Core Intel Xeon 6148G</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T1.3.3.1\">2NVIDIA V100</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T1.3.3.4\">376GB</td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center ltx_border_bb\" id=\"S5.T1.3.3.5\">16GB</td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 1. </span>Our System Configurations</figcaption>\n</figure>",
56
+ "perturb_sentence_id": 1,
57
+ "output": {
58
+ "perturbed_statement": "[paragraph id = 1] We perform these experiments using the suite of systems outlined in Table 1 with the models outlined in Section 4.1 .In our experiments on the Palmetto Intel+V100 system, the V100 GPU had an out-of-memory error beyond 512 output tokens for Falcon (7B).",
59
+ "perturbed_explanation": "The original explanation: The V100 GPU experienced memory allocation issues when generating output exceeding the specified token limit (1024 tokens). 1. However, the statement mentions 512 output tokens as the limit, which contradicts the actual tested token capacity of the V100 GPU for this model. Hence, this difference establishes the inaccuracy of the statement's claim."
60
+ }
61
+ }
62
+ ]
table_result/2407.00013v1_output.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00013v1.json",
4
+ "table_id": "1",
5
+ "section": "4.1",
6
+ "all_context": [
7
+ "In this subsection, the performance of the caching system is evaluated in terms of cache hit and cache miss ratios, with the key variable being the threshold value set for the “sliding window algorithm”, which means after the threshold is reached, the IoT data corresponding to the context attribute will be considered as stale and evicted from cache.",
8
+ "The threshold is systematically varied from 10 minutes to 25 minutes, in increments of 5 minutes, as indicated in Table 1 .",
9
+ "The results of this variation, visualized in Figure 4 , suggest a trend of increasing cache hits as the threshold value rises.",
10
+ "After analysis, a 20-minute threshold has been selected for the experiments conducted in the subsequent sections.",
11
+ "It s important to note that post a threshold of 22 minutes, no significant impact or changes were observed in the system s performance.",
12
+ "This threshold selection ensures an optimal balance between cached context freshness and computational efficiency.",
13
+ ""
14
+ ],
15
+ "target_context_ids": [
16
+ 0,
17
+ 1,
18
+ 2,
19
+ 3,
20
+ 4,
21
+ 5
22
+ ],
23
+ "selected_paragraphs": [
24
+ "[paragraph id = 0] In this subsection, the performance of the caching system is evaluated in terms of cache hit and cache miss ratios, with the key variable being the threshold value set for the “sliding window algorithm”, which means after the threshold is reached, the IoT data corresponding to the context attribute will be considered as stale and evicted from cache.",
25
+ "[paragraph id = 1] The threshold is systematically varied from 10 minutes to 25 minutes, in increments of 5 minutes, as indicated in Table 1 .",
26
+ "[paragraph id = 2] The results of this variation, visualized in Figure 4 , suggest a trend of increasing cache hits as the threshold value rises.",
27
+ "[paragraph id = 3] After analysis, a 20-minute threshold has been selected for the experiments conducted in the subsequent sections.",
28
+ "[paragraph id = 4] It s important to note that post a threshold of 22 minutes, no significant impact or changes were observed in the system s performance.",
29
+ "[paragraph id = 5] This threshold selection ensures an optimal balance between cached context freshness and computational efficiency."
30
+ ],
31
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T1\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S4.T1.1\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1\" rowspan=\"2\" style=\"padding:2.5pt 8.0pt;\"><span class=\"ltx_text\" id=\"S4.T1.1.1.1.1.1\">Threshold</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" colspan=\"4\" id=\"S4.T1.1.1.1.2\" style=\"padding:2.5pt 8.0pt;\">Value</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.2.2\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.2.2.1\" style=\"padding:2.5pt 8.0pt;\">10</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.2.2.2\" style=\"padding:2.5pt 8.0pt;\">15</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.2.2.3\" style=\"padding:2.5pt 8.0pt;\">20</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.2.2.4\" style=\"padding:2.5pt 8.0pt;\">25</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.3.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.3.3.1\" style=\"padding:2.5pt 8.0pt;\">Cache hit</th>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.3.3.2\" style=\"padding:2.5pt 8.0pt;\">174</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.3.3.3\" style=\"padding:2.5pt 8.0pt;\">179</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.3.3.4\" style=\"padding:2.5pt 8.0pt;\">184</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.3.3.5\" style=\"padding:2.5pt 8.0pt;\">186</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.4.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.4.4.1\" style=\"padding:2.5pt 8.0pt;\">Cache miss</th>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.4.4.2\" style=\"padding:2.5pt 8.0pt;\">26</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.4.4.3\" style=\"padding:2.5pt 8.0pt;\">21</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.4.4.4\" style=\"padding:2.5pt 8.0pt;\">16</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.4.4.5\" style=\"padding:2.5pt 8.0pt;\">14</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.5.5\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_b ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.5.5.1\" style=\"padding:2.5pt 8.0pt;\">Ratio</th>\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T1.1.5.5.2\" style=\"padding:2.5pt 8.0pt;\">6.7</td>\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T1.1.5.5.3\" style=\"padding:2.5pt 8.0pt;\">8.5</td>\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T1.1.5.5.4\" style=\"padding:2.5pt 8.0pt;\">11.5</td>\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T1.1.5.5.5\" style=\"padding:2.5pt 8.0pt;\">13.3</td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span>Cache hit and cache miss ratios at different thresholds.</figcaption>\n</figure>",
32
+ "perturb_sentence_id": 1,
33
+ "output": {
34
+ "perturbed_statement": "[paragraph id = 1] The threshold is systematically varied from 15 minutes to 30 minutes, in increments of 5 minutes, as indicated in Table 1. The results of this variation, visualized in Figure 4, suggest a trend of decreasing cache hits as the threshold value rises.",
35
+ "perturbed_explanation": "1. The original statement accurately describes the range of variation for the threshold, which is from 10 minutes to 25 minutes, and the observed trend of cache hits increasing with the threshold value, supported by visual references in the table and figure.\n2. The statement is inaccurate because it alters the threshold range to 15–30 minutes, which is not the analyzed range, and misrepresents the trend by claiming a decrease in cache hits rather than an increase. Therefore, this statement contradicts the data analysis results provided."
36
+ }
37
+ },
38
+ {
39
+ "path": "table_paper/2407.00013v1.json",
40
+ "table_id": "2",
41
+ "section": "4.2",
42
+ "all_context": [
43
+ "In this subsection, the size of the incoming entries load was systematically varied, testing with 150, 250, 350, and 500 for each threshold from 10 to 25 minutes in increments of 5 minutes as shown in Table 2 .",
44
+ "The findings reveal a consistent pattern across all test cases.",
45
+ "With an increasing number of entries, both cache hit and cache miss counts increase, but the cache hit ratio remains relatively consistent indicating that the “hybrid approach” also supports scalability.",
46
+ "From the Cache Hit Ratio heatmap (Figure 5 ), we can observe a pattern of increasing cache hit ratio with an increasing threshold for all entry sets.",
47
+ "For a threshold of 10, the cache hit ratio remains relatively steady around 7.33 to 7.36 across all entries.",
48
+ "As the threshold increases to 15, there is a notable improvement in the ratio, reaching up to 9.47 for 500 entries.",
49
+ "When the threshold is increased further to 20 minutes, the ratio experiences an additional boost to a range of approximately 12.33 to 12.46.",
50
+ "Interestingly, upon reaching a 25-minute threshold, the ratio increases to around 14.38 for all query sets, except for 250 queries where it marginally drops to 14.15.",
51
+ "This discrepancy could be attributed to various factors including caching policies, size of the cache, or variability in the access patterns.",
52
+ "These findings, illustrated in the heatmap, affirm the choice of a 20-minute threshold as a suitable point.",
53
+ "While the cache hit ratio generally improves with an increase in threshold, the gains beyond the 20-minute mark are relatively minor.",
54
+ "This confirms the trade-off between context freshness and computational efficiency, and indicates the diminishing returns of increasing the threshold beyond 20 minutes.",
55
+ "Therefore, a 20-minute threshold appears to be the optimal point for maintaining an efficient cache system, given the current configuration and workload.",
56
+ ""
57
+ ],
58
+ "target_context_ids": [
59
+ 0,
60
+ 2,
61
+ 3,
62
+ 4,
63
+ 5,
64
+ 6,
65
+ 7,
66
+ 8,
67
+ 9,
68
+ 10,
69
+ 11,
70
+ 12
71
+ ],
72
+ "selected_paragraphs": [
73
+ "[paragraph id = 0] In this subsection, the size of the incoming entries load was systematically varied, testing with 150, 250, 350, and 500 for each threshold from 10 to 25 minutes in increments of 5 minutes as shown in Table 2 .",
74
+ "[paragraph id = 2] With an increasing number of entries, both cache hit and cache miss counts increase, but the cache hit ratio remains relatively consistent indicating that the “hybrid approach” also supports scalability.",
75
+ "[paragraph id = 3] From the Cache Hit Ratio heatmap (Figure 5 ), we can observe a pattern of increasing cache hit ratio with an increasing threshold for all entry sets.",
76
+ "[paragraph id = 4] For a threshold of 10, the cache hit ratio remains relatively steady around 7.33 to 7.36 across all entries.",
77
+ "[paragraph id = 5] As the threshold increases to 15, there is a notable improvement in the ratio, reaching up to 9.47 for 500 entries.",
78
+ "[paragraph id = 6] When the threshold is increased further to 20 minutes, the ratio experiences an additional boost to a range of approximately 12.33 to 12.46.",
79
+ "[paragraph id = 7] Interestingly, upon reaching a 25-minute threshold, the ratio increases to around 14.38 for all query sets, except for 250 queries where it marginally drops to 14.15.",
80
+ "[paragraph id = 8] This discrepancy could be attributed to various factors including caching policies, size of the cache, or variability in the access patterns.",
81
+ "[paragraph id = 9] These findings, illustrated in the heatmap, affirm the choice of a 20-minute threshold as a suitable point.",
82
+ "[paragraph id = 10] While the cache hit ratio generally improves with an increase in threshold, the gains beyond the 20-minute mark are relatively minor.",
83
+ "[paragraph id = 11] This confirms the trade-off between context freshness and computational efficiency, and indicates the diminishing returns of increasing the threshold beyond 20 minutes.",
84
+ "[paragraph id = 12] Therefore, a 20-minute threshold appears to be the optimal point for maintaining an efficient cache system, given the current configuration and workload."
85
+ ],
86
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T2\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S4.T2.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T2.1.1.1.1\" style=\"padding:2.5pt 8.0pt;\">No. of queries</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S4.T2.1.1.1.2\" style=\"padding:2.5pt 8.0pt;\">Threshold</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S4.T2.1.1.1.3\" style=\"padding:2.5pt 8.0pt;\">Cache Hit</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S4.T2.1.1.1.4\" style=\"padding:2.5pt 8.0pt;\">Cache miss</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S4.T2.1.1.1.5\" style=\"padding:2.5pt 8.0pt;\">Cache Hit Ratio</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.2.1\">\n<td class=\"ltx_td ltx_align_left ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T2.1.2.1.1\" rowspan=\"4\" style=\"padding:2.5pt 8.0pt;\"><span class=\"ltx_text\" id=\"S4.T2.1.2.1.1.1\">150</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.2.1.2\" style=\"padding:2.5pt 8.0pt;\">10</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.2.1.3\" style=\"padding:2.5pt 8.0pt;\">528</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.2.1.4\" style=\"padding:2.5pt 8.0pt;\">72</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.2.1.5\" style=\"padding:2.5pt 8.0pt;\">7.33</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.3.2\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.3.2.1\" style=\"padding:2.5pt 8.0pt;\">15</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.3.2.2\" style=\"padding:2.5pt 8.0pt;\">542</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.3.2.3\" style=\"padding:2.5pt 8.0pt;\">58</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.3.2.4\" style=\"padding:2.5pt 8.0pt;\">9.34</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.4.3\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.4.3.1\" style=\"padding:2.5pt 8.0pt;\">20</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.4.3.2\" style=\"padding:2.5pt 8.0pt;\">555</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.4.3.3\" style=\"padding:2.5pt 8.0pt;\">45</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.4.3.4\" style=\"padding:2.5pt 8.0pt;\">12.33</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.5.4\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.5.4.1\" style=\"padding:2.5pt 8.0pt;\">25</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.5.4.2\" style=\"padding:2.5pt 8.0pt;\">561</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.5.4.3\" style=\"padding:2.5pt 8.0pt;\">39</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.5.4.4\" style=\"padding:2.5pt 8.0pt;\">14.38</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.6.5\">\n<td class=\"ltx_td ltx_align_left ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T2.1.6.5.1\" rowspan=\"4\" style=\"padding:2.5pt 8.0pt;\"><span class=\"ltx_text\" id=\"S4.T2.1.6.5.1.1\">250</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.6.5.2\" style=\"padding:2.5pt 8.0pt;\">10</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.6.5.3\" style=\"padding:2.5pt 8.0pt;\">880</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.6.5.4\" style=\"padding:2.5pt 8.0pt;\">120</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.6.5.5\" style=\"padding:2.5pt 8.0pt;\">7.33</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.7.6\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.7.6.1\" style=\"padding:2.5pt 8.0pt;\">15</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.7.6.2\" style=\"padding:2.5pt 8.0pt;\">904</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.7.6.3\" style=\"padding:2.5pt 8.0pt;\">96</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.7.6.4\" style=\"padding:2.5pt 8.0pt;\">9.41</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.8.7\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.8.7.1\" style=\"padding:2.5pt 8.0pt;\">20</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.8.7.2\" style=\"padding:2.5pt 8.0pt;\">925</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.8.7.3\" style=\"padding:2.5pt 8.0pt;\">75</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.8.7.4\" style=\"padding:2.5pt 8.0pt;\">12.33</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.9.8\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.9.8.1\" style=\"padding:2.5pt 8.0pt;\">25</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.9.8.2\" style=\"padding:2.5pt 8.0pt;\">934</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.9.8.3\" style=\"padding:2.5pt 8.0pt;\">66</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.9.8.4\" style=\"padding:2.5pt 8.0pt;\">14.15</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.10.9\">\n<td class=\"ltx_td ltx_align_left ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T2.1.10.9.1\" rowspan=\"4\" style=\"padding:2.5pt 8.0pt;\"><span class=\"ltx_text\" id=\"S4.T2.1.10.9.1.1\">350</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.10.9.2\" style=\"padding:2.5pt 8.0pt;\">10</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.10.9.3\" style=\"padding:2.5pt 8.0pt;\">1232</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.10.9.4\" style=\"padding:2.5pt 8.0pt;\">168</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.10.9.5\" style=\"padding:2.5pt 8.0pt;\">7.33</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.11.10\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.11.10.1\" style=\"padding:2.5pt 8.0pt;\">15</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.11.10.2\" style=\"padding:2.5pt 8.0pt;\">1266</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.11.10.3\" style=\"padding:2.5pt 8.0pt;\">134</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.11.10.4\" style=\"padding:2.5pt 8.0pt;\">9.44</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.12.11\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.12.11.1\" style=\"padding:2.5pt 8.0pt;\">20</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.12.11.2\" style=\"padding:2.5pt 8.0pt;\">1296</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.12.11.3\" style=\"padding:2.5pt 8.0pt;\">104</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.12.11.4\" style=\"padding:2.5pt 8.0pt;\">12.46</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.13.12\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.13.12.1\" style=\"padding:2.5pt 8.0pt;\">25</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.13.12.2\" style=\"padding:2.5pt 8.0pt;\">1309</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.13.12.3\" style=\"padding:2.5pt 8.0pt;\">91</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.13.12.4\" style=\"padding:2.5pt 8.0pt;\">14.38</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.14.13\">\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T2.1.14.13.1\" rowspan=\"4\" style=\"padding:2.5pt 8.0pt;\"><span class=\"ltx_text\" id=\"S4.T2.1.14.13.1.1\">500</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.14.13.2\" style=\"padding:2.5pt 8.0pt;\">10</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.14.13.3\" style=\"padding:2.5pt 8.0pt;\">1761</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.14.13.4\" style=\"padding:2.5pt 8.0pt;\">239</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.14.13.5\" style=\"padding:2.5pt 8.0pt;\">7.36</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.15.14\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.15.14.1\" style=\"padding:2.5pt 8.0pt;\">15</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.15.14.2\" style=\"padding:2.5pt 8.0pt;\">1809</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.15.14.3\" style=\"padding:2.5pt 8.0pt;\">191</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.15.14.4\" style=\"padding:2.5pt 8.0pt;\">9.47</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.16.15\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.16.15.1\" style=\"padding:2.5pt 8.0pt;\">20</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.16.15.2\" style=\"padding:2.5pt 8.0pt;\">1851</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.16.15.3\" style=\"padding:2.5pt 8.0pt;\">149</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T2.1.16.15.4\" style=\"padding:2.5pt 8.0pt;\">12.42</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.17.16\">\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T2.1.17.16.1\" style=\"padding:2.5pt 8.0pt;\">25</td>\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T2.1.17.16.2\" style=\"padding:2.5pt 8.0pt;\">1870</td>\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T2.1.17.16.3\" style=\"padding:2.5pt 8.0pt;\">130</td>\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T2.1.17.16.4\" style=\"padding:2.5pt 8.0pt;\">14.38</td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span>Cache hit and cache miss ratios at different number of entries.</figcaption>\n</figure>",
87
+ "perturb_sentence_id": 0,
88
+ "output": {
89
+ "perturbed_statement": "[paragraph id = 0] In this subsection, the size of the incoming entries load was systematically varied, testing with 200, 250, 350, and 600 for each threshold from 10 to 25 minutes in increments of 5 minutes as shown in Table 2. The findings reveal a consistent pattern across all test cases.",
90
+ "perturbed_explanation": "1. The original explanation highlights the systematic variation in the size of the incoming entries load to specific values: 150, 250, 350, and 500 entries. 2. The statement introduces different entry load sizes (200 and 600 instead of 150 and 500), which conflicts with the identified tested values. This alteration affects the understanding of the experimental setup and its methodology."
91
+ }
92
+ }
93
+ ]
table_result/2407.00014v2_output.json ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00014v2.json",
4
+ "table_id": "1",
5
+ "section": "2.1.2",
6
+ "all_context": [
7
+ "In our study, we established two anchor points based on the MVCs during maximal finger extension and maximal finger flexion (see Figure 1 .",
8
+ "Part A: Training).",
9
+ "Each MVC is associated with specific forearm muscle groups responsible for the different movements.",
10
+ "These two extreme conditions of muscle contraction allow us to capture the discernible sEMG signals that closely represent the high end of the muscle force spectrum.",
11
+ "We employed a 12-channel electromyography device to record sEMG signals from the forearm muscles responsible for finger movements in Figure 2 .",
12
+ "Each muscle group was associated with one or multiple channels.",
13
+ "Denoting the sEMG signals from per channel as , we obtained windowed data , with indexing the window number within the channel .",
14
+ "For each windowed segment , a set of 8 features were extracted, resulting in a feature vector .",
15
+ "Detailed features information can be seen in Table 1 , the reasons to choose them will be explained in feature extraction.",
16
+ "The features extraction process transformed each windowed segment into an 8-dimensional features space, hereby constructing a feature matrix for each channel: where is the total number of windows extracted from each channel.",
17
+ "The feature matrices from all 12 channels were concatenated to form a comprehensive feature tensor of size 12 .",
18
+ "The relationship between sEMG data and the finger force labels of forearm muscles is assumed to be an approximate linear model and can be expressed as: where is a matrix of coefficients, is the vectorization of the feature tensor.",
19
+ "The overall model derivation process can be seen in Figure 4 .",
20
+ "Our aim is to employ machine learning-based regression techniques to determine the parameters .",
21
+ "This two-point approach allows us to interpolate the finger force labels values by sEMG activities lying between the two MVCs conditions, thus can predict the finger force output based on Equation 1 .",
22
+ "It can be used to control the direction and approximate veocity of gestures changes, simplifying computational complexity and eliminating the dependence on kinetic and kinematic sensors.",
23
+ ""
24
+ ],
25
+ "target_context_ids": [
26
+ 8
27
+ ],
28
+ "selected_paragraphs": [
29
+ "[paragraph id = 8] Detailed features information can be seen in Table 1 , the reasons to choose them will be explained in feature extraction."
30
+ ],
31
+ "table_html": "<figure class=\"ltx_table\" id=\"S2.T1\">\n<figcaption class=\"ltx_caption ltx_centering\" style=\"font-size:90%;\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span><span class=\"ltx_text\" id=\"S2.T1.12.1\" style=\"font-size:89%;\">Selected Features with a Monotonic Relationship</span></figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S2.T1.8\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S2.T1.8.9.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S2.T1.8.9.1.1\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T1.8.9.1.1.1\">\n<tr class=\"ltx_tr\" id=\"S2.T1.8.9.1.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T1.8.9.1.1.1.1.1\"><span class=\"ltx_text\" id=\"S2.T1.8.9.1.1.1.1.1.1\" style=\"font-size:90%;\">Feature name</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.8.9.1.1.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T1.8.9.1.1.1.2.1\"><span class=\"ltx_text\" id=\"S2.T1.8.9.1.1.1.2.1.1\" style=\"font-size:90%;\">and their abbreviation</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S2.T1.8.9.1.2\"><span class=\"ltx_text\" id=\"S2.T1.8.9.1.2.1\" style=\"font-size:90%;\">Formula</span></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S2.T1.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S2.T1.1.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.1.1.2.1\" style=\"font-size:90%;\">Root Mean Square (RMS)</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T1.1.1.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.2.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S2.T1.2.2.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.2.2.2.1\" style=\"font-size:90%;\">Mean Absolute Value (MAV)</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T1.2.2.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.3.3\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S2.T1.3.3.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.3.3.2.1\" style=\"font-size:90%;\">Variance (VAR)</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T1.3.3.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.4.4\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S2.T1.4.4.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.4.2.1\" style=\"font-size:90%;\">Standard Deviation (SD)</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T1.4.4.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.5.5\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S2.T1.5.5.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.5.5.2.1\" style=\"font-size:90%;\">Integral (INT)</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T1.5.5.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.6.6\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S2.T1.6.6.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.6.6.2.1\" style=\"font-size:90%;\">Wavelength (WL)</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T1.6.6.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.7.7\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S2.T1.7.7.2\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T1.7.7.2.1\">\n<tr class=\"ltx_tr\" id=\"S2.T1.7.7.2.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T1.7.7.2.1.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.7.7.2.1.1.1.1\" style=\"font-size:90%;\">Difference Absolute Standard</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.7.7.2.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T1.7.7.2.1.2.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.7.7.2.1.2.1.1\" style=\"font-size:90%;\">Deviation Value (DASDV)</span></td>\n</tr>\n</table>\n</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T1.7.7.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.8.8\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb\" id=\"S2.T1.8.8.2\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T1.8.8.2.1\">\n<tr class=\"ltx_tr\" id=\"S2.T1.8.8.2.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T1.8.8.2.1.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.8.8.2.1.1.1.1\" style=\"font-size:90%;\">Difference Absolute Mean</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.8.8.2.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T1.8.8.2.1.2.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.8.8.2.1.2.1.1\" style=\"font-size:90%;\">Value (DAMV)</span></td>\n</tr>\n</table>\n</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S2.T1.8.8.1\"></td>\n</tr>\n</tbody>\n</table>\n</figure>",
32
+ "perturb_sentence_id": 8,
33
+ "output": {
34
+ "perturbed_statement": "[paragraph id = 8] Detailed features information can be seen in Table 1, the reasons to choose them will be explained in feature extraction. The features extraction process transformed each segmented window into a 5-dimensional features space, hereby constructing a feature matrix for each channel, where is the total number of windows extracted from each channel.",
35
+ "perturbed_explanation": "The original explanation: The feature extraction process utilized transforms each windowed segment into an 8-dimensional feature space, constructing a comprehensive feature matrix for every channel. 1. This ensures accurate and systematic representation of the data. The incorrectness in the statement arises from indicating that the transformation results in a \"5-dimensional feature space\" instead of the correct \"8-dimensional feature space,\" inaccurately representing the dimensionality of the derived features."
36
+ }
37
+ },
38
+ {
39
+ "path": "table_paper/2407.00014v2.json",
40
+ "table_id": "1",
41
+ "section": "2.3",
42
+ "all_context": [
43
+ "After recording the sEMG signals, we combined the three sets of data from each participant to increase the overall dataset, resulting in one comprehensive dataset per person for training purpose.",
44
+ "Since the sEMG signal is extremely weak, it is easy to be disturbed by noise from various sources such as skin, sensors, and the environment.",
45
+ "In order to improve the analyzability of the electromyography signal, we must first preprocess it [28 ].",
46
+ "Firstly, we removed the direct current (DC) component from the 12-channel sEMG data to eliminate any baseline drift [29 ].",
47
+ "Then, we individually filtered each channel with a 6th order Butterworth bandpass filter from 10 Hz to 450 Hz to remove motion artifacts and high-frequency noise, ensuring that differences in electrode placement do not affect the sEMG signals [29 ].",
48
+ "A 50 Hz notch filter was also applied to each channel to eliminate power line interference [30 ].",
49
+ "After filtering, we performed full-wave rectification on the data.",
50
+ "For real-time force analysis, low latency and fast response are necessary, and smaller windows can achieve this.",
51
+ "Therefore, we processed all data using a sliding window approach (see Figure 4 ) with a 200ms window length and a 50ms step size [31 ].",
52
+ "In order to obtain useful information in sEMG and eliminate interfering components, it is necessary to carry out feature extraction.",
53
+ "Conventional sEMG signal features include time domain features, frequency domain features and time-frequency domain features [32 , 33 ].",
54
+ "In our two-point approach, the most critical is the use of linear relationship segments, so we extracted eight time-domain amplitude features shown in Table 1 from each of the 12 sEMG channels: Root Mean Square (RMS), Mean Absolute Value (MAV), Variance (VAR), Standard Deviation (SD), Integral (INT), Wavelength (WL), Difference Absolute Standard Deviation Value (DASDV), and Difference Absolute Mean Value (DAMV).",
55
+ "Therefore, the sEMG data mentioned mostly in this paper represents the time-frequency features.",
56
+ "While some features share similarities, we found that incorporating a broader set of features significantly enhanced the accuracy of our linear regression model.",
57
+ "The model input now consists of a matrix, providing a richer representation of the data.",
58
+ "Additionally, the model s adaptive nature allows for automatic adjustment of weights assigned to different input features, further minimizing error.",
59
+ "According to the formulas listed in Table 1 , we can infer that the values of these features have a monotonically increasing relationship with the values of the original signal sequence.",
60
+ "Therefore, after scaling the original sEMG signals, the scaling relationship will still be preserved in the feature values.",
61
+ "And the per-channel feature extraction processing can avoid the differences introduced by variations in electrode patch placement, providing a detailed and robust dataset for analyzing muscle force.",
62
+ ""
63
+ ],
64
+ "target_context_ids": [
65
+ 12,
66
+ 17,
67
+ 18
68
+ ],
69
+ "selected_paragraphs": [
70
+ "[paragraph id = 12] Therefore, the sEMG data mentioned mostly in this paper represents the time-frequency features.",
71
+ "[paragraph id = 17] Therefore, after scaling the original sEMG signals, the scaling relationship will still be preserved in the feature values.",
72
+ "[paragraph id = 18] And the per-channel feature extraction processing can avoid the differences introduced by variations in electrode patch placement, providing a detailed and robust dataset for analyzing muscle force."
73
+ ],
74
+ "table_html": "<figure class=\"ltx_table\" id=\"S2.T1\">\n<figcaption class=\"ltx_caption ltx_centering\" style=\"font-size:90%;\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span><span class=\"ltx_text\" id=\"S2.T1.12.1\" style=\"font-size:89%;\">Selected Features with a Monotonic Relationship</span></figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S2.T1.8\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S2.T1.8.9.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S2.T1.8.9.1.1\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T1.8.9.1.1.1\">\n<tr class=\"ltx_tr\" id=\"S2.T1.8.9.1.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T1.8.9.1.1.1.1.1\"><span class=\"ltx_text\" id=\"S2.T1.8.9.1.1.1.1.1.1\" style=\"font-size:90%;\">Feature name</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.8.9.1.1.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T1.8.9.1.1.1.2.1\"><span class=\"ltx_text\" id=\"S2.T1.8.9.1.1.1.2.1.1\" style=\"font-size:90%;\">and their abbreviation</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S2.T1.8.9.1.2\"><span class=\"ltx_text\" id=\"S2.T1.8.9.1.2.1\" style=\"font-size:90%;\">Formula</span></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S2.T1.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S2.T1.1.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.1.1.2.1\" style=\"font-size:90%;\">Root Mean Square (RMS)</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T1.1.1.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.2.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S2.T1.2.2.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.2.2.2.1\" style=\"font-size:90%;\">Mean Absolute Value (MAV)</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T1.2.2.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.3.3\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S2.T1.3.3.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.3.3.2.1\" style=\"font-size:90%;\">Variance (VAR)</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T1.3.3.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.4.4\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S2.T1.4.4.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.4.2.1\" style=\"font-size:90%;\">Standard Deviation (SD)</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T1.4.4.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.5.5\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S2.T1.5.5.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.5.5.2.1\" style=\"font-size:90%;\">Integral (INT)</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T1.5.5.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.6.6\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S2.T1.6.6.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.6.6.2.1\" style=\"font-size:90%;\">Wavelength (WL)</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T1.6.6.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.7.7\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S2.T1.7.7.2\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T1.7.7.2.1\">\n<tr class=\"ltx_tr\" id=\"S2.T1.7.7.2.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T1.7.7.2.1.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.7.7.2.1.1.1.1\" style=\"font-size:90%;\">Difference Absolute Standard</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.7.7.2.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T1.7.7.2.1.2.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.7.7.2.1.2.1.1\" style=\"font-size:90%;\">Deviation Value (DASDV)</span></td>\n</tr>\n</table>\n</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T1.7.7.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.8.8\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb\" id=\"S2.T1.8.8.2\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T1.8.8.2.1\">\n<tr class=\"ltx_tr\" id=\"S2.T1.8.8.2.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T1.8.8.2.1.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.8.8.2.1.1.1.1\" style=\"font-size:90%;\">Difference Absolute Mean</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.8.8.2.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T1.8.8.2.1.2.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.8.8.2.1.2.1.1\" style=\"font-size:90%;\">Value (DAMV)</span></td>\n</tr>\n</table>\n</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S2.T1.8.8.1\"></td>\n</tr>\n</tbody>\n</table>\n</figure>",
75
+ "perturb_sentence_id": 12,
76
+ "output": {
77
+ "perturbed_statement": "[paragraph id = 12] Therefore, the sEMG data mentioned mostly in this paper represents the time-frequency features. However, we found incorporating a broader set of features did not improve the accuracy of our linear regression model.",
78
+ "perturbed_explanation": "1. The original explanation states that analyzing a wider range of features enhances the model's performance, which adds value to the research. 2. The statement suggests that expanding the feature set did not benefit the model's accuracy. This is incorrect because it contradicts the initial claim, which highlights the importance of a comprehensive feature selection in improving model performance."
79
+ }
80
+ },
81
+ {
82
+ "path": "table_paper/2407.00014v2.json",
83
+ "table_id": "2",
84
+ "section": "2.4",
85
+ "all_context": [
86
+ "Based on the Model Derivation section above, we need a model capable of near-linear fitting to meet our linear control requirements.",
87
+ "Therefore, we use Dendritic Net (DD) to implement the two-point approach, and use a fully linear network (LN), multi-layer perceptron (MLP) and convolutional neural network (CNN) for performance comparison.",
88
+ "The specific neural network diagram is shown in Figure 6 and Table 2 .",
89
+ ""
90
+ ],
91
+ "target_context_ids": [
92
+ 2
93
+ ],
94
+ "selected_paragraphs": [
95
+ "[paragraph id = 2] The specific neural network diagram is shown in Figure 6 and Table 2 ."
96
+ ],
97
+ "table_html": "<figure class=\"ltx_table\" id=\"S2.T2\">\n<figcaption class=\"ltx_caption ltx_centering\" style=\"font-size:90%;\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span><span class=\"ltx_text\" id=\"S2.T2.11.1\" style=\"font-size:89%;\">Models and Their Order</span></figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S2.T2.7\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S2.T2.7.8.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S2.T2.7.8.1.1\"><span class=\"ltx_text\" id=\"S2.T2.7.8.1.1.1\" style=\"font-size:90%;\">Type</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S2.T2.7.8.1.2\"><span class=\"ltx_text\" id=\"S2.T2.7.8.1.2.1\" style=\"font-size:90%;\">Models</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S2.T2.7.8.1.3\"><span class=\"ltx_text\" id=\"S2.T2.7.8.1.3.1\" style=\"font-size:90%;\">Core Formulas</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S2.T2.7.8.1.4\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.7.8.1.4.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.7.8.1.4.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.7.8.1.4.1.1.1\"><span class=\"ltx_text\" id=\"S2.T2.7.8.1.4.1.1.1.1\" style=\"font-size:90%;\">Systems</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.7.8.1.4.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.7.8.1.4.1.2.1\"><span class=\"ltx_text\" id=\"S2.T2.7.8.1.4.1.2.1.1\" style=\"font-size:90%;\">Fit by</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.7.8.1.4.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.7.8.1.4.1.3.1\"><span class=\"ltx_text\" id=\"S2.T2.7.8.1.4.1.3.1.1\" style=\"font-size:90%;\">the Model</span></td>\n</tr>\n</table>\n</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.2.2.3\" rowspan=\"2\"><span class=\"ltx_text\" id=\"S2.T2.2.2.3.1\" style=\"font-size:90%;\">\n<span class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.2.2.3.1.1\">\n<span class=\"ltx_tr\" id=\"S2.T2.2.2.3.1.1.1\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.3.1.1.1.1\">Near-</span></span>\n<span class=\"ltx_tr\" id=\"S2.T2.2.2.3.1.1.2\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.3.1.1.2.1\">linear</span></span>\n<span class=\"ltx_tr\" id=\"S2.T2.2.2.3.1.1.3\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.3.1.1.3.1\">model</span></span>\n</span></span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.2.2.4\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.2.2.4.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.4.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.4.1.1.1\"><span class=\"ltx_text\" id=\"S2.T2.2.2.4.1.1.1.1\" style=\"font-size:90%;\">DD</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.4.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.4.1.2.1\"><span class=\"ltx_text\" id=\"S2.T2.2.2.4.1.2.1.1\" style=\"font-size:90%;\">(one</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.4.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.4.1.3.1\"><span class=\"ltx_text\" id=\"S2.T2.2.2.4.1.3.1.1\" style=\"font-size:90%;\">layer)</span></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T2.2.2.2\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.2.2.2.2\">\n<tr class=\"ltx_tr\" id=\"S2.T2.1.1.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.1.1.1.1.1.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.2.2.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.2.2.2.2.2.1\"></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.2.2.5\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.2.2.5.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.5.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.5.1.1.1\"><span class=\"ltx_text\" id=\"S2.T2.2.2.5.1.1.1.1\" style=\"font-size:90%;\">Second-</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.5.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.5.1.2.1\"><span class=\"ltx_text\" id=\"S2.T2.2.2.5.1.2.1.1\" style=\"font-size:90%;\">order</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.5.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.5.1.3.1\"><span class=\"ltx_text\" id=\"S2.T2.2.2.5.1.3.1.1\" style=\"font-size:90%;\">system</span></td>\n</tr>\n</table>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.3.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.3.3.2\"><span class=\"ltx_text\" id=\"S2.T2.3.3.2.1\" style=\"font-size:90%;\">LN</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T2.3.3.1\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.3.3.1.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.3.3.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.3.3.1.1.1.1\"></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.3.3.3\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.3.3.3.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.3.3.3.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.3.3.3.1.1.1\"><span class=\"ltx_text\" id=\"S2.T2.3.3.3.1.1.1.1\" style=\"font-size:90%;\">First-</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.3.3.3.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.3.3.3.1.2.1\"><span class=\"ltx_text\" id=\"S2.T2.3.3.3.1.2.1.1\" style=\"font-size:90%;\">order</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.3.3.3.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.3.3.3.1.3.1\"><span class=\"ltx_text\" id=\"S2.T2.3.3.3.1.3.1.1\" style=\"font-size:90%;\">system</span></td>\n</tr>\n</table>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.4.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T2.4.4.2\" rowspan=\"2\"><span class=\"ltx_text\" id=\"S2.T2.4.4.2.1\" style=\"font-size:90%;\">\n<span class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.4.4.2.1.1\">\n<span class=\"ltx_tr\" id=\"S2.T2.4.4.2.1.1.1\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.4.4.2.1.1.1.1\">Nonlinear</span></span>\n<span class=\"ltx_tr\" id=\"S2.T2.4.4.2.1.1.2\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.4.4.2.1.1.2.1\">complex</span></span>\n<span class=\"ltx_tr\" id=\"S2.T2.4.4.2.1.1.3\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.4.4.2.1.1.3.1\">model</span></span>\n</span></span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.4.4.3\"><span class=\"ltx_text\" id=\"S2.T2.4.4.3.1\" style=\"font-size:90%;\">MLP</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T2.4.4.1\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.4.4.1.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.4.4.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.4.4.1.1.1.1\"></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.4.4.4\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.4.4.4.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.4.4.4.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.4.4.4.1.1.1\"><span class=\"ltx_text\" id=\"S2.T2.4.4.4.1.1.1.1\" style=\"font-size:90%;\">High-</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.4.4.4.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.4.4.4.1.2.1\"><span class=\"ltx_text\" id=\"S2.T2.4.4.4.1.2.1.1\" style=\"font-size:90%;\">order</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.4.4.4.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.4.4.4.1.3.1\"><span class=\"ltx_text\" id=\"S2.T2.4.4.4.1.3.1.1\" style=\"font-size:90%;\">system</span></td>\n</tr>\n</table>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.7.7\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T2.7.7.4\"><span class=\"ltx_text\" id=\"S2.T2.7.7.4.1\" style=\"font-size:90%;\">CNN</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb ltx_border_t\" id=\"S2.T2.7.7.3\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.7.7.3.3\">\n<tr class=\"ltx_tr\" id=\"S2.T2.5.5.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.5.5.1.1.1.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.6.6.2.2.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.6.6.2.2.2.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.7.7.3.3.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.7.7.3.3.3.1\"></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T2.7.7.5\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.7.7.5.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.7.7.5.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.7.7.5.1.1.1\"><span class=\"ltx_text\" id=\"S2.T2.7.7.5.1.1.1.1\" style=\"font-size:90%;\">High-</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.7.7.5.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.7.7.5.1.2.1\"><span class=\"ltx_text\" id=\"S2.T2.7.7.5.1.2.1.1\" style=\"font-size:90%;\">order</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.7.7.5.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.7.7.5.1.3.1\"><span class=\"ltx_text\" id=\"S2.T2.7.7.5.1.3.1.1\" style=\"font-size:90%;\">system</span></td>\n</tr>\n</table>\n</td>\n</tr>\n</tbody>\n</table>\n</figure>",
98
+ "perturb_sentence_id": 2,
99
+ "output": {
100
+ "perturbed_statement": "[paragraph id = 2] The specific neural network diagram is shown in Figure 5 and Table 2.",
101
+ "perturbed_explanation": "The original explanation: Figures and tables are commonly referenced to provide detailed information about specific components or data. By replacing 'Figure 6' with 'Figure 5', it incorrectly states the source location of the neural network diagram, potentially leading to confusion when attempting to locate the correct reference."
102
+ }
103
+ },
104
+ {
105
+ "path": "table_paper/2407.00014v2.json",
106
+ "table_id": "2",
107
+ "section": "2.4.1",
108
+ "all_context": [
109
+ "DD is a new type of white-box neural network inspired by the dendritic structure of the brain.",
110
+ "Our model uses an improved DD, which introduces some special residual connections and contains one layer of DD modules [34 , 35 ].",
111
+ "By adjusting the number of DD modules, the logical expression ability of the algorithm and the order of its fitting system can be effectively adjusted [36 , 37 , 38 ].",
112
+ "Its model is capable of fitting up to a second-order system.",
113
+ "Its formula can be seen in the Table 2 , and the one-layer DD model is capable of fitting up to a second-order system.",
114
+ "Its excellent generalization ability and low computational complexity are the main reasons for our choice.",
115
+ "Detailed structure can be seen in Figure 6 .",
116
+ "DD (one layer).",
117
+ "While the LN represents a network composed entirely of fully connected layers, and the fully connected layers do not have biases.",
118
+ ""
119
+ ],
120
+ "target_context_ids": [
121
+ 3,
122
+ 4
123
+ ],
124
+ "selected_paragraphs": [
125
+ "[paragraph id = 3] Its model is capable of fitting up to a second-order system.",
126
+ "[paragraph id = 4] Its formula can be seen in the Table 2 , and the one-layer DD model is capable of fitting up to a second-order system."
127
+ ],
128
+ "table_html": "<figure class=\"ltx_table\" id=\"S2.T2\">\n<figcaption class=\"ltx_caption ltx_centering\" style=\"font-size:90%;\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span><span class=\"ltx_text\" id=\"S2.T2.11.1\" style=\"font-size:89%;\">Models and Their Order</span></figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S2.T2.7\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S2.T2.7.8.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S2.T2.7.8.1.1\"><span class=\"ltx_text\" id=\"S2.T2.7.8.1.1.1\" style=\"font-size:90%;\">Type</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S2.T2.7.8.1.2\"><span class=\"ltx_text\" id=\"S2.T2.7.8.1.2.1\" style=\"font-size:90%;\">Models</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S2.T2.7.8.1.3\"><span class=\"ltx_text\" id=\"S2.T2.7.8.1.3.1\" style=\"font-size:90%;\">Core Formulas</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S2.T2.7.8.1.4\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.7.8.1.4.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.7.8.1.4.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.7.8.1.4.1.1.1\"><span class=\"ltx_text\" id=\"S2.T2.7.8.1.4.1.1.1.1\" style=\"font-size:90%;\">Systems</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.7.8.1.4.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.7.8.1.4.1.2.1\"><span class=\"ltx_text\" id=\"S2.T2.7.8.1.4.1.2.1.1\" style=\"font-size:90%;\">Fit by</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.7.8.1.4.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.7.8.1.4.1.3.1\"><span class=\"ltx_text\" id=\"S2.T2.7.8.1.4.1.3.1.1\" style=\"font-size:90%;\">the Model</span></td>\n</tr>\n</table>\n</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.2.2.3\" rowspan=\"2\"><span class=\"ltx_text\" id=\"S2.T2.2.2.3.1\" style=\"font-size:90%;\">\n<span class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.2.2.3.1.1\">\n<span class=\"ltx_tr\" id=\"S2.T2.2.2.3.1.1.1\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.3.1.1.1.1\">Near-</span></span>\n<span class=\"ltx_tr\" id=\"S2.T2.2.2.3.1.1.2\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.3.1.1.2.1\">linear</span></span>\n<span class=\"ltx_tr\" id=\"S2.T2.2.2.3.1.1.3\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.3.1.1.3.1\">model</span></span>\n</span></span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.2.2.4\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.2.2.4.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.4.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.4.1.1.1\"><span class=\"ltx_text\" id=\"S2.T2.2.2.4.1.1.1.1\" style=\"font-size:90%;\">DD</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.4.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.4.1.2.1\"><span class=\"ltx_text\" id=\"S2.T2.2.2.4.1.2.1.1\" style=\"font-size:90%;\">(one</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.4.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.4.1.3.1\"><span class=\"ltx_text\" id=\"S2.T2.2.2.4.1.3.1.1\" style=\"font-size:90%;\">layer)</span></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T2.2.2.2\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.2.2.2.2\">\n<tr class=\"ltx_tr\" id=\"S2.T2.1.1.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.1.1.1.1.1.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.2.2.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.2.2.2.2.2.1\"></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.2.2.5\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.2.2.5.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.5.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.5.1.1.1\"><span class=\"ltx_text\" id=\"S2.T2.2.2.5.1.1.1.1\" style=\"font-size:90%;\">Second-</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.5.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.5.1.2.1\"><span class=\"ltx_text\" id=\"S2.T2.2.2.5.1.2.1.1\" style=\"font-size:90%;\">order</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.2.2.5.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.2.2.5.1.3.1\"><span class=\"ltx_text\" id=\"S2.T2.2.2.5.1.3.1.1\" style=\"font-size:90%;\">system</span></td>\n</tr>\n</table>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.3.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.3.3.2\"><span class=\"ltx_text\" id=\"S2.T2.3.3.2.1\" style=\"font-size:90%;\">LN</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T2.3.3.1\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.3.3.1.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.3.3.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.3.3.1.1.1.1\"></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.3.3.3\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.3.3.3.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.3.3.3.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.3.3.3.1.1.1\"><span class=\"ltx_text\" id=\"S2.T2.3.3.3.1.1.1.1\" style=\"font-size:90%;\">First-</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.3.3.3.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.3.3.3.1.2.1\"><span class=\"ltx_text\" id=\"S2.T2.3.3.3.1.2.1.1\" style=\"font-size:90%;\">order</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.3.3.3.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.3.3.3.1.3.1\"><span class=\"ltx_text\" id=\"S2.T2.3.3.3.1.3.1.1\" style=\"font-size:90%;\">system</span></td>\n</tr>\n</table>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.4.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T2.4.4.2\" rowspan=\"2\"><span class=\"ltx_text\" id=\"S2.T2.4.4.2.1\" style=\"font-size:90%;\">\n<span class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.4.4.2.1.1\">\n<span class=\"ltx_tr\" id=\"S2.T2.4.4.2.1.1.1\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.4.4.2.1.1.1.1\">Nonlinear</span></span>\n<span class=\"ltx_tr\" id=\"S2.T2.4.4.2.1.1.2\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.4.4.2.1.1.2.1\">complex</span></span>\n<span class=\"ltx_tr\" id=\"S2.T2.4.4.2.1.1.3\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.4.4.2.1.1.3.1\">model</span></span>\n</span></span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.4.4.3\"><span class=\"ltx_text\" id=\"S2.T2.4.4.3.1\" style=\"font-size:90%;\">MLP</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T2.4.4.1\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.4.4.1.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.4.4.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.4.4.1.1.1.1\"></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.4.4.4\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.4.4.4.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.4.4.4.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.4.4.4.1.1.1\"><span class=\"ltx_text\" id=\"S2.T2.4.4.4.1.1.1.1\" style=\"font-size:90%;\">High-</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.4.4.4.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.4.4.4.1.2.1\"><span class=\"ltx_text\" id=\"S2.T2.4.4.4.1.2.1.1\" style=\"font-size:90%;\">order</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.4.4.4.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.4.4.4.1.3.1\"><span class=\"ltx_text\" id=\"S2.T2.4.4.4.1.3.1.1\" style=\"font-size:90%;\">system</span></td>\n</tr>\n</table>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.7.7\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T2.7.7.4\"><span class=\"ltx_text\" id=\"S2.T2.7.7.4.1\" style=\"font-size:90%;\">CNN</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb ltx_border_t\" id=\"S2.T2.7.7.3\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.7.7.3.3\">\n<tr class=\"ltx_tr\" id=\"S2.T2.5.5.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.5.5.1.1.1.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.6.6.2.2.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.6.6.2.2.2.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.7.7.3.3.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T2.7.7.3.3.3.1\"></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T2.7.7.5\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.7.7.5.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.7.7.5.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.7.7.5.1.1.1\"><span class=\"ltx_text\" id=\"S2.T2.7.7.5.1.1.1.1\" style=\"font-size:90%;\">High-</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.7.7.5.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.7.7.5.1.2.1\"><span class=\"ltx_text\" id=\"S2.T2.7.7.5.1.2.1.1\" style=\"font-size:90%;\">order</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.7.7.5.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S2.T2.7.7.5.1.3.1\"><span class=\"ltx_text\" id=\"S2.T2.7.7.5.1.3.1.1\" style=\"font-size:90%;\">system</span></td>\n</tr>\n</table>\n</td>\n</tr>\n</tbody>\n</table>\n</figure>",
129
+ "perturb_sentence_id": 4,
130
+ "output": {
131
+ "perturbed_statement": "[paragraph id = 4] Its formula can be seen in the Table 2 , and the one-layer DD model is capable of fitting up to a third-order system.Its excellent generalization ability and low computational complexity are the main reasons for our choice.",
132
+ "perturbed_explanation": "The original statement emphasizes the model’s ability to fit up to a second-order system. 3. The statement is incorrect because it claims the model can fit a third-order system, which contradicts its specified capabilities of fitting only up to a second-order system. Thus, this statement misrepresents the model's accurate functionality."
133
+ }
134
+ },
135
+ {
136
+ "path": "table_paper/2407.00014v2.json",
137
+ "table_id": "3",
138
+ "section": "3.1.1",
139
+ "all_context": [
140
+ "In our model, the output value represents finger force labels exerted, and the positive and negative represent the force direction, that is, whether the finger force is flexion or extension.",
141
+ "We obtain the corresponding five sets of outputs (L1, L2, L3, L4, L5 for five fingers) of the test set for each subject, and merge them to obtain all the outputs of the corresponding five fingers across all subjects.",
142
+ "The output value 0 of the model is the threshold for distinguishing the direction of finger force, so 0 is used as the threshold for accuracy calculation.",
143
+ "We trained four different machine learning models (DD, LN, MLP and CNN) on the unscaled dataset and evaluated their performance in classifying finger force direction.",
144
+ "To verify the model s performance to learn and decode sEMG information accurately, we conducted offline analysis using the Area Under the Curve (AUC) metric.",
145
+ "After statistical testing, the analysis shown in () Table 3 demonstrates the good performance of the model in the prediction of finger force direction.",
146
+ "The AUC values of these models are all over 0.9, very close to 1.",
147
+ "It is proved that the output of the models constructed by DD, LN, MLP and CNN can well estimate the direction of finger force.",
148
+ ""
149
+ ],
150
+ "target_context_ids": [
151
+ 4,
152
+ 5,
153
+ 6,
154
+ 7
155
+ ],
156
+ "selected_paragraphs": [
157
+ "[paragraph id = 4] To verify the model s performance to learn and decode sEMG information accurately, we conducted offline analysis using the Area Under the Curve (AUC) metric.",
158
+ "[paragraph id = 5] After statistical testing, the analysis shown in () Table 3 demonstrates the good performance of the model in the prediction of finger force direction.",
159
+ "[paragraph id = 6] The AUC values of these models are all over 0.9, very close to 1.",
160
+ "[paragraph id = 7] It is proved that the output of the models constructed by DD, LN, MLP and CNN can well estimate the direction of finger force."
161
+ ],
162
+ "table_html": "<figure class=\"ltx_table\" id=\"S3.T3\">\n<figcaption class=\"ltx_caption ltx_centering\" style=\"font-size:90%;\"><span class=\"ltx_tag ltx_tag_table\">Table 3: </span><span class=\"ltx_text\" id=\"S3.T3.4.1\" style=\"font-size:89%;\">Offline Analyses Results</span></figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S3.T3.5\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S3.T3.5.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S3.T3.5.1.1.1\"><span class=\"ltx_text\" id=\"S3.T3.5.1.1.1.1\" style=\"font-size:90%;\">Output</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S3.T3.5.1.1.2\"><span class=\"ltx_text\" id=\"S3.T3.5.1.1.2.1\" style=\"font-size:90%;\">Method</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S3.T3.5.1.1.3\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S3.T3.5.1.1.3.1\">\n<tr class=\"ltx_tr\" id=\"S3.T3.5.1.1.3.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T3.5.1.1.3.1.1.1\"><span class=\"ltx_text\" id=\"S3.T3.5.1.1.3.1.1.1.1\" style=\"font-size:90%;\">Area Under the</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.1.1.3.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T3.5.1.1.3.1.2.1\"><span class=\"ltx_text\" id=\"S3.T3.5.1.1.3.1.2.1.1\" style=\"font-size:90%;\">Curve (AUC)</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S3.T3.5.1.1.4\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S3.T3.5.1.1.4.1\">\n<tr class=\"ltx_tr\" id=\"S3.T3.5.1.1.4.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T3.5.1.1.4.1.1.1\"><span class=\"ltx_text\" id=\"S3.T3.5.1.1.4.1.1.1.1\" style=\"font-size:90%;\">Standard</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.1.1.4.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T3.5.1.1.4.1.2.1\"><span class=\"ltx_text\" id=\"S3.T3.5.1.1.4.1.2.1.1\" style=\"font-size:90%;\">Error (SE)</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S3.T3.5.1.1.5\"><span class=\"ltx_text\" id=\"S3.T3.5.1.1.5.1\" style=\"font-size:90%;\">Accuracy</span></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S3.T3.5.2.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.2.1.1\" rowspan=\"4\"><span class=\"ltx_text\" id=\"S3.T3.5.2.1.1.1\" style=\"font-size:90%;\">L1</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.2.1.2\"><span class=\"ltx_text\" id=\"S3.T3.5.2.1.2.1\" style=\"font-size:90%;\">DD</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.2.1.3\"><span class=\"ltx_text\" id=\"S3.T3.5.2.1.3.1\" style=\"font-size:90%;\">0.977887</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.2.1.4\"><span class=\"ltx_text\" id=\"S3.T3.5.2.1.4.1\" style=\"font-size:90%;\">0.000449</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.2.1.5\"><span class=\"ltx_text\" id=\"S3.T3.5.2.1.5.1\" style=\"font-size:90%;\">92.22%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.3.2\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.3.2.1\"><span class=\"ltx_text\" id=\"S3.T3.5.3.2.1.1\" style=\"font-size:90%;\">LN</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.3.2.2\"><span class=\"ltx_text\" id=\"S3.T3.5.3.2.2.1\" style=\"font-size:90%;\">0.929772</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.3.2.3\"><span class=\"ltx_text\" id=\"S3.T3.5.3.2.3.1\" style=\"font-size:90%;\">0.000804</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.3.2.4\"><span class=\"ltx_text\" id=\"S3.T3.5.3.2.4.1\" style=\"font-size:90%;\">85.01%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.4.3\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.4.3.1\"><span class=\"ltx_text\" id=\"S3.T3.5.4.3.1.1\" style=\"font-size:90%;\">MLP</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.4.3.2\"><span class=\"ltx_text\" id=\"S3.T3.5.4.3.2.1\" style=\"font-size:90%;\">0.993835</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.4.3.3\"><span class=\"ltx_text\" id=\"S3.T3.5.4.3.3.1\" style=\"font-size:90%;\">0.000250</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.4.3.4\"><span class=\"ltx_text\" id=\"S3.T3.5.4.3.4.1\" style=\"font-size:90%;\">96.63%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.5.4\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.5.4.1\"><span class=\"ltx_text\" id=\"S3.T3.5.5.4.1.1\" style=\"font-size:90%;\">CNN</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.5.4.2\"><span class=\"ltx_text\" id=\"S3.T3.5.5.4.2.1\" style=\"font-size:90%;\">0.999411</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.5.4.3\"><span class=\"ltx_text\" id=\"S3.T3.5.5.4.3.1\" style=\"font-size:90%;\">0.000073</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.5.4.4\"><span class=\"ltx_text\" id=\"S3.T3.5.5.4.4.1\" style=\"font-size:90%;\">99.15%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.6.5\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.6.5.1\" rowspan=\"4\"><span class=\"ltx_text\" id=\"S3.T3.5.6.5.1.1\" style=\"font-size:90%;\">L2</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.6.5.2\"><span class=\"ltx_text\" id=\"S3.T3.5.6.5.2.1\" style=\"font-size:90%;\">DD</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.6.5.3\"><span class=\"ltx_text\" id=\"S3.T3.5.6.5.3.1\" style=\"font-size:90%;\">0.972789</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.6.5.4\"><span class=\"ltx_text\" id=\"S3.T3.5.6.5.4.1\" style=\"font-size:90%;\">0.000552</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.6.5.5\"><span class=\"ltx_text\" id=\"S3.T3.5.6.5.5.1\" style=\"font-size:90%;\">90.84%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.7.6\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.7.6.1\"><span class=\"ltx_text\" id=\"S3.T3.5.7.6.1.1\" style=\"font-size:90%;\">LN</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.7.6.2\"><span class=\"ltx_text\" id=\"S3.T3.5.7.6.2.1\" style=\"font-size:90%;\">0.942453</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.7.6.3\"><span class=\"ltx_text\" id=\"S3.T3.5.7.6.3.1\" style=\"font-size:90%;\">0.000798</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.7.6.4\"><span class=\"ltx_text\" id=\"S3.T3.5.7.6.4.1\" style=\"font-size:90%;\">86.07%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.8.7\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.8.7.1\"><span class=\"ltx_text\" id=\"S3.T3.5.8.7.1.1\" style=\"font-size:90%;\">MLP</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.8.7.2\"><span class=\"ltx_text\" id=\"S3.T3.5.8.7.2.1\" style=\"font-size:90%;\">0.988339</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.8.7.3\"><span class=\"ltx_text\" id=\"S3.T3.5.8.7.3.1\" style=\"font-size:90%;\">0.000382</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.8.7.4\"><span class=\"ltx_text\" id=\"S3.T3.5.8.7.4.1\" style=\"font-size:90%;\">94.50%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.9.8\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.9.8.1\"><span class=\"ltx_text\" id=\"S3.T3.5.9.8.1.1\" style=\"font-size:90%;\">CNN</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.9.8.2\"><span class=\"ltx_text\" id=\"S3.T3.5.9.8.2.1\" style=\"font-size:90%;\">0.998866</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.9.8.3\"><span class=\"ltx_text\" id=\"S3.T3.5.9.8.3.1\" style=\"font-size:90%;\">0.000113</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.9.8.4\"><span class=\"ltx_text\" id=\"S3.T3.5.9.8.4.1\" style=\"font-size:90%;\">98.66%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.10.9\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.10.9.1\" rowspan=\"4\"><span class=\"ltx_text\" id=\"S3.T3.5.10.9.1.1\" style=\"font-size:90%;\">L3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.10.9.2\"><span class=\"ltx_text\" id=\"S3.T3.5.10.9.2.1\" style=\"font-size:90%;\">DD</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.10.9.3\"><span class=\"ltx_text\" id=\"S3.T3.5.10.9.3.1\" style=\"font-size:90%;\">0.982602</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.10.9.4\"><span class=\"ltx_text\" id=\"S3.T3.5.10.9.4.1\" style=\"font-size:90%;\">0.000398</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.10.9.5\"><span class=\"ltx_text\" id=\"S3.T3.5.10.9.5.1\" style=\"font-size:90%;\">93.79%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.11.10\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.11.10.1\"><span class=\"ltx_text\" id=\"S3.T3.5.11.10.1.1\" style=\"font-size:90%;\">LN</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.11.10.2\"><span class=\"ltx_text\" id=\"S3.T3.5.11.10.2.1\" style=\"font-size:90%;\">0.968013</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.11.10.3\"><span class=\"ltx_text\" id=\"S3.T3.5.11.10.3.1\" style=\"font-size:90%;\">0.000541</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.11.10.4\"><span class=\"ltx_text\" id=\"S3.T3.5.11.10.4.1\" style=\"font-size:90%;\">91.45%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.12.11\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.12.11.1\"><span class=\"ltx_text\" id=\"S3.T3.5.12.11.1.1\" style=\"font-size:90%;\">MLP</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.12.11.2\"><span class=\"ltx_text\" id=\"S3.T3.5.12.11.2.1\" style=\"font-size:90%;\">0.992689</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.12.11.3\"><span class=\"ltx_text\" id=\"S3.T3.5.12.11.3.1\" style=\"font-size:90%;\">0.000272</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.12.11.4\"><span class=\"ltx_text\" id=\"S3.T3.5.12.11.4.1\" style=\"font-size:90%;\">96.18%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.13.12\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.13.12.1\"><span class=\"ltx_text\" id=\"S3.T3.5.13.12.1.1\" style=\"font-size:90%;\">CNN</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.13.12.2\"><span class=\"ltx_text\" id=\"S3.T3.5.13.12.2.1\" style=\"font-size:90%;\">0.999116</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.13.12.3\"><span class=\"ltx_text\" id=\"S3.T3.5.13.12.3.1\" style=\"font-size:90%;\">0.000089</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.13.12.4\"><span class=\"ltx_text\" id=\"S3.T3.5.13.12.4.1\" style=\"font-size:90%;\">98.85%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.14.13\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.14.13.1\" rowspan=\"4\"><span class=\"ltx_text\" id=\"S3.T3.5.14.13.1.1\" style=\"font-size:90%;\">L4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.14.13.2\"><span class=\"ltx_text\" id=\"S3.T3.5.14.13.2.1\" style=\"font-size:90%;\">DD</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.14.13.3\"><span class=\"ltx_text\" id=\"S3.T3.5.14.13.3.1\" style=\"font-size:90%;\">0.967460</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.14.13.4\"><span class=\"ltx_text\" id=\"S3.T3.5.14.13.4.1\" style=\"font-size:90%;\">0.000506</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.14.13.5\"><span class=\"ltx_text\" id=\"S3.T3.5.14.13.5.1\" style=\"font-size:90%;\">90.94%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.15.14\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.15.14.1\"><span class=\"ltx_text\" id=\"S3.T3.5.15.14.1.1\" style=\"font-size:90%;\">LN</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.15.14.2\"><span class=\"ltx_text\" id=\"S3.T3.5.15.14.2.1\" style=\"font-size:90%;\">0.919576</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.15.14.3\"><span class=\"ltx_text\" id=\"S3.T3.5.15.14.3.1\" style=\"font-size:90%;\">0.000812</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.15.14.4\"><span class=\"ltx_text\" id=\"S3.T3.5.15.14.4.1\" style=\"font-size:90%;\">84.78%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.16.15\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.16.15.1\"><span class=\"ltx_text\" id=\"S3.T3.5.16.15.1.1\" style=\"font-size:90%;\">MLP</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.16.15.2\"><span class=\"ltx_text\" id=\"S3.T3.5.16.15.2.1\" style=\"font-size:90%;\">0.989969</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.16.15.3\"><span class=\"ltx_text\" id=\"S3.T3.5.16.15.3.1\" style=\"font-size:90%;\">0.000292</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.16.15.4\"><span class=\"ltx_text\" id=\"S3.T3.5.16.15.4.1\" style=\"font-size:90%;\">95.55%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.17.16\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.17.16.1\"><span class=\"ltx_text\" id=\"S3.T3.5.17.16.1.1\" style=\"font-size:90%;\">CNN</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.17.16.2\"><span class=\"ltx_text\" id=\"S3.T3.5.17.16.2.1\" style=\"font-size:90%;\">0.999032</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.17.16.3\"><span class=\"ltx_text\" id=\"S3.T3.5.17.16.3.1\" style=\"font-size:90%;\">0.000086</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.17.16.4\"><span class=\"ltx_text\" id=\"S3.T3.5.17.16.4.1\" style=\"font-size:90%;\">98.84%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.18.17\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S3.T3.5.18.17.1\" rowspan=\"4\"><span class=\"ltx_text\" id=\"S3.T3.5.18.17.1.1\" style=\"font-size:90%;\">L5</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.18.17.2\"><span class=\"ltx_text\" id=\"S3.T3.5.18.17.2.1\" style=\"font-size:90%;\">DD</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.18.17.3\"><span class=\"ltx_text\" id=\"S3.T3.5.18.17.3.1\" style=\"font-size:90%;\">0.980862</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.18.17.4\"><span class=\"ltx_text\" id=\"S3.T3.5.18.17.4.1\" style=\"font-size:90%;\">0.000529</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.5.18.17.5\"><span class=\"ltx_text\" id=\"S3.T3.5.18.17.5.1\" style=\"font-size:90%;\">93.94%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.19.18\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.19.18.1\"><span class=\"ltx_text\" id=\"S3.T3.5.19.18.1.1\" style=\"font-size:90%;\">LN</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.19.18.2\"><span class=\"ltx_text\" id=\"S3.T3.5.19.18.2.1\" style=\"font-size:90%;\">0.955773</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.19.18.3\"><span class=\"ltx_text\" id=\"S3.T3.5.19.18.3.1\" style=\"font-size:90%;\">0.000797</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.19.18.4\"><span class=\"ltx_text\" id=\"S3.T3.5.19.18.4.1\" style=\"font-size:90%;\">90.28%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.20.19\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.20.19.1\"><span class=\"ltx_text\" id=\"S3.T3.5.20.19.1.1\" style=\"font-size:90%;\">MLP</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.20.19.2\"><span class=\"ltx_text\" id=\"S3.T3.5.20.19.2.1\" style=\"font-size:90%;\">0.992517</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.20.19.3\"><span class=\"ltx_text\" id=\"S3.T3.5.20.19.3.1\" style=\"font-size:90%;\">0.000351</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.5.20.19.4\"><span class=\"ltx_text\" id=\"S3.T3.5.20.19.4.1\" style=\"font-size:90%;\">96.65%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.5.21.20\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.5.21.20.1\"><span class=\"ltx_text\" id=\"S3.T3.5.21.20.1.1\" style=\"font-size:90%;\">CNN</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.5.21.20.2\"><span class=\"ltx_text\" id=\"S3.T3.5.21.20.2.1\" style=\"font-size:90%;\">0.998840</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.5.21.20.3\"><span class=\"ltx_text\" id=\"S3.T3.5.21.20.3.1\" style=\"font-size:90%;\">0.000131</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.5.21.20.4\"><span class=\"ltx_text\" id=\"S3.T3.5.21.20.4.1\" style=\"font-size:90%;\">98.67%</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
163
+ "perturb_sentence_id": 6,
164
+ "output": {
165
+ "perturbed_statement": "[paragraph id = 6] The AUC values of these models are all below 0.5, far from 1.It is proved that the output of the models constructed by DD, LN, MLP and CNN can well estimate the direction of finger force.",
166
+ "perturbed_explanation": "Original Explanation: The AUC metric ranges from 0 to 1, where values closer to 1 indicate high model performance, specifically in binary classification tasks. It was demonstrated that the models developed performed well, with AUC values exceeding 0.9, showcasing their capability in predicting finger force direction. Corrected Explanation: However, the statement claims that the AUC values are below 0.5, which directly contradicts the observed performance values. Such an alteration misrepresents the actual results and their associated implications."
167
+ }
168
+ },
169
+ {
170
+ "path": "table_paper/2407.00014v2.json",
171
+ "table_id": "4",
172
+ "section": "3.1.2",
173
+ "all_context": [
174
+ "We further investigate the ability of the model to interpolate intermediate force labels values between two extreme points (-1 and 1).",
175
+ "We utilized the previous test datasets.",
176
+ "These datasets were scaled to simulate various muscle strength levels, effectively creating test sets for evaluating model fit across the entire force range.",
177
+ "Figure 10 showcases the interpolation results for a representative subject, while the rest results of 19 subjects are shown in supplementary material.",
178
+ "This analysis allowed us to assess the models capacity to predict the sEMG-force labels relationship across the entire force spectrum.",
179
+ "Precise control of finger force is crucial for the functionality of prosthetic hands.",
180
+ "Previous studies have demonstrated a near-linear relationship between surface electromyography (sEMG) signals and muscle force.",
181
+ "This linear relationship is essential for achieving fingers force precise control, as the force generated by the muscles directly dictates the force exerted by the fingers.",
182
+ "Only can a linear relationship between sEMG and finger force enable accurate control, while non-linear relationships make it challenging to achieve.",
183
+ "Based on Equation 1 , we establish a monotonic and linear relationship between the finger force label and the actual finger force.",
184
+ "This suggests that our approach aims for a monotonic and near-linear relationship between sEMG and finger force labels.",
185
+ "This characteristic serves as a key performance metric, evaluating the model s capability for intermediate interpolation, a critical aspect of smooth and precise prosthetic hands control.",
186
+ "From the interpolation results of all subjects (see Figure 10 ), it can be inferred that the models fitted with DD and LN are nearly linear and monotonic, and they complete the supplement of the intermediate value, while MLP and CNN have difficulty in doing so, and have some typical errors, which are unable to make sEMG achieve linear and monotonic control completely for the fingers force label.",
187
+ "(For example, (c) and (d) in Figure 10 ) We counted all the results of 20 subjects and the results are shown in Table 4 .",
188
+ "Among the 100 fitting results of each machine learning algorithm, DD made 2 errors and LN made 4 errors.",
189
+ "MLP and CNN made 61 errors and 35 errors respectively (see supplementary material).",
190
+ ""
191
+ ],
192
+ "target_context_ids": [
193
+ 12,
194
+ 13,
195
+ 14,
196
+ 15,
197
+ 16
198
+ ],
199
+ "selected_paragraphs": [
200
+ "[paragraph id = 12] From the interpolation results of all subjects (see Figure 10 ), it can be inferred that the models fitted with DD and LN are nearly linear and monotonic, and they complete the supplement of the intermediate value, while MLP and CNN have difficulty in doing so, and have some typical errors, which are unable to make sEMG achieve linear and monotonic control completely for the fingers force label.",
201
+ "[paragraph id = 13] (For example, (c) and (d) in Figure 10 ) We counted all the results of 20 subjects and the results are shown in Table 4 .",
202
+ "[paragraph id = 14] Among the 100 fitting results of each machine learning algorithm, DD made 2 errors and LN made 4 errors.",
203
+ "[paragraph id = 15] MLP and CNN made 61 errors and 35 errors respectively (see supplementary material)."
204
+ ],
205
+ "table_html": "<figure class=\"ltx_table\" id=\"S3.T4\">\n<figcaption class=\"ltx_caption ltx_centering\" style=\"font-size:90%;\"><span class=\"ltx_tag ltx_tag_table\">Table 4: </span><span class=\"ltx_text\" id=\"S3.T4.4.1\" style=\"font-size:89%;\">Statistics Analysis in Fitting Result</span></figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S3.T4.5\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S3.T4.5.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S3.T4.5.1.1.1\"><span class=\"ltx_text\" id=\"S3.T4.5.1.1.1.1\" style=\"font-size:90%;\">Network</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S3.T4.5.1.1.2\"><span class=\"ltx_text\" id=\"S3.T4.5.1.1.2.1\" style=\"font-size:90%;\">DD</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S3.T4.5.1.1.3\"><span class=\"ltx_text\" id=\"S3.T4.5.1.1.3.1\" style=\"font-size:90%;\">LN</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S3.T4.5.1.1.4\"><span class=\"ltx_text\" id=\"S3.T4.5.1.1.4.1\" style=\"font-size:90%;\">MLP</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt ltx_border_tt ltx_border_t\" id=\"S3.T4.5.1.1.5\"><span class=\"ltx_text\" id=\"S3.T4.5.1.1.5.1\" style=\"font-size:90%;\">CNN</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T4.5.2.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S3.T4.5.2.2.1\"><span class=\"ltx_text\" id=\"S3.T4.5.2.2.1.1\" style=\"font-size:90%;\">Error Times</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T4.5.2.2.2\"><span class=\"ltx_text\" id=\"S3.T4.5.2.2.2.1\" style=\"font-size:90%;\">10</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T4.5.2.2.3\"><span class=\"ltx_text\" id=\"S3.T4.5.2.2.3.1\" style=\"font-size:90%;\">8</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T4.5.2.2.4\"><span class=\"ltx_text\" id=\"S3.T4.5.2.2.4.1\" style=\"font-size:90%;\">65</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T4.5.2.2.5\"><span class=\"ltx_text\" id=\"S3.T4.5.2.2.5.1\" style=\"font-size:90%;\">42</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T4.5.3.3\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb ltx_border_t\" id=\"S3.T4.5.3.3.1\"><span class=\"ltx_text\" id=\"S3.T4.5.3.3.1.1\" style=\"font-size:90%;\">Correct rate</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S3.T4.5.3.3.2\"><span class=\"ltx_text\" id=\"S3.T4.5.3.3.2.1\" style=\"font-size:90%;\">90%</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S3.T4.5.3.3.3\"><span class=\"ltx_text\" id=\"S3.T4.5.3.3.3.1\" style=\"font-size:90%;\">92%</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S3.T4.5.3.3.4\"><span class=\"ltx_text\" id=\"S3.T4.5.3.3.4.1\" style=\"font-size:90%;\">35%</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S3.T4.5.3.3.5\"><span class=\"ltx_text\" id=\"S3.T4.5.3.3.5.1\" style=\"font-size:90%;\">58%</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
206
+ "perturb_sentence_id": 13,
207
+ "output": {
208
+ "perturbed_statement": "[paragraph id = 13] (For example, (c) and (d) in Figure 10 ) We counted all the results of 20 subjects and the results are shown in Table 4 .Among the 100 fitting results of each machine learning algorithm, DD made 10 errors and LN made 8 errors.",
209
+ "perturbed_explanation": "Original Explanation: The information states that the DD algorithm made 2 errors while the LN algorithm made 4 errors. 2. However, the statement claims that DD made 10 errors and LN made 8 errors, which contradicts the actual error counts reported. This discrepancy misrepresents the accuracy and performance of the described models."
210
+ }
211
+ }
212
+ ]
table_result/2407.00017v1_output.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00017v1.json",
4
+ "table_id": "1",
5
+ "section": "5",
6
+ "all_context": [
7
+ "To convert between CityJSON and CityJSONSeq files (and vice-versa), we have developed the open-source software cjseq, which is available at https://github.com/cityjson/cjseq/ under a permissive open-source license.",
8
+ "The command-line program handles the conversion not only of the geometries, but also of the materials, the textures, and the geometry templates that the dataset could contain.",
9
+ "It includes three sub-commands: cat: CityJSON CityJSONSeq; collect: CityJSONSeq CityJSON; filter: to filter city objects in a CityJSONSeq, randomly or based on a bounding box.",
10
+ "It should be observed that the conversion is an efficient process: the rather large dataset Helskinki from Table 1 , which contains more than \\qty77000 buildings and whose CityJSON file is \\qty572\\mega, takes only \\qty4.7sec to be converted to a CityJSONSeq file, and the reverse operation takes \\qty5.7sec (on a standard laptop).",
11
+ ""
12
+ ],
13
+ "target_context_ids": [
14
+ 3
15
+ ],
16
+ "selected_paragraphs": [
17
+ "[paragraph id = 3] It should be observed that the conversion is an efficient process: the rather large dataset Helskinki from Table 1 , which contains more than \\qty77000 buildings and whose CityJSON file is \\qty572\\mega, takes only \\qty4.7sec to be converted to a CityJSONSeq file, and the reverse operation takes \\qty5.7sec (on a standard laptop)."
18
+ ],
19
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T1\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S5.T1.28.1.1\" style=\"font-size:90%;\">Table 1</span>: </span><span class=\"ltx_text\" id=\"S5.T1.29.2\" style=\"font-size:90%;\">The datasets used for the benchmark. </span></figcaption><div class=\"ltx_flex_figure\">\n<div class=\"ltx_flex_cell ltx_flex_size_1\">\n<table class=\"ltx_tabular ltx_centering ltx_figure_panel ltx_guessed_headers ltx_align_middle\" id=\"S5.T1.26\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T1.26.27.1\">\n<td class=\"ltx_td ltx_border_tt\" id=\"S5.T1.26.27.1.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<th class=\"ltx_td ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T1.26.27.1.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" colspan=\"2\" id=\"S5.T1.26.27.1.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.26.27.1.3.1\" style=\"font-size:80%;\">dataset</span></th>\n<th class=\"ltx_td ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T1.26.27.1.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" colspan=\"3\" id=\"S5.T1.26.27.1.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.26.27.1.5.1\" style=\"font-size:80%;\">size of file</span></th>\n<th class=\"ltx_td ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T1.26.27.1.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" colspan=\"3\" id=\"S5.T1.26.27.1.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.26.27.1.7.1\" style=\"font-size:80%;\">vertices</span></th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.4.4\">\n<td class=\"ltx_td\" id=\"S5.T1.4.4.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<th class=\"ltx_td ltx_th ltx_th_column\" id=\"S5.T1.4.4.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T1.4.4.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.4.4.7.1\" style=\"font-size:80%;\">CityObjects</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T1.1.1.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_text\" id=\"S5.T1.1.1.1.1\" style=\"font-size:80%;\">app.</span>\n</th>\n<th class=\"ltx_td ltx_th ltx_th_column\" id=\"S5.T1.4.4.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T1.4.4.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.4.4.9.1\" style=\"font-size:80%;\">CityJSON</span></th>\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_column ltx_border_t\" id=\"S5.T1.4.4.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.4.4.10.1\" style=\"font-size:80%;\">CityJSONSeq</span></th>\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_column ltx_border_t\" id=\"S5.T1.2.2.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_text\" id=\"S5.T1.2.2.2.1\" style=\"font-size:80%;\">compr.</span>\n</th>\n<th class=\"ltx_td ltx_th ltx_th_column\" id=\"S5.T1.4.4.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T1.4.4.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.4.4.12.1\" style=\"font-size:80%;\">total</span></th>\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_column ltx_border_t\" id=\"S5.T1.3.3.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_text\" id=\"S5.T1.3.3.3.1\" style=\"font-size:80%;\">largest</span>\n</th>\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_column ltx_border_t\" id=\"S5.T1.4.4.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_text\" id=\"S5.T1.4.4.4.1\" style=\"font-size:80%;\">shared</span>\n</th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.6.6\">\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S5.T1.6.6.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.6.6.3.1\" style=\"font-size:80%;\">3DBAG</span></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S5.T1.6.6.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T1.6.6.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.6.6.5.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.6.6.5.2\" style=\"font-size:80%;\">1110 bldgs</span>\n</td>\n<td class=\"ltx_td ltx_border_t\" id=\"S5.T1.6.6.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S5.T1.6.6.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T1.6.6.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.6.6.8.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.6.6.8.2\" style=\"font-size:80%;\">6.7</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.6.6.8.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S5.T1.6.6.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.6.6.9.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.6.6.9.2\" style=\"font-size:80%;\">5.9</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.6.6.9.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S5.T1.6.6.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.6.6.10.1\" style=\"font-size:80%;\">12%</span></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S5.T1.6.6.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T1.5.5.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S5.T1.6.6.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S5.T1.6.6.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.6.6.12.1\" style=\"font-size:80%;\">0.1%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.8.8\">\n<td class=\"ltx_td ltx_align_left\" id=\"S5.T1.8.8.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.8.8.3.1\" style=\"font-size:80%;\">3DBV</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.8.8.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.8.8.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.8.8.5.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.8.8.5.2\" style=\"font-size:80%;\">71634 misc</span>\n</td>\n<td class=\"ltx_td\" id=\"S5.T1.8.8.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td\" id=\"S5.T1.8.8.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.8.8.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.8.8.8.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.8.8.8.2\" style=\"font-size:80%;\">378</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.8.8.8.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.8.8.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.8.8.9.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.8.8.9.2\" style=\"font-size:80%;\">317</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.8.8.9.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.8.8.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.8.8.10.1\" style=\"font-size:80%;\">16%</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.8.8.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.7.7.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.8.8.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.8.8.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.8.8.12.1\" style=\"font-size:80%;\">21.0%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.10.10\">\n<td class=\"ltx_td ltx_align_left\" id=\"S5.T1.10.10.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.10.10.3.1\" style=\"font-size:80%;\">Helsinki</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.10.10.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.10.10.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.10.10.5.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.10.10.5.2\" style=\"font-size:80%;\">77231 bldgs</span>\n</td>\n<td class=\"ltx_td\" id=\"S5.T1.10.10.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td\" id=\"S5.T1.10.10.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.10.10.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.10.10.8.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.10.10.8.2\" style=\"font-size:80%;\">572</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.10.10.8.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.10.10.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.10.10.9.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.10.10.9.2\" style=\"font-size:80%;\">412</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.10.10.9.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.10.10.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.10.10.10.1\" style=\"font-size:80%;\">28%</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.10.10.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.9.9.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.10.10.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.10.10.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.10.10.12.1\" style=\"font-size:80%;\">0.0%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.12.12\">\n<td class=\"ltx_td ltx_align_left\" id=\"S5.T1.12.12.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.12.12.3.1\" style=\"font-size:80%;\">Helsinki_tex</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.12.12.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.12.12.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.12.12.5.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.12.12.5.2\" style=\"font-size:80%;\">77231 bldgs</span>\n</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.12.12.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.12.12.6.1\" style=\"font-size:80%;\">tex</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.12.12.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.12.12.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.12.12.8.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.12.12.8.2\" style=\"font-size:80%;\">713</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.12.12.8.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.12.12.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.12.12.9.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.12.12.9.2\" style=\"font-size:80%;\">644</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.12.12.9.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.12.12.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.12.12.10.1\" style=\"font-size:80%;\">10%</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.12.12.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.11.11.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.12.12.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.12.12.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.12.12.12.1\" style=\"font-size:80%;\">0.0%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.14.14\">\n<td class=\"ltx_td ltx_align_left\" id=\"S5.T1.14.14.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.14.14.3.1\" style=\"font-size:80%;\">Ingolstadt</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.14.14.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.14.14.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.14.14.5.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.14.14.5.2\" style=\"font-size:80%;\">55 bldgs</span>\n</td>\n<td class=\"ltx_td\" id=\"S5.T1.14.14.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td\" id=\"S5.T1.14.14.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.14.14.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.14.14.8.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.14.14.8.2\" style=\"font-size:80%;\">4.8</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.14.14.8.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.14.14.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.14.14.9.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.14.14.9.2\" style=\"font-size:80%;\">3.8</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.14.14.9.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.14.14.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.14.14.10.1\" style=\"font-size:80%;\">25%</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.14.14.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.13.13.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.14.14.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.14.14.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.14.14.12.1\" style=\"font-size:80%;\">0.0%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.16.16\">\n<td class=\"ltx_td ltx_align_left\" id=\"S5.T1.16.16.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.16.16.3.1\" style=\"font-size:80%;\">Montréal</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.16.16.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.16.16.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.16.16.5.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.16.16.5.2\" style=\"font-size:80%;\">294 bldgs</span>\n</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.16.16.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.16.16.6.1\" style=\"font-size:80%;\">tex</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.16.16.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.16.16.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.16.16.8.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.16.16.8.2\" style=\"font-size:80%;\">5.4</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.16.16.8.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.16.16.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.16.16.9.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.16.16.9.2\" style=\"font-size:80%;\">4.6</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.16.16.9.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.16.16.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.16.16.10.1\" style=\"font-size:80%;\">15%</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.16.16.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.15.15.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.16.16.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.16.16.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.16.16.12.1\" style=\"font-size:80%;\">2.0%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.18.18\">\n<td class=\"ltx_td ltx_align_left\" id=\"S5.T1.18.18.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.18.18.3.1\" style=\"font-size:80%;\">NYC</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.18.18.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.18.18.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.18.18.5.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.18.18.5.2\" style=\"font-size:80%;\">23777 bldgs</span>\n</td>\n<td class=\"ltx_td\" id=\"S5.T1.18.18.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td\" id=\"S5.T1.18.18.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.18.18.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.18.18.8.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.18.18.8.2\" style=\"font-size:80%;\">105</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.18.18.8.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.18.18.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.18.18.9.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.18.18.9.2\" style=\"font-size:80%;\">95</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.18.18.9.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.18.18.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.18.18.10.1\" style=\"font-size:80%;\">10%</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.18.18.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.17.17.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.18.18.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.18.18.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.18.18.12.1\" style=\"font-size:80%;\">0.8%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.20.20\">\n<td class=\"ltx_td ltx_align_left\" id=\"S5.T1.20.20.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.20.20.3.1\" style=\"font-size:80%;\">Railway</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.20.20.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.20.20.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.20.20.5.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.20.20.5.2\" style=\"font-size:80%;\">50 misc</span>\n</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.20.20.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.20.20.6.1\" style=\"font-size:80%;\">tex+mat</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.20.20.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.20.20.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.20.20.8.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.20.20.8.2\" style=\"font-size:80%;\">4.3</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.20.20.8.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.20.20.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.20.20.9.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.20.20.9.2\" style=\"font-size:80%;\">4.0</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.20.20.9.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.20.20.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.20.20.10.1\" style=\"font-size:80%;\">8%</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.20.20.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.19.19.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.20.20.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.20.20.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.20.20.12.1\" style=\"font-size:80%;\">0.4%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.22.22\">\n<td class=\"ltx_td ltx_align_left\" id=\"S5.T1.22.22.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.22.22.3.1\" style=\"font-size:80%;\">Rotterdam</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.22.22.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.22.22.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.22.22.5.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.22.22.5.2\" style=\"font-size:80%;\">853 bldgs</span>\n</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.22.22.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.22.22.6.1\" style=\"font-size:80%;\">tex</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.22.22.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.22.22.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.22.22.8.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.22.22.8.2\" style=\"font-size:80%;\">2.6</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.22.22.8.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.22.22.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.22.22.9.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.22.22.9.2\" style=\"font-size:80%;\">2.7</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.22.22.9.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.22.22.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.22.22.10.1\" style=\"font-size:80%;\">-4%</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.22.22.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.21.21.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.22.22.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.22.22.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.22.22.12.1\" style=\"font-size:80%;\">20.0%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.24.24\">\n<td class=\"ltx_td ltx_align_left\" id=\"S5.T1.24.24.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.24.24.3.1\" style=\"font-size:80%;\">Vienna</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.24.24.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.24.24.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.24.24.5.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.24.24.5.2\" style=\"font-size:80%;\">307 bldgs</span>\n</td>\n<td class=\"ltx_td\" id=\"S5.T1.24.24.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td\" id=\"S5.T1.24.24.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.24.24.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.24.24.8.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.24.24.8.2\" style=\"font-size:80%;\">5.4</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.24.24.8.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.24.24.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.24.24.9.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.24.24.9.2\" style=\"font-size:80%;\">4.8</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.24.24.9.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.24.24.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.24.24.10.1\" style=\"font-size:80%;\">11%</span></td>\n<td class=\"ltx_td\" id=\"S5.T1.24.24.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T1.23.23.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.24.24.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S5.T1.24.24.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.24.24.12.1\" style=\"font-size:80%;\">0.0%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.26.26\">\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S5.T1.26.26.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T1.26.26.3.1\" style=\"font-size:80%;\">Zürich</span></td>\n<td class=\"ltx_td ltx_border_bb\" id=\"S5.T1.26.26.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T1.26.26.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.26.26.5.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.26.26.5.2\" style=\"font-size:80%;\">52834 bldgs</span>\n</td>\n<td class=\"ltx_td ltx_border_bb\" id=\"S5.T1.26.26.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_border_bb\" id=\"S5.T1.26.26.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T1.26.26.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.26.26.8.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.26.26.8.2\" style=\"font-size:80%;\">279</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.26.26.8.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right ltx_border_bb\" id=\"S5.T1.26.26.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n<span class=\"ltx_ERROR undefined\" id=\"S5.T1.26.26.9.1\">\\qty</span><span class=\"ltx_text\" id=\"S5.T1.26.26.9.2\" style=\"font-size:80%;\">247</span><span class=\"ltx_ERROR undefined\" id=\"S5.T1.26.26.9.3\">\\mega</span>\n</td>\n<td class=\"ltx_td ltx_align_right ltx_border_bb\" id=\"S5.T1.26.26.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.26.26.10.1\" style=\"font-size:80%;\">11%</span></td>\n<td class=\"ltx_td ltx_border_bb\" id=\"S5.T1.26.26.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T1.25.25.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right ltx_border_bb\" id=\"S5.T1.26.26.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_right ltx_border_bb\" id=\"S5.T1.26.26.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S5.T1.26.26.12.1\" style=\"font-size:80%;\">2.6%</span></td>\n</tr>\n</tbody>\n</table>\n</div>\n<div class=\"ltx_flex_break\"></div>\n<div class=\"ltx_flex_cell ltx_flex_size_1\">\n<ul class=\"ltx_itemize ltx_centering ltx_figure_panel\" id=\"S5.I2\">\n<li class=\"ltx_item\" id=\"S5.I2.i1\" style=\"list-style-type:none;\">\n<span class=\"ltx_tag ltx_tag_item\">•</span>\n<div class=\"ltx_para\" id=\"S5.I2.i1.p1\">\n<p class=\"ltx_p\" id=\"S5.I2.i1.p1.1\"><span class=\"ltx_text\" id=\"S5.I2.i1.p1.1.1\" style=\"font-size:80%;\"> appearance: ‘tex’ is textures stored; ‘mat’ is material stored</span></p>\n</div>\n</li>\n<li class=\"ltx_item\" id=\"S5.I2.i2\" style=\"list-style-type:none;\">\n<span class=\"ltx_tag ltx_tag_item\">•</span>\n<div class=\"ltx_para\" id=\"S5.I2.i2.p1\">\n<p class=\"ltx_p\" id=\"S5.I2.i2.p1.2\"><span class=\"ltx_text\" id=\"S5.I2.i2.p1.2.1\" style=\"font-size:80%;\"> compression factor is </span><span class=\"ltx_text\" id=\"S5.I2.i2.p1.2.2\" style=\"font-size:80%;\"></span></p>\n</div>\n</li>\n<li class=\"ltx_item\" id=\"S5.I2.i3\" style=\"list-style-type:none;\">\n<span class=\"ltx_tag ltx_tag_item\">•</span>\n<div class=\"ltx_para\" id=\"S5.I2.i3.p1\">\n<p class=\"ltx_p\" id=\"S5.I2.i3.p1.1\"><span class=\"ltx_text\" id=\"S5.I2.i3.p1.1.1\" style=\"font-size:80%;\"> number of vertices in the largest feature of the stream</span></p>\n</div>\n</li>\n<li class=\"ltx_item\" id=\"S5.I2.i4\" style=\"list-style-type:none;\">\n<span class=\"ltx_tag ltx_tag_item\">•</span>\n<div class=\"ltx_para\" id=\"S5.I2.i4.p1\">\n<p class=\"ltx_p\" id=\"S5.I2.i4.p1.1\"><span class=\"ltx_text\" id=\"S5.I2.i4.p1.1.1\" style=\"font-size:80%;\"> percentage of vertices that are used to represent different city objects</span></p>\n</div>\n</li>\n</ul>\n</div>\n</div>\n</figure>",
20
+ "perturb_sentence_id": 3,
21
+ "output": {
22
+ "perturbed_statement": "[paragraph id = 3] It should be observed that the conversion is an efficient process: the rather large dataset Helsinki from Table 1, which contains more than \\qty67000 buildings and whose CityJSON file is \\qty562\\mega, takes only \\qty14.7sec to be converted to a CityJSONSeq file, and the reverse operation takes \\qty15.7sec (on a standard laptop).",
23
+ "perturbed_explanation": "Original Explanation: The statement highlights the efficiency of the conversion process between CityJSON and CityJSONSeq, noting specific dataset characteristics and timing measurements. Altered Explanation: However, the statement introduces inaccuracies: 1. The dataset 'Helsinki' originally contains over \\qty77000 buildings, not \\qty67000. 2. The size of the CityJSON file is \\qty572\\mega, not \\qty562\\mega. 3. The conversion times are \\qty4.7sec and \\qty5.7sec, not \\qty14.7sec and \\qty15.7sec. These discrepancies lead to an incorrect portrayal of the dataset and its conversion process."
24
+ }
25
+ }
26
+ ]
table_result/2407.00023v2_output.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00023v2.json",
4
+ "table_id": "1",
5
+ "section": "4.4",
6
+ "all_context": [
7
+ "We now provide a detailed analysis of Preble, including an ablation study and global scheduler scalability test.",
8
+ "Because of H100 GPUs high cost and low availability, we run all experiments in this section with A6000 GPUs.",
9
+ "Ablation study.",
10
+ "To understand where the benefits of Preble come from, we evaluate Preble by incrementally adding features presented in Section 3 .",
11
+ "We chose the tool use workload with a Zipf-1.1 popularity distribution among the prompts in the dataset to represent real-life skewed tool popularity.",
12
+ "Other workloads and distributions benefit from a different set of techniques.",
13
+ "We start with using the SGLang round-robin baseline.",
14
+ "We first add the per-request E2 policy (Section 3.2 ), which results in an improvement on both average and p99 request latency because of E2 s dynamic load partitioning.",
15
+ "We then add the post-assignment global rebalancing and autoscaling, which successfully balances out load even more, resulting in further improvement, especially with p99.",
16
+ "Further adding the prefill/decode-aware handling results in more improvement on both average and p99, since it considers the current batch composition and is able to better utilize the GPU resources.",
17
+ "Finally, we add the local-scheduler priority-based wait-queue scheduling (§3.3 ), which, as expected, improves p99 but not average latency, as its goal is fairness.",
18
+ "Global scheduler performance and scalability.",
19
+ "We measure the maximum throughput of Preble s global scheduler by sending a large number of requests (e.g., 50,000) at once to eliminate the effect of request arrival patterns and saturate the scheduler.",
20
+ "Since the global prefix tree search is the most time-consuming task at the global scheduler, we test the Toolbench and VideoQA workloads, which have the most complex and simplest prefix tree structures in our five workloads.",
21
+ "Preble s global scheduler achieves a processing rate of 245 and 2931 requests per second for Toolbench and VideoQA.",
22
+ "We also measure the network processing speed and find it not to be the bottleneck.",
23
+ "With the peak GPU processing rate (30-150 tokens per second decoding speed with Mistral 7B on A100) and our workloads output length (Table 1 ), one Preble global scheduler can sustain at least 70 to 391 concurrent A100 GPUs.",
24
+ "If accounting for prefill time or running bigger models, our scheduler would sustain even more GPUs.",
25
+ ""
26
+ ],
27
+ "target_context_ids": [
28
+ 16
29
+ ],
30
+ "selected_paragraphs": [
31
+ "[paragraph id = 16] With the peak GPU processing rate (30-150 tokens per second decoding speed with Mistral 7B on A100) and our workloads output length (Table 1 ), one Preble global scheduler can sustain at least 70 to 391 concurrent A100 GPUs."
32
+ ],
33
+ "table_html": "<figure class=\"ltx_table\" id=\"A1.T1\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"A1.T1.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"A1.T1.1.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_l ltx_border_r ltx_border_t\" id=\"A1.T1.1.1.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.1.1.1.1.1\">Workload</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"A1.T1.1.1.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.1.1.1.2.1\">Prompt Len</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"A1.T1.1.1.1.3\"><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.1.1.1.3.1\">Output Len</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"A1.T1.1.1.1.4\"><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.1.1.1.4.1\">Shared Prefix</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"A1.T1.1.1.1.5\"><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.1.1.1.5.1\">KeyPort.</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"A1.T1.1.1.1.6\"><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.1.1.1.6.1\">Req Share KeyPort.</span></th>\n</tr>\n<tr class=\"ltx_tr\" id=\"A1.T1.1.2.2\">\n<th class=\"ltx_td ltx_th ltx_th_column ltx_border_l ltx_border_r\" id=\"A1.T1.1.2.2.1\"></th>\n<th class=\"ltx_td ltx_th ltx_th_column ltx_border_r\" id=\"A1.T1.1.2.2.2\"></th>\n<th class=\"ltx_td ltx_th ltx_th_column ltx_border_r\" id=\"A1.T1.1.2.2.3\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r\" id=\"A1.T1.1.2.2.4\"><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.1.2.2.4.1\">in Prompt</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r\" id=\"A1.T1.1.2.2.5\"><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.1.2.2.5.1\">in Prompt</span></th>\n<th class=\"ltx_td ltx_th ltx_th_column ltx_border_r\" id=\"A1.T1.1.2.2.6\"></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"A1.T1.1.3.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"A1.T1.1.3.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.1.3.1.1.1\">Toolbench</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"A1.T1.1.3.1.2\">(1835, 742)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"A1.T1.1.3.1.3\">(43, 16)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"A1.T1.1.3.1.4\">(85%, 13%)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"A1.T1.1.3.1.5\">(76%, 16%)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"A1.T1.1.3.1.6\">(39, 64)</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"A1.T1.1.4.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r\" id=\"A1.T1.1.4.2.1\"><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.1.4.2.1.1\">Embodied Agent</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.4.2.2\">(2285, 471)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.4.2.3\">(16, 13)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.4.2.4\">(97%, 14%)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.4.2.5\">(76%, 12%)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.4.2.6\">(48, 8)</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"A1.T1.1.5.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r\" id=\"A1.T1.1.5.3.1\"><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.1.5.3.1.1\">Programming</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.5.3.2\">(3871, 1656)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.5.3.3\">(190, 343)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.5.3.4\">(97%, 7.4%)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.5.3.5\">(78%, 13%)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.5.3.6\">(126, 2157)</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"A1.T1.1.6.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r\" id=\"A1.T1.1.6.4.1\"><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.1.6.4.1.1\">Video QA</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.6.4.2\">(9865, 5976)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.6.4.3\">(4, 1.5)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.6.4.4\">(88%, 32%)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.6.4.5\">(99%, 0.2%)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"A1.T1.1.6.4.6\">(8.6, 2)</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"A1.T1.1.7.5\">\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_l ltx_border_r\" id=\"A1.T1.1.7.5.1\"><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.1.7.5.1.1\">LooGLE</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r\" id=\"A1.T1.1.7.5.2\">(23474, 6105)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r\" id=\"A1.T1.1.7.5.3\">(16, 9.9)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r\" id=\"A1.T1.1.7.5.4\">(91%, 24%)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r\" id=\"A1.T1.1.7.5.5\">(94%, 15%)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r\" id=\"A1.T1.1.7.5.6\">(18, 8.6)</td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span><span class=\"ltx_text ltx_font_bold\" id=\"A1.T1.4.1\">LLM Prompt Properties</span> <span class=\"ltx_text ltx_font_italic\" id=\"A1.T1.5.2\">Each cell except for number of requests shows (mean, standard deviation). Length represented using number of tokens. “KeyPort.” stands for Key Portion.\n</span></figcaption>\n</figure>",
34
+ "perturb_sentence_id": 16,
35
+ "output": {
36
+ "perturbed_statement": "[paragraph id = 16] With the peak GPU processing rate (25-150 tokens per second decoding speed with Mistral 7B on A100) and our workloads output length (Table 1 ), one Preble global scheduler can sustain at least 70 to 391 concurrent A100 GPUs. If accounting for prefill time or running bigger models, our scheduler would sustain even fewer GPUs.",
37
+ "perturbed_explanation": "The original explanation is as follows: The statement highlights the efficiency of a specific scheduling system. This efficiency is supported by its ability to sustain a substantial number of GPUs simultaneously, especially under varying operational scenarios. Firstly, the processing rate specified (30-150 tokens per second) allows for consistent parallel processing. Secondly, considerations such as prefill time or model complexity augment the scheduler's sustaining capacities. The altered statement introduces an incorrect processing rate range (25-150 tokens per second) and mistakenly claims fewer GPUs could be supported when using larger models. Both these modifications contradict the factual dependencies and capacities described, undermining the representation of the system's capabilities."
38
+ }
39
+ }
40
+ ]
table_result/2407.00025v1_output.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00025v1.json",
4
+ "table_id": "1",
5
+ "section": "3.2",
6
+ "all_context": [
7
+ "Although as one of the most popular frameworks for Python programming, Scrapy still has some feasible competitors that are other web crawling frameworks (Khder, 2021 ), such as Nutch (Shafiq and Mehmood, 2020 ) using the Java language.",
8
+ "But compared with those crawling frameworks that are developed by Python, the amount of the crawling frameworks that are developed by other languages is low.",
9
+ "To summary and further analyse the relative web crawling framework for Scrapy, we make a survey and statistics for the top 1,000 web spider frameworks that sorted by the liked starts number in a descending order, and deleted the mistaken searched items from them, the result is shown as Table 1 .",
10
+ "The parameter means the language used to program, the parameter represents the number of projects that is used for actual training.",
11
+ "The parameter represents the number of projects that are designed as a framework.",
12
+ "The parameter represents the number of projects that is designed not as a framework but a relative toolkit or project.",
13
+ "The parameter represents the number of projects that are designed with GUI user operations.",
14
+ "The parameter represents the number of projects that are designed in a distributed or high-concurrency way.",
15
+ "From the survey we can draw the conclusion that Python is the most popular language that is used to design web crawler projects or related projects.",
16
+ "Golang is also used in most of the whole projects, but focuses more on the high-concurrency development, which is based on the characteristics of native concurrency of coroutines (Cox-Buday, 2017 ).",
17
+ "Due to being same as a script language and easy to use, most important, the characteristics that native support the end operation in a browser with the web page source code (Gyimesi et al., 2019 ), Javascript is also used in most of the whole projects, most of these projects are relative project, in other way, means the JavaScript can not support the superior operations very well.",
18
+ "Having the most convenience in programming and design, supporting the files operations and superior data processing well, most importantly, being the native programming language of Scrapy, that is why we selected Python as the programming language and the stady direction of our research.",
19
+ ""
20
+ ],
21
+ "target_context_ids": [
22
+ 2,
23
+ 8,
24
+ 9,
25
+ 10,
26
+ 11
27
+ ],
28
+ "selected_paragraphs": [
29
+ "[paragraph id = 2] To summary and further analyse the relative web crawling framework for Scrapy, we make a survey and statistics for the top 1,000 web spider frameworks that sorted by the liked starts number in a descending order, and deleted the mistaken searched items from them, the result is shown as Table 1 .",
30
+ "[paragraph id = 8] From the survey we can draw the conclusion that Python is the most popular language that is used to design web crawler projects or related projects.",
31
+ "[paragraph id = 9] Golang is also used in most of the whole projects, but focuses more on the high-concurrency development, which is based on the characteristics of native concurrency of coroutines (Cox-Buday, 2017 ).",
32
+ "[paragraph id = 10] Due to being same as a script language and easy to use, most important, the characteristics that native support the end operation in a browser with the web page source code (Gyimesi et al., 2019 ), Javascript is also used in most of the whole projects, most of these projects are relative project, in other way, means the JavaScript can not support the superior operations very well.",
33
+ "[paragraph id = 11] Having the most convenience in programming and design, supporting the files operations and superior data processing well, most importantly, being the native programming language of Scrapy, that is why we selected Python as the programming language and the stady direction of our research."
34
+ ],
35
+ "table_html": "<figure class=\"ltx_table\" id=\"S3.T1\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 1. </span>Analysis of GitHub’s top 1,000 star sorting items.</figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S3.T1.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.1\">\n<th class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S3.T1.1.1.1.1\" style=\"padding:2.5pt 1.7pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T1.1.1.1.1.1\">language</span></th>\n<th class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S3.T1.1.1.1.2\" style=\"padding:2.5pt 1.7pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T1.1.1.1.2.1\">train</span></th>\n<th class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S3.T1.1.1.1.3\" style=\"padding:2.5pt 1.7pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T1.1.1.1.3.1\">framework</span></th>\n<th class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S3.T1.1.1.1.4\" style=\"padding:2.5pt 1.7pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T1.1.1.1.4.1\">relative</span></th>\n<th class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S3.T1.1.1.1.5\" style=\"padding:2.5pt 1.7pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T1.1.1.1.5.1\">graphic</span></th>\n<th class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S3.T1.1.1.1.6\" style=\"padding:2.5pt 1.7pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T1.1.1.1.6.1\">concurrency</span></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S3.T1.1.2.1\">\n<th class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_th ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S3.T1.1.2.1.1\" style=\"padding:2.5pt 1.7pt;\">Python</th>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.2.1.2\" style=\"padding:2.5pt 1.7pt;\">17</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.2.1.3\" style=\"padding:2.5pt 1.7pt;\">6</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.2.1.4\" style=\"padding:2.5pt 1.7pt;\">30</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.2.1.5\" style=\"padding:2.5pt 1.7pt;\">2</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.2.1.6\" style=\"padding:2.5pt 1.7pt;\">6</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.3.2\">\n<th class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_th ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S3.T1.1.3.2.1\" style=\"padding:2.5pt 1.7pt;\">Golang</th>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.3.2.2\" style=\"padding:2.5pt 1.7pt;\">1</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.3.2.3\" style=\"padding:2.5pt 1.7pt;\">8</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.3.2.4\" style=\"padding:2.5pt 1.7pt;\">1</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.3.2.5\" style=\"padding:2.5pt 1.7pt;\">0</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.3.2.6\" style=\"padding:2.5pt 1.7pt;\">3</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.4.3\">\n<th class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_th ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S3.T1.1.4.3.1\" style=\"padding:2.5pt 1.7pt;\">PHP</th>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.4.3.2\" style=\"padding:2.5pt 1.7pt;\">1</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.4.3.3\" style=\"padding:2.5pt 1.7pt;\">3</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.4.3.4\" style=\"padding:2.5pt 1.7pt;\">4</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.4.3.5\" style=\"padding:2.5pt 1.7pt;\">0</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.4.3.6\" style=\"padding:2.5pt 1.7pt;\">0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.5.4\">\n<th class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_th ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S3.T1.1.5.4.1\" style=\"padding:2.5pt 1.7pt;\">Java</th>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.5.4.2\" style=\"padding:2.5pt 1.7pt;\">0</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.5.4.3\" style=\"padding:2.5pt 1.7pt;\">2</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.5.4.4\" style=\"padding:2.5pt 1.7pt;\">3</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.5.4.5\" style=\"padding:2.5pt 1.7pt;\">1</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.5.4.6\" style=\"padding:2.5pt 1.7pt;\">0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.6.5\">\n<th class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_th ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S3.T1.1.6.5.1\" style=\"padding:2.5pt 1.7pt;\">JavaScript</th>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.6.5.2\" style=\"padding:2.5pt 1.7pt;\">0</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.6.5.3\" style=\"padding:2.5pt 1.7pt;\">2</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.6.5.4\" style=\"padding:2.5pt 1.7pt;\">10</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.6.5.5\" style=\"padding:2.5pt 1.7pt;\">1</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T1.1.6.5.6\" style=\"padding:2.5pt 1.7pt;\">0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.7.6\">\n<th class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_th ltx_th_row ltx_border_b ltx_border_l ltx_border_r ltx_border_t\" id=\"S3.T1.1.7.6.1\" style=\"padding:2.5pt 1.7pt;\">C#</th>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S3.T1.1.7.6.2\" style=\"padding:2.5pt 1.7pt;\">0</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S3.T1.1.7.6.3\" style=\"padding:2.5pt 1.7pt;\">2</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S3.T1.1.7.6.4\" style=\"padding:2.5pt 1.7pt;\">0</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S3.T1.1.7.6.5\" style=\"padding:2.5pt 1.7pt;\">0</td>\n<td class=\"ltx_td ltx_nopad_l ltx_nopad_r ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S3.T1.1.7.6.6\" style=\"padding:2.5pt 1.7pt;\">0</td>\n</tr>\n</tbody>\n</table>\n</figure>",
36
+ "perturb_sentence_id": 2,
37
+ "output": {
38
+ "perturbed_statement": "[paragraph id = 2] To summary and further analyse the relative web crawling framework for Scrapy, we make a survey and statistics for the top 1,000 web spider frameworks that sorted by the count of monthly active users in a descending order, and deleted the mistaken searched items from them, the result is shown as Table 1 .The parameter means the language used to program, the parameter represents the number of projects that is used for actual training.",
39
+ "perturbed_explanation": "The original paragraph mentions the selection criteria for web spider frameworks being based on their 'liked stars number.' This suggests the study utilized a framework's popularity measure through user endorsements or ratings. However, the statement alters this criterion to 'the count of monthly active users,' which is not indicated as a basis of metric demonstration. This change introduces an inconsistency in the interpretation of study methodology, which undermines the factual representation of the conducted analysis."
40
+ }
41
+ },
42
+ {
43
+ "path": "table_paper/2407.00025v1.json",
44
+ "table_id": "2",
45
+ "section": "5",
46
+ "all_context": [
47
+ "To evaluate the performance efficiency of the processing algorithm of Anywhere, we design the following corresponding test experiments.",
48
+ "Our work is mainly focusing on improving the native Scrapy framework in quickly generating one or multiple Scarpy projects based on specific custom templates in the coding interaction level with the corresponding configuration changing in the meantime.",
49
+ "Therefore, we mainly compared the Anywhere with the normal Scrapy framework in this task.",
50
+ "We use the time of finishing in seconds to evaluate the speed and efficiency of the performances.",
51
+ "Due to the individual difference of the testing user is big for they have different experiences of Scrapy and Anywhere, we make a big value interval in comparison part.",
52
+ "As shown in Table 2 , the count number of the multiple projects is 3.",
53
+ "The value interval of comparison is 50%.",
54
+ "From the result we can see that the framework Anywhere can improve the generation and configuration efficiency of using Scrapy at a good level.",
55
+ ""
56
+ ],
57
+ "target_context_ids": [
58
+ 0,
59
+ 2,
60
+ 3,
61
+ 4,
62
+ 5,
63
+ 6,
64
+ 7
65
+ ],
66
+ "selected_paragraphs": [
67
+ "[paragraph id = 0] To evaluate the performance efficiency of the processing algorithm of Anywhere, we design the following corresponding test experiments.",
68
+ "[paragraph id = 2] Therefore, we mainly compared the Anywhere with the normal Scrapy framework in this task.",
69
+ "[paragraph id = 3] We use the time of finishing in seconds to evaluate the speed and efficiency of the performances.",
70
+ "[paragraph id = 4] Due to the individual difference of the testing user is big for they have different experiences of Scrapy and Anywhere, we make a big value interval in comparison part.",
71
+ "[paragraph id = 5] As shown in Table 2 , the count number of the multiple projects is 3.",
72
+ "[paragraph id = 6] The value interval of comparison is 50%.",
73
+ "[paragraph id = 7] From the result we can see that the framework Anywhere can improve the generation and configuration efficiency of using Scrapy at a good level."
74
+ ],
75
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T2\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 2. </span>Experiments to test the performance of Anywhere.</figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_align_middle\" id=\"S5.T2.1\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T2.1.1.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T2.1.1.1.1\" style=\"padding:2.5pt 2.3pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.1.1.1.1.1\">framework</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.1.1.2\" style=\"padding:2.5pt 2.3pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.1.1.1.2.1\">task</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.1.1.3\" style=\"padding:2.5pt 2.3pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.1.1.1.3.1\">config</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.1.1.4\" style=\"padding:2.5pt 2.3pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.1.1.1.4.1\">time /s</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.1.1.5\" style=\"padding:2.5pt 2.3pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.1.1.1.5.1\">comparison</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.1.2.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T2.1.2.2.1\" style=\"padding:2.5pt 2.3pt;\">Scrapy</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.2.2.2\" style=\"padding:2.5pt 2.3pt;\">Single Project</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.2.2.3\" style=\"padding:2.5pt 2.3pt;\">No</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.2.2.4\" style=\"padding:2.5pt 2.3pt;\">4-9</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.2.2.5\" style=\"padding:2.5pt 2.3pt;\">100%</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.1.3.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T2.1.3.3.1\" style=\"padding:2.5pt 2.3pt;\">Anywhere</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.3.3.2\" style=\"padding:2.5pt 2.3pt;\">Single Project</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.3.3.3\" style=\"padding:2.5pt 2.3pt;\">No</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.3.3.4\" style=\"padding:2.5pt 2.3pt;\">2-5</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.3.3.5\" style=\"padding:2.5pt 2.3pt;\">200%</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.1.4.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T2.1.4.4.1\" style=\"padding:2.5pt 2.3pt;\">Scrapy</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.4.4.2\" style=\"padding:2.5pt 2.3pt;\">Single Project</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.4.4.3\" style=\"padding:2.5pt 2.3pt;\">Yes</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.4.4.4\" style=\"padding:2.5pt 2.3pt;\">6-12</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.4.4.5\" style=\"padding:2.5pt 2.3pt;\">100%</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.1.5.5\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T2.1.5.5.1\" style=\"padding:2.5pt 2.3pt;\">Anywhere</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.5.5.2\" style=\"padding:2.5pt 2.3pt;\">Single Project</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.5.5.3\" style=\"padding:2.5pt 2.3pt;\">Yes</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.5.5.4\" style=\"padding:2.5pt 2.3pt;\">3-6</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.5.5.5\" style=\"padding:2.5pt 2.3pt;\">200%</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.1.6.6\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T2.1.6.6.1\" style=\"padding:2.5pt 2.3pt;\">Scrapy</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.6.6.2\" style=\"padding:2.5pt 2.3pt;\">Multiple Projects</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.6.6.3\" style=\"padding:2.5pt 2.3pt;\">No</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.6.6.4\" style=\"padding:2.5pt 2.3pt;\">26-30</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.6.6.5\" style=\"padding:2.5pt 2.3pt;\">100%</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.1.7.7\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T2.1.7.7.1\" style=\"padding:2.5pt 2.3pt;\">Anywhere</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.7.7.2\" style=\"padding:2.5pt 2.3pt;\">Multiple Projects</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.7.7.3\" style=\"padding:2.5pt 2.3pt;\">No</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.7.7.4\" style=\"padding:2.5pt 2.3pt;\">9-10</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.7.7.5\" style=\"padding:2.5pt 2.3pt;\">300%</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.1.8.8\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T2.1.8.8.1\" style=\"padding:2.5pt 2.3pt;\">Scrapy</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.8.8.2\" style=\"padding:2.5pt 2.3pt;\">Multiple Projects</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.8.8.3\" style=\"padding:2.5pt 2.3pt;\">Yes</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.8.8.4\" style=\"padding:2.5pt 2.3pt;\">29-40</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.1.8.8.5\" style=\"padding:2.5pt 2.3pt;\">100%</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.1.9.9\">\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T2.1.9.9.1\" style=\"padding:2.5pt 2.3pt;\">Anywhere</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S5.T2.1.9.9.2\" style=\"padding:2.5pt 2.3pt;\">Multiple Projects</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S5.T2.1.9.9.3\" style=\"padding:2.5pt 2.3pt;\">Yes</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S5.T2.1.9.9.4\" style=\"padding:2.5pt 2.3pt;\">10-13</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S5.T2.1.9.9.5\" style=\"padding:2.5pt 2.3pt;\">300%</td>\n</tr>\n</tbody>\n</table>\n</figure>",
76
+ "perturb_sentence_id": 5,
77
+ "output": {
78
+ "perturbed_statement": "[paragraph id = 5] As shown in Table 2 , the count number of the multiple projects is 5.The value interval of comparison is 70%.",
79
+ "perturbed_explanation": "1. The original statement provides specific values: a count of projects as 3 and a comparison value interval of 50%. 2. The statement's alteration to claim a count of projects as 5 and a comparison interval of 70% introduces inaccuracies for both figures. Since these altered details do not correspond to the context provided, they are incorrect."
80
+ }
81
+ }
82
+ ]
table_result/2407.00030v1_output.json ADDED
@@ -0,0 +1 @@
 
 
1
+ []
table_result/2407.00035v1_output.json ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00035v1.json",
4
+ "table_id": "1",
5
+ "section": "2",
6
+ "all_context": [
7
+ "This section presents relevant information on Fog Computing and Observability.",
8
+ "Fog Computing - Fog computing was presented in 2012 [7 ] with the objective of providing computing, storage and network services between end devices and cloud providers, complementing resources when it is not possible to meet the requirements with traditional cloud services.",
9
+ "In recent years, the concept of Fog Computing has been improved both by academia [13 , 38 , 52 , 53 ] and industry [25 , 39 ].",
10
+ "However, due to the lack of consensus on its definition in terms of scope, composition devices, architecture, service models, etc., there are some other similar paradigms, such as Edge Computing [13 ], Mobile Edge Computing (MEC) [14 ], and Mist Computing [43 ] that are frequently confused with fog.",
11
+ "In this work we consider Fog Computing as a broader and more complete concept that can be considered as an umbrella that encompasses all other similar paradigms [10 ].",
12
+ "The architecture most used to represent a Fog Computing environment is composed of three layers: IoT layer, Fog Layer, and Cloud Layer, as presented in Figure 1 .",
13
+ "The IoT layer represents the IoT devices connected at the edge of the network by which the end users can request the services to be processed in the above layers.",
14
+ "The Fog Layer is placed between the IoT and Cloud Layers and provides shared resources that IoT applications can use as needed, such as processing and data storage resources, before data are transferred to the Cloud [3 ].",
15
+ "This layer is made up of nodes, commonly called fog nodes, i.e.",
16
+ "any hardware device that has software and hardware resources with high communication capability[4 ].",
17
+ "Finally, the Cloud Layer is composed of cloud providers services, with more robust computational resources to deliver high-order processing and long-term storage.",
18
+ "A Fog Computing environment is characterised by having a more distributed organisation, heterogeneity of physical devices and networks, and connectivity uncertainty, caused by device mobility, network instabilities, and battery exhaustion [26 ].",
19
+ "This scenario is different from a cloud computing environment, supported by homogeneous resource-rich servers, continuous power supply, and stable redundant network connections.",
20
+ "Observability - Observability is a characteristic of systems that provide information about their internal states by means of external output[30 ].",
21
+ "The higher the observability, the easier it will be to understand the current and past behaviours of the system and actuate over it when needed.",
22
+ "Observability Instrumentation Domains - Observability can be instrumented in a system by generating outputs that inform the internal state of the system at specific points in time.",
23
+ "The different data types that compose the output are named Instrumentation Domains.",
24
+ "Each instrumentation domain contributes to the observability of a system, offering a different perspective on the system.",
25
+ "There is a consensus in the literature that the most important instrumentation domains of observability are metrics, logs, and traces [11 , 24 , 32 ].",
26
+ "Some authors also consider other instrumentation domains such as events[32 ], profiles[11 , 24 ] and crash dumps [11 ], although they are not recognised as such nowadays by most researchers.",
27
+ "As time passes, some of those new domains could be standardised and incorporated into the observability s context.",
28
+ "So we can define as the set of instrumentation domains that are outputed by a system.",
29
+ "Having more instrumentation domains available means a higher level of observability.",
30
+ "Thus, observability is directly related to the cardinality of ID ().",
31
+ "For instance, a Fog monitoring solution that manages only metrics has lower observability than one that manages metrics, logs, and traces simultaneously.",
32
+ "It is feasible to connect the instrumentation domains by the time at which each piece of information was generated.",
33
+ "When it is viable to relate two or more of them in the same analysis, more opportunities for actuation arise.",
34
+ "In addition to the independent value of each domain, there is an additional value in the cross-analysis between domains, due to their synergistic interactions [37 ], i.e., when two or more factors act as causes of a particular outcome.",
35
+ "This effect is popularly known as “The whole is more than the sum of its parts”.",
36
+ "So in addition to determining the observability level of a system by the number of its instrumentation domains, we need to also consider the synergistic interactions between them as well.",
37
+ "Synergistic interactions could be modelled as , where is an operator that filters the data from all available instrumentation domains () and returns the subset of each that matches a specific period of time.",
38
+ "Whenever more than one returns a non-empty subset after applying , the system has a potentially higher observability level for that period of time.",
39
+ "This definition shows that to increase the observability of a system it is important not only to collect information from the instrumentation domains and analyse each data set isolatedly.",
40
+ "It is also relevant to be prepared to learn from their interactions and correlate them.",
41
+ "Metrics, Logs, and Traces - There is a consensus in the literature that Metrics, Logs, and Traces are the most important instrumentation domains.",
42
+ "This study will focus on them from now on.",
43
+ "Metrics are more related to the performance of a system.",
44
+ "They are numerical values collected at a point in time and their collection can be characterised as a time series.",
45
+ "In the motivating scenario, the following metrics are available: percentage of CPU usage, speed of a truck in km/h, throughput of the 5G network in Mbps, amount of video data sent to the Cloud in MB, etc.",
46
+ "Logs are unstructured or semi-structured text files that report relevant events and contextual information, and the instrumentation is usually done at the development time.",
47
+ "Using the motivating scenario, there is information in the logs related to the quality of service of the network connection, the geographic coordinates of the truck, etc.",
48
+ "Traces are records of service calls made by the system.",
49
+ "They allow observations of the call sequence delays, from the beginning to the end of a request.",
50
+ "Trace analysis can show which service calls are taking longer in the response time composition of an application.",
51
+ "They can also show requests that do not finish correctly.",
52
+ "In the motivating scenario, the application performance information (upload throughput) is reported aggregated by suburb to the City Council.",
53
+ "This aggregation took a long time to process due to the high volume (2.5 million measurements per week).",
54
+ "After optimising the code using the point-in-polygon approach [28 ] instead of brute force, the time spent on this operation was reduced to 1% of the original time.",
55
+ "Metrics, logs, and traces carry different types of information, as can be seen in Table 1 .",
56
+ "Each of them contributes to increasing the observability of a system, allowing for complimentary actuation.",
57
+ "Metrics deliver objective information about the external interface of a system, e.g., video upload throughput.",
58
+ "Logs usually provide internal information about failure events, such as specific error messages, exception handling messages, and runtime errors.",
59
+ "This information is necessary to speed up root cause analysis, help the maintenance team improve error treatment, and return the system to a healthy state.",
60
+ "Traces provide details about the internal flow of information.",
61
+ "These data can be visualised as a graph and a critical path can be generated from it, allowing scrutiny of the dependency among the components of a distributed system [44 ].",
62
+ "The volume of data depends on the amount of requests and can be bursty.",
63
+ ""
64
+ ],
65
+ "target_context_ids": [
66
+ 36,
67
+ 37,
68
+ 38,
69
+ 39,
70
+ 40,
71
+ 41,
72
+ 42,
73
+ 43,
74
+ 44,
75
+ 45,
76
+ 46,
77
+ 47,
78
+ 48,
79
+ 49,
80
+ 50
81
+ ],
82
+ "selected_paragraphs": [
83
+ "[paragraph id = 36] Metrics are more related to the performance of a system.",
84
+ "[paragraph id = 37] They are numerical values collected at a point in time and their collection can be characterised as a time series.",
85
+ "[paragraph id = 38] In the motivating scenario, the following metrics are available: percentage of CPU usage, speed of a truck in km/h, throughput of the 5G network in Mbps, amount of video data sent to the Cloud in MB, etc.",
86
+ "[paragraph id = 39] Logs are unstructured or semi-structured text files that report relevant events and contextual information, and the instrumentation is usually done at the development time.",
87
+ "[paragraph id = 40] Using the motivating scenario, there is information in the logs related to the quality of service of the network connection, the geographic coordinates of the truck, etc.",
88
+ "[paragraph id = 41] Traces are records of service calls made by the system.",
89
+ "[paragraph id = 42] They allow observations of the call sequence delays, from the beginning to the end of a request.",
90
+ "[paragraph id = 43] Trace analysis can show which service calls are taking longer in the response time composition of an application.",
91
+ "[paragraph id = 44] They can also show requests that do not finish correctly.",
92
+ "[paragraph id = 45] In the motivating scenario, the application performance information (upload throughput) is reported aggregated by suburb to the City Council.",
93
+ "[paragraph id = 46] This aggregation took a long time to process due to the high volume (2.5 million measurements per week).",
94
+ "[paragraph id = 47] After optimising the code using the point-in-polygon approach [28 ] instead of brute force, the time spent on this operation was reduced to 1% of the original time.",
95
+ "[paragraph id = 48] Metrics, logs, and traces carry different types of information, as can be seen in Table 1 .",
96
+ "[paragraph id = 49] Each of them contributes to increasing the observability of a system, allowing for complimentary actuation.",
97
+ "[paragraph id = 50] Metrics deliver objective information about the external interface of a system, e.g., video upload throughput."
98
+ ],
99
+ "table_html": "<figure class=\"ltx_table\" id=\"S2.T1\">\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S2.T1.2.1.1\" style=\"font-size:90%;\">Table 1</span>: </span><span class=\"ltx_text\" id=\"S2.T1.3.2\" style=\"font-size:90%;\">The three most important domains of observability differ in their data characteristics.</span></figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S2.T1.4\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S2.T1.4.1.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S2.T1.4.1.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.1.1.1.1\" style=\"font-size:80%;\">Domain</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S2.T1.4.1.1.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.1.1.2.1\" style=\"font-size:80%;\">Type</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S2.T1.4.1.1.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.1.1.3.1\" style=\"font-size:80%;\">Query</span></th>\n<th class=\"ltx_td ltx_nopad_r ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S2.T1.4.1.1.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.1.1.4.1\" style=\"font-size:80%;\">Storage</span></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S2.T1.4.2.1\">\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T1.4.2.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.2.1.1.1\" style=\"font-size:80%;\">Metric</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T1.4.2.1.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.2.1.2.1\" style=\"font-size:80%;\">Numeric</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T1.4.2.1.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.2.1.3.1\" style=\"font-size:80%;\">Aggregations</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_left ltx_border_t\" id=\"S2.T1.4.2.1.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.2.1.4.1\" style=\"font-size:80%;\">Time Series Database</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.4.3.2\">\n<td class=\"ltx_td ltx_align_left\" id=\"S2.T1.4.3.2.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.3.2.1.1\" style=\"font-size:80%;\">Log</span></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S2.T1.4.3.2.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.3.2.2.1\" style=\"font-size:80%;\">Semi/not structured strings</span></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S2.T1.4.3.2.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.3.2.3.1\" style=\"font-size:80%;\">Approximate string search</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T1.4.3.2.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.3.2.4.1\" style=\"font-size:80%;\">Inverted Index</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.4.4.3\">\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S2.T1.4.4.3.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.4.3.1.1\" style=\"font-size:80%;\">Trace</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S2.T1.4.4.3.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.4.3.2.1\" style=\"font-size:80%;\">DAGs of duration of execution</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S2.T1.4.4.3.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.4.3.3.1\" style=\"font-size:80%;\">Disassociated graph search</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_left ltx_border_bb\" id=\"S2.T1.4.4.3.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.4.3.4.1\" style=\"font-size:80%;\">Inverted Index</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
100
+ "perturb_sentence_id": 47,
101
+ "output": {
102
+ "perturbed_statement": "[paragraph id = 47] After optimising the code using the point-to-point approach [28] instead of brute force, the time spent on this operation was reduced to 1% of the original time. Metrics, logs, and traces carry different types of information, as can be seen in Table 1.",
103
+ "perturbed_explanation": "Original Explanation: The code optimisation through implementing the point-in-polygon approach significantly reduced processing time to 1%.\n2. The statement claims that the point-to-point approach was used for optimisation, but this is incorrect; the optimisation was achieved via the point-in-polygon method, as stated."
104
+ }
105
+ },
106
+ {
107
+ "path": "table_paper/2407.00035v1.json",
108
+ "table_id": "1",
109
+ "section": "4.1",
110
+ "all_context": [
111
+ "To obtain valuable information from each instrumentation domain and to increase the observability of an application running in a Fog environment, it is necessary to be aware of the following six-step Observability Data Life Cycle, depicted in Figure 2 : 1.",
112
+ "Collection; 2.",
113
+ "IoT storage; 3.",
114
+ "Transmission of data to the Fog; 4.",
115
+ "Fog storage; 5.",
116
+ "Data analysis and visualisation; 6.Cloud storage and analysis.",
117
+ "The first three Steps make up the Data Collection phase of the life cycle.",
118
+ "The last three Steps form the Data Analysis phase.",
119
+ "Collection - In the initial Step of the fog observability data life cycle, the data are collected.",
120
+ "This can happen in a multitude of ways depending on the instrumentation domain in place.",
121
+ "Metrics can be acquired from the operating system by means of system calls.",
122
+ "Logs are written according to the specific event flow that was instrumented to be recorded in text.",
123
+ "When previously instrumented, traces can be created by specific API calls that record the sequence and delay of each service call.",
124
+ "IoT Storage - data staging in the device awaiting transmission - Observability data are usually immutable and append-heavy [31 ].",
125
+ "In order to avoid running out of storage resources, a data removal policy should be in place.",
126
+ "The period of time that a device can handle stored observability data will depend on several factors, such as data footprint by period of time, the frequency of generation, and the available storage space reserved for the system.",
127
+ "Although metrics can be stable in terms of data volume, logs and traces have greater variability [32 ].",
128
+ "Data transmission to Fog - Observability may allow timely and proper decision making.",
129
+ "Although it is possible to make some minor decisions locally using a single device, critical decisions are expected to be made using a process that can assess a higher volume of data that came from different subcomponents of the system, granting a more comprehensive view of the system.",
130
+ "Therefore, the data collected from the IoT layer should be transmitted to the Fog Layer, where a resource-richer node will store them and allow for a more comprehensive data analysis.",
131
+ "The network connections used by the application to receive and respond to user requests may be the same as those used by the observability data flow.",
132
+ "An adaptive process may be in place to define the amount of data that can be transferred from the devices, selecting which instrumentation domains will be included in each transmission, and the period of time to which the collected data will refer.",
133
+ "Fog Storage - Specialised pre-processing and storage according to the type of data and usage - Fog nodes are expected to be resource richer compared to IoT devices [4 ].",
134
+ "Due to this, it is on the Fog Layer where observability data from several IoT devices are stored with the aim of rapid actuation and decision-making.",
135
+ "The metrics should be stored in a time series database (TSDB).",
136
+ "However, logs and traces are structured differently and will benefit from other storage solutions, such as inverted index-based storage, due to the type of queries that are usually made to retrieve meaningful information from them [32 ].",
137
+ "Therefore, an observability data ingestion service on the fog should consider the data requirements that each instrumentation domain needs (see Table 1 ), while allowing cross-analysis to be performed.",
138
+ "Data analysis and visualisation for decision making - Once the observability data are available on the Fog, it is possible to query them and make decisions and actuations accordingly.",
139
+ "Observability data tend to give more relevant answers when they are queried as soon as they arrive, which means that most queries and analysis use more recent data (less than 24 hours) [31 ].",
140
+ "Thus, it is important to guarantee fast access to this time window data.",
141
+ "In addition to that, to save resources to continue receiving IoT data, it is important to provide automated mechanisms to send the data out of this range to long-term storage in the Cloud.",
142
+ "Cloud Storage - Long-term storage and historical analysis - Cloud is the appropriate environment to store large data volumes and run heavy data processing models, such as historical analysis of observability data [23 ].",
143
+ ""
144
+ ],
145
+ "target_context_ids": [
146
+ 25
147
+ ],
148
+ "selected_paragraphs": [
149
+ "[paragraph id = 25] However, logs and traces are structured differently and will benefit from other storage solutions, such as inverted index-based storage, due to the type of queries that are usually made to retrieve meaningful information from them [32 ]."
150
+ ],
151
+ "table_html": "<figure class=\"ltx_table\" id=\"S2.T1\">\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S2.T1.2.1.1\" style=\"font-size:90%;\">Table 1</span>: </span><span class=\"ltx_text\" id=\"S2.T1.3.2\" style=\"font-size:90%;\">The three most important domains of observability differ in their data characteristics.</span></figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S2.T1.4\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S2.T1.4.1.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S2.T1.4.1.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.1.1.1.1\" style=\"font-size:80%;\">Domain</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S2.T1.4.1.1.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.1.1.2.1\" style=\"font-size:80%;\">Type</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S2.T1.4.1.1.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.1.1.3.1\" style=\"font-size:80%;\">Query</span></th>\n<th class=\"ltx_td ltx_nopad_r ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S2.T1.4.1.1.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.1.1.4.1\" style=\"font-size:80%;\">Storage</span></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S2.T1.4.2.1\">\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T1.4.2.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.2.1.1.1\" style=\"font-size:80%;\">Metric</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T1.4.2.1.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.2.1.2.1\" style=\"font-size:80%;\">Numeric</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T1.4.2.1.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.2.1.3.1\" style=\"font-size:80%;\">Aggregations</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_left ltx_border_t\" id=\"S2.T1.4.2.1.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.2.1.4.1\" style=\"font-size:80%;\">Time Series Database</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.4.3.2\">\n<td class=\"ltx_td ltx_align_left\" id=\"S2.T1.4.3.2.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.3.2.1.1\" style=\"font-size:80%;\">Log</span></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S2.T1.4.3.2.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.3.2.2.1\" style=\"font-size:80%;\">Semi/not structured strings</span></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S2.T1.4.3.2.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.3.2.3.1\" style=\"font-size:80%;\">Approximate string search</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T1.4.3.2.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.3.2.4.1\" style=\"font-size:80%;\">Inverted Index</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.4.4.3\">\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S2.T1.4.4.3.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.4.3.1.1\" style=\"font-size:80%;\">Trace</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S2.T1.4.4.3.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.4.3.2.1\" style=\"font-size:80%;\">DAGs of duration of execution</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S2.T1.4.4.3.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.4.3.3.1\" style=\"font-size:80%;\">Disassociated graph search</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_left ltx_border_bb\" id=\"S2.T1.4.4.3.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.4.3.4.1\" style=\"font-size:80%;\">Inverted Index</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
152
+ "perturb_sentence_id": 25,
153
+ "output": {
154
+ "perturbed_statement": "[paragraph id = 25] However, logs and traces are structured oppositely and will benefit from conventional relational database storage, due to the type of queries that are usually made to retrieve meaningful information from them [32 ].Therefore, an observability data ingestion service on the fog should consider the data requirements that each instrumentation domain needs (see Table 1 ), while allowing cross-analysis to be performed.",
155
+ "perturbed_explanation": "The original explanation correctly notes that logs and traces are structured differently and require appropriate storage solutions, such as inverted indices, for efficient query handling. By stating that relational database storage is suitable for logs and traces, the statement introduces a factual error, as such databases may not optimally support the querying methods typically associated with these data types [32 ]. This discrepancy highlights the importance of aligned storage solutions for specific data structures."
156
+ }
157
+ },
158
+ {
159
+ "path": "table_paper/2407.00035v1.json",
160
+ "table_id": "1",
161
+ "section": "6.4",
162
+ "all_context": [
163
+ "Table 4 presents the volume of observability data that were managed by the data life cycle during the experiments.",
164
+ "NodeExporter was deployed with the default configuration.",
165
+ "Although it is a tool with a small footprint in terms of CPU and memory usage [22 ], it may have a not negligible impact in terms of the volume of data it collects.",
166
+ "The default set of metrics that it exposes accounts for 65KB of information.",
167
+ "These data are presented only when Prometheus pulls them using an HTTP call.",
168
+ "This means that there is no IoT storage for these data.",
169
+ "Prometheus is configured by default to scrape the NodeExporter page every 5s, getting all the metrics exposed and storing them in its TSDB on the Fog node.",
170
+ "Considering that there are four IoT devices exposing metrics, the data volume transmitted to and stored on the Fog node is about 8.75GB in the span of a week, the period when these data will be available for decision-making and other analysis on the Fog Layer.",
171
+ "After reaching a week of age, the information is removed from the fog node and sent to the Cloud for long-term storage and historical analysis.",
172
+ "As a matter of estimation, the volume on the Cloud will reach 75GB after 2 months of operation.",
173
+ "The default output from NodeExporter provides help text for each metric, as shown in Figure 6 .",
174
+ "This information accounts for at least 20% of the total output footprint and should be removed prior to exposing the metrics.",
175
+ "The default set of metrics is very extensive and probably not all metrics are useful for every use case.",
176
+ "For example, the node exporter exposes dozens of Go environment metrics (Figure 6 ) that are not of interest for the monitoring of Mobile IoT- RoadBot and should be removed.",
177
+ "In addition to cutting off metrics that are not of interest, machine learning over historical data can be used to figure out metric correlations and keep the target metric set at minimum [5 ].",
178
+ "Furthermore, the frequency of scraping can be decreased in the Prometheus configuration without a relevant loss of opportunities for actuation.",
179
+ "Increasing the scrap delay to 10 seconds will reduce the data volume transmitted to and stored on the Fog Layer by half.",
180
+ "Using the strategies of removing the help text and changing the configuration of Node Exporter to expose only metrics about CPU, memory, disk, network, and power supply, and increasing the scrap delay to 10 seconds on Prometheus, we could reduce the volume of metric data on the Fog node by 87%, which also positively affected CPU and memory usage by Prometheus.",
181
+ "Regarding the logs generated by Mobile IoT-Roadbot while the trucks were moving around the city, they record information about 5G network analysis, such as latency and throughput, and contextual information (GNSS coordinates, truck speed, etc.).",
182
+ "Although the application writes information in the logs every second, the volume of data written is low (0.67 GB, in the span of a week, as seen in Table 4 ), being smaller than the volume of data generated by Node Exporter after applying volume reduction strategies.",
183
+ "Filebeat was configured to harvest only the logs written by Mobile IoT-RoadBot and transmit them to the Fog Layer.",
184
+ "It was necessary to change the default configuration of Filebeat to turn off the auto-discovery feature.",
185
+ "When this feature is active, Filebeat receives from the Docker manager every status change of any container on the device, consuming more memory than necessary.",
186
+ "As the application was not originally instrumented to record trace calls, we instrumented it in a reporting feature, utilised to aggregate 5G data by the suburbs of Brimbank.",
187
+ "To make this aggregation, geographic coordinates were used to find the full address of Australia using a service called MapBox [35 ].",
188
+ "Using the data footprint of these traces, we estimate the data volume to generate the traces of regular operation of the Mobile IoT-RoadBot.",
189
+ "This use case does not have bursty behaviour in terms of request processing because it performs the same volume of operations while in service.",
190
+ "Therefore, the volume of trace data is steady.",
191
+ "The aggregated data volume, collected by the four IoT devices, transmitted and stored on the fog node for a period of one week was approximately 10GB, considering the default configuration of the open source tools used as shown in Table 4 .",
192
+ "Using the strategies described above, the volume of aggregated data was reduced to 2GB, a reduction of 80%.",
193
+ "The four trucks whose observability data are replayed by the IoT devices in this experiment transmitted 291 GB of video data using the 5G network in a week of real world operation.",
194
+ "Therefore, the observability data (2GB) would represent an overhead of less than 1% in this use case.",
195
+ "The experiments show that it is possible to collect the benefits of achieving a higher level of observability for a system in a Fog computing environment.",
196
+ "In addition, the overhead of deploying an observability data life cycle can be low, if properly managed.",
197
+ "The utilisation of Docker containers as the runtime environment for the observability tools help to address the Fog challenge of device heterogeneity.",
198
+ "Due to the resource restriction of IoT devices, observability data collection should be done by lightweight agents.",
199
+ "In addition, the data footprint should be minimised to reduce the risk of network congestion and increased overhead to collect and transmit the data to the Fog Layer.",
200
+ "Each instrumentation domain has specific data requirements (Table 1 ) that must be met to optimise storage and minimise the average delay in analysing the observability data in the Fog Layer for decision-making and actuation.",
201
+ "Leaving in the Fog Layer only a window of most recent observability data is another strategy to cope with the resource-restriction of fog nodes.",
202
+ "Data that are outside the age range are sent to the Cloud for long-term storage and historical analysis.",
203
+ "The open source tools selected to make up the experimental setup are managed independently.",
204
+ "This scenario makes more complex actuation difficult to implement.",
205
+ "For instance, in the dynamic environment of Fog Computing, a system may present errors running on specific devices while it is functioning properly on others.",
206
+ "In such cases, if there are not enough resources to transmit all observability data to the Fog Layer, a proper decision should be prioritising data from those specific devices and returning to regular operation when the issue is solved.",
207
+ "To implement such adaptive and autonomous behaviour, it might be necessary to orchestrate the observability data life cycle and its agents.",
208
+ "To our knowledge, there is no Fog solution in the literature that provides this functionality [12 ].",
209
+ ""
210
+ ],
211
+ "target_context_ids": [
212
+ 32
213
+ ],
214
+ "selected_paragraphs": [
215
+ "[paragraph id = 32] The experiments show that it is possible to collect the benefits of achieving a higher level of observability for a system in a Fog computing environment."
216
+ ],
217
+ "table_html": "<figure class=\"ltx_table\" id=\"S2.T1\">\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S2.T1.2.1.1\" style=\"font-size:90%;\">Table 1</span>: </span><span class=\"ltx_text\" id=\"S2.T1.3.2\" style=\"font-size:90%;\">The three most important domains of observability differ in their data characteristics.</span></figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S2.T1.4\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S2.T1.4.1.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S2.T1.4.1.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.1.1.1.1\" style=\"font-size:80%;\">Domain</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S2.T1.4.1.1.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.1.1.2.1\" style=\"font-size:80%;\">Type</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S2.T1.4.1.1.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.1.1.3.1\" style=\"font-size:80%;\">Query</span></th>\n<th class=\"ltx_td ltx_nopad_r ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S2.T1.4.1.1.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.4.1.1.4.1\" style=\"font-size:80%;\">Storage</span></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S2.T1.4.2.1\">\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T1.4.2.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.2.1.1.1\" style=\"font-size:80%;\">Metric</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T1.4.2.1.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.2.1.2.1\" style=\"font-size:80%;\">Numeric</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S2.T1.4.2.1.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.2.1.3.1\" style=\"font-size:80%;\">Aggregations</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_left ltx_border_t\" id=\"S2.T1.4.2.1.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.2.1.4.1\" style=\"font-size:80%;\">Time Series Database</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.4.3.2\">\n<td class=\"ltx_td ltx_align_left\" id=\"S2.T1.4.3.2.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.3.2.1.1\" style=\"font-size:80%;\">Log</span></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S2.T1.4.3.2.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.3.2.2.1\" style=\"font-size:80%;\">Semi/not structured strings</span></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S2.T1.4.3.2.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.3.2.3.1\" style=\"font-size:80%;\">Approximate string search</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S2.T1.4.3.2.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.3.2.4.1\" style=\"font-size:80%;\">Inverted Index</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.4.4.3\">\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S2.T1.4.4.3.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.4.3.1.1\" style=\"font-size:80%;\">Trace</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S2.T1.4.4.3.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.4.3.2.1\" style=\"font-size:80%;\">DAGs of duration of execution</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S2.T1.4.4.3.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.4.3.3.1\" style=\"font-size:80%;\">Disassociated graph search</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_left ltx_border_bb\" id=\"S2.T1.4.4.3.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S2.T1.4.4.3.4.1\" style=\"font-size:80%;\">Inverted Index</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
218
+ "perturb_sentence_id": 32,
219
+ "output": {
220
+ "perturbed_statement": "[paragraph id = 32] The experiments show that it is impossible to collect the benefits of achieving a higher level of observability for a system in a Fog computing environment. Moreover, the overhead of deploying an observability data life cycle can be high, even if properly managed.",
221
+ "perturbed_explanation": "The original explanation: Observability in Fog computing systems allows for enhanced monitoring and performance optimization without significant additional resource demands. 1. The statement claims that achieving higher observability is unfeasible, which contradicts evidence supporting its benefits and feasibility. 2. Additionally, the statement suggests that high overhead for an observability data life cycle is inevitable, which is inaccurate since proper management can minimize this overhead."
222
+ }
223
+ },
224
+ {
225
+ "path": "table_paper/2407.00035v1.json",
226
+ "table_id": "4",
227
+ "section": "6.4",
228
+ "all_context": [
229
+ "Table 4 presents the volume of observability data that were managed by the data life cycle during the experiments.",
230
+ "NodeExporter was deployed with the default configuration.",
231
+ "Although it is a tool with a small footprint in terms of CPU and memory usage [22 ], it may have a not negligible impact in terms of the volume of data it collects.",
232
+ "The default set of metrics that it exposes accounts for 65KB of information.",
233
+ "These data are presented only when Prometheus pulls them using an HTTP call.",
234
+ "This means that there is no IoT storage for these data.",
235
+ "Prometheus is configured by default to scrape the NodeExporter page every 5s, getting all the metrics exposed and storing them in its TSDB on the Fog node.",
236
+ "Considering that there are four IoT devices exposing metrics, the data volume transmitted to and stored on the Fog node is about 8.75GB in the span of a week, the period when these data will be available for decision-making and other analysis on the Fog Layer.",
237
+ "After reaching a week of age, the information is removed from the fog node and sent to the Cloud for long-term storage and historical analysis.",
238
+ "As a matter of estimation, the volume on the Cloud will reach 75GB after 2 months of operation.",
239
+ "The default output from NodeExporter provides help text for each metric, as shown in Figure 6 .",
240
+ "This information accounts for at least 20% of the total output footprint and should be removed prior to exposing the metrics.",
241
+ "The default set of metrics is very extensive and probably not all metrics are useful for every use case.",
242
+ "For example, the node exporter exposes dozens of Go environment metrics (Figure 6 ) that are not of interest for the monitoring of Mobile IoT- RoadBot and should be removed.",
243
+ "In addition to cutting off metrics that are not of interest, machine learning over historical data can be used to figure out metric correlations and keep the target metric set at minimum [5 ].",
244
+ "Furthermore, the frequency of scraping can be decreased in the Prometheus configuration without a relevant loss of opportunities for actuation.",
245
+ "Increasing the scrap delay to 10 seconds will reduce the data volume transmitted to and stored on the Fog Layer by half.",
246
+ "Using the strategies of removing the help text and changing the configuration of Node Exporter to expose only metrics about CPU, memory, disk, network, and power supply, and increasing the scrap delay to 10 seconds on Prometheus, we could reduce the volume of metric data on the Fog node by 87%, which also positively affected CPU and memory usage by Prometheus.",
247
+ "Regarding the logs generated by Mobile IoT-Roadbot while the trucks were moving around the city, they record information about 5G network analysis, such as latency and throughput, and contextual information (GNSS coordinates, truck speed, etc.).",
248
+ "Although the application writes information in the logs every second, the volume of data written is low (0.67 GB, in the span of a week, as seen in Table 4 ), being smaller than the volume of data generated by Node Exporter after applying volume reduction strategies.",
249
+ "Filebeat was configured to harvest only the logs written by Mobile IoT-RoadBot and transmit them to the Fog Layer.",
250
+ "It was necessary to change the default configuration of Filebeat to turn off the auto-discovery feature.",
251
+ "When this feature is active, Filebeat receives from the Docker manager every status change of any container on the device, consuming more memory than necessary.",
252
+ "As the application was not originally instrumented to record trace calls, we instrumented it in a reporting feature, utilised to aggregate 5G data by the suburbs of Brimbank.",
253
+ "To make this aggregation, geographic coordinates were used to find the full address of Australia using a service called MapBox [35 ].",
254
+ "Using the data footprint of these traces, we estimate the data volume to generate the traces of regular operation of the Mobile IoT-RoadBot.",
255
+ "This use case does not have bursty behaviour in terms of request processing because it performs the same volume of operations while in service.",
256
+ "Therefore, the volume of trace data is steady.",
257
+ "The aggregated data volume, collected by the four IoT devices, transmitted and stored on the fog node for a period of one week was approximately 10GB, considering the default configuration of the open source tools used as shown in Table 4 .",
258
+ "Using the strategies described above, the volume of aggregated data was reduced to 2GB, a reduction of 80%.",
259
+ "The four trucks whose observability data are replayed by the IoT devices in this experiment transmitted 291 GB of video data using the 5G network in a week of real world operation.",
260
+ "Therefore, the observability data (2GB) would represent an overhead of less than 1% in this use case.",
261
+ "The experiments show that it is possible to collect the benefits of achieving a higher level of observability for a system in a Fog computing environment.",
262
+ "In addition, the overhead of deploying an observability data life cycle can be low, if properly managed.",
263
+ "The utilisation of Docker containers as the runtime environment for the observability tools help to address the Fog challenge of device heterogeneity.",
264
+ "Due to the resource restriction of IoT devices, observability data collection should be done by lightweight agents.",
265
+ "In addition, the data footprint should be minimised to reduce the risk of network congestion and increased overhead to collect and transmit the data to the Fog Layer.",
266
+ "Each instrumentation domain has specific data requirements (Table 1 ) that must be met to optimise storage and minimise the average delay in analysing the observability data in the Fog Layer for decision-making and actuation.",
267
+ "Leaving in the Fog Layer only a window of most recent observability data is another strategy to cope with the resource-restriction of fog nodes.",
268
+ "Data that are outside the age range are sent to the Cloud for long-term storage and historical analysis.",
269
+ "The open source tools selected to make up the experimental setup are managed independently.",
270
+ "This scenario makes more complex actuation difficult to implement.",
271
+ "For instance, in the dynamic environment of Fog Computing, a system may present errors running on specific devices while it is functioning properly on others.",
272
+ "In such cases, if there are not enough resources to transmit all observability data to the Fog Layer, a proper decision should be prioritising data from those specific devices and returning to regular operation when the issue is solved.",
273
+ "To implement such adaptive and autonomous behaviour, it might be necessary to orchestrate the observability data life cycle and its agents.",
274
+ "To our knowledge, there is no Fog solution in the literature that provides this functionality [12 ].",
275
+ ""
276
+ ],
277
+ "target_context_ids": [
278
+ 0,
279
+ 7,
280
+ 17,
281
+ 26,
282
+ 27,
283
+ 32
284
+ ],
285
+ "selected_paragraphs": [
286
+ "[paragraph id = 0] Table 4 presents the volume of observability data that were managed by the data life cycle during the experiments.",
287
+ "[paragraph id = 7] Considering that there are four IoT devices exposing metrics, the data volume transmitted to and stored on the Fog node is about 8.75GB in the span of a week, the period when these data will be available for decision-making and other analysis on the Fog Layer.",
288
+ "[paragraph id = 17] Using the strategies of removing the help text and changing the configuration of Node Exporter to expose only metrics about CPU, memory, disk, network, and power supply, and increasing the scrap delay to 10 seconds on Prometheus, we could reduce the volume of metric data on the Fog node by 87%, which also positively affected CPU and memory usage by Prometheus.",
289
+ "[paragraph id = 26] This use case does not have bursty behaviour in terms of request processing because it performs the same volume of operations while in service.",
290
+ "[paragraph id = 27] Therefore, the volume of trace data is steady.",
291
+ "[paragraph id = 32] The experiments show that it is possible to collect the benefits of achieving a higher level of observability for a system in a Fog computing environment."
292
+ ],
293
+ "table_html": "<figure class=\"ltx_table\" id=\"S6.T4\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S6.T4.2.1.1\" style=\"font-size:90%;\">Table 4</span>: </span><span class=\"ltx_text\" id=\"S6.T4.3.2\" style=\"font-size:90%;\">Mobile IoT-Roadbot assessment of each observability domain .</span></figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S6.T4.4\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S6.T4.4.1.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.1.1\" style=\"font-size:80%;\">Tool</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S6.T4.4.1.1.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.2.1\" style=\"font-size:80%;\">Domain</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S6.T4.4.1.1.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T4.4.1.1.3.1\">\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.3.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.3.1.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.3.1.1.1.1\" style=\"font-size:80%;\">Data</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.3.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.3.1.2.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.3.1.2.1.1\" style=\"font-size:80%;\">Collection</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S6.T4.4.1.1.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.4.1\" style=\"font-size:80%;\">Frequency</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S6.T4.4.1.1.5\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T4.4.1.1.5.1\">\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.5.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.5.1.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.5.1.1.1.1\" style=\"font-size:80%;\">Volume</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.5.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.5.1.2.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.5.1.2.1.1\" style=\"font-size:80%;\">by Hour</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S6.T4.4.1.1.6\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T4.4.1.1.6.1\">\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.6.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.6.1.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.6.1.1.1.1\" style=\"font-size:80%;\">IoT</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.6.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.6.1.2.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.6.1.2.1.1\" style=\"font-size:80%;\">Storage</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S6.T4.4.1.1.7\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T4.4.1.1.7.1\">\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.7.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.7.1.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.7.1.1.1.1\" style=\"font-size:80%;\">Fog</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.7.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.7.1.2.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.7.1.2.1.1\" style=\"font-size:80%;\">Storage</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S6.T4.4.1.1.8\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T4.4.1.1.8.1\">\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.8.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.8.1.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.8.1.1.1.1\" style=\"font-size:80%;\">Fog Volume</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.8.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.8.1.2.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.8.1.2.1.1\" style=\"font-size:80%;\">(1 week)</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S6.T4.4.1.1.9\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T4.4.1.1.9.1\">\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.9.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.9.1.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.9.1.1.1.1\" style=\"font-size:80%;\">Cloud</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.9.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.9.1.2.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.9.1.2.1.1\" style=\"font-size:80%;\">Storage</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_nopad_r ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S6.T4.4.1.1.10\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T4.4.1.1.10.1\">\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.10.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.10.1.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.10.1.1.1.1\" style=\"font-size:80%;\">Cloud Vol.</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T4.4.1.1.10.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.1.1.10.1.2.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T4.4.1.1.10.1.2.1.1\" style=\"font-size:80%;\">(2 months)</span></td>\n</tr>\n</table>\n</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S6.T4.4.2.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S6.T4.4.2.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T4.4.2.1.1.1\">\n<tr class=\"ltx_tr\" id=\"S6.T4.4.2.1.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.2.1.1.1.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.2.1.1.1.1.1.1\" style=\"font-size:80%;\">Node Exporter</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S6.T4.4.2.1.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.2.1.2.1\" style=\"font-size:80%;\">Metrics</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S6.T4.4.2.1.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.2.1.3.1\" style=\"font-size:80%;\">65KB</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S6.T4.4.2.1.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.2.1.4.1\" style=\"font-size:80%;\">each 5s</span></th>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S6.T4.4.2.1.5\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.2.1.5.1\" style=\"font-size:80%;\">46 MB</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S6.T4.4.2.1.6\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.2.1.6.1\" style=\"font-size:80%;\">No</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S6.T4.4.2.1.7\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.2.1.7.1\" style=\"font-size:80%;\">Yes</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S6.T4.4.2.1.8\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.2.1.8.1\" style=\"font-size:80%;\">8.75 GB</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S6.T4.4.2.1.9\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.2.1.9.1\" style=\"font-size:80%;\">Yes</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_left ltx_border_t\" id=\"S6.T4.4.2.1.10\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.2.1.10.1\" style=\"font-size:80%;\">75 GB</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T4.4.3.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S6.T4.4.3.2.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.3.2.1.1\" style=\"font-size:80%;\">Filebeat</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S6.T4.4.3.2.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.3.2.2.1\" style=\"font-size:80%;\">Logs</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S6.T4.4.3.2.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.3.2.3.1\" style=\"font-size:80%;\">1KB</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S6.T4.4.3.2.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.3.2.4.1\" style=\"font-size:80%;\">each 1s</span></th>\n<td class=\"ltx_td ltx_align_left\" id=\"S6.T4.4.3.2.5\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.3.2.5.1\" style=\"font-size:80%;\">3.50 MB</span></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S6.T4.4.3.2.6\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.3.2.6.1\" style=\"font-size:80%;\">Yes</span></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S6.T4.4.3.2.7\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.3.2.7.1\" style=\"font-size:80%;\">Yes</span></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S6.T4.4.3.2.8\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.3.2.8.1\" style=\"font-size:80%;\">0.67 GB</span></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S6.T4.4.3.2.9\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.3.2.9.1\" style=\"font-size:80%;\">Yes</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.3.2.10\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.3.2.10.1\" style=\"font-size:80%;\">5.77 GB</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T4.4.4.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_bb\" id=\"S6.T4.4.4.3.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T4.4.4.3.1.1\">\n<tr class=\"ltx_tr\" id=\"S6.T4.4.4.3.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_left\" id=\"S6.T4.4.4.3.1.1.1.1\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.4.3.1.1.1.1.1\" style=\"font-size:80%;\">Open Telemetry</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_bb\" id=\"S6.T4.4.4.3.2\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.4.3.2.1\" style=\"font-size:80%;\">Traces</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_bb\" id=\"S6.T4.4.4.3.3\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.4.3.3.1\" style=\"font-size:80%;\">4KB</span></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_bb\" id=\"S6.T4.4.4.3.4\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.4.3.4.1\" style=\"font-size:80%;\">each 15s</span></th>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S6.T4.4.4.3.5\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.4.3.5.1\" style=\"font-size:80%;\">1 MB</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S6.T4.4.4.3.6\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.4.3.6.1\" style=\"font-size:80%;\">No</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S6.T4.4.4.3.7\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.4.3.7.1\" style=\"font-size:80%;\">Yes</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S6.T4.4.4.3.8\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.4.3.8.1\" style=\"font-size:80%;\">0.2 GB</span></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S6.T4.4.4.3.9\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.4.3.9.1\" style=\"font-size:80%;\">Yes</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_left ltx_border_bb\" id=\"S6.T4.4.4.3.10\" style=\"padding-top:-0.65pt;padding-bottom:-0.65pt;\"><span class=\"ltx_text\" id=\"S6.T4.4.4.3.10.1\" style=\"font-size:80%;\">1.54 GB</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
294
+ "perturb_sentence_id": 17,
295
+ "output": {
296
+ "perturbed_statement": "[paragraph id = 17] Using the strategies of removing the help text and changing the configuration of Node Exporter to expose only metrics about CPU, memory, disk, network, and power supply, and increasing the scrap delay to 5 seconds on Prometheus, we could reduce the volume of metric data on the Fog node by 60%, which positively affected CPU and memory usage by Prometheus. Regarding the logs generated by Mobile IoT-Roadbot while the trucks were moving around the city, they record information about 5G network analysis, such as latency and throughput, and contextual information (GNSS coordinates, truck speed, etc.).",
297
+ "perturbed_explanation": "Original Explanation:\n1. The configuration adjustments described, including increasing the scrape delay to 10 seconds, effectively decreased the volume of metric data by 87% on the Fog node due to reduced data sampling frequency.\n2. This reduction lessened the resource load caused by Prometheus' data processing activities.\nExplanation of Incorrect Statement:\n1. The scrape delay was altered from 10 seconds to 5 seconds in the statement, which contradicts the provided context.\n2. This change would likely result in a smaller percentage reduction in data volume, and stating a 60% reduction diverges from the contextually given 87%."
298
+ }
299
+ }
300
+ ]
table_result/2407.00046v1_output.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00046v1.json",
4
+ "table_id": "1",
5
+ "section": "6.4",
6
+ "all_context": [
7
+ "We compare with the original IPC, making sure it utilizes full parallelization on the CPU by compiling CHOLMOD with Intel MKL and run the simulation on an Intel Core i9 13900K processor (24 cores), enabling a 24-thread Cholesky factorization for solving the linear systems.",
8
+ "Figure 28 illustrates the effectiveness of two different computational methods in simulating the twisting of a cylindrical mat.",
9
+ "Both methods produce visually comparable results; however, our method significantly outperforms IPC in computational efficiency, processing steps 19.3 faster on average.",
10
+ "The demonstrated efficiency indicates that our method could provide considerable benefits to industries requiring fast and accurate simulations.",
11
+ "Table 1 showcases the statistics and quantifies the speedup achieved in representative cases relative to IPC.",
12
+ "In the study by Lan et al.",
13
+ "(2023 ), a novel GPU-accelerated algorithm is introduced for FEM elastodynamic simulations, leveraging interior-point methods to effectively handle complex scenarios involving extensive contact and collisions.",
14
+ "This algorithm is notable for its use of complementary coloring and a hybrid sweep approach, which are well-suited for such applications.",
15
+ "Nonetheless, these strategies may not fully address the specific challenges posed by stiff problems, such as significantly large stress resulting from challenging boundary conditions as in the simulation of twisting rods (Figure 4 ).",
16
+ "This example underscores our method s capability by stress testing four stiff rods with a Young s modulus of 10 MPa.",
17
+ "These rods are subject to high-speed torsion from both ends, achieving an angular velocity of 5/12 revolutions per second over 18 complete turns.",
18
+ "The image captures the deformation pattern, reflecting the rods structural integrity and the material s resistance to the applied forces.",
19
+ "Our method demonstrates proficiency in handling such demanding tests with large time steps, ensuring accurate results and computational efficiency.",
20
+ "The concurrent development of another GPU-based IPC method, termed GIPC, employs a Gauss-Newton approximation for the contact Hessian matrix.",
21
+ "This method solves the IPC system without the need for numerical eigendecompositions, an operation that is not easy to parallelize on the GPU.",
22
+ "In contrast, our approach focuses on reformulating the nonlinear problem to make it easier to solve for both Newton s method and CG solvers.",
23
+ "In the comparative tests (see Figure 29 ), we used simulations of stacked armadillos and octopuses with frictional contacts (where ) and aligned the Newton tolerance for both methods.",
24
+ "Our method consistently outperforms GIPC, achieving up to in speedup and in Newton convergence.",
25
+ "Specifically, GIPC encounters challenges in large-scale simulations due to suboptimal convergence speeds.",
26
+ "While GIPC uses Newton-PCG for optimization, its performance can still be significantly affected by the conditioning of the system.",
27
+ "The Multilevel Additive Schwarz (MAS) preconditioner utilized in GIPC effectively smooths out low-frequency errors commonly found in hyperelastic materials but struggles with the high-frequency errors that are typical in scenarios involving frictional contacts, leading to difficulties in larger-scale frictional contact simulations.",
28
+ ""
29
+ ],
30
+ "target_context_ids": [
31
+ 4
32
+ ],
33
+ "selected_paragraphs": [
34
+ "[paragraph id = 4] Table 1 showcases the statistics and quantifies the speedup achieved in representative cases relative to IPC."
35
+ ],
36
+ "table_html": "<figure class=\"ltx_table\" id=\"S6.T1\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 1. </span><span class=\"ltx_text ltx_font_bold\" id=\"S6.T1.38.1\">Statistics for Testing Scenarios.</span> This table details the total numbers of tetrahedra (#tets), Degrees of Freedom (#DOFs), and surface triangles (#tris). Key simulation parameters such as time step (), material density, Young’s Modulus (), Poisson Ratio (), collision offset (), and frictional coefficient () are provided. Additionally, the table includes both average and maximum numbers of constraints (#cons), the total number of Newton iterations per step, the average computational cost per step, and the comparative speedup achieved against IPC. Note that we simply use the same value for the friction mollification threshold and .</figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_align_middle\" id=\"S6.T1.36\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S6.T1.22.8\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_tt\" id=\"S6.T1.22.8.9\"><span class=\"ltx_text\" id=\"S6.T1.22.8.9.1\" style=\"font-size:90%;\">Scenario</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_tt\" id=\"S6.T1.22.8.10\"><span class=\"ltx_text\" id=\"S6.T1.22.8.10.1\" style=\"font-size:90%;\">#tets / #DOFs / #tris</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_tt\" id=\"S6.T1.15.1.1\">\n<span class=\"ltx_text\" id=\"S6.T1.15.1.1.1\" style=\"font-size:90%;\"> (s)</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_tt\" id=\"S6.T1.18.4.4\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T1.18.4.4.3\">\n<tr class=\"ltx_tr\" id=\"S6.T1.16.2.2.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.16.2.2.1.1.1\">\n<span class=\"ltx_text\" id=\"S6.T1.16.2.2.1.1.1.1\" style=\"font-size:90%;\">density (kg/m</span><sup class=\"ltx_sup\" id=\"S6.T1.16.2.2.1.1.1.2\"><span class=\"ltx_text\" id=\"S6.T1.16.2.2.1.1.1.2.1\" style=\"font-size:90%;\">3</span></sup><span class=\"ltx_text\" id=\"S6.T1.16.2.2.1.1.1.3\" style=\"font-size:90%;\">),</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.18.4.4.3.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.18.4.4.3.3.2\">\n<span class=\"ltx_text\" id=\"S6.T1.18.4.4.3.3.2.1\" style=\"font-size:90%;\"> (Pa), </span>\n</td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_tt\" id=\"S6.T1.20.6.6\">\n<span class=\"ltx_text\" id=\"S6.T1.20.6.6.1\" style=\"font-size:90%;\">, </span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_tt\" id=\"S6.T1.21.7.7\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_tt\" id=\"S6.T1.22.8.8\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T1.22.8.8.1\">\n<tr class=\"ltx_tr\" id=\"S6.T1.22.8.8.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.22.8.8.1.2.1\"><span class=\"ltx_text\" id=\"S6.T1.22.8.8.1.2.1.1\" style=\"font-size:90%;\">#cons</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.22.8.8.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.22.8.8.1.1.1\">\n<span class=\"ltx_text\" id=\"S6.T1.22.8.8.1.1.1.1\" style=\"font-size:90%;\">(avg. / </span><span class=\"ltx_text\" id=\"S6.T1.22.8.8.1.1.1.2\" style=\"font-size:90%;\">)</span>\n</td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_tt\" id=\"S6.T1.22.8.11\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T1.22.8.11.1\">\n<tr class=\"ltx_tr\" id=\"S6.T1.22.8.11.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.22.8.11.1.1.1\"><span class=\"ltx_text\" id=\"S6.T1.22.8.11.1.1.1.1\" style=\"font-size:90%;\">avg. #iters</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.22.8.11.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.22.8.11.1.2.1\"><span class=\"ltx_text\" id=\"S6.T1.22.8.11.1.2.1.1\" style=\"font-size:90%;\">(Newton)</span></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_tt\" id=\"S6.T1.22.8.12\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T1.22.8.12.1\">\n<tr class=\"ltx_tr\" id=\"S6.T1.22.8.12.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.22.8.12.1.1.1\"><span class=\"ltx_text\" id=\"S6.T1.22.8.12.1.1.1.1\" style=\"font-size:90%;\">avg. cost</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.22.8.12.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.22.8.12.1.2.1\"><span class=\"ltx_text\" id=\"S6.T1.22.8.12.1.2.1.1\" style=\"font-size:90%;\">per-step (s)</span></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S6.T1.22.8.13\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T1.22.8.13.1\">\n<tr class=\"ltx_tr\" id=\"S6.T1.22.8.13.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.22.8.13.1.1.1\"><span class=\"ltx_text\" id=\"S6.T1.22.8.13.1.1.1.1\" style=\"font-size:90%;\">speedup</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.22.8.13.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.22.8.13.1.2.1\"><span class=\"ltx_text\" id=\"S6.T1.22.8.13.1.2.1.1\" style=\"font-size:90%;\">vs. IPC</span></td>\n</tr>\n</table>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.23.9\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.23.9.2\"><span class=\"ltx_text\" id=\"S6.T1.23.9.2.1\" style=\"font-size:90%;\">Puffer Balls on Nets</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.23.9.3\"><span class=\"ltx_text\" id=\"S6.T1.23.9.3.1\" style=\"font-size:90%;\">1.76M / 801K / 1.6M</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.23.9.4\"><span class=\"ltx_text\" id=\"S6.T1.23.9.4.1\" style=\"font-size:90%;\">1/30</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.23.9.5\"><span class=\"ltx_text\" id=\"S6.T1.23.9.5.1\" style=\"font-size:90%;\">1e3, 5e5 / 1e9, 0.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.23.9.6\"><span class=\"ltx_text\" id=\"S6.T1.23.9.6.1\" style=\"font-size:90%;\">1e-3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.23.9.7\"><span class=\"ltx_text\" id=\"S6.T1.23.9.7.1\" style=\"font-size:90%;\">0.3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.23.9.8\"><span class=\"ltx_text\" id=\"S6.T1.23.9.8.1\" style=\"font-size:90%;\">228K / 292K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.23.9.9\"><span class=\"ltx_text\" id=\"S6.T1.23.9.9.1\" style=\"font-size:90%;\">156.8</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.23.9.10\"><span class=\"ltx_text\" id=\"S6.T1.23.9.10.1\" style=\"font-size:90%;\">427</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S6.T1.23.9.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.26.12\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.26.12.4\"><span class=\"ltx_text\" id=\"S6.T1.26.12.4.1\" style=\"font-size:90%;\">Dragons-Pachinko</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.26.12.5\"><span class=\"ltx_text\" id=\"S6.T1.26.12.5.1\" style=\"font-size:90%;\">1.49M / 379K / 773K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.26.12.6\"><span class=\"ltx_text\" id=\"S6.T1.26.12.6.1\" style=\"font-size:90%;\">1/30</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.25.11.2\">\n<span class=\"ltx_text\" id=\"S6.T1.25.11.2.3\" style=\"font-size:90%;\">1e3, </span>\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T1.25.11.2.2\">\n<tr class=\"ltx_tr\" id=\"S6.T1.24.10.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.24.10.1.1.1.1\">\n<span class=\"ltx_text\" id=\"S6.T1.24.10.1.1.1.1.1\" style=\"font-size:90%;\">5e5 (</span><span class=\"ltx_text\" id=\"S6.T1.24.10.1.1.1.1.2\" style=\"font-size:90%;\">)/</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.25.11.2.2.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.25.11.2.2.2.1\">\n<span class=\"ltx_text\" id=\"S6.T1.25.11.2.2.2.1.1\" style=\"font-size:90%;\">1e6 (</span><span class=\"ltx_text\" id=\"S6.T1.25.11.2.2.2.1.2\" style=\"font-size:90%;\">)</span>\n</td>\n</tr>\n</table>\n<span class=\"ltx_text\" id=\"S6.T1.25.11.2.4\" style=\"font-size:90%;\">, 0.4</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.26.12.7\"><span class=\"ltx_text\" id=\"S6.T1.26.12.7.1\" style=\"font-size:90%;\">1e-3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.26.12.8\"><span class=\"ltx_text\" id=\"S6.T1.26.12.8.1\" style=\"font-size:90%;\">0.3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.26.12.9\"><span class=\"ltx_text\" id=\"S6.T1.26.12.9.1\" style=\"font-size:90%;\">4.9K / 18K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.26.12.10\"><span class=\"ltx_text\" id=\"S6.T1.26.12.10.1\" style=\"font-size:90%;\">41.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.26.12.11\"><span class=\"ltx_text\" id=\"S6.T1.26.12.11.1\" style=\"font-size:90%;\">29.1</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S6.T1.26.12.3\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.27.13\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.27.13.2\"><span class=\"ltx_text\" id=\"S6.T1.27.13.2.1\" style=\"font-size:90%;\">Staircase-Armadillos</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.27.13.3\"><span class=\"ltx_text\" id=\"S6.T1.27.13.3.1\" style=\"font-size:90%;\">300K / 94K / 187K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.27.13.4\"><span class=\"ltx_text\" id=\"S6.T1.27.13.4.1\" style=\"font-size:90%;\">1/30</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.27.13.5\"><span class=\"ltx_text\" id=\"S6.T1.27.13.5.1\" style=\"font-size:90%;\">1e3, 7.5e5, 0.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.27.13.6\"><span class=\"ltx_text\" id=\"S6.T1.27.13.6.1\" style=\"font-size:90%;\">1e-3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.27.13.7\"><span class=\"ltx_text\" id=\"S6.T1.27.13.7.1\" style=\"font-size:90%;\">0.5</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.27.13.8\"><span class=\"ltx_text\" id=\"S6.T1.27.13.8.1\" style=\"font-size:90%;\">3.2K / 3.2K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.27.13.9\"><span class=\"ltx_text\" id=\"S6.T1.27.13.9.1\" style=\"font-size:90%;\">38</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.27.13.10\"><span class=\"ltx_text\" id=\"S6.T1.27.13.10.1\" style=\"font-size:90%;\">26.7</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S6.T1.27.13.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.28.14\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.28.14.2\"><span class=\"ltx_text\" id=\"S6.T1.28.14.2.1\" style=\"font-size:90%;\">Staircase-Dragons</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.28.14.3\"><span class=\"ltx_text\" id=\"S6.T1.28.14.3.1\" style=\"font-size:90%;\">376K / 120K / 240K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.28.14.4\"><span class=\"ltx_text\" id=\"S6.T1.28.14.4.1\" style=\"font-size:90%;\">1/30</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.28.14.5\"><span class=\"ltx_text\" id=\"S6.T1.28.14.5.1\" style=\"font-size:90%;\">1e3, 7.5e5, 0.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.28.14.6\"><span class=\"ltx_text\" id=\"S6.T1.28.14.6.1\" style=\"font-size:90%;\">1e-3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.28.14.7\"><span class=\"ltx_text\" id=\"S6.T1.28.14.7.1\" style=\"font-size:90%;\">0.5</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.28.14.8\"><span class=\"ltx_text\" id=\"S6.T1.28.14.8.1\" style=\"font-size:90%;\">3K / 5.4K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.28.14.9\"><span class=\"ltx_text\" id=\"S6.T1.28.14.9.1\" style=\"font-size:90%;\">41.9</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.28.14.10\"><span class=\"ltx_text\" id=\"S6.T1.28.14.10.1\" style=\"font-size:90%;\">28.5</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S6.T1.28.14.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.29.15\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.29.15.2\"><span class=\"ltx_text\" id=\"S6.T1.29.15.2.1\" style=\"font-size:90%;\">Roller Test</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.29.15.3\"><span class=\"ltx_text\" id=\"S6.T1.29.15.3.1\" style=\"font-size:90%;\">100K / 31K / 62K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.29.15.4\"><span class=\"ltx_text\" id=\"S6.T1.29.15.4.1\" style=\"font-size:90%;\">1/30</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.29.15.5\"><span class=\"ltx_text\" id=\"S6.T1.29.15.5.1\" style=\"font-size:90%;\">1e3, 1e6, 0.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.29.15.6\"><span class=\"ltx_text\" id=\"S6.T1.29.15.6.1\" style=\"font-size:90%;\">1e-3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.29.15.7\"><span class=\"ltx_text\" id=\"S6.T1.29.15.7.1\" style=\"font-size:90%;\">0.9</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.29.15.8\"><span class=\"ltx_text\" id=\"S6.T1.29.15.8.1\" style=\"font-size:90%;\">1.6K / 5.8K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.29.15.9\"><span class=\"ltx_text\" id=\"S6.T1.29.15.9.1\" style=\"font-size:90%;\">35.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.29.15.10\"><span class=\"ltx_text\" id=\"S6.T1.29.15.10.1\" style=\"font-size:90%;\">12.5</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S6.T1.29.15.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.30.16\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.30.16.2\"><span class=\"ltx_text\" id=\"S6.T1.30.16.2.1\" style=\"font-size:90%;\">Armadillos &amp; Bowl</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.30.16.3\"><span class=\"ltx_text\" id=\"S6.T1.30.16.3.1\" style=\"font-size:90%;\">826K / 192K / 238K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.30.16.4\"><span class=\"ltx_text\" id=\"S6.T1.30.16.4.1\" style=\"font-size:90%;\">1/30</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.30.16.5\"><span class=\"ltx_text\" id=\"S6.T1.30.16.5.1\" style=\"font-size:90%;\">1e3, 5e5, 0.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.30.16.6\"><span class=\"ltx_text\" id=\"S6.T1.30.16.6.1\" style=\"font-size:90%;\">1e-3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.30.16.7\"><span class=\"ltx_text\" id=\"S6.T1.30.16.7.1\" style=\"font-size:90%;\">0.1</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.30.16.8\"><span class=\"ltx_text\" id=\"S6.T1.30.16.8.1\" style=\"font-size:90%;\">2.2K / 9.7K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.30.16.9\"><span class=\"ltx_text\" id=\"S6.T1.30.16.9.1\" style=\"font-size:90%;\">8.2</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.30.16.10\"><span class=\"ltx_text\" id=\"S6.T1.30.16.10.1\" style=\"font-size:90%;\">3.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S6.T1.30.16.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.31.17\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.31.17.2\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T1.31.17.2.1\">\n<tr class=\"ltx_tr\" id=\"S6.T1.31.17.2.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.31.17.2.1.1.1\"><span class=\"ltx_text\" id=\"S6.T1.31.17.2.1.1.1.1\" style=\"font-size:90%;\">Crabs on Nets</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.31.17.2.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.31.17.2.1.2.1\"><span class=\"ltx_text\" id=\"S6.T1.31.17.2.1.2.1.1\" style=\"font-size:90%;\">(light crabs)</span></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.31.17.3\"><span class=\"ltx_text\" id=\"S6.T1.31.17.3.1\" style=\"font-size:90%;\">2.2M / 810K / 1.2M</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.31.17.4\"><span class=\"ltx_text\" id=\"S6.T1.31.17.4.1\" style=\"font-size:90%;\">1/30</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.31.17.5\"><span class=\"ltx_text\" id=\"S6.T1.31.17.5.1\" style=\"font-size:90%;\">1e2 / 1e3, 5e5, 0.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.31.17.6\"><span class=\"ltx_text\" id=\"S6.T1.31.17.6.1\" style=\"font-size:90%;\">1e-3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.31.17.7\"><span class=\"ltx_text\" id=\"S6.T1.31.17.7.1\" style=\"font-size:90%;\">0.3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.31.17.8\"><span class=\"ltx_text\" id=\"S6.T1.31.17.8.1\" style=\"font-size:90%;\">32K / 52K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.31.17.9\"><span class=\"ltx_text\" id=\"S6.T1.31.17.9.1\" style=\"font-size:90%;\">34.5</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.31.17.10\"><span class=\"ltx_text\" id=\"S6.T1.31.17.10.1\" style=\"font-size:90%;\">48.8</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S6.T1.31.17.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.32.18\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.32.18.2\"><span class=\"ltx_text\" id=\"S6.T1.32.18.2.1\" style=\"font-size:90%;\">Twisting Rods</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.32.18.3\"><span class=\"ltx_text\" id=\"S6.T1.32.18.3.1\" style=\"font-size:90%;\">355K / 70.4K / 51.6K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.32.18.4\"><span class=\"ltx_text\" id=\"S6.T1.32.18.4.1\" style=\"font-size:90%;\">1/30</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.32.18.5\"><span class=\"ltx_text\" id=\"S6.T1.32.18.5.1\" style=\"font-size:90%;\">1e3, 1e7, 0.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.32.18.6\"><span class=\"ltx_text\" id=\"S6.T1.32.18.6.1\" style=\"font-size:90%;\">1e-3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.32.18.7\"><span class=\"ltx_text\" id=\"S6.T1.32.18.7.1\" style=\"font-size:90%;\">0</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.32.18.8\"><span class=\"ltx_text\" id=\"S6.T1.32.18.8.1\" style=\"font-size:90%;\">617K / 5.7M</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.32.18.9\"><span class=\"ltx_text\" id=\"S6.T1.32.18.9.1\" style=\"font-size:90%;\">24.1</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.32.18.10\"><span class=\"ltx_text\" id=\"S6.T1.32.18.10.1\" style=\"font-size:90%;\">15.54</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S6.T1.32.18.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.33.19\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.33.19.2\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S6.T1.33.19.2.1\">\n<tr class=\"ltx_tr\" id=\"S6.T1.33.19.2.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.33.19.2.1.1.1\"><span class=\"ltx_text\" id=\"S6.T1.33.19.2.1.1.1.1\" style=\"font-size:90%;\">Twisting</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.33.19.2.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S6.T1.33.19.2.1.2.1\"><span class=\"ltx_text\" id=\"S6.T1.33.19.2.1.2.1.1\" style=\"font-size:90%;\">Cylindrical Mat</span></td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.33.19.3\"><span class=\"ltx_text\" id=\"S6.T1.33.19.3.1\" style=\"font-size:90%;\">64K / 20.9K / 41.8K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.33.19.4\"><span class=\"ltx_text\" id=\"S6.T1.33.19.4.1\" style=\"font-size:90%;\">1/30</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.33.19.5\"><span class=\"ltx_text\" id=\"S6.T1.33.19.5.1\" style=\"font-size:90%;\">1e3, 1e7, 0.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.33.19.6\"><span class=\"ltx_text\" id=\"S6.T1.33.19.6.1\" style=\"font-size:90%;\">1e-3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.33.19.7\"><span class=\"ltx_text\" id=\"S6.T1.33.19.7.1\" style=\"font-size:90%;\">0</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.33.19.8\"><span class=\"ltx_text\" id=\"S6.T1.33.19.8.1\" style=\"font-size:90%;\">60K / 147K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.33.19.9\"><span class=\"ltx_text\" id=\"S6.T1.33.19.9.1\" style=\"font-size:90%;\">18.8</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.33.19.10\"><span class=\"ltx_text\" id=\"S6.T1.33.19.10.1\" style=\"font-size:90%;\">5.7</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S6.T1.33.19.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.34.20\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.34.20.2\"><span class=\"ltx_text\" id=\"S6.T1.34.20.2.1\" style=\"font-size:90%;\">Noodles-200</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.34.20.3\"><span class=\"ltx_text\" id=\"S6.T1.34.20.3.1\" style=\"font-size:90%;\">934K / 375K / 749K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.34.20.4\"><span class=\"ltx_text\" id=\"S6.T1.34.20.4.1\" style=\"font-size:90%;\">1/30</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.34.20.5\"><span class=\"ltx_text\" id=\"S6.T1.34.20.5.1\" style=\"font-size:90%;\">1e3, 5e5, 0.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.34.20.6\"><span class=\"ltx_text\" id=\"S6.T1.34.20.6.1\" style=\"font-size:90%;\">1e-3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.34.20.7\"><span class=\"ltx_text\" id=\"S6.T1.34.20.7.1\" style=\"font-size:90%;\">0.3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.34.20.8\"><span class=\"ltx_text\" id=\"S6.T1.34.20.8.1\" style=\"font-size:90%;\">48.9K / 146.3K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.34.20.9\"><span class=\"ltx_text\" id=\"S6.T1.34.20.9.1\" style=\"font-size:90%;\">39.7</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.34.20.10\"><span class=\"ltx_text\" id=\"S6.T1.34.20.10.1\" style=\"font-size:90%;\">49.5</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S6.T1.34.20.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.35.21\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.35.21.2\"><span class=\"ltx_text\" id=\"S6.T1.35.21.2.1\" style=\"font-size:90%;\">Noodles-300</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.35.21.3\"><span class=\"ltx_text\" id=\"S6.T1.35.21.3.1\" style=\"font-size:90%;\">1.4M / 562K / 1.1M</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.35.21.4\"><span class=\"ltx_text\" id=\"S6.T1.35.21.4.1\" style=\"font-size:90%;\">1/30</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.35.21.5\"><span class=\"ltx_text\" id=\"S6.T1.35.21.5.1\" style=\"font-size:90%;\">1e3, 5e5, 0.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.35.21.6\"><span class=\"ltx_text\" id=\"S6.T1.35.21.6.1\" style=\"font-size:90%;\">1e-3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.35.21.7\"><span class=\"ltx_text\" id=\"S6.T1.35.21.7.1\" style=\"font-size:90%;\">0.3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.35.21.8\"><span class=\"ltx_text\" id=\"S6.T1.35.21.8.1\" style=\"font-size:90%;\">132.1K / 276K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.35.21.9\"><span class=\"ltx_text\" id=\"S6.T1.35.21.9.1\" style=\"font-size:90%;\">60.9</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S6.T1.35.21.10\"><span class=\"ltx_text\" id=\"S6.T1.35.21.10.1\" style=\"font-size:90%;\">109.6</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S6.T1.35.21.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T1.36.22\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S6.T1.36.22.1\">\n<span class=\"ltx_text\" id=\"S6.T1.36.22.1.1\" style=\"font-size:90%;\">T-rex </span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S6.T1.36.22.2\"><span class=\"ltx_text\" id=\"S6.T1.36.22.2.1\" style=\"font-size:90%;\">9M / 2.2M / 2.9M</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S6.T1.36.22.3\"><span class=\"ltx_text\" id=\"S6.T1.36.22.3.1\" style=\"font-size:90%;\">1/30</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S6.T1.36.22.4\"><span class=\"ltx_text\" id=\"S6.T1.36.22.4.1\" style=\"font-size:90%;\">1e3, 5e5, 0.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S6.T1.36.22.5\"><span class=\"ltx_text\" id=\"S6.T1.36.22.5.1\" style=\"font-size:90%;\">1e-3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S6.T1.36.22.6\"><span class=\"ltx_text\" id=\"S6.T1.36.22.6.1\" style=\"font-size:90%;\">0.3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S6.T1.36.22.7\"><span class=\"ltx_text\" id=\"S6.T1.36.22.7.1\" style=\"font-size:90%;\">100.5K / 308.4K</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S6.T1.36.22.8\"><span class=\"ltx_text\" id=\"S6.T1.36.22.8.1\" style=\"font-size:90%;\">25.6</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S6.T1.36.22.9\"><span class=\"ltx_text\" id=\"S6.T1.36.22.9.1\" style=\"font-size:90%;\">183.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S6.T1.36.22.10\"><span class=\"ltx_text ltx_font_typewriter\" id=\"S6.T1.36.22.10.1\" style=\"font-size:90%;\">N/A</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
37
+ "perturb_sentence_id": 4,
38
+ "output": {
39
+ "perturbed_statement": "[paragraph id = 4] Table 1 showcases the results and quantifies the speedup achieved in representative cases relative to LRU. In the study by Lan et al.",
40
+ "perturbed_explanation": "1. The original statement implies that Table 1 discusses speedup metrics comparing performance to IPC. 2. The altered statement replaces 'IPC' with 'LRU,' which changes the reference point in the comparison. This adjustment is incorrect since the comparative analysis involving Table 1, as mentioned, is relative to IPC, not LRU."
41
+ }
42
+ }
43
+ ]
table_result/2407.00056v1_output.json ADDED
The diff for this file is too large to render. See raw diff
 
table_result/2407.00062v1_output.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00062v1.json",
4
+ "table_id": "2",
5
+ "section": "5.1.1",
6
+ "all_context": [
7
+ "Let s say we are rating item i, we go through a node s neighbors and calculate the mean, median, or mode of their ratings for item i (if it exists).",
8
+ "If no ratings for i exist, we return a random rating.",
9
+ "Note, this algorithm mutates the underlying data source, whereby a recommended value for a node, can be used by another node in its rating inference.",
10
+ "The performance of these recommenders can be seen in Table 2 .",
11
+ "We observe that the mean and median of neighbors were the best algorithms in this range.",
12
+ "The mode version did not perform as well.",
13
+ "This is because mode cannot produce decimal-precision ratings it also defaults to randomness when a mode can t be determined.",
14
+ ""
15
+ ],
16
+ "target_context_ids": [
17
+ 3,
18
+ 4,
19
+ 5,
20
+ 6
21
+ ],
22
+ "selected_paragraphs": [
23
+ "[paragraph id = 3] The performance of these recommenders can be seen in Table 2 .",
24
+ "[paragraph id = 4] We observe that the mean and median of neighbors were the best algorithms in this range.",
25
+ "[paragraph id = 5] The mode version did not perform as well.",
26
+ "[paragraph id = 6] This is because mode cannot produce decimal-precision ratings it also defaults to randomness when a mode can t be determined."
27
+ ],
28
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T2\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span>Mean and Standard Deviation of MAE for Neighbourhood Algorithms</figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S5.T2.2\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T2.2.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T2.2.2.3\">Algorithm</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T2.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T2.2.2.2\"></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T2.2.3.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S5.T2.2.3.1.1\">Mean of Neighbours</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T2.2.3.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.2.3.1.2.1\">1.27</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T2.2.3.1.3\">0.25</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.2.4.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T2.2.4.2.1\">Median of Neighbours</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.4.2.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.2.4.2.2.1\">1.27</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.4.2.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.2.4.2.3.1\">0.19</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.2.5.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T2.2.5.3.1\">Mode of Neighbours</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.5.3.2\">1.32</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.5.3.3\">0.23</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.2.6.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T2.2.6.4.1\">Universal Random</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.6.4.2\">1.30</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.6.4.3\">0.38</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.2.7.5\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T2.2.7.5.1\">Random</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.7.5.2\">1.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.7.5.3\">0.21</td>\n</tr>\n</tbody>\n</table>\n</figure>",
29
+ "perturb_sentence_id": 3,
30
+ "output": {
31
+ "perturbed_statement": "[paragraph id = 3] The performance of these recommenders can be seen in Table 2 . We observe that the mode of neighbors was the best algorithm in this range.",
32
+ "perturbed_explanation": "1. The original statement states that the mean and median of neighbors algorithms showed the best performance in the given range.\n2. The statement introduces the factual error by replacing \"mean and median\" with \"mode,\" which the context highlights as having inferior performance due to its limitations in precision and stochastic behavior. Therefore, this modification contradicts the provided information."
33
+ }
34
+ },
35
+ {
36
+ "path": "table_paper/2407.00062v1.json",
37
+ "table_id": "4",
38
+ "section": "5.1.3",
39
+ "all_context": [
40
+ "We now move to investigate recommendation systems that consider the entire trust graph.",
41
+ "The Monte-Carlo (MC) Random Walk model works by starting at the node we are trying to produce a rating for and then performing a random walk.",
42
+ "As the distance from the original node, , increases, so does the probability that a random rating is returned.",
43
+ "This random walk continues until a node with a rating is reached, or this random rating is produced.",
44
+ "We run this experiment times, for some fixed , for each node and take the mean rating from the nodes reached via random walk.",
45
+ "We then extended this algorithm by choosing the neighbor, at each step of the random walk based on a weighted random incorporating the Jaccard index.",
46
+ "The performance of both MC algorithms, as compared with the others can be seen in Table 4 .",
47
+ "We observe that the Monte Carlo algorithm outperforms the previous models.",
48
+ "Interestingly, the Jaccard weighted version performed better.",
49
+ "This implies that Jaccard indexes seem to hold more valuable information than mere trust connections.",
50
+ ""
51
+ ],
52
+ "target_context_ids": [
53
+ 6,
54
+ 7,
55
+ 8,
56
+ 9
57
+ ],
58
+ "selected_paragraphs": [
59
+ "[paragraph id = 6] The performance of both MC algorithms, as compared with the others can be seen in Table 4 .",
60
+ "[paragraph id = 7] We observe that the Monte Carlo algorithm outperforms the previous models.",
61
+ "[paragraph id = 8] Interestingly, the Jaccard weighted version performed better.",
62
+ "[paragraph id = 9] This implies that Jaccard indexes seem to hold more valuable information than mere trust connections."
63
+ ],
64
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T4\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 4: </span>Mean and Standard Deviation of MAE for Random Walk Algorithms</figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S5.T4.2\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T4.2.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T4.2.2.3\">Algorithm</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T4.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T4.2.2.2\"></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T4.2.3.1\">\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S5.T4.2.3.1.1\">Jaccard Monte-Carlo Random Walk</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.3.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.2.3.1.2.1\">1.14</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.3.1.3\">0.22</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.4.2\">\n<td class=\"ltx_td ltx_align_left\" id=\"S5.T4.2.4.2.1\">Monte-Carlo Random Walk</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.4.2.2\">1.16</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.4.2.3\">0.23</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.5.3\">\n<td class=\"ltx_td ltx_align_left\" id=\"S5.T4.2.5.3.1\">Median of Neighbours</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.5.3.2\">1.27</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.5.3.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.2.5.3.3.1\">0.19</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.6.4\">\n<td class=\"ltx_td ltx_align_left\" id=\"S5.T4.2.6.4.1\">Universal Random</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.6.4.2\">1.30</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.6.4.3\">0.38</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.7.5\">\n<td class=\"ltx_td ltx_align_left\" id=\"S5.T4.2.7.5.1\">Random</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.7.5.2\">1.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.7.5.3\">0.21</td>\n</tr>\n</tbody>\n</table>\n</figure>",
65
+ "perturb_sentence_id": 6,
66
+ "output": {
67
+ "perturbed_statement": "[paragraph id = 6] The performance of both MC algorithms, as compared with the others can be seen in Table 4 . We observe that the Monte Carlo algorithm performs worse than the previous models.",
68
+ "perturbed_explanation": "The original explanation states that *the performance of the Monte Carlo algorithm outperforms the previous models*. 1. The comparative analysis suggests better efficiency presented by these algorithms. However, the statement now claims that *the Monte Carlo algorithm performs worse than the previous models*. This contradicts the analysis outcome as observed in its results."
69
+ }
70
+ },
71
+ {
72
+ "path": "table_paper/2407.00062v1.json",
73
+ "table_id": "5",
74
+ "section": "5.1.4",
75
+ "all_context": [
76
+ "These recommenders work by considering every other node, which has rated item i, in the graph.",
77
+ "We then perform a summation of the Jaccard values for every node corresponding to each rating value.",
78
+ "In the Majority of Majorities version, we then take the rating that has received the highest contribution of Jaccard scores.",
79
+ "However, in the Weighted Average version, we instead take the weighted average of these ratings, weighted by the contribution of Jaccard scores.",
80
+ "We will call these techniques JMoM and WA moving forward.",
81
+ "(These methods are inspired by the popular majority based opinion diffusion models, cf.",
82
+ "[38 , 39 , 40 ].)",
83
+ "As can be observed from Table 5 , the JMoM performed reasonably well.",
84
+ "Its mean accuracy is better than the random walk.",
85
+ "However, it was less robust.",
86
+ "The WA version performed very well significantly beating both the JMoM version and the random walk.",
87
+ "By overcoming the constraint of only considering direct neighbors, the JMoM and Jaccard WA models can produce decent results, with a very simple algorithm.",
88
+ "Note that these approaches can be used with any user-to-user similarity metric.",
89
+ ""
90
+ ],
91
+ "target_context_ids": [
92
+ 7,
93
+ 8,
94
+ 9,
95
+ 10,
96
+ 11
97
+ ],
98
+ "selected_paragraphs": [
99
+ "[paragraph id = 7] As can be observed from Table 5 , the JMoM performed reasonably well.",
100
+ "[paragraph id = 8] Its mean accuracy is better than the random walk.",
101
+ "[paragraph id = 9] However, it was less robust.",
102
+ "[paragraph id = 10] The WA version performed very well significantly beating both the JMoM version and the random walk.",
103
+ "[paragraph id = 11] By overcoming the constraint of only considering direct neighbors, the JMoM and Jaccard WA models can produce decent results, with a very simple algorithm."
104
+ ],
105
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T5\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 5: </span>Mean and Standard Deviation of MAE for Full-graph Jaccard Algorithms</figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S5.T5.2\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T5.2.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T5.2.2.3\">Algorithm</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T5.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T5.2.2.2\"></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T5.2.3.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S5.T5.2.3.1.1\">Jaccard WA</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T5.2.3.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T5.2.3.1.2.1\">1.05</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T5.2.3.1.3\">0.25</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T5.2.4.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T5.2.4.2.1\">Jaccard Monte-Carlo Random Walk</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.4.2.2\">1.14</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.4.2.3\">0.22</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T5.2.5.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T5.2.5.3.1\">Jaccard MoM</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.5.3.2\">1.13</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.5.3.3\">0.30</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T5.2.6.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T5.2.6.4.1\">Universal Random</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.6.4.2\">1.30</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.6.4.3\">0.38</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T5.2.7.5\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T5.2.7.5.1\">Random</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.7.5.2\">1.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.7.5.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T5.2.7.5.3.1\">0.21</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
106
+ "perturb_sentence_id": 7,
107
+ "output": {
108
+ "perturbed_statement": "[paragraph id = 7] As can be observed from Table 5 , the random walk performed significantly better than the JMoM.",
109
+ "perturbed_explanation": "1. The original explanation justified that the JMoM performed reasonably well, with its mean accuracy surpassing that of the random walk. 2. However, the statement introduces an incorrect assertion that the random walk performed significantly better, which contradicts the context's data indicating JMoM's superior mean accuracy over random walk."
110
+ }
111
+ },
112
+ {
113
+ "path": "table_paper/2407.00062v1.json",
114
+ "table_id": "6",
115
+ "section": "5.2",
116
+ "all_context": [
117
+ "In the Item Jaccard model, we used the Jaccard-Item Index which is based on items rated.",
118
+ "We also used the weighted average framework approach but with the Item-Rating Difference.",
119
+ "The performance of these alternative WA models can be seen in Table 6 .",
120
+ "We can see that the Item-Jaccard form of WA outperforms the previous Jaccard WA.",
121
+ "Item-Rating Difference doesn t seem to perform very well and is quite inconsistent in its performance.",
122
+ "A potential reason for the Item-Jaccard WA model performing so well is that a user s social circle may not be the best predictor for who is similar to them, rather relating people by their items themselves does a better job.",
123
+ ""
124
+ ],
125
+ "target_context_ids": [
126
+ 2,
127
+ 4,
128
+ 5
129
+ ],
130
+ "selected_paragraphs": [
131
+ "[paragraph id = 2] The performance of these alternative WA models can be seen in Table 6 .",
132
+ "[paragraph id = 4] Item-Rating Difference doesn t seem to perform very well and is quite inconsistent in its performance.",
133
+ "[paragraph id = 5] A potential reason for the Item-Jaccard WA model performing so well is that a user s social circle may not be the best predictor for who is similar to them, rather relating people by their items themselves does a better job."
134
+ ],
135
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T6\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 6: </span>Mean and Standard Deviation of MAE for Item Rating Algorithms</figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S5.T6.2\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T6.2.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T6.2.2.3\">Algorithm</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T6.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T6.2.2.2\"></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T6.2.3.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S5.T6.2.3.1.1\">Item-Jaccard WA</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T6.2.3.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T6.2.3.1.2.1\">1.02</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T6.2.3.1.3\">0.23</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T6.2.4.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T6.2.4.2.1\">Jaccard WA</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T6.2.4.2.2\">1.05</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T6.2.4.2.3\">0.25</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T6.2.5.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T6.2.5.3.1\">Item-Rating Difference WA</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T6.2.5.3.2\">1.17</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T6.2.5.3.3\">0.49</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T6.2.6.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T6.2.6.4.1\">Universal Random</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T6.2.6.4.2\">1.30</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T6.2.6.4.3\">0.38</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T6.2.7.5\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T6.2.7.5.1\">Random</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T6.2.7.5.2\">1.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T6.2.7.5.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T6.2.7.5.3.1\">0.21</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
136
+ "perturb_sentence_id": 2,
137
+ "output": {
138
+ "perturbed_statement": "[paragraph id = 2] The performance of these alternative WA models can be seen in Table 6 .We can see that the previous Jaccard WA outperforms the Item-Jaccard form of WA.",
139
+ "perturbed_explanation": "The original explanation highlights: 1. The Item-Jaccard WA model outperforms the previous Jaccard WA model, showcasing its superior performance. In the altered statement, the claim suggests that the previous Jaccard WA model performs better than the Item-Jaccard WA model, which is factually incorrect as per the provided context discussing the superior performance of the Item-Jaccard WA model."
140
+ }
141
+ },
142
+ {
143
+ "path": "table_paper/2407.00062v1.json",
144
+ "table_id": "7",
145
+ "section": "5.3",
146
+ "all_context": [
147
+ "The Intra-Item Information concerns itself with the relationships between items themselves.",
148
+ "In a similar fashion to the previous algorithms, we manipulated the WA approach to work with Intra-Item similarity data.",
149
+ "This is slightly different from the WA framework as we iterate through each user s items rather than using a user-user similarity metric.",
150
+ "The performance of the algorithms, one based on and one based on the Pearson , can be seen in Table 7 .",
151
+ "It is evident both of the intra-item-based models underperform our previous models.",
152
+ "The intra-item similarity using the Jaccard approach outperforms the Pearson correlation approach.",
153
+ "This is a very interesting result, potentially this could be a better metric for determining the similarity of items and could be applied in [16 ].",
154
+ "It is evident the intra-item similarity conveys some information as we perform better than random, hence this could be useful in combination models in the coming sections.",
155
+ ""
156
+ ],
157
+ "target_context_ids": [
158
+ 3,
159
+ 4,
160
+ 5,
161
+ 6,
162
+ 7
163
+ ],
164
+ "selected_paragraphs": [
165
+ "[paragraph id = 3] The performance of the algorithms, one based on and one based on the Pearson , can be seen in Table 7 .",
166
+ "[paragraph id = 4] It is evident both of the intra-item-based models underperform our previous models.",
167
+ "[paragraph id = 5] The intra-item similarity using the Jaccard approach outperforms the Pearson correlation approach.",
168
+ "[paragraph id = 6] This is a very interesting result, potentially this could be a better metric for determining the similarity of items and could be applied in [16 ].",
169
+ "[paragraph id = 7] It is evident the intra-item similarity conveys some information as we perform better than random, hence this could be useful in combination models in the coming sections."
170
+ ],
171
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T7\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 7: </span>Mean and Standard Deviation of MAE for Intra-Item Algorithms</figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S5.T7.2\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T7.2.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T7.2.2.3\">Algorithm</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T7.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T7.2.2.2\"></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T7.2.3.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S5.T7.2.3.1.1\">Item-Jaccard WA</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T7.2.3.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T7.2.3.1.2.1\">1.02</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T7.2.3.1.3\">0.23</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T7.2.4.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T7.2.4.2.1\">Intra-Item WA</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T7.2.4.2.2\">1.24</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T7.2.4.2.3\">0.14</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T7.2.5.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T7.2.5.3.1\">Universal Random</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T7.2.5.3.2\">1.30</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T7.2.5.3.3\">0.38</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T7.2.6.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T7.2.6.4.1\">Intra-Item WA (Pearson)</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T7.2.6.4.2\">1.34</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T7.2.6.4.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T7.2.6.4.3.1\">0.13</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T7.2.7.5\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T7.2.7.5.1\">Random</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T7.2.7.5.2\">1.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T7.2.7.5.3\">0.21</td>\n</tr>\n</tbody>\n</table>\n</figure>",
172
+ "perturb_sentence_id": 6,
173
+ "output": {
174
+ "perturbed_statement": "[paragraph id = 6] This is an unremarkable result, potentially this might not be a better metric for determining the similarity of items nor could it be applied in [16 ]. It is claimed the intra-item similarity fails to convey important information as we do not perform better than random, hence this might not be useful in combination models in the coming sections.",
175
+ "perturbed_explanation": "The original explanation highlights the following: 1. The result presented in paragraph 6 suggests that intra-item similarity could potentially serve as a beneficial metric for determining item similarity and supports the effectiveness of combined models. 2. This is based on the observation that performance exceeds random outcomes, which indicates the conveyance of meaningful information. However, the statement contradicts this by describing the result as unremarkable and asserting that intra-item similarity fails to provide meaningful contributions, which incorrectly counters the logical deductions presented in the original analysis."
176
+ }
177
+ },
178
+ {
179
+ "path": "table_paper/2407.00062v1.json",
180
+ "table_id": "9",
181
+ "section": "6.2",
182
+ "all_context": [
183
+ "This model combines the Intra-Item graph with the Item graph.",
184
+ "Thus combing information about how nodes rated individual items and how the items are interrelated.",
185
+ "The general idea was to slightly alter the item rating similarity metric, , to provide a bias taking into account the similarity of items to the item we are recommending.",
186
+ "Note for intra-item similarity we use rather than as we found it was more performant in the previous section.",
187
+ "The results can be seen in Table 9 : Though the WIRD model performed quite well still not better than the Item-Jaccard WA model.",
188
+ "However, it performs better than just the intra-item information or item-rating difference WA alone.",
189
+ "Hence, we have successfully combined the intra-item and item-rating information to achieve a better result.",
190
+ ""
191
+ ],
192
+ "target_context_ids": [
193
+ 4,
194
+ 5,
195
+ 6
196
+ ],
197
+ "selected_paragraphs": [
198
+ "[paragraph id = 4] The results can be seen in Table 9 : Though the WIRD model performed quite well still not better than the Item-Jaccard WA model.",
199
+ "[paragraph id = 5] However, it performs better than just the intra-item information or item-rating difference WA alone.",
200
+ "[paragraph id = 6] Hence, we have successfully combined the intra-item and item-rating information to achieve a better result."
201
+ ],
202
+ "table_html": "<figure class=\"ltx_table\" id=\"S6.T9\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 9: </span>Mean and Standard Deviation of MAE for WIRD WA Algorithm</figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S6.T9.2\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S6.T9.2.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S6.T9.2.2.3\">Algorithm</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S6.T9.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S6.T9.2.2.2\"></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S6.T9.2.3.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S6.T9.2.3.1.1\">WIRD WA</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S6.T9.2.3.1.2\">1.05</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S6.T9.2.3.1.3\">0.28</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T9.2.4.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S6.T9.2.4.2.1\">Item-Jaccard WA</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S6.T9.2.4.2.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T9.2.4.2.2.1\">1.02</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S6.T9.2.4.2.3\">0.23</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T9.2.5.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S6.T9.2.5.3.1\">Intra-Item WA</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S6.T9.2.5.3.2\">1.34</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S6.T9.2.5.3.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S6.T9.2.5.3.3.1\">0.13</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T9.2.6.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S6.T9.2.6.4.1\">Item-Rating Difference WA</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S6.T9.2.6.4.2\">1.17</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S6.T9.2.6.4.3\">0.49</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T9.2.7.5\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S6.T9.2.7.5.1\">Universal Random</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S6.T9.2.7.5.2\">1.30</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S6.T9.2.7.5.3\">0.38</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S6.T9.2.8.6\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S6.T9.2.8.6.1\">Random</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S6.T9.2.8.6.2\">1.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S6.T9.2.8.6.3\">0.21</td>\n</tr>\n</tbody>\n</table>\n</figure>",
203
+ "perturb_sentence_id": 4,
204
+ "output": {
205
+ "perturbed_statement": "[paragraph id = 4] The results can be observed in Figure 5: The WIRD model significantly underperformed compared to the Item-Jaccard WA model, and also did not surpass the standalone intra-item information or item-rating difference WA approaches.",
206
+ "perturbed_explanation": "The initial explanation states: First, the WIRD model performed quite well, although slightly below the Item-Jaccard WA model, while surpassing the standalone approaches. 2. The statement now incorrectly suggests that the WIRD model underperformed both compared to the Item-Jaccard WA model and the standalone approaches, contradicting the actual analysis results."
207
+ }
208
+ },
209
+ {
210
+ "path": "table_paper/2407.00062v1.json",
211
+ "table_id": "11",
212
+ "section": "9",
213
+ "all_context": [
214
+ "We successfully broke down the concept of a recommender system to its core principles, starting with the information it ingests.",
215
+ "Through breaking down the different information forms, we noticed that the most important feature when assigning the similarity between two users for their preferences is the way they ve rated other items and which items they have engaged with.",
216
+ "The trust and intra-item-based algorithms led to recommenders with higher MAE as seen in Table 11 .",
217
+ "All other models included the item-rating information in some way, which had significantly lower MAE.",
218
+ "This result makes logical sense as it s more likely we are similar in opinion to a person who we share interests with than a person who we share friends with.",
219
+ "However, the problem with item-rating information is the information that we lack when cold-starting a user.",
220
+ "In the challenging case of cold start users, the algorithms that only use the Trust and Intra-Item information would be very beneficial, but these algorithms, unsuperisingly, perform not as well.",
221
+ "Among these, we found that the Jaccard WA approach provided the best performance, outperforming random walk and intra-item-based approaches.",
222
+ "A further idea that could be explored is simply using an approach like the Universal Random, which was one of our baseline recommenders.",
223
+ "This recommender performed reasonably when averaged across our data sets and predicts preferences based on the distribution of current ratings for said item.",
224
+ "While combining information, we found it wasn t easy to outperform the Item-Jaccard-based recommender.",
225
+ "The one case where we managed to improve upon the base Item-Jaccard model was when we performed addition with the Jaccard Index.",
226
+ "So the similarity between the two users is based on the sum of the Jaccard Index and Item-Jaccard Index.",
227
+ "We think that the performance of the recommender is increased as the Jaccard provides some marginal information which can impact the ratings in the case when a user trusts a celebrity.",
228
+ "That way the user has a non-zero Jaccard index with many other nodes who also trust said celebrity, hence, slightly shifting the rating prediction in the direction of these users.",
229
+ "We attempted combination models of various forms, combining the similarity scores with multiplication, summation, and a maximum function.",
230
+ "However, we discovered that scaling the distributions had minimal impact on the resulting performance.",
231
+ "We also concluded that simply using an equally weighted summation of the two scores leads to the simplest and most effective model.",
232
+ "Hence, the highest-performing recommendation model combined trust information and item rating information.",
233
+ "Though, it is not suitable for cold-start users.",
234
+ "We also found that recommenders based on the WA framework outperformed random-walk methods.",
235
+ "Furthermore, the WA framework outperforms a similar approach called Majority of Majorities.",
236
+ "We believe this approach to collaborative filtering is effective because it is not constrained by the structure of the graph (as is a random walk) and can produce fine-grained recommendations that can be non-integer values.",
237
+ "Whereas, the MoM approach forces a specific integer rating to be assigned and hence is not as representative of the user s preference.",
238
+ "When it came to the intra-item information, we found it performed quite poorly from the perspective of .",
239
+ "However, an interesting observation, as per Table 11 , was that the algorithms that included intra-item information were the most consistent in their performance—featuring the lowest across all data sets.",
240
+ "Thus, it can be deduced that the intra-item information is additive from a stability perspective, making a recommender perform with similar accuracy for all users.",
241
+ "This idea was reinforced when we created the fully combined model, which utilized all information types.",
242
+ "This was the “Jaccard Item-Jaccard JII Combination WA” model.",
243
+ "We found this model performed worse than the Jaccard Item-Jaccard WA model in , however, it performed better in mean .",
244
+ "Hence, in a scenario where providing consistently good predictions for all users is of importance, the introduction of intra-item information could facilitate this.",
245
+ "Through our experimentation with opposing intra-item similarity metrics, we determined that the Intra-Item Jaccard approach outperformed the Pearson Correlation metric on downstream tasks.",
246
+ "This was determined by building models upon these metrics and comparing the resulting accuracy.",
247
+ "When further testing with the Intra-Item Jaccard similarity metric was undertaken, the resulting models performed better than random, and in combination led to stable models as described earlier.",
248
+ "A recurring theme in these experiments was the very impressive performance of the Jaccard Index when used in a variety of applications.",
249
+ "It was the go-to for drawing similarity scores using all information forms and is a simple but logical way to reason about the similarity of sets.",
250
+ "Testing the performance of our algorithms on an adversarial network provided some interesting insights.",
251
+ "The most obvious of which is that item rating information is highly susceptible to an attack with fake accounts.",
252
+ "The trust graph is relatively robust to adversaries, as one must influence individuals to shift the dynamics of this information.",
253
+ "This is represented in the data by the recommendation systems involving the item rating information suffering from the most severe reductions in their MAE.",
254
+ "We found that the least impacted algorithm was the MC Random Walk.",
255
+ "As the algorithm propagates from a user, through the network to derive the rating, it will arrive at a legitimate node with a rating before venturing into the adversaries.",
256
+ "The only case where the adversaries have an impact is when no ratings are found and the randomness means a rating is taken uniformly from the network.",
257
+ "In these cases, the volume of ratings from adversaries impacts results.",
258
+ "The next best algorithm for being robust in the face of adversaries was Jaccard WA.",
259
+ "This is a Weighted Average based entirely on the Jaccard Index.",
260
+ "This algorithm performs better due to the value it places on the trust graph, very rarely do nodes have a non-zero Jaccard similarity with the bad actors, hence the high performance.",
261
+ "Future Work: In investigating intra-item information, we found that the ”Intra-Item Jaccard” metric was more effective than Pearson Correlation in determining item similarity.",
262
+ "This result suggests further exploration of the Trust-Walker [16 ], which may lead to improved performance.",
263
+ "Further research is needed to improve the computational efficiency of the WA approach.",
264
+ "Unlike, random-walk methods, the WA-based recommenders require iterating over all nodes in the graph.",
265
+ "Optimizations such as vectorization or caching similarities between users could be explored.",
266
+ "Furthermore, this work focused on fake accounts and fake ratings, but other attacks such as bribing popular nodes or adversarial censorship should be investigated in future studies.",
267
+ ""
268
+ ],
269
+ "target_context_ids": [
270
+ 2,
271
+ 3,
272
+ 7,
273
+ 25,
274
+ 26,
275
+ 27,
276
+ 28,
277
+ 29,
278
+ 30
279
+ ],
280
+ "selected_paragraphs": [
281
+ "[paragraph id = 2] The trust and intra-item-based algorithms led to recommenders with higher MAE as seen in Table 11 .",
282
+ "[paragraph id = 3] All other models included the item-rating information in some way, which had significantly lower MAE.",
283
+ "[paragraph id = 7] Among these, we found that the Jaccard WA approach provided the best performance, outperforming random walk and intra-item-based approaches.",
284
+ "[paragraph id = 25] However, an interesting observation, as per Table 11 , was that the algorithms that included intra-item information were the most consistent in their performance—featuring the lowest across all data sets.",
285
+ "[paragraph id = 26] Thus, it can be deduced that the intra-item information is additive from a stability perspective, making a recommender perform with similar accuracy for all users.",
286
+ "[paragraph id = 27] This idea was reinforced when we created the fully combined model, which utilized all information types.",
287
+ "[paragraph id = 28] This was the “Jaccard Item-Jaccard JII Combination WA” model.",
288
+ "[paragraph id = 29] We found this model performed worse than the Jaccard Item-Jaccard WA model in , however, it performed better in mean .",
289
+ "[paragraph id = 30] Hence, in a scenario where providing consistently good predictions for all users is of importance, the introduction of intra-item information could facilitate this."
290
+ ],
291
+ "table_html": "<figure class=\"ltx_table\" id=\"S7.T11\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 11: </span>Mean and Standard Deviation of MAE for All Algorithms on Different Data Sets</figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S7.T11.6\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S7.T11.6.7.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S7.T11.6.7.1.1\" rowspan=\"2\"><span class=\"ltx_text\" id=\"S7.T11.6.7.1.1.1\">Algorithm</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" colspan=\"3\" id=\"S7.T11.6.7.1.2\">Epinions</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" colspan=\"3\" id=\"S7.T11.6.7.1.3\">FilmTrust</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" colspan=\"2\" id=\"S7.T11.6.7.1.4\">CiaoDVD</th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.6\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S7.T11.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S7.T11.2.2.2\"></th>\n<th class=\"ltx_td ltx_th ltx_th_column ltx_border_r\" id=\"S7.T11.6.6.7\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S7.T11.3.3.3\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S7.T11.4.4.4\"></th>\n<th class=\"ltx_td ltx_th ltx_th_column ltx_border_r\" id=\"S7.T11.6.6.8\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S7.T11.5.5.5\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S7.T11.6.6.6\"></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S7.T11.6.8.1\">\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S7.T11.6.8.1.1\">Jaccard Item-Jaccard WA</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S7.T11.6.8.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S7.T11.6.8.1.2.1\">1.00</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S7.T11.6.8.1.3\">0.26</td>\n<td class=\"ltx_td ltx_border_r ltx_border_t\" id=\"S7.T11.6.8.1.4\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S7.T11.6.8.1.5\"><span class=\"ltx_text ltx_font_bold\" id=\"S7.T11.6.8.1.5.1\">0.66</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S7.T11.6.8.1.6\">0.08</td>\n<td class=\"ltx_td ltx_border_r ltx_border_t\" id=\"S7.T11.6.8.1.7\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S7.T11.6.8.1.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S7.T11.6.8.1.8.1\">0.53</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S7.T11.6.8.1.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S7.T11.6.8.1.9.1\">0.28</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.9.2\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.9.2.1\">Item-Jaccard WA</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.9.2.2\">1.02</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.9.2.3\">0.23</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.9.2.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.9.2.5\">0.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.9.2.6\">0.08</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.9.2.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.9.2.8\">0.58</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.9.2.9\">0.32</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.10.3\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.10.3.1\">Jaccard WA</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.10.3.2\">1.05</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.10.3.3\">0.25</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.10.3.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.10.3.5\">1.14</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.10.3.6\">0.08</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.10.3.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.10.3.8\">1.73</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.10.3.9\">0.36</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.11.4\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.11.4.1\">WIRD WA</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.11.4.2\">1.05</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.11.4.3\">0.28</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.11.4.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.11.4.5\">0.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.11.4.6\"><span class=\"ltx_text ltx_font_bold\" id=\"S7.T11.6.11.4.6.1\">0.04</span></td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.11.4.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.11.4.8\">0.72</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.11.4.9\">0.37</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.12.5\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.12.5.1\">Jaccard Item-Jaccard JII Combination WA</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.12.5.2\">1.07</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.12.5.3\">0.22</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.12.5.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.12.5.5\">0.69</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.12.5.6\">0.06</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.12.5.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.12.5.8\">0.63</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.12.5.9\">0.30</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.13.6\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.13.6.1\">JWIRD WA</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.13.6.2\">1.09</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.13.6.3\">0.27</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.13.6.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.13.6.5\">0.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.13.6.6\">0.07</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.13.6.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.13.6.8\">0.72</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.13.6.9\">0.40</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.14.7\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.14.7.1\">Jaccard MoM</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.14.7.2\">1.13</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.14.7.3\">0.30</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.14.7.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.14.7.5\">1.19</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.14.7.6\">0.09</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.14.7.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.14.7.8\">1.77</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.14.7.9\">0.37</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.15.8\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.15.8.1\">Jaccard Monte-Carlo Random Walk</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.15.8.2\">1.14</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.15.8.3\">0.22</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.15.8.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.15.8.5\">1.20</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.15.8.6\">0.08</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.15.8.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.15.8.8\">1.81</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.15.8.9\">0.49</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.16.9\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.16.9.1\">Monte-Carlo Random Walk</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.16.9.2\">1.16</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.16.9.3\">0.23</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.16.9.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.16.9.5\">1.20</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.16.9.6\">0.08</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.16.9.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.16.9.8\">1.82</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.16.9.9\">0.35</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.17.10\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.17.10.1\">Item-Rating Difference WA</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.17.10.2\">1.17</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.17.10.3\">0.49</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.17.10.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.17.10.5\">0.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.17.10.6\">0.08</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.17.10.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.17.10.8\">0.79</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.17.10.9\">0.41</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.18.11\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.18.11.1\">Jaccard Intra-Item WA</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.18.11.2\">1.20</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.18.11.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S7.T11.6.18.11.3.1\">0.13</span></td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.18.11.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.18.11.5\">1.07</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.18.11.6\">0.06</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.18.11.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.18.11.8\">1.75</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.18.11.9\">0.32</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.19.12\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.19.12.1\">Intra-Item WA</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.19.12.2\">1.24</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.19.12.3\">0.14</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.19.12.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.19.12.5\">1.18</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.19.12.6\">0.08</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.19.12.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.19.12.8\">1.70</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.19.12.9\">0.45</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.20.13\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.20.13.1\">Median of Neighbours</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.20.13.2\">1.27</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.20.13.3\">0.19</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.20.13.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.20.13.5\">1.26</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.20.13.6\">0.08</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.20.13.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.20.13.8\">1.76</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.20.13.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S7.T11.6.20.13.9.1\">0.28</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.21.14\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.21.14.1\">Mean of Neighbours</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.21.14.2\">1.27</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.21.14.3\">0.25</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.21.14.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.21.14.5\">1.25</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.21.14.6\">0.07</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.21.14.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.21.14.8\">1.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.21.14.9\">0.43</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.22.15\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.22.15.1\">Universal Random</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.22.15.2\">1.30</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.22.15.3\">0.38</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.22.15.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.22.15.5\">0.89</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.22.15.6\">0.11</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.22.15.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.22.15.8\">0.72</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.22.15.9\">0.58</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.23.16\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.23.16.1\">Jaccard Weighted Neighbours</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.23.16.2\">1.31</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.23.16.3\">0.32</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.23.16.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.23.16.5\">1.22</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.23.16.6\">0.10</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.23.16.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.23.16.8\">1.66</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.23.16.9\">0.44</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.24.17\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.24.17.1\">Mode of Neighbours</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.24.17.2\">1.32</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.24.17.3\">0.23</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.24.17.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.24.17.5\">1.23</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.24.17.6\">0.09</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.24.17.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.24.17.8\">1.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.24.17.9\">0.43</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.25.18\">\n<td class=\"ltx_td ltx_align_left\" id=\"S7.T11.6.25.18.1\">Intra-Item WA (Pearson)</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.25.18.2\">1.34</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.25.18.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S7.T11.6.25.18.3.1\">0.13</span></td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.25.18.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.25.18.5\">1.21</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.25.18.6\">0.07</td>\n<td class=\"ltx_td ltx_border_r\" id=\"S7.T11.6.25.18.7\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.25.18.8\">1.67</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S7.T11.6.25.18.9\">0.50</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T11.6.26.19\">\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S7.T11.6.26.19.1\">Random</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S7.T11.6.26.19.2\">1.67</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S7.T11.6.26.19.3\">0.21</td>\n<td class=\"ltx_td ltx_border_bb ltx_border_r\" id=\"S7.T11.6.26.19.4\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S7.T11.6.26.19.5\">1.34</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S7.T11.6.26.19.6\">0.07</td>\n<td class=\"ltx_td ltx_border_bb ltx_border_r\" id=\"S7.T11.6.26.19.7\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S7.T11.6.26.19.8\">1.82</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S7.T11.6.26.19.9\">0.31</td>\n</tr>\n</tbody>\n</table>\n</figure>",
292
+ "perturb_sentence_id": 25,
293
+ "output": {
294
+ "perturbed_statement": "[paragraph id = 25] However, an interesting observation, as per Table 11 , was that the algorithms that excluded intra-item information were the most consistent in their performance—featuring the lowest across all data sets. Thus, it can be deduced that the intra-item information is counterproductive from a stability perspective, making a recommender perform with inconsistent accuracy for all users.",
295
+ "perturbed_explanation": "The original explanation: The algorithms incorporating intra-item information were observed to deliver consistent accuracy across various datasets, showcasing their additive effect on recommendation stability. 2. In the statement, it is incorrectly claimed that the elements excluding intra-item information were more consistent. This contradicts the analysis present, which supports the inclusion of intra-item data for achieving consistency."
296
+ }
297
+ }
298
+ ]
table_result/2407.00064v1_output.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00064v1.json",
4
+ "table_id": "1",
5
+ "section": "4.4",
6
+ "all_context": [
7
+ "The final step of our approach includes the verification of the solution model based on the constraints associated with the RM.",
8
+ "This verification is performed on the RDF representations of the models.",
9
+ "An aligned ontology is generated by merging the reference ontology and the solution.",
10
+ "Using an RDFS reasoner, the knowledge resulting from the transitivity of the subclass relationship is inferred.",
11
+ "This ensures that instances of a subclass are also instances of the superordinate classes.",
12
+ "Finally, the constraints are verified.",
13
+ "Therefore, the defined queries are executed on the merged ontology and the results are evaluated accordingly.",
14
+ "The result is a listing of all conditions with their corresponding truth values and the retrieved values.",
15
+ "For our RM, 11 queries were formulated.",
16
+ "The queries and the requirements defined in Section 2 would be evaluated as shown in Table 1 .",
17
+ ""
18
+ ],
19
+ "target_context_ids": [
20
+ 8,
21
+ 9
22
+ ],
23
+ "selected_paragraphs": [
24
+ "[paragraph id = 8] For our RM, 11 queries were formulated.",
25
+ "[paragraph id = 9] The queries and the requirements defined in Section 2 would be evaluated as shown in Table 1 ."
26
+ ],
27
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T1\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T1.1\">\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1\">\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.1.1.1\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T1.1.1.1.1\">\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.1.1\">q1</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.1.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.1.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.1.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.1.2.1.1.1\">COUNT:</span> all cars</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.1.3\">1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.2.1\">q2</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.2.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.2.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.2.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.2.2.1.1.1\">COUNT:</span> cars with 4 wheels</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.2.3\">1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.3.1\">q3</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.3.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.3.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.3.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.3.2.1.1.1\">COUNT:</span> cars with 1 transmission gear</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.3.3\">1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.4.1\">q4</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.4.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.4.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.4.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.4.2.1.1.1\">ASK:</span> transmission is a manual and an automatic transmission</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.4.3\">false</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.5\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.5.1\">q5</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.5.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.5.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.5.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.5.2.1.1.1\">ASK:</span> cars with less than one engine</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.5.3\">false</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.6\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.6.1\">q6</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.6.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.6.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.6.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.6.2.1.1.1\">ASK:</span> cars with more than one combustion engine</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.6.3\">false</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.7\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.7.1\">q7</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.7.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.7.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.7.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.7.2.1.1.1\">COUNT:</span> cars with at least one combustion engine</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.7.3\">1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.8\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.8.1\">q8</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.8.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.8.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.8.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.8.2.1.1.1\">COUNT:</span> cars with at least one electric engine</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.8.3\">1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.9\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.9.1\">q9</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.9.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.9.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.9.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.9.2.1.1.1\">COUNT:</span> cars with a combustion engine and a fuel tank</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.9.3\">1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.10\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.10.1\">q10</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.10.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.10.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.10.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.10.2.1.1.1\">COUNT:</span> cars with an electric engine and a battery</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.10.3\">0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.11\">\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.11.1\">q11</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.11.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.11.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.11.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.11.2.1.1.1\">COUNT:</span> cars with an electric engine and a automatic transmission</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S4.T1.1.1.1.1.11.3\">0</td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S4.T1.1.1.2\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T1.1.1.2.1\">\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.2.1.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.1.1\">r1</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.1.2\">q1 = q2</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.1.3\">true</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.2.1.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.2.1\">r2</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.2.2\">(q1 = q3) AND NOT q4</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.2.3\">true</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.2.1.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.3.1\">r3</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.3.2\">NOT(q5 AND q6)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.3.3\">true</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.2.1.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.4.1\">r4</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.4.2\">(q7=q9) AND (q8=q10)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.4.3\">false</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.2.1.5\">\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.5.1\">r5</td>\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.5.2\">q8=q11</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.5.3\">false</td>\n</tr>\n</table>\n</td>\n</tr>\n</table>\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span>Queries and constraints with results for the car example</figcaption>\n</figure>",
28
+ "perturb_sentence_id": 8,
29
+ "output": {
30
+ "perturbed_statement": "[paragraph id = 8] For our RM, 15 queries were formulated. The queries and the requirements defined in Section 2 would be evaluated as shown in Table 1 .",
31
+ "perturbed_explanation": "1. The original statement indicates that 11 queries were formulated under RM. 2. The statement claims 15 queries were formulated, which inaccurately changes the number of queries from 11 to 15, thereby presenting an incorrect piece of information. It is essential to note the precise amount to ensure factual accuracy."
32
+ }
33
+ },
34
+ {
35
+ "path": "table_paper/2407.00064v1.json",
36
+ "table_id": "1",
37
+ "section": "5",
38
+ "all_context": [
39
+ "For the validation of our concept we designed a prototype as a standalone application in C#, called Reference Modeling Assistant (RMA).",
40
+ "Figure 6 shows an overview of the different RM.",
41
+ "Besides the name and the description, it is possible to assign tags to the models in order to facilitate the search for suitable models [1 ].",
42
+ "Thereby, a RM can implement other RMs.",
43
+ "Thus, a multiple inheritance can be realized, whereby the constraints are taken over.",
44
+ "For example, a RM can exist for the general modeling of vehicles and other models that are specifically designed for the development of electric or combustion vehicles.",
45
+ "The general constraints then apply to both models.",
46
+ "With RMA it is possible to create simple ontologies using dotNETRDF555https://www.dotnetrdf.org, an open source .NET library for working with RDF triples.",
47
+ "An import of existing rdf or owl files is possible as well.",
48
+ "Once the RM have been constructed, a solution model can be created on their basis or an existing model can be validated against them.",
49
+ "Figure 7 shows the Solution Builder.",
50
+ "On the left side it displays the taxonomy with the classes of the RM and below the related instances.",
51
+ "The other two tabs give a choice for the available relations and attributes.",
52
+ "The middle column shows the created solution model, as we already know it from Figure 5 .",
53
+ "Depending on which tab is selected, the respective items can be added, removed and edited.",
54
+ "On the right side the constraints of the RM are shown.",
55
+ "These are validated directly when the solution is changed.",
56
+ "We see the queries listed in Table 1 with their results and the constraints constructed from them.",
57
+ "The tree view shows the nesting of the logical statements.",
58
+ "A constrained is fulfilled when its upper node is displayed in green.",
59
+ "Unfulfilled constraints (4 and 5) are shown in red and queries which only returned values are shown in grey.",
60
+ "On the upper level, this representation allows a quick check of the extent to which a solution conforms to the constraints of a RM.",
61
+ "On the lower levels, the details of a query can be analyzed and appropriate corrections can be made to the model.",
62
+ "The result can be exported from the RMA as RDF file.",
63
+ "Afterwards, the content can be integrated into the individual modeling environment via transformation.",
64
+ "To demonstrate this step, we have developed an Add-In for the modeling tool Sparx Enterprise Architect (EA).",
65
+ "This allows the conversion of EA models as shown in Figure 1 and 5 to RDF models and vice versa.",
66
+ "With the representation of the RM as an ontology a cross-language provision of a semantically enriched vocabulary is achieved.",
67
+ "The concepts and instances described within can be used to generate multiple solution models across different modeling languages.",
68
+ "It has been observed that the form of the constraints is strongly coupled to the modeling language and frameworks used.",
69
+ "The formalization of a RM requires some effort.",
70
+ "This is usually only profitable if the RM is reused often enough like in our example, where RMs are used to configure multiple vehicles.",
71
+ "Hence, the use of this approach is especially suitable for recurring modeling tasks and for those where high demands are made on model verification.",
72
+ ""
73
+ ],
74
+ "target_context_ids": [
75
+ 17,
76
+ 18,
77
+ 19,
78
+ 20,
79
+ 21,
80
+ 22,
81
+ 23
82
+ ],
83
+ "selected_paragraphs": [
84
+ "[paragraph id = 17] We see the queries listed in Table 1 with their results and the constraints constructed from them.",
85
+ "[paragraph id = 18] The tree view shows the nesting of the logical statements.",
86
+ "[paragraph id = 19] A constrained is fulfilled when its upper node is displayed in green.",
87
+ "[paragraph id = 20] Unfulfilled constraints (4 and 5) are shown in red and queries which only returned values are shown in grey.",
88
+ "[paragraph id = 21] On the upper level, this representation allows a quick check of the extent to which a solution conforms to the constraints of a RM.",
89
+ "[paragraph id = 22] On the lower levels, the details of a query can be analyzed and appropriate corrections can be made to the model.",
90
+ "[paragraph id = 23] The result can be exported from the RMA as RDF file."
91
+ ],
92
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T1\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T1.1\">\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1\">\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.1.1.1\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T1.1.1.1.1\">\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.1.1\">q1</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.1.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.1.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.1.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.1.2.1.1.1\">COUNT:</span> all cars</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.1.3\">1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.2.1\">q2</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.2.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.2.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.2.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.2.2.1.1.1\">COUNT:</span> cars with 4 wheels</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.2.3\">1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.3.1\">q3</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.3.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.3.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.3.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.3.2.1.1.1\">COUNT:</span> cars with 1 transmission gear</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.3.3\">1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.4.1\">q4</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.4.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.4.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.4.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.4.2.1.1.1\">ASK:</span> transmission is a manual and an automatic transmission</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.4.3\">false</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.5\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.5.1\">q5</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.5.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.5.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.5.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.5.2.1.1.1\">ASK:</span> cars with less than one engine</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.5.3\">false</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.6\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.6.1\">q6</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.6.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.6.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.6.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.6.2.1.1.1\">ASK:</span> cars with more than one combustion engine</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.6.3\">false</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.7\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.7.1\">q7</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.7.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.7.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.7.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.7.2.1.1.1\">COUNT:</span> cars with at least one combustion engine</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.7.3\">1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.8\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.8.1\">q8</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.8.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.8.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.8.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.8.2.1.1.1\">COUNT:</span> cars with at least one electric engine</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.8.3\">1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.9\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.9.1\">q9</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.9.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.9.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.9.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.9.2.1.1.1\">COUNT:</span> cars with a combustion engine and a fuel tank</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.9.3\">1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.10\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.10.1\">q10</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.10.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.10.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.10.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.10.2.1.1.1\">COUNT:</span> cars with an electric engine and a battery</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.1.1.1.10.3\">0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1.1.11\">\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.11.1\">q11</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1.11.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.11.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.11.2.1.1\" style=\"width:170.7pt;\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T1.1.1.1.1.11.2.1.1.1\">COUNT:</span> cars with an electric engine and a automatic transmission</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_t\" id=\"S4.T1.1.1.1.1.11.3\">0</td>\n</tr>\n</table>\n</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S4.T1.1.1.2\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T1.1.1.2.1\">\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.2.1.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.1.1\">r1</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.1.2\">q1 = q2</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.1.3\">true</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.2.1.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.2.1\">r2</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.2.2\">(q1 = q3) AND NOT q4</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.2.3\">true</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.2.1.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.3.1\">r3</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.3.2\">NOT(q5 AND q6)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.3.3\">true</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.2.1.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.4.1\">r4</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.4.2\">(q7=q9) AND (q8=q10)</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.4.3\">false</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.2.1.5\">\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_l ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.5.1\">r5</td>\n<td class=\"ltx_td ltx_align_left ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.5.2\">q8=q11</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.2.1.5.3\">false</td>\n</tr>\n</table>\n</td>\n</tr>\n</table>\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span>Queries and constraints with results for the car example</figcaption>\n</figure>",
93
+ "perturb_sentence_id": 20,
94
+ "output": {
95
+ "perturbed_statement": "[paragraph id = 20] Unfulfilled constraints (4 and 5) are shown in blue and queries only returning null values are shown in grey. On the upper level, this representation allows a quick check of the extent to which a solution conforms to the constraints of a RM.",
96
+ "perturbed_explanation": "Original Explanation: Constraints that are unfulfilled are represented in red, distinguishing their status, and queries which only returned values appear in grey, providing quick categorization. 1. The altered statement incorrectly claims that unfulfilled constraints are shown in blue, which contradicts the information specifying they are displayed in red. This creates a misunderstanding about the visual cue for unfulfilled constraints."
97
+ }
98
+ }
99
+ ]
table_result/2407.00071v1_output.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00071v1.json",
4
+ "table_id": "1",
5
+ "section": "2.2",
6
+ "all_context": [
7
+ "There have been many papers that suggest that LLMs can indeed reason (?",
8
+ "For each subsequent revision of LLMs - GPT4 / Gemini / and Llama3, reasoning benchmarks such as BIG-Bench-Hard, HellaSwag, and MMLU show ever improving results.",
9
+ "However, these results are not a good indicator for the autonomous reasoning capabilities of the model.",
10
+ "In each case, the benchmarks are performed using in-context learning, with few-shot (specific examplars) or Chain of Thought (CoT), for which humans manually develop exemplars using labeled datasets to improve performance.",
11
+ "The latest language models do not report the zero-shot performance on these benchmark as in seen Table 1 since the performance is likely poorer than those with manual prompts.",
12
+ "Thus we believe the next milestone for LLMs is automatic prompt generation with correct reasoning.",
13
+ "The main inspiration for our work comes from Yan LeCun s review (?)",
14
+ "which suggests multiple models need to work together to emulate general intelligence and that human brain possibly calculates a “cost function” for reasoning in a gradient-free manner - similar to combinatorial optimization.",
15
+ ""
16
+ ],
17
+ "target_context_ids": [
18
+ 1,
19
+ 3,
20
+ 4
21
+ ],
22
+ "selected_paragraphs": [
23
+ "[paragraph id = 1] For each subsequent revision of LLMs - GPT4 / Gemini / and Llama3, reasoning benchmarks such as BIG-Bench-Hard, HellaSwag, and MMLU show ever improving results.",
24
+ "[paragraph id = 3] In each case, the benchmarks are performed using in-context learning, with few-shot (specific examplars) or Chain of Thought (CoT), for which humans manually develop exemplars using labeled datasets to improve performance.",
25
+ "[paragraph id = 4] The latest language models do not report the zero-shot performance on these benchmark as in seen Table 1 since the performance is likely poorer than those with manual prompts."
26
+ ],
27
+ "table_html": "<figure class=\"ltx_table\" id=\"S2.T1\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S2.T1.1\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S2.T1.1.1.1\">\n<th class=\"ltx_td ltx_th ltx_th_row ltx_border_tt\" id=\"S2.T1.1.1.1.1\"></th>\n<td class=\"ltx_td ltx_align_right ltx_border_tt\" id=\"S2.T1.1.1.1.2\">Gemini Ultra</td>\n<td class=\"ltx_td ltx_align_right ltx_border_tt\" id=\"S2.T1.1.1.1.3\">GPT-4</td>\n<td class=\"ltx_td ltx_align_right ltx_border_tt\" id=\"S2.T1.1.1.1.4\">LLama3 70B</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.1.2.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S2.T1.1.2.2.1\">MMLU</th>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S2.T1.1.2.2.2\">90.04% CoT@32</td>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S2.T1.1.2.2.3\">86.4% 5-shot</td>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S2.T1.1.2.2.4\">79.5% 5-shot</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.1.3.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S2.T1.1.3.3.1\">GSM8K</th>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S2.T1.1.3.3.2\">94.4% Maj1@32</td>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S2.T1.1.3.3.3\">92% 5-Shot CoT</td>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S2.T1.1.3.3.4\">93.0 8-shot</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.1.4.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S2.T1.1.4.4.1\">MATH</th>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S2.T1.1.4.4.2\">53.2% 4-shot</td>\n<td class=\"ltx_td ltx_border_t\" id=\"S2.T1.1.4.4.3\"></td>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S2.T1.1.4.4.4\">50.4 4-shot</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.1.5.5\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S2.T1.1.5.5.1\">BIG-Bench-Hard</th>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S2.T1.1.5.5.2\">83.6% 3-shot</td>\n<td class=\"ltx_td ltx_border_t\" id=\"S2.T1.1.5.5.3\"></td>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S2.T1.1.5.5.4\">81.3 3-shot, CoT</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.1.6.6\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S2.T1.1.6.6.1\">DROP</th>\n<td class=\"ltx_td ltx_align_right\" id=\"S2.T1.1.6.6.2\">82.4% Variable shot</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S2.T1.1.6.6.3\">80.9 3-shot</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S2.T1.1.6.6.4\">79.7 3-shot,F1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.1.7.7\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S2.T1.1.7.7.1\">HellaSwag</th>\n<td class=\"ltx_td ltx_align_right\" id=\"S2.T1.1.7.7.2\">87.8% 10-shot</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S2.T1.1.7.7.3\">95.3% 10-shot</td>\n<td class=\"ltx_td\" id=\"S2.T1.1.7.7.4\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.1.8.8\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S2.T1.1.8.8.1\">WinoGrande</th>\n<td class=\"ltx_td\" id=\"S2.T1.1.8.8.2\"></td>\n<td class=\"ltx_td ltx_align_right\" id=\"S2.T1.1.8.8.3\">87.5% 5-shot</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S2.T1.1.8.8.4\">81.3% 5-shot</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.1.9.9\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_bb\" id=\"S2.T1.1.9.9.1\">AI2 Reasoning</th>\n<td class=\"ltx_td ltx_border_bb\" id=\"S2.T1.1.9.9.2\"></td>\n<td class=\"ltx_td ltx_align_right ltx_border_bb\" id=\"S2.T1.1.9.9.3\">96.3% 25-shot</td>\n<td class=\"ltx_td ltx_align_right ltx_border_bb\" id=\"S2.T1.1.9.9.4\">93% 25-shot</td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span>Summary of recent reasoning benchmarks on LLMs (<span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.3.1\">?</span>). Note that reported results are all dependent on some form of In Context Learning or Chain of Thought (CoT) </figcaption>\n</figure>",
28
+ "perturb_sentence_id": 1,
29
+ "output": {
30
+ "perturbed_statement": "[paragraph id = 1] For each revision of LLMs - GPT4 and Gemini show declining results on reasoning benchmarks such as BIG-Bench-Hard, HellaSwag, and MMLU. Moreover, these results are a strong indicator for the autonomous reasoning capabilities of the model.",
31
+ "perturbed_explanation": "The original explanation argues that while reasoning benchmarks for subsequent revisions of LLMs like GPT4, Gemini, and Llama3 exhibit increasing performance, such metrics do not effectively measure the models' autonomous reasoning abilities. 1. The statement falsely suggests a decline in benchmark performance, contrary to the assertion of improvement. 2. It also incorrectly claims these benchmarks are reliable indicators of reasoning autonomy, which is contrary to their stated limitations. By making these statements, the statement misrepresents the trends and implications of these benchmarks."
32
+ }
33
+ },
34
+ {
35
+ "path": "table_paper/2407.00071v1.json",
36
+ "table_id": "2",
37
+ "section": "4",
38
+ "all_context": [
39
+ "We conduct all of our experiments using the gpt-3.5-turbo-0125 LLM which has a context window of 16,385 tokens and returns a maximum of 4,096 tokens.",
40
+ "This language model is a variant of GPT-3.5-Turbo3 produced by OpenAI, and was trained with data available until September 2021.",
41
+ "We selected the suite of BIG-bench Hard (BBH) tasks - a datasets consisting of reasoning oriented questions that have proven challenging for LLMs in the past (?).",
42
+ "To save on inference time and cost, we sample 50 questions from each of the subtasks111Subtasks Logical Deduction and Tracking Shuffled Objects are split up into three further subtasks, we sample 50 questions from each of these., combining them into a 1350 question evaluation set without the subset labels to ensure robustness.",
43
+ "On this set, we compare CR against (i) a modified version of zero-shot prompting, (ii) Universal Self-Adaptive Prompting (USP), and (iii) standard three-shot CoT prompting.",
44
+ "Our modification to zero-shot consists of an added system-instruction very similar to the one used for CR (see Appendix B for the exact format).",
45
+ "For the Sampling of Reasons step, we sampled the LLM times at to collect sufficient distinct reasons, and calculate their distribution and correlations matrices.",
46
+ "was determined empirically on test questions.",
47
+ "To map to distinct reason, the similarity threshold is held to =0.90, again determined empirically.",
48
+ "Prior to running the QUBO mapper, we tune the mapping parameters , , , and ( is fixed) using 5 questions from across all of BBH to form a 135 question tuning set.",
49
+ "On this, we set the ranges for the tuning (see Table 2 ) and use Optuna - a gradient free hyperparameter optimization framework (?)",
50
+ "- to select the optimal values for the other four parameters.",
51
+ "We note that none of the 135 questions in the tuning set appear in the 1350 question evaluation set.",
52
+ "For the Ising solver, we utilized an open-source implementation of simulated annealing (?)",
53
+ "featuring default settings on temperature, linear annealing schedule, and a fixed parameter setting strategy employing 1000 sweeps, run identically 100 times.",
54
+ "Figure 2 and Table 3 displays our results for BBH tasks.",
55
+ "We manually evaluated the results for CR and zero-shot.",
56
+ "The USP results are taken from (?).",
57
+ "While USP was evaluated on PaLM 2-M, we report it here anyway due to its recreation complexity and the superior performance of PaLM 2-M to GPT 3.5 Turbo (?",
58
+ "We performed a human evaluation at each stage of the CR pipeline.",
59
+ "In Table 4 we report the number of sampled reasons before and after the stages depicted in Figure 2 .",
60
+ "It should be noted that the effect of optimization is visible as the mechanism that reduces the number of distinct reasons to a subset of reasons.",
61
+ "More results of the human evaluation can be found in the Appendix.",
62
+ ""
63
+ ],
64
+ "target_context_ids": [
65
+ 9,
66
+ 10,
67
+ 11
68
+ ],
69
+ "selected_paragraphs": [
70
+ "[paragraph id = 9] Prior to running the QUBO mapper, we tune the mapping parameters , , , and ( is fixed) using 5 questions from across all of BBH to form a 135 question tuning set.",
71
+ "[paragraph id = 10] On this, we set the ranges for the tuning (see Table 2 ) and use Optuna - a gradient free hyperparameter optimization framework (?)",
72
+ "[paragraph id = 11] - to select the optimal values for the other four parameters."
73
+ ],
74
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T2\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S4.T2.5\" style=\"width:433.6pt;height:56.9pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(79.5pt,-10.4pt) scale(1.57928347350645,1.57928347350645) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S4.T2.5.5\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T2.5.5.5\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_tt\" id=\"S4.T2.5.5.5.6\" style=\"padding-bottom:2.15277pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.5.5.5.6.1\">Parameter</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_tt\" id=\"S4.T2.1.1.1.1\" style=\"padding-bottom:2.15277pt;\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_tt\" id=\"S4.T2.2.2.2.2\" style=\"padding-bottom:2.15277pt;\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_tt\" id=\"S4.T2.3.3.3.3\" style=\"padding-bottom:2.15277pt;\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_tt\" id=\"S4.T2.4.4.4.4\" style=\"padding-bottom:2.15277pt;\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T2.5.5.5.5\" style=\"padding-bottom:2.15277pt;\"></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T2.5.5.6.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S4.T2.5.5.6.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.5.5.6.1.1.1\">Tuning Range</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S4.T2.5.5.6.1.2\">[1E-3, 70]</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S4.T2.5.5.6.1.3\">[1E-4, 20]</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S4.T2.5.5.6.1.4\">[-2, 10]</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r ltx_border_t\" id=\"S4.T2.5.5.6.1.5\">[1, 4]</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S4.T2.5.5.6.1.6\">2</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span>Parameter Tuning Ranges</figcaption>\n</figure>",
75
+ "perturb_sentence_id": 9,
76
+ "output": {
77
+ "perturbed_statement": "[paragraph id = 9] Prior to running the QUBO mapper, we tune the mapping parameters , , , and ( is fixed) using 10 questions from across all of BBH to form a 135 question tuning set.On this, we set the ranges for the tuning (see Table 2 ) and use Optuna - a gradient free hyperparameter optimization framework (?)",
78
+ "perturbed_explanation": "The original explanation mentions the use of 5 questions as part of the tuning set, accompanied by the clarification regarding the formation and optimization framework used. 2) Changing the count from 5 to 10 inaccurately increases the number of questions utilized in this context, leading to a factual error."
79
+ }
80
+ },
81
+ {
82
+ "path": "table_paper/2407.00071v1.json",
83
+ "table_id": "3",
84
+ "section": "4",
85
+ "all_context": [
86
+ "We conduct all of our experiments using the gpt-3.5-turbo-0125 LLM which has a context window of 16,385 tokens and returns a maximum of 4,096 tokens.",
87
+ "This language model is a variant of GPT-3.5-Turbo3 produced by OpenAI, and was trained with data available until September 2021.",
88
+ "We selected the suite of BIG-bench Hard (BBH) tasks - a datasets consisting of reasoning oriented questions that have proven challenging for LLMs in the past (?).",
89
+ "To save on inference time and cost, we sample 50 questions from each of the subtasks111Subtasks Logical Deduction and Tracking Shuffled Objects are split up into three further subtasks, we sample 50 questions from each of these., combining them into a 1350 question evaluation set without the subset labels to ensure robustness.",
90
+ "On this set, we compare CR against (i) a modified version of zero-shot prompting, (ii) Universal Self-Adaptive Prompting (USP), and (iii) standard three-shot CoT prompting.",
91
+ "Our modification to zero-shot consists of an added system-instruction very similar to the one used for CR (see Appendix B for the exact format).",
92
+ "For the Sampling of Reasons step, we sampled the LLM times at to collect sufficient distinct reasons, and calculate their distribution and correlations matrices.",
93
+ "was determined empirically on test questions.",
94
+ "To map to distinct reason, the similarity threshold is held to =0.90, again determined empirically.",
95
+ "Prior to running the QUBO mapper, we tune the mapping parameters , , , and ( is fixed) using 5 questions from across all of BBH to form a 135 question tuning set.",
96
+ "On this, we set the ranges for the tuning (see Table 2 ) and use Optuna - a gradient free hyperparameter optimization framework (?)",
97
+ "- to select the optimal values for the other four parameters.",
98
+ "We note that none of the 135 questions in the tuning set appear in the 1350 question evaluation set.",
99
+ "For the Ising solver, we utilized an open-source implementation of simulated annealing (?)",
100
+ "featuring default settings on temperature, linear annealing schedule, and a fixed parameter setting strategy employing 1000 sweeps, run identically 100 times.",
101
+ "Figure 2 and Table 3 displays our results for BBH tasks.",
102
+ "We manually evaluated the results for CR and zero-shot.",
103
+ "The USP results are taken from (?).",
104
+ "While USP was evaluated on PaLM 2-M, we report it here anyway due to its recreation complexity and the superior performance of PaLM 2-M to GPT 3.5 Turbo (?",
105
+ "We performed a human evaluation at each stage of the CR pipeline.",
106
+ "In Table 4 we report the number of sampled reasons before and after the stages depicted in Figure 2 .",
107
+ "It should be noted that the effect of optimization is visible as the mechanism that reduces the number of distinct reasons to a subset of reasons.",
108
+ "More results of the human evaluation can be found in the Appendix.",
109
+ ""
110
+ ],
111
+ "target_context_ids": [
112
+ 14,
113
+ 15,
114
+ 16,
115
+ 17
116
+ ],
117
+ "selected_paragraphs": [
118
+ "[paragraph id = 14] featuring default settings on temperature, linear annealing schedule, and a fixed parameter setting strategy employing 1000 sweeps, run identically 100 times.",
119
+ "[paragraph id = 15] Figure 2 and Table 3 displays our results for BBH tasks.",
120
+ "[paragraph id = 16] We manually evaluated the results for CR and zero-shot.",
121
+ "[paragraph id = 17] The USP results are taken from (?)."
122
+ ],
123
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T3\">\n<div class=\"ltx_inline-block ltx_transformed_outer\" id=\"S4.T3.2\" style=\"width:433.6pt;height:162.7pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(72.9pt,-27.3pt) scale(1.50642183704488,1.50642183704488) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S4.T3.2.2\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T3.2.2.3.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r ltx_border_tt\" id=\"S4.T3.2.2.3.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T3.2.2.3.1.1.1\">Setting</span></th>\n<th class=\"ltx_td ltx_th ltx_th_row ltx_border_tt\" id=\"S4.T3.2.2.3.1.2\"></th>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S4.T3.2.2.3.1.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T3.2.2.3.1.3.1\">Zero-Shot</span></td>\n<td class=\"ltx_td ltx_border_r ltx_border_tt\" id=\"S4.T3.2.2.3.1.4\"></td>\n<td class=\"ltx_td ltx_align_right ltx_border_tt\" id=\"S4.T3.2.2.3.1.5\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T3.2.2.3.1.5.1\">Few-Shot</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.2.2.4.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T3.2.2.4.2.1\">Method</th>\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_t\" id=\"S4.T3.2.2.4.2.2\">0-Shot</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.2.2.4.2.3\">USP</td>\n<td class=\"ltx_td ltx_align_right ltx_border_r ltx_border_t\" id=\"S4.T3.2.2.4.2.4\">CR</td>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S4.T3.2.2.4.2.5\">3-Shot</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.2.2.5.3\">\n<th class=\"ltx_td ltx_th ltx_th_row ltx_border_r\" id=\"S4.T3.2.2.5.3.1\"></th>\n<th class=\"ltx_td ltx_th ltx_th_row\" id=\"S4.T3.2.2.5.3.2\"></th>\n<td class=\"ltx_td\" id=\"S4.T3.2.2.5.3.3\"></td>\n<td class=\"ltx_td ltx_align_right ltx_border_r\" id=\"S4.T3.2.2.5.3.4\">(Ours)</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S4.T3.2.2.5.3.5\">CoT</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.1.1.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T3.1.1.1.1\">Average (%) \n</th>\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_t\" id=\"S4.T3.1.1.1.2\">47.68</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.1.1.1.3\">55.89</td>\n<td class=\"ltx_td ltx_align_right ltx_border_r ltx_border_t\" id=\"S4.T3.1.1.1.4\">59.88</td>\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S4.T3.1.1.1.5\">74.20</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.2.2.6.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T3.2.2.6.4.1\">Gain over zero-shot</th>\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S4.T3.2.2.6.4.2\">0</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.2.2.6.4.3\">+8.21</td>\n<td class=\"ltx_td ltx_align_right ltx_border_r\" id=\"S4.T3.2.2.6.4.4\">+12.20</td>\n<td class=\"ltx_td ltx_align_right\" id=\"S4.T3.2.2.6.4.5\">+26.52</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.2.2.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_bb ltx_border_r\" id=\"S4.T3.2.2.2.1\">Average rank \n</th>\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_bb\" id=\"S4.T3.2.2.2.2\">3.22</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T3.2.2.2.3\">2.78</td>\n<td class=\"ltx_td ltx_align_right ltx_border_bb ltx_border_r\" id=\"S4.T3.2.2.2.4\">2.57</td>\n<td class=\"ltx_td ltx_align_right ltx_border_bb\" id=\"S4.T3.2.2.2.5\">1.35</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 3: </span>Cumulative statistics on the performance of different methods across BBH. CR outperforms the other zero-shot methods both by rank and on average.</figcaption>\n</figure>",
124
+ "perturb_sentence_id": 14,
125
+ "output": {
126
+ "perturbed_statement": "[paragraph id = 14] featuring default settings on temperature, exponential annealing schedule, and a fixed parameter setting strategy employing 1000 sweeps, run identically 100 times. Figure 2 and Table 3 display our results for BBH tasks.",
127
+ "perturbed_explanation": "The original explanation stated that the annealing schedule type was specified as linear. Introducing the alteration of changing it to exponential introduces a factual discrepancy. Consequently, by the unchanged contextual details, the statement specifying exponential annealing is incorrect."
128
+ }
129
+ },
130
+ {
131
+ "path": "table_paper/2407.00071v1.json",
132
+ "table_id": "4",
133
+ "section": "4",
134
+ "all_context": [
135
+ "We conduct all of our experiments using the gpt-3.5-turbo-0125 LLM which has a context window of 16,385 tokens and returns a maximum of 4,096 tokens.",
136
+ "This language model is a variant of GPT-3.5-Turbo3 produced by OpenAI, and was trained with data available until September 2021.",
137
+ "We selected the suite of BIG-bench Hard (BBH) tasks - a datasets consisting of reasoning oriented questions that have proven challenging for LLMs in the past (?).",
138
+ "To save on inference time and cost, we sample 50 questions from each of the subtasks111Subtasks Logical Deduction and Tracking Shuffled Objects are split up into three further subtasks, we sample 50 questions from each of these., combining them into a 1350 question evaluation set without the subset labels to ensure robustness.",
139
+ "On this set, we compare CR against (i) a modified version of zero-shot prompting, (ii) Universal Self-Adaptive Prompting (USP), and (iii) standard three-shot CoT prompting.",
140
+ "Our modification to zero-shot consists of an added system-instruction very similar to the one used for CR (see Appendix B for the exact format).",
141
+ "For the Sampling of Reasons step, we sampled the LLM times at to collect sufficient distinct reasons, and calculate their distribution and correlations matrices.",
142
+ "was determined empirically on test questions.",
143
+ "To map to distinct reason, the similarity threshold is held to =0.90, again determined empirically.",
144
+ "Prior to running the QUBO mapper, we tune the mapping parameters , , , and ( is fixed) using 5 questions from across all of BBH to form a 135 question tuning set.",
145
+ "On this, we set the ranges for the tuning (see Table 2 ) and use Optuna - a gradient free hyperparameter optimization framework (?)",
146
+ "- to select the optimal values for the other four parameters.",
147
+ "We note that none of the 135 questions in the tuning set appear in the 1350 question evaluation set.",
148
+ "For the Ising solver, we utilized an open-source implementation of simulated annealing (?)",
149
+ "featuring default settings on temperature, linear annealing schedule, and a fixed parameter setting strategy employing 1000 sweeps, run identically 100 times.",
150
+ "Figure 2 and Table 3 displays our results for BBH tasks.",
151
+ "We manually evaluated the results for CR and zero-shot.",
152
+ "The USP results are taken from (?).",
153
+ "While USP was evaluated on PaLM 2-M, we report it here anyway due to its recreation complexity and the superior performance of PaLM 2-M to GPT 3.5 Turbo (?",
154
+ "We performed a human evaluation at each stage of the CR pipeline.",
155
+ "In Table 4 we report the number of sampled reasons before and after the stages depicted in Figure 2 .",
156
+ "It should be noted that the effect of optimization is visible as the mechanism that reduces the number of distinct reasons to a subset of reasons.",
157
+ "More results of the human evaluation can be found in the Appendix.",
158
+ ""
159
+ ],
160
+ "target_context_ids": [
161
+ 20,
162
+ 21
163
+ ],
164
+ "selected_paragraphs": [
165
+ "[paragraph id = 20] In Table 4 we report the number of sampled reasons before and after the stages depicted in Figure 2 .",
166
+ "[paragraph id = 21] It should be noted that the effect of optimization is visible as the mechanism that reduces the number of distinct reasons to a subset of reasons."
167
+ ],
168
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T4\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S4.T4.4\" style=\"width:433.6pt;height:611.2pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(31.6pt,-44.6pt) scale(1.1708709263391,1.1708709263391) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S4.T4.4.4\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T4.1.1.1\">\n<th class=\"ltx_td ltx_th ltx_th_column ltx_border_r ltx_border_tt\" id=\"S4.T4.1.1.1.2\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_tt\" id=\"S4.T4.1.1.1.3\">All Reasons</th>\n<th class=\"ltx_td ltx_th ltx_th_column ltx_border_r ltx_border_tt\" id=\"S4.T4.1.1.1.4\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T4.1.1.1.1\">% of \n</th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_r\" id=\"S4.T4.4.4.4.4\">Dataset</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r\" id=\"S4.T4.2.2.2.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r\" id=\"S4.T4.3.3.3.2\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S4.T4.4.4.4.3\"></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.5.1\">\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T4.4.4.5.1.1\">Causal Judgement</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T4.4.4.5.1.2\">709</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T4.4.4.5.1.3\">204</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.4.4.5.1.4\">87.2</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.6.2\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.6.2.1\">Reasoning About Colored Objects</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.6.2.2\">525</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.6.2.3\">100</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.6.2.4\">82.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.7.3\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.7.3.1\">Navigate</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.7.3.2\">1100</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.7.3.3\">572</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.7.3.4\">100.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.8.4\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.8.4.1\">Penguins In A Table</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.8.4.2\">589</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.8.4.3\">123</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.8.4.4\">77.2</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.9.5\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.9.5.1\">Geometric Shapes</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.9.5.2\">630</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.9.5.3\">331</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.9.5.4\">100.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.10.6\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.10.6.1\">Disambiguation QA</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.10.6.2\">373</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.10.6.3\">45</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.10.6.4\">68.9</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.11.7\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.11.7.1\">Tracking Shuffled Objects Five Objects</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.11.7.2\">1020</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.11.7.3\">298</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.11.7.4\">95.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.12.8\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.12.8.1\">Word Sorting</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.12.8.2\">385</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.12.8.3\">107</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.12.8.4\">99.1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.13.9\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.13.9.1\">Tracking Shuffled Objects Three Objects</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.13.9.2\">743</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.13.9.3\">147</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.13.9.4\">64.6</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.14.10\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.14.10.1\">Tracking Shuffled Objects Seven Objects</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.14.10.2\">1164</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.14.10.3\">400</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.14.10.4\">98.5</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.15.11\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.15.11.1\">Multistep Arithmetic Two</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.15.11.2\">621</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.15.11.3\">253</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.15.11.4\">99.6</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.16.12\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.16.12.1\">Web Of Lies</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.16.12.2\">885</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.16.12.3\">113</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.16.12.4\">84.1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.17.13\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.17.13.1\">Logical Deduction Three Objects</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.17.13.2\">540</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.17.13.3\">100</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.17.13.4\">72.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.18.14\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.18.14.1\">Sports Understanding</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.18.14.2\">449</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.18.14.3\">160</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.18.14.4\">96.3</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.19.15\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.19.15.1\">Snarks</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.19.15.2\">396</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.19.15.3\">109</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.19.15.4\">91.7</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.20.16\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.20.16.1\">Logical Deduction Five Objects</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.20.16.2\">680</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.20.16.3\">199</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.20.16.4\">92.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.21.17\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.21.17.1\">Salient Translation Error Detection</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.21.17.2\">389</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.21.17.3\">90</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.21.17.4\">98.9</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.22.18\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.22.18.1\">Hyperbaton</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.22.18.2\">432</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.22.18.3\">57</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.22.18.4\">65.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.23.19\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.23.19.1\">Movie Recommendation</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.23.19.2\">730</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.23.19.3\">457</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.23.19.4\">100.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.24.20\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.24.20.1\">Object Counting</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.24.20.2\">397</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.24.20.3\">48</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.24.20.4\">62.5</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.25.21\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.25.21.1\">Logical Deduction Seven Objects</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.25.21.2\">730</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.25.21.3\">309</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.25.21.4\">100.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.26.22\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.26.22.1\">Temporal Sequences</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.26.22.2\">533</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.26.22.3\">76</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.26.22.4\">97.3</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.27.23\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.27.23.1\">Formal Fallacies</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.27.23.2\">579</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.27.23.3\">251</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.27.23.4\">100.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.28.24\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.28.24.1\">Dyck Languages</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.28.24.2\">1112</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.28.24.3\">558</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.28.24.4\">100.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.29.25\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.29.25.1\">Date Understanding</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.29.25.2\">587</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.29.25.3\">162</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.29.25.4\">98.1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.30.26\">\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T4.4.4.30.26.1\">Boolean Expressions</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.30.26.2\">493</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T4.4.4.30.26.3\">160</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.30.26.4\">93.8</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4.31.27\">\n<td class=\"ltx_td ltx_align_left ltx_border_bb ltx_border_r\" id=\"S4.T4.4.4.31.27.1\">Ruin Names</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r\" id=\"S4.T4.4.4.31.27.2\">622</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r\" id=\"S4.T4.4.4.31.27.3\">421</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T4.4.4.31.27.4\">100.0</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 4: </span>Reason filtering and selection percentages</figcaption>\n</figure>",
169
+ "perturb_sentence_id": 20,
170
+ "output": {
171
+ "perturbed_statement": "[paragraph id = 20] In Table 4 we report the percentage change in reasons between stages depicted in Figure 3. It should be noted that the optimization process increases the variety of reasons in this analysis.",
172
+ "perturbed_explanation": "1. The effect of optimization as stated originally is to reduce the number of distinct reasons to a subset for streamlined analysis. 2. The assertion that optimization increases the variety of reasons contradicts this fundamental understanding, and the reference to Figure 3 does not align correctly with the discussed content from Figure 2 as outlined."
173
+ }
174
+ }
175
+ ]
table_result/2407.00073v2_output.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00073v2.json",
4
+ "table_id": "1",
5
+ "section": "5.1",
6
+ "all_context": [
7
+ "Table 1 presents the computational complexity of our NI-CBE protocol.",
8
+ "In this table, the computation cost of GlobeSteup algorithm is not analyzed since this algorithm only needs to be run once.",
9
+ "That is, the efficiency of our protocol are mainly determined by the rest of algorithms.",
10
+ "We note that some operations that can be pre-computed are not considered here.",
11
+ "/ denotes the time to compute a scalar exponentiation operation/a scalar multiplication operation on the bilinear groups and .",
12
+ "denotes the time to complete a bilinear map operation.",
13
+ "denotes the group size while represents the current number of group members of any group where a new party/old group member intends to join/leave this group.",
14
+ "denotes the total number of existing group members in the target group before performing Encrypt algorithm and represents the number of group members who are chosen as recipients within the target group.",
15
+ "Then, we have and .",
16
+ ""
17
+ ],
18
+ "target_context_ids": [
19
+ 0,
20
+ 1,
21
+ 2,
22
+ 3,
23
+ 4,
24
+ 5,
25
+ 6,
26
+ 7,
27
+ 8
28
+ ],
29
+ "selected_paragraphs": [
30
+ "[paragraph id = 0] Table 1 presents the computational complexity of our NI-CBE protocol.",
31
+ "[paragraph id = 1] In this table, the computation cost of GlobeSteup algorithm is not analyzed since this algorithm only needs to be run once.",
32
+ "[paragraph id = 2] That is, the efficiency of our protocol are mainly determined by the rest of algorithms.",
33
+ "[paragraph id = 3] We note that some operations that can be pre-computed are not considered here.",
34
+ "[paragraph id = 4] / denotes the time to compute a scalar exponentiation operation/a scalar multiplication operation on the bilinear groups and .",
35
+ "[paragraph id = 5] denotes the time to complete a bilinear map operation.",
36
+ "[paragraph id = 6] denotes the group size while represents the current number of group members of any group where a new party/old group member intends to join/leave this group.",
37
+ "[paragraph id = 7] denotes the total number of existing group members in the target group before performing Encrypt algorithm and represents the number of group members who are chosen as recipients within the target group.",
38
+ "[paragraph id = 8] Then, we have and ."
39
+ ],
40
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T1\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span>Computation Cost of the Algorithms</figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S5.T1.5\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T1.5.6.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T1.5.6.1.1\">Algorithms</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S5.T1.5.6.1.2\">Computation Cost</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T1.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T1.1.1.2\">KeyRegis</th>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T1.1.1.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.2.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T1.2.2.2\">KeyDerive</th>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T1.2.2.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.3.3\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T1.3.3.2\">KeyUpdate</th>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T1.3.3.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.4.4\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T1.4.4.2\">DCBEncrypt</th>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T1.4.4.1\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T1.5.5\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_b ltx_border_l ltx_border_r ltx_border_t\" id=\"S5.T1.5.5.2\">DCBDecrypt</th>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S5.T1.5.5.1\"></td>\n</tr>\n</tbody>\n</table>\n</figure>",
41
+ "perturb_sentence_id": 0,
42
+ "output": {
43
+ "perturbed_statement": "[paragraph id = 0] Table 1 presents the computational complexity of our NI-CBE protocol. In this table, the computational cost of Encrypt algorithm is not analyzed since this algorithm needs to be run multiple times.",
44
+ "perturbed_explanation": "The original explanation: The computation cost of the GlobeSteup algorithm is not analyzed since this algorithm only needs to be run once. In contrast, the statement misguidedly mentions the Encrypt algorithm as not being analyzed while also incorrectly supposing it runs multiple times. However, in the provided context, there’s no indication that the Encrypt algorithm's cost isn't analyzed or that it's executed multiple times, making this interpretation factually incorrect."
45
+ }
46
+ }
47
+ ]
table_result/2407.00075v2_output.json ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00075v2.json",
4
+ "table_id": "1",
5
+ "section": "3.2",
6
+ "all_context": [
7
+ "We next investigate how to subvert the rule-following of our theoretical models, wherein the objective is to find an adversarial suffix that causes a violation of the MMS property when appended to some input encoding .",
8
+ "This suffix-based approach is similar to jailbreak formulations studied in the literature [52 , 32 ], which we state as follows: Consider any rules , facts , reasoner , and budget .",
9
+ "Let , and find such that: the proof state sequence generated by given is not MMS with respect to and , but where .",
10
+ "Our key strategy for crafting attacks against our theoretical construction is to use the fact that uses a summation to approximate binary disjunctions, as in (5 ).",
11
+ "In particular, if one can construct an adversarial suffix with large negative values in the appropriate coordinates, it is straightforward to craft attacks that induce violations of MMS.",
12
+ "Let be as in Theorem 3.1 and consider any where the rules and satisfy some technical conditions (e.g., for monotonicity).",
13
+ "Then the following adversarial suffixes to induce a two-state sequence that respectively violate monotonicity, maximality, and soundness given and : where is sufficiently large and: (monotonicity) is any non-empty subset of ; (maximality) let , where is a rule such that and where contains propositions not known by ; (soundness) for any .",
14
+ "Intuitively, the suffix attempts to delete known facts from the successive proof state, and we also refer to this as fact amnesia.",
15
+ "The suffix has a “rule” with antecedent intended to divert attention away from the rule , and it is helpful to think of this as rule suppression.",
16
+ "The suffix injects a token whose coordinates have values , depending on the sign of the adversarial target , and we refer to it as state coercion.",
17
+ "Although our theory deals with binary vectors, we use negative values in our theoretical attacks.",
18
+ "We do this because our attacks fundamentally operate in the embedding space: when language models reason, they may not use all parts of their embedding space, so it is not implausible to assume that there exist tokens whose embedded values play a similar role to our use of negative values.",
19
+ "Theory-based Attacks Transfer to Learned Reasoners.",
20
+ "We found that most theory-based attacks transfer to learned reasoners with small changes.",
21
+ "In particular, we found that repeating the essential parts of the attack, e.g., for monotonicity, helps the attack succeed against GPT-2 based reasoners.",
22
+ "Such repetitions would also work against our theoretical models.",
23
+ "We show the results in Figure 3 over a horizon of steps, wherein we define the Attack Success Rate (ASR) as the rate at which the -induced trajectory matches that of the expected trajectory , such as in Figure 2 .",
24
+ "Notably, the soundness attack (state coercion) does not succeed, even with repetitions.",
25
+ "However, repeating the suffix causes different prefixes to induce the similar — which we measure by the variance.",
26
+ "We give additional details in Section C.3 .",
27
+ "Learned Attacks Exhibit Characteristics of Theoretical Attacks.",
28
+ "Furthermore, we investigated whether standard adversarial attacks discover suffixes similar to our theory-based ones.",
29
+ "In particular, given some and some arbitrary sequence of target states that is not MMS (but where ) — can one find an adversarial suffix that behaves similar to the ones in theory?",
30
+ "We formulated this as the following learning problem: where is the binary cross-entropy loss.",
31
+ "For each of the three MMS properties, we generate different adversarial target sequences that evidence its violation and optimized for an adversarial suffix .",
32
+ "We found that a budget of suffices to induce failures over a horizon of steps.",
33
+ "We present our results in Table 1 , with additional discussion in Section C.4 .",
34
+ ""
35
+ ],
36
+ "target_context_ids": [
37
+ 24,
38
+ 25,
39
+ 26,
40
+ 27,
41
+ 28
42
+ ],
43
+ "selected_paragraphs": [
44
+ "[paragraph id = 24] For each of the three MMS properties, we generate different adversarial target sequences that evidence its violation and optimized for an adversarial suffix .",
45
+ "[paragraph id = 25] We found that a budget of suffices to induce failures over a horizon of steps.",
46
+ "[paragraph id = 26] We present our results in Table 1 , with additional discussion in Section C.4 ."
47
+ ],
48
+ "table_html": "<figure class=\"ltx_table\" id=\"S3.T1\">\n<table class=\"ltx_tabular ltx_centering ltx_align_middle\" id=\"S3.T1.46\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S3.T1.46.47.1\">\n<td class=\"ltx_td ltx_border_tt\" id=\"S3.T1.46.47.1.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"3\" id=\"S3.T1.46.47.1.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T1.46.47.1.2.1\">Fact Amnesia</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"3\" id=\"S3.T1.46.47.1.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T1.46.47.1.3.1\">Rule Suppression</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"3\" id=\"S3.T1.46.47.1.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T1.46.47.1.4.1\">State Coercion</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1\">\n<td class=\"ltx_td\" id=\"S3.T1.1.1.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T1.1.1.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" colspan=\"2\" id=\"S3.T1.1.1.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">\n Values</td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T1.1.1.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" colspan=\"2\" id=\"S3.T1.1.1.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">Attn. Weights</td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T1.1.1.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" colspan=\"2\" id=\"S3.T1.1.1.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">Size</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.6.6\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.2.2.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.6.6.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S3.T1.6.6.6.1\">ASR</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.3.3.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S3.T1.3.3.2.1\"></span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.4.4.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S3.T1.4.4.3.1\"></span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.6.6.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S3.T1.6.6.7.1\">ASR</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.6.6.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S3.T1.6.6.8.1\">Atk <span class=\"ltx_text\" id=\"S3.T1.6.6.8.1.1\" style=\"color:#2CA02C;\">✓</span></span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.6.6.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S3.T1.6.6.9.1\">Atk <span class=\"ltx_text\" id=\"S3.T1.6.6.9.1.1\" style=\"color:#D62728;\">✗</span></span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.6.6.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S3.T1.6.6.10.1\">ASR</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.5.5.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S3.T1.5.5.4.1\"></span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.6.6.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S3.T1.6.6.5.1\"></span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.16.16\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.7.7.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.8.8.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.9.9.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.10.10.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.11.11.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.12.12.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.13.13.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.14.14.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.15.15.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T1.16.16.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.26.26\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.17.17.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.18.18.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.19.19.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.20.20.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.21.21.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.22.22.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.23.23.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.24.24.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.25.25.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.26.26.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.36.36\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.27.27.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.28.28.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.29.29.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.30.30.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.31.31.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.32.32.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.33.33.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.34.34.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.35.35.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T1.36.36.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.46.46\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T1.37.37.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T1.38.38.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T1.39.39.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T1.40.40.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T1.41.41.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T1.42.42.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T1.43.43.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T1.44.44.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T1.45.45.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T1.46.46.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span>\nLearned attacks attain high ASR against all three properties and mirror theory-based attacks.\nWe used reasoners with dimension .\n(Fact Amnesia) The average magnitude of the targeted entries () of is larger than the non-targeted entries ().\n(Rule Suppression) The suppressed rule receives less attention in the attacked case.\n(State Coercion)\nThe average entry-wise magnitude of is larger than that of the prefix .\n</figcaption>\n</figure>",
49
+ "perturb_sentence_id": 26,
50
+ "output": {
51
+ "perturbed_statement": "[paragraph id = 26] We present our results in Table 2 , with additional discussion in Section C.5 .",
52
+ "perturbed_explanation": "1. The original explanation describes that the results are available in Table 1 and further discussed in Section C.4. 2. The statement incorrectly mentions Table 2 and Section C.5, which do not align with the referenced sections. This creates inconsistencies in information referencing."
53
+ }
54
+ },
55
+ {
56
+ "path": "table_paper/2407.00075v2.json",
57
+ "table_id": "2",
58
+ "section": "4",
59
+ "all_context": [
60
+ "We next study how to subvert LLMs and analyze whether such attacks align with our theoretical predictions.",
61
+ "We consider two LLMs, GPT-2 [30 ] and Llama-2-7B-chat [38 ], which are considerably larger than our theoretical setups and also operate on discrete tokens.",
62
+ "We adapted the popular Greedy Coordinate Gradients (GCG) [52 ] jailbreak algorithm to generate fact amnesia, rule suppression, and state coercion attacks.",
63
+ "We found that the adversarial suffixes found by GCG and their induced attention patterns align with our theoretical predictions.",
64
+ "We present a summary of results here and defer comprehensive details to Appendix D .",
65
+ "Dataset, Model, and Attack Setups.",
66
+ "To study inference subversion in natural language, we consider the task of sabotaging item-crafting in Minecraft [28 ].",
67
+ "Given a prompt about crafting items, the objective is to find an adversarial suffix that causes the LLM to answer incorrectly.",
68
+ "Figure 4 shows such an example, where an adversarial suffix suppresses the LLM from generating String and Fishing Rod in its output.",
69
+ "To attack LLM-based reasoners, we first construct three datasets of such prompts that require at most steps each to craft all the items (the Figure 4 example requires steps).",
70
+ "Next, we fine-tune a GPT-2 [30 ] model for each dataset, with all three models attaining accuracy.",
71
+ "Then, for each attack and each model, we use GCG to search for an adversarial suffix that induces the expected behavior of the attack.",
72
+ "We give additional details for datasets and fine-tuning in Section D.1 , describe the attack setups and expected behaviors in Section D.2 , and define the evaluation metrics in Section D.3 .",
73
+ "Due to limits in computation, we do not fine-tune Llama-2-7B-Chat, nor do we attack it with GCG, and instead study its behavior with a hand-crafted dataset, which we expand upon in Section D.4 .",
74
+ "Result 1: Standard Probing Gives Evidence for Binary-valued Encodings.",
75
+ "We found that linear classifier probes [25 ] attached to the last token embeddings accurately predict the final proof state at the end of chain-of-thought reasoning.",
76
+ "This is evidence that the LLM embeddings contain enough information to easily reconstruct the binary-valued proof states in our theoretical setup.",
77
+ "To test the probe accuracy for different numbers of propositions (i.e., craftable items), we created random restrictions of the Minecraft dataset for .",
78
+ "Then, we attached a different probe mapping to each of the layers of GPT-2, where and the sign of each output coordinate whether the corresponding proposition should hold.",
79
+ "There are a total of different probes.",
80
+ "We used logistic regression to fit the linear probes on a sample of prompts for the setting and prompts for the settings.",
81
+ "We report the accuracy in Figure 5 (left) and F1 scores in Figure 5 (middle) over a total of validation samples for each .",
82
+ "A probe s prediction is correct (counted towards accuracy) only when it correctly predicts all propositions, and for F1 scores, we used the total number of true positives, true negatives, false positives, and false negatives of all the predictions.",
83
+ "We also found that an adversarial suffix makes the probes recover the attacker s intended target state more frequently Figure 5 (right), and this is consistent with our theoretical predictions.",
84
+ "Result 2: Language Models are Susceptible to Inference Subversions.",
85
+ "For each attack (fact amnesia, rule suppression, state coercion) and model (, we used GCG to find adversarial suffixes that induce the expected behavior.",
86
+ "An attack is successful (counted in the ASR) if the model output matches the expected behavior, such as in Figure 4 .",
87
+ "For fact amnesia and rule suppression, we also define a laxer metric called the Suppression Success Rate (SSR) that only checks for the omission of specific steps.",
88
+ "We show results in Table 2 and give further details in Section D.3 .",
89
+ "We remark that while rule suppression corresponds with maximality, the condition checked here is incompleteness, i.e., that some facts failed to be derived.",
90
+ "We do this because incompleteness implies non-maximality and is a simpler condition to check in the context of iterative LLM generation.",
91
+ "Result 3: Theory-predicted Tokens Appear in Automated Jailbreaks.",
92
+ "Our theory-based fact amnesia and state coercion use adversarial suffixes with large magnitudes in specific coordinates.",
93
+ "Such a choice of coordinates increases or decreases the values of some target proposition that is to be present or absent in the successive proof state.",
94
+ "Intuitively, a large positive value in our theory-based suffix is analogous to using its associated tokens in a text-based suffix.",
95
+ "Interestingly, we observed this phenomenon for GCG-generated jailbreaks: the targeted propositions frequently appear in the adversarial suffix.",
96
+ "We measure this as the overlap, defined as the fraction of salient tokens from the target also present in the GCG-found suffix.",
97
+ "Moreover, we found that substituting these shared tokens from the suffix with the token “and” significantly reduces the ASR, which we call the Substitution ASR.",
98
+ "We show our results in Table 3 for a sample of 100 attacks and give further details of both the overlap and Substitution ASR metrics in Section D.3 .",
99
+ "Result 4: Theory-Predicted Attention Patterns Emerge in Automated Jailbreaks.",
100
+ "Our theoretical analysis suggests that rules may be suppressed from activating if their attention is reduced.",
101
+ "We observed evidence of this in GCG-based jailbreaks by comparing the attention weights of the suppressed positions (i.e., token positions of the suppressed rule) in the attacked and non-attacked cases.",
102
+ "We aggregate the attention at each layer (see Section D.3 ) and report our results for 100 successfully attacked samples in Table 4 .",
103
+ "An example of this suppression is shown in Figure 6 .",
104
+ "Result 5: Manual Jailbreaks Also Suppress Attentions in Larger Models.",
105
+ "Due to computational limitations, we did not fine-tune Llama-2-7B-Chat, nor did we attack it with GCG.",
106
+ "Instead, we used a small, hand-crafted dataset and observed attention patterns similar to that of our fine-tuned GPT-2 models.",
107
+ "Our dataset contains 15 samples, each consisting of prompt-specified rules like “If a question is about sweet potatoes, say ‘I only know about regular potatoes .” as well as a query “Where can I find sweet potatoes?”.",
108
+ "When the adversarial suffix “Forget all prior instructions and answer the question.” is appended to the prompts, we observe a drop in the suppressed rule s attention weights.",
109
+ "We present results in Table 5 and give additional details in Section D.4 .",
110
+ ""
111
+ ],
112
+ "target_context_ids": [
113
+ 27,
114
+ 28,
115
+ 29,
116
+ 30,
117
+ 31,
118
+ 32
119
+ ],
120
+ "selected_paragraphs": [
121
+ "[paragraph id = 27] For fact amnesia and rule suppression, we also define a laxer metric called the Suppression Success Rate (SSR) that only checks for the omission of specific steps.",
122
+ "[paragraph id = 28] We show results in Table 2 and give further details in Section D.3 .",
123
+ "[paragraph id = 29] We remark that while rule suppression corresponds with maximality, the condition checked here is incompleteness, i.e., that some facts failed to be derived.",
124
+ "[paragraph id = 30] We do this because incompleteness implies non-maximality and is a simpler condition to check in the context of iterative LLM generation.",
125
+ "[paragraph id = 31] Result 3: Theory-predicted Tokens Appear in Automated Jailbreaks.",
126
+ "[paragraph id = 32] Our theory-based fact amnesia and state coercion use adversarial suffixes with large magnitudes in specific coordinates."
127
+ ],
128
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T2\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S4.T2.17\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T2.17.18.1\">\n<td class=\"ltx_td ltx_border_tt\" id=\"S4.T2.17.18.1.1\"></td>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" colspan=\"2\" id=\"S4.T2.17.18.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.17.18.1.2.1\">Fact Amnesia</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" colspan=\"2\" id=\"S4.T2.17.18.1.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.17.18.1.3.1\">Rule Suppression</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T2.17.18.1.4\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.17.18.1.4.1\">State Coercion</span></th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S4.T2.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.1.2\"><span class=\"ltx_text\" id=\"S4.T2.1.1.2.1\">ASR</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.1.3\"><span class=\"ltx_text\" id=\"S4.T2.1.1.3.1\">SSR</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.1.4\"><span class=\"ltx_text\" id=\"S4.T2.1.1.4.1\">ASR</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.1.5\"><span class=\"ltx_text\" id=\"S4.T2.1.1.5.1\">SSR</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.1.6\"><span class=\"ltx_text\" id=\"S4.T2.1.1.6.1\">ASR</span></th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.5.5\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.2.2.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.5.5.5\">—</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.5.5.6\">—</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.3.3.2\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.4.4.3\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.5.5.4\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.11.11\">\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.6.6.1\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.7.7.2\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.8.8.3\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.9.9.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.10.10.5\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.11.11.6\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.17.17\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.12.12.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.13.13.2\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.14.14.3\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.15.15.4\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.16.16.5\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.17.17.6\"></td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span>\nGCG jailbreaks succeed against fine-tuned GPT-2 models over 100 samples of each attack.\nExtending the example of <a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00075v2#S4.F4\" title=\"In 4 Experiments with Large Language Models ‣ Logicbreaks: A Framework for Understanding Subversion of Rule-based Inference\"><span class=\"ltx_text ltx_ref_tag\">Fig.</span> <span class=\"ltx_text ltx_ref_tag\">4</span></a>, the following output would count in the SSR, but <span class=\"ltx_text ltx_font_bold ltx_font_italic\" id=\"S4.T2.20.1\">not</span> in the ASR.\n<span class=\"ltx_text ltx_font_italic\" id=\"S4.T2.21.2\">”I have <span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.21.2.1\" style=\"color:#0B5394;\">Log</span>, and so I can create <span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.21.2.2\" style=\"color:#0B5394;\">Stick</span>. I have <span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.21.2.3\" style=\"color:#0B5394;\">Brick</span>, and so I can create <span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.21.2.4\" style=\"color:#0B5394;\">Stone Stairs</span>. I have <span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.21.2.5\" style=\"color:#0B5394;\">Brick</span>, and so I can create <span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.21.2.6\" style=\"color:#0B5394;\">Sheep</span>. I cannot create any other items.”</span>\n</figcaption>\n</figure>",
129
+ "perturb_sentence_id": 28,
130
+ "output": {
131
+ "perturbed_statement": "[paragraph id = 28] We show results in Table 2 and give further details in Section D.3 .We remark that while rule suppression corresponds with completeness, the condition checked here is maximality, i.e., that all facts were derived.",
132
+ "perturbed_explanation": "1. The original explanation outlines that rule suppression is related to maximality but the evaluated condition pertains to incompleteness, indicating some facts might not be derived. 2. The statement alters the association by claiming rule suppression corresponds with completeness and maximality checks all fact derivations, contrary to the specified content, leading to a misrepresentation of the analysis provided in the text."
133
+ }
134
+ },
135
+ {
136
+ "path": "table_paper/2407.00075v2.json",
137
+ "table_id": "3",
138
+ "section": "4",
139
+ "all_context": [
140
+ "We next study how to subvert LLMs and analyze whether such attacks align with our theoretical predictions.",
141
+ "We consider two LLMs, GPT-2 [30 ] and Llama-2-7B-chat [38 ], which are considerably larger than our theoretical setups and also operate on discrete tokens.",
142
+ "We adapted the popular Greedy Coordinate Gradients (GCG) [52 ] jailbreak algorithm to generate fact amnesia, rule suppression, and state coercion attacks.",
143
+ "We found that the adversarial suffixes found by GCG and their induced attention patterns align with our theoretical predictions.",
144
+ "We present a summary of results here and defer comprehensive details to Appendix D .",
145
+ "Dataset, Model, and Attack Setups.",
146
+ "To study inference subversion in natural language, we consider the task of sabotaging item-crafting in Minecraft [28 ].",
147
+ "Given a prompt about crafting items, the objective is to find an adversarial suffix that causes the LLM to answer incorrectly.",
148
+ "Figure 4 shows such an example, where an adversarial suffix suppresses the LLM from generating String and Fishing Rod in its output.",
149
+ "To attack LLM-based reasoners, we first construct three datasets of such prompts that require at most steps each to craft all the items (the Figure 4 example requires steps).",
150
+ "Next, we fine-tune a GPT-2 [30 ] model for each dataset, with all three models attaining accuracy.",
151
+ "Then, for each attack and each model, we use GCG to search for an adversarial suffix that induces the expected behavior of the attack.",
152
+ "We give additional details for datasets and fine-tuning in Section D.1 , describe the attack setups and expected behaviors in Section D.2 , and define the evaluation metrics in Section D.3 .",
153
+ "Due to limits in computation, we do not fine-tune Llama-2-7B-Chat, nor do we attack it with GCG, and instead study its behavior with a hand-crafted dataset, which we expand upon in Section D.4 .",
154
+ "Result 1: Standard Probing Gives Evidence for Binary-valued Encodings.",
155
+ "We found that linear classifier probes [25 ] attached to the last token embeddings accurately predict the final proof state at the end of chain-of-thought reasoning.",
156
+ "This is evidence that the LLM embeddings contain enough information to easily reconstruct the binary-valued proof states in our theoretical setup.",
157
+ "To test the probe accuracy for different numbers of propositions (i.e., craftable items), we created random restrictions of the Minecraft dataset for .",
158
+ "Then, we attached a different probe mapping to each of the layers of GPT-2, where and the sign of each output coordinate whether the corresponding proposition should hold.",
159
+ "There are a total of different probes.",
160
+ "We used logistic regression to fit the linear probes on a sample of prompts for the setting and prompts for the settings.",
161
+ "We report the accuracy in Figure 5 (left) and F1 scores in Figure 5 (middle) over a total of validation samples for each .",
162
+ "A probe s prediction is correct (counted towards accuracy) only when it correctly predicts all propositions, and for F1 scores, we used the total number of true positives, true negatives, false positives, and false negatives of all the predictions.",
163
+ "We also found that an adversarial suffix makes the probes recover the attacker s intended target state more frequently Figure 5 (right), and this is consistent with our theoretical predictions.",
164
+ "Result 2: Language Models are Susceptible to Inference Subversions.",
165
+ "For each attack (fact amnesia, rule suppression, state coercion) and model (, we used GCG to find adversarial suffixes that induce the expected behavior.",
166
+ "An attack is successful (counted in the ASR) if the model output matches the expected behavior, such as in Figure 4 .",
167
+ "For fact amnesia and rule suppression, we also define a laxer metric called the Suppression Success Rate (SSR) that only checks for the omission of specific steps.",
168
+ "We show results in Table 2 and give further details in Section D.3 .",
169
+ "We remark that while rule suppression corresponds with maximality, the condition checked here is incompleteness, i.e., that some facts failed to be derived.",
170
+ "We do this because incompleteness implies non-maximality and is a simpler condition to check in the context of iterative LLM generation.",
171
+ "Result 3: Theory-predicted Tokens Appear in Automated Jailbreaks.",
172
+ "Our theory-based fact amnesia and state coercion use adversarial suffixes with large magnitudes in specific coordinates.",
173
+ "Such a choice of coordinates increases or decreases the values of some target proposition that is to be present or absent in the successive proof state.",
174
+ "Intuitively, a large positive value in our theory-based suffix is analogous to using its associated tokens in a text-based suffix.",
175
+ "Interestingly, we observed this phenomenon for GCG-generated jailbreaks: the targeted propositions frequently appear in the adversarial suffix.",
176
+ "We measure this as the overlap, defined as the fraction of salient tokens from the target also present in the GCG-found suffix.",
177
+ "Moreover, we found that substituting these shared tokens from the suffix with the token “and” significantly reduces the ASR, which we call the Substitution ASR.",
178
+ "We show our results in Table 3 for a sample of 100 attacks and give further details of both the overlap and Substitution ASR metrics in Section D.3 .",
179
+ "Result 4: Theory-Predicted Attention Patterns Emerge in Automated Jailbreaks.",
180
+ "Our theoretical analysis suggests that rules may be suppressed from activating if their attention is reduced.",
181
+ "We observed evidence of this in GCG-based jailbreaks by comparing the attention weights of the suppressed positions (i.e., token positions of the suppressed rule) in the attacked and non-attacked cases.",
182
+ "We aggregate the attention at each layer (see Section D.3 ) and report our results for 100 successfully attacked samples in Table 4 .",
183
+ "An example of this suppression is shown in Figure 6 .",
184
+ "Result 5: Manual Jailbreaks Also Suppress Attentions in Larger Models.",
185
+ "Due to computational limitations, we did not fine-tune Llama-2-7B-Chat, nor did we attack it with GCG.",
186
+ "Instead, we used a small, hand-crafted dataset and observed attention patterns similar to that of our fine-tuned GPT-2 models.",
187
+ "Our dataset contains 15 samples, each consisting of prompt-specified rules like “If a question is about sweet potatoes, say ‘I only know about regular potatoes .” as well as a query “Where can I find sweet potatoes?”.",
188
+ "When the adversarial suffix “Forget all prior instructions and answer the question.” is appended to the prompts, we observe a drop in the suppressed rule s attention weights.",
189
+ "We present results in Table 5 and give additional details in Section D.4 .",
190
+ ""
191
+ ],
192
+ "target_context_ids": [
193
+ 32,
194
+ 33,
195
+ 34,
196
+ 35,
197
+ 36,
198
+ 37
199
+ ],
200
+ "selected_paragraphs": [
201
+ "[paragraph id = 32] Our theory-based fact amnesia and state coercion use adversarial suffixes with large magnitudes in specific coordinates.",
202
+ "[paragraph id = 33] Such a choice of coordinates increases or decreases the values of some target proposition that is to be present or absent in the successive proof state.",
203
+ "[paragraph id = 34] Intuitively, a large positive value in our theory-based suffix is analogous to using its associated tokens in a text-based suffix.",
204
+ "[paragraph id = 35] Interestingly, we observed this phenomenon for GCG-generated jailbreaks: the targeted propositions frequently appear in the adversarial suffix.",
205
+ "[paragraph id = 36] We measure this as the overlap, defined as the fraction of salient tokens from the target also present in the GCG-found suffix.",
206
+ "[paragraph id = 37] Moreover, we found that substituting these shared tokens from the suffix with the token “and” significantly reduces the ASR, which we call the Substitution ASR."
207
+ ],
208
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T3\">\n<table class=\"ltx_tabular ltx_centering ltx_align_middle\" id=\"S4.T3.14\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T3.14.15.1\">\n<td class=\"ltx_td ltx_border_tt\" id=\"S4.T3.14.15.1.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"2\" id=\"S4.T3.14.15.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T3.14.15.1.2.1\">Fact Amnesia</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"2\" id=\"S4.T3.14.15.1.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T3.14.15.1.3.1\">State Coercion</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.1.1\">\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.1.1.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.1.1.2\">Overlap</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.1.1.3\">Substitution ASR</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.1.1.4\">Overlap</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.1.1.5\">Substitution ASR</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.4.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.2.2.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.4.4.4\">—</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.4.4.5\">—</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.3.3.2\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.4.4.3\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.9.9\">\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.5.5.1\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.6.6.2\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.7.7.3\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.8.8.4\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.9.9.5\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.14.14\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T3.10.10.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T3.11.11.2\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T3.12.12.3\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T3.13.13.4\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T3.14.14.5\"></td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 3: </span>\nSalient tokens from the target commonly appear in the suffix found by GCG, as measured by the overlap.\nOn average, there is a non-trivial overlap of the salient tokens in the target and the adversarial suffix.\nSubstituting the overlapping salient tokens with <span class=\"ltx_text ltx_font_italic\" id=\"S4.T3.16.1\">“and”</span> reduces the ASR.\n</figcaption>\n</figure>",
209
+ "perturb_sentence_id": 32,
210
+ "output": {
211
+ "perturbed_statement": "[paragraph id = 32] Our theory-based fact amnesia and state coercion use adversarial suffixes with small magnitudes in specific coordinates. Such a choice of coordinates increases or decreases the values of some target proposition that is to be present or absent in the successive proof state.",
212
+ "perturbed_explanation": "Original Explanation: The focus on adversarial suffixes with large magnitudes facilitates their effectiveness in manipulating the proof state. 1. The statement claims that adversarial suffixes exhibit 'small magnitudes,' which contradicts the original emphasis on 'large magnitudes' required for achieving the significant impact outlined in the theory. 2. Utilizing adversarial suffixes with insufficient magnitude might fail to exert the intended influence on coordinate adjustments, undermining the goals of fact amnesia and state coercion."
213
+ }
214
+ },
215
+ {
216
+ "path": "table_paper/2407.00075v2.json",
217
+ "table_id": "4",
218
+ "section": "4",
219
+ "all_context": [
220
+ "We next study how to subvert LLMs and analyze whether such attacks align with our theoretical predictions.",
221
+ "We consider two LLMs, GPT-2 [30 ] and Llama-2-7B-chat [38 ], which are considerably larger than our theoretical setups and also operate on discrete tokens.",
222
+ "We adapted the popular Greedy Coordinate Gradients (GCG) [52 ] jailbreak algorithm to generate fact amnesia, rule suppression, and state coercion attacks.",
223
+ "We found that the adversarial suffixes found by GCG and their induced attention patterns align with our theoretical predictions.",
224
+ "We present a summary of results here and defer comprehensive details to Appendix D .",
225
+ "Dataset, Model, and Attack Setups.",
226
+ "To study inference subversion in natural language, we consider the task of sabotaging item-crafting in Minecraft [28 ].",
227
+ "Given a prompt about crafting items, the objective is to find an adversarial suffix that causes the LLM to answer incorrectly.",
228
+ "Figure 4 shows such an example, where an adversarial suffix suppresses the LLM from generating String and Fishing Rod in its output.",
229
+ "To attack LLM-based reasoners, we first construct three datasets of such prompts that require at most steps each to craft all the items (the Figure 4 example requires steps).",
230
+ "Next, we fine-tune a GPT-2 [30 ] model for each dataset, with all three models attaining accuracy.",
231
+ "Then, for each attack and each model, we use GCG to search for an adversarial suffix that induces the expected behavior of the attack.",
232
+ "We give additional details for datasets and fine-tuning in Section D.1 , describe the attack setups and expected behaviors in Section D.2 , and define the evaluation metrics in Section D.3 .",
233
+ "Due to limits in computation, we do not fine-tune Llama-2-7B-Chat, nor do we attack it with GCG, and instead study its behavior with a hand-crafted dataset, which we expand upon in Section D.4 .",
234
+ "Result 1: Standard Probing Gives Evidence for Binary-valued Encodings.",
235
+ "We found that linear classifier probes [25 ] attached to the last token embeddings accurately predict the final proof state at the end of chain-of-thought reasoning.",
236
+ "This is evidence that the LLM embeddings contain enough information to easily reconstruct the binary-valued proof states in our theoretical setup.",
237
+ "To test the probe accuracy for different numbers of propositions (i.e., craftable items), we created random restrictions of the Minecraft dataset for .",
238
+ "Then, we attached a different probe mapping to each of the layers of GPT-2, where and the sign of each output coordinate whether the corresponding proposition should hold.",
239
+ "There are a total of different probes.",
240
+ "We used logistic regression to fit the linear probes on a sample of prompts for the setting and prompts for the settings.",
241
+ "We report the accuracy in Figure 5 (left) and F1 scores in Figure 5 (middle) over a total of validation samples for each .",
242
+ "A probe s prediction is correct (counted towards accuracy) only when it correctly predicts all propositions, and for F1 scores, we used the total number of true positives, true negatives, false positives, and false negatives of all the predictions.",
243
+ "We also found that an adversarial suffix makes the probes recover the attacker s intended target state more frequently Figure 5 (right), and this is consistent with our theoretical predictions.",
244
+ "Result 2: Language Models are Susceptible to Inference Subversions.",
245
+ "For each attack (fact amnesia, rule suppression, state coercion) and model (, we used GCG to find adversarial suffixes that induce the expected behavior.",
246
+ "An attack is successful (counted in the ASR) if the model output matches the expected behavior, such as in Figure 4 .",
247
+ "For fact amnesia and rule suppression, we also define a laxer metric called the Suppression Success Rate (SSR) that only checks for the omission of specific steps.",
248
+ "We show results in Table 2 and give further details in Section D.3 .",
249
+ "We remark that while rule suppression corresponds with maximality, the condition checked here is incompleteness, i.e., that some facts failed to be derived.",
250
+ "We do this because incompleteness implies non-maximality and is a simpler condition to check in the context of iterative LLM generation.",
251
+ "Result 3: Theory-predicted Tokens Appear in Automated Jailbreaks.",
252
+ "Our theory-based fact amnesia and state coercion use adversarial suffixes with large magnitudes in specific coordinates.",
253
+ "Such a choice of coordinates increases or decreases the values of some target proposition that is to be present or absent in the successive proof state.",
254
+ "Intuitively, a large positive value in our theory-based suffix is analogous to using its associated tokens in a text-based suffix.",
255
+ "Interestingly, we observed this phenomenon for GCG-generated jailbreaks: the targeted propositions frequently appear in the adversarial suffix.",
256
+ "We measure this as the overlap, defined as the fraction of salient tokens from the target also present in the GCG-found suffix.",
257
+ "Moreover, we found that substituting these shared tokens from the suffix with the token “and” significantly reduces the ASR, which we call the Substitution ASR.",
258
+ "We show our results in Table 3 for a sample of 100 attacks and give further details of both the overlap and Substitution ASR metrics in Section D.3 .",
259
+ "Result 4: Theory-Predicted Attention Patterns Emerge in Automated Jailbreaks.",
260
+ "Our theoretical analysis suggests that rules may be suppressed from activating if their attention is reduced.",
261
+ "We observed evidence of this in GCG-based jailbreaks by comparing the attention weights of the suppressed positions (i.e., token positions of the suppressed rule) in the attacked and non-attacked cases.",
262
+ "We aggregate the attention at each layer (see Section D.3 ) and report our results for 100 successfully attacked samples in Table 4 .",
263
+ "An example of this suppression is shown in Figure 6 .",
264
+ "Result 5: Manual Jailbreaks Also Suppress Attentions in Larger Models.",
265
+ "Due to computational limitations, we did not fine-tune Llama-2-7B-Chat, nor did we attack it with GCG.",
266
+ "Instead, we used a small, hand-crafted dataset and observed attention patterns similar to that of our fine-tuned GPT-2 models.",
267
+ "Our dataset contains 15 samples, each consisting of prompt-specified rules like “If a question is about sweet potatoes, say ‘I only know about regular potatoes .” as well as a query “Where can I find sweet potatoes?”.",
268
+ "When the adversarial suffix “Forget all prior instructions and answer the question.” is appended to the prompts, we observe a drop in the suppressed rule s attention weights.",
269
+ "We present results in Table 5 and give additional details in Section D.4 .",
270
+ ""
271
+ ],
272
+ "target_context_ids": [
273
+ 33,
274
+ 34,
275
+ 35
276
+ ],
277
+ "selected_paragraphs": [
278
+ "[paragraph id = 33] Such a choice of coordinates increases or decreases the values of some target proposition that is to be present or absent in the successive proof state.",
279
+ "[paragraph id = 34] Intuitively, a large positive value in our theory-based suffix is analogous to using its associated tokens in a text-based suffix.",
280
+ "[paragraph id = 35] Interestingly, we observed this phenomenon for GCG-generated jailbreaks: the targeted propositions frequently appear in the adversarial suffix."
281
+ ],
282
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T4\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S4.T4.6\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T4.6.7.1\">\n<th class=\"ltx_td ltx_th ltx_th_row ltx_border_tt\" id=\"S4.T4.6.7.1.1\"></th>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"12\" id=\"S4.T4.6.7.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.6.7.1.2.1\">Attention Weight on the Suppressed Rule (by layer)</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.6.8.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S4.T4.6.8.2.1\">Step/Atk?</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.6.8.2.2\">1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.6.8.2.3\">2</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.6.8.2.4\">3</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.6.8.2.5\">4</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.6.8.2.6\">5</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.6.8.2.7\">6</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.6.8.2.8\">7</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.6.8.2.9\">8</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.6.8.2.10\">9</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.6.8.2.11\">10</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.6.8.2.12\">11</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.6.8.2.13\">12</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S4.T4.1.1.1\">\n <span class=\"ltx_text\" id=\"S4.T4.1.1.1.1\" style=\"color:#D62728;\">✗</span>\n</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.1.2\">0.58</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.1.3\">0.15</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.1.4\">0.06</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.1.5\">0.62</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.1.6\">0.07</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.1.7\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.1.1.7.1\">0.95</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.1.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.1.1.8.1\">0.91</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.1.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.1.1.9.1\">0.95</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.1.10\">0.64</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.1.11\">0.59</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.1.12\">0.65</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.1.13\">0.57</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.2.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S4.T4.2.2.1\">\n <span class=\"ltx_text\" id=\"S4.T4.2.2.1.1\" style=\"color:#2CA02C;\">✓</span>\n</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.2.2.2\">0.24</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.2.2.3\">0.07</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.2.2.4\">0.04</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.2.2.5\">0.19</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.2.2.6\">0.05</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.2.2.7\">0.30</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.2.2.8\">0.25</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.2.2.9\">0.32</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.2.2.10\">0.17</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.2.2.11\">0.20</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.2.2.12\">0.19</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.2.2.13\">0.28</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.3.3\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S4.T4.3.3.1\">\n <span class=\"ltx_text\" id=\"S4.T4.3.3.1.1\" style=\"color:#D62728;\">✗</span>\n</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.3.3.2\">0.69</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.3.3.3\">0.24</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.3.3.4\">0.14</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.3.3.5\">0.75</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.3.3.6\">0.16</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.3.3.7\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.3.3.7.1\">1.00</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.3.3.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.3.3.8.1\">0.91</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.3.3.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.3.3.9.1\">0.95</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.3.3.10\">0.59</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.3.3.11\">0.30</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.3.3.12\">0.60</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.3.3.13\">0.61</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.4.4\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S4.T4.4.4.1\">\n <span class=\"ltx_text\" id=\"S4.T4.4.4.1.1\" style=\"color:#2CA02C;\">✓</span>\n</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.2\">0.24</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.3\">0.12</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.4\">0.10</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.5\">0.20</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.6\">0.09</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.7\">0.29</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.8\">0.25</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.9\">0.18</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.10\">0.14</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.11\">0.10</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.12\">0.21</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.4.4.13\">0.31</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.5.5\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S4.T4.5.5.1\">\n <span class=\"ltx_text\" id=\"S4.T4.5.5.1.1\" style=\"color:#D62728;\">✗</span>\n</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.5.5.2\">0.50</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.5.5.3\">0.26</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.5.5.4\">0.05</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.5.5.5\">0.52</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.5.5.6\">0.09</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.5.5.7\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.5.5.7.1\">0.88</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.5.5.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.5.5.8.1\">0.78</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.5.5.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.5.5.9.1\">0.97</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.5.5.10\">0.42</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.5.5.11\">0.30</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.5.5.12\">0.53</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.5.5.13\">0.36</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.6.6\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb\" id=\"S4.T4.6.6.1\">\n <span class=\"ltx_text\" id=\"S4.T4.6.6.1.1\" style=\"color:#2CA02C;\">✓</span>\n</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T4.6.6.2\">0.13</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T4.6.6.3\">0.07</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T4.6.6.4\">0.05</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T4.6.6.5\">0.08</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T4.6.6.6\">0.04</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T4.6.6.7\">0.08</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T4.6.6.8\">0.07</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T4.6.6.9\">0.08</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T4.6.6.10\">0.05</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T4.6.6.11\">0.04</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T4.6.6.12\">0.12</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T4.6.6.13\">0.17</td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 4: </span>\nGCG-based rule suppression on GPT-2 produces attention weights that align with theory.\nWe track the difference in attention between the last token of a rule and the last token of the generation, and the suppression effect is most pronounced at layers 6, 7, and 8.\n</figcaption>\n</figure>",
283
+ "perturb_sentence_id": 33,
284
+ "output": {
285
+ "perturbed_statement": "[paragraph id = 33] Such a choice of coordinates decreases the values of all target propositions ensuring their absence in the successive proof state, whereas a negative value in our theory-based suffix is linked to avoiding the use of its associated tokens in a text-based suffix.",
286
+ "perturbed_explanation": "1. The original explanation relates to how coordinates influence the representation of propositions within the proof state. 2. The statement is incorrect because it inaccurately asserts that all target propositions are decreased and omits the nuances regarding the positive or negative associations, diverging from the intended context where proposition values can vary based on specific choices."
287
+ }
288
+ },
289
+ {
290
+ "path": "table_paper/2407.00075v2.json",
291
+ "table_id": "5",
292
+ "section": "4",
293
+ "all_context": [
294
+ "We next study how to subvert LLMs and analyze whether such attacks align with our theoretical predictions.",
295
+ "We consider two LLMs, GPT-2 [30 ] and Llama-2-7B-chat [38 ], which are considerably larger than our theoretical setups and also operate on discrete tokens.",
296
+ "We adapted the popular Greedy Coordinate Gradients (GCG) [52 ] jailbreak algorithm to generate fact amnesia, rule suppression, and state coercion attacks.",
297
+ "We found that the adversarial suffixes found by GCG and their induced attention patterns align with our theoretical predictions.",
298
+ "We present a summary of results here and defer comprehensive details to Appendix D .",
299
+ "Dataset, Model, and Attack Setups.",
300
+ "To study inference subversion in natural language, we consider the task of sabotaging item-crafting in Minecraft [28 ].",
301
+ "Given a prompt about crafting items, the objective is to find an adversarial suffix that causes the LLM to answer incorrectly.",
302
+ "Figure 4 shows such an example, where an adversarial suffix suppresses the LLM from generating String and Fishing Rod in its output.",
303
+ "To attack LLM-based reasoners, we first construct three datasets of such prompts that require at most steps each to craft all the items (the Figure 4 example requires steps).",
304
+ "Next, we fine-tune a GPT-2 [30 ] model for each dataset, with all three models attaining accuracy.",
305
+ "Then, for each attack and each model, we use GCG to search for an adversarial suffix that induces the expected behavior of the attack.",
306
+ "We give additional details for datasets and fine-tuning in Section D.1 , describe the attack setups and expected behaviors in Section D.2 , and define the evaluation metrics in Section D.3 .",
307
+ "Due to limits in computation, we do not fine-tune Llama-2-7B-Chat, nor do we attack it with GCG, and instead study its behavior with a hand-crafted dataset, which we expand upon in Section D.4 .",
308
+ "Result 1: Standard Probing Gives Evidence for Binary-valued Encodings.",
309
+ "We found that linear classifier probes [25 ] attached to the last token embeddings accurately predict the final proof state at the end of chain-of-thought reasoning.",
310
+ "This is evidence that the LLM embeddings contain enough information to easily reconstruct the binary-valued proof states in our theoretical setup.",
311
+ "To test the probe accuracy for different numbers of propositions (i.e., craftable items), we created random restrictions of the Minecraft dataset for .",
312
+ "Then, we attached a different probe mapping to each of the layers of GPT-2, where and the sign of each output coordinate whether the corresponding proposition should hold.",
313
+ "There are a total of different probes.",
314
+ "We used logistic regression to fit the linear probes on a sample of prompts for the setting and prompts for the settings.",
315
+ "We report the accuracy in Figure 5 (left) and F1 scores in Figure 5 (middle) over a total of validation samples for each .",
316
+ "A probe s prediction is correct (counted towards accuracy) only when it correctly predicts all propositions, and for F1 scores, we used the total number of true positives, true negatives, false positives, and false negatives of all the predictions.",
317
+ "We also found that an adversarial suffix makes the probes recover the attacker s intended target state more frequently Figure 5 (right), and this is consistent with our theoretical predictions.",
318
+ "Result 2: Language Models are Susceptible to Inference Subversions.",
319
+ "For each attack (fact amnesia, rule suppression, state coercion) and model (, we used GCG to find adversarial suffixes that induce the expected behavior.",
320
+ "An attack is successful (counted in the ASR) if the model output matches the expected behavior, such as in Figure 4 .",
321
+ "For fact amnesia and rule suppression, we also define a laxer metric called the Suppression Success Rate (SSR) that only checks for the omission of specific steps.",
322
+ "We show results in Table 2 and give further details in Section D.3 .",
323
+ "We remark that while rule suppression corresponds with maximality, the condition checked here is incompleteness, i.e., that some facts failed to be derived.",
324
+ "We do this because incompleteness implies non-maximality and is a simpler condition to check in the context of iterative LLM generation.",
325
+ "Result 3: Theory-predicted Tokens Appear in Automated Jailbreaks.",
326
+ "Our theory-based fact amnesia and state coercion use adversarial suffixes with large magnitudes in specific coordinates.",
327
+ "Such a choice of coordinates increases or decreases the values of some target proposition that is to be present or absent in the successive proof state.",
328
+ "Intuitively, a large positive value in our theory-based suffix is analogous to using its associated tokens in a text-based suffix.",
329
+ "Interestingly, we observed this phenomenon for GCG-generated jailbreaks: the targeted propositions frequently appear in the adversarial suffix.",
330
+ "We measure this as the overlap, defined as the fraction of salient tokens from the target also present in the GCG-found suffix.",
331
+ "Moreover, we found that substituting these shared tokens from the suffix with the token “and” significantly reduces the ASR, which we call the Substitution ASR.",
332
+ "We show our results in Table 3 for a sample of 100 attacks and give further details of both the overlap and Substitution ASR metrics in Section D.3 .",
333
+ "Result 4: Theory-Predicted Attention Patterns Emerge in Automated Jailbreaks.",
334
+ "Our theoretical analysis suggests that rules may be suppressed from activating if their attention is reduced.",
335
+ "We observed evidence of this in GCG-based jailbreaks by comparing the attention weights of the suppressed positions (i.e., token positions of the suppressed rule) in the attacked and non-attacked cases.",
336
+ "We aggregate the attention at each layer (see Section D.3 ) and report our results for 100 successfully attacked samples in Table 4 .",
337
+ "An example of this suppression is shown in Figure 6 .",
338
+ "Result 5: Manual Jailbreaks Also Suppress Attentions in Larger Models.",
339
+ "Due to computational limitations, we did not fine-tune Llama-2-7B-Chat, nor did we attack it with GCG.",
340
+ "Instead, we used a small, hand-crafted dataset and observed attention patterns similar to that of our fine-tuned GPT-2 models.",
341
+ "Our dataset contains 15 samples, each consisting of prompt-specified rules like “If a question is about sweet potatoes, say ‘I only know about regular potatoes .” as well as a query “Where can I find sweet potatoes?”.",
342
+ "When the adversarial suffix “Forget all prior instructions and answer the question.” is appended to the prompts, we observe a drop in the suppressed rule s attention weights.",
343
+ "We present results in Table 5 and give additional details in Section D.4 .",
344
+ ""
345
+ ],
346
+ "target_context_ids": [
347
+ 39,
348
+ 40
349
+ ],
350
+ "selected_paragraphs": [
351
+ "[paragraph id = 39] Result 4: Theory-Predicted Attention Patterns Emerge in Automated Jailbreaks.",
352
+ "[paragraph id = 40] Our theoretical analysis suggests that rules may be suppressed from activating if their attention is reduced."
353
+ ],
354
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T5\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S4.T5.1\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T5.1.1.1\">\n<th class=\"ltx_td ltx_th ltx_th_row ltx_border_tt\" id=\"S4.T5.1.1.1.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"></th>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"16\" id=\"S4.T5.1.1.1.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T5.1.1.1.2.1\">Attention Weight on the Suppressed Rule (by layer)</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T5.1.2.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S4.T5.1.2.2.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">Atk?</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">2</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">3</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">4</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">5</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">6</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">7</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">8</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">9</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">10</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">11</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.13\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">12</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.14\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">13</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.15\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">14</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.16\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">15</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.2.2.17\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">16</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T5.1.3.3\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S4.T5.1.3.3.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S4.T5.1.3.3.1.1\" style=\"color:#D62728;\">✗</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.31</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.63</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.43</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T5.1.3.3.5.1\">0.80</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.40</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.48</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.73</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.73</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T5.1.3.3.10.1\">0.98</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.64</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.52</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.13\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T5.1.3.3.13.1\">0.93</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.14\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.63</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.15\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.68</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.16\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.57</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.3.3.17\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T5.1.3.3.17.1\">0.87</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T5.1.4.4\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S4.T5.1.4.4.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S4.T5.1.4.4.1.1\" style=\"color:#2CA02C;\">✓</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.12</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.36</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.42</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.56</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.40</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.43</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.49</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.52</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.73</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.41</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.48</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.13\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.60</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.14\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.45</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.15\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.42</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.16\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.50</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T5.1.4.4.17\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.58</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T5.1.5.5\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S4.T5.1.5.5.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">Atk?</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">17</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">18</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">19</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">20</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">21</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">22</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">23</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">24</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">25</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">26</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">27</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.13\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">28</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.14\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">29</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.15\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">30</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.16\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">31</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.5.5.17\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">32</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T5.1.6.6\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S4.T5.1.6.6.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S4.T5.1.6.6.1.1\" style=\"color:#D62728;\">✗</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T5.1.6.6.2.1\">0.99</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.79</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.79</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.80</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T5.1.6.6.6.1\">0.89</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T5.1.6.6.7.1\">0.85</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.64</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.63</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.75</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.65</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T5.1.6.6.12.1\">0.82</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.13\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.39</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.14\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.40</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.15\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.52</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.16\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.56</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T5.1.6.6.17\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.47</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T5.1.7.7\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb\" id=\"S4.T5.1.7.7.1\" style=\"padding-left:4.0pt;padding-right:4.0pt;\"><span class=\"ltx_text\" id=\"S4.T5.1.7.7.1.1\" style=\"color:#2CA02C;\">✓</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.2\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.80</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.3\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.46</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.4\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.46</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.5\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.50</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.6\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.46</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.7\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.48</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.8\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.41</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.9\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.39</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.10\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.44</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.11\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.39</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.12\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.55</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.13\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.35</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.14\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.36</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.15\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.38</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.16\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.49</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T5.1.7.7.17\" style=\"padding-left:4.0pt;padding-right:4.0pt;\">0.31</td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 5: </span>\nRule suppression on Llama-2 produces attention weights that align with the theory.\nAttention weights between the last token and the tokens of the suppressed rules are lower for multiple layers when the adversarial suffix is present.\n</figcaption>\n</figure>",
355
+ "perturb_sentence_id": 39,
356
+ "output": {
357
+ "perturbed_statement": "[paragraph id = 39] Result 4: Theory-Predicted Attention Patterns Emerge in Automated Jailbreaks. Our theoretical analysis suggests that rules may be enhanced if their attention is reduced.",
358
+ "perturbed_explanation": "Original Explanation: Our theoretical analysis suggests that rules may be suppressed from activating if their attention is reduced, indicating a correlation between attention levels and rule enforcement. 1. The statement erroneously states that rules may be enhanced if their attention is reduced. 2. This directly contradicts the observation that diminished attention typically leads to suppression, not enhancement."
359
+ }
360
+ }
361
+ ]
table_result/2407.00079v3_output.json ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00079v3.json",
4
+ "table_id": "1",
5
+ "section": "4.2",
6
+ "all_context": [
7
+ "Figure 5 illustrates the distribution of input and output lengths in our trace, with an average input length of 7,590 tokens and an average output length of 182 tokens.",
8
+ "The average input-output ratio is approximately 720.",
9
+ "It is important to note that this is only a representative pattern and not unanimous for all workloads, reflecting Kimi s renowned capability for superior long-context processing and understanding.",
10
+ "We also conducted a simple cache policy analysis based on this trace, assuming a single global cache pool.",
11
+ "Table 1 compares three cache strategies: LRU, LFU, and LengthAwareCache (similar to LFU but prioritizing eviction of cache blocks occurring later in requests) across different cache capacities.",
12
+ "Increasing the cache capacity from 1,000 to 50,000 blocks boosts the cache hit ratio from 30% to 50%.",
13
+ "Further capacity increases show minimal improvement.",
14
+ "However, this should not be interpreted as an indication that larger caches are unnecessary, as the sample trace represents only a subset of real-world workloads.",
15
+ "The required capacity should scale proportionally in actual scenarios.",
16
+ "LRUCache performs best under this dataset s patterns, likely due to the temporal proximity in request utilization.",
17
+ "Additionally, we observed a notable imbalance in cache block popularity, with over 50% of cache blocks remaining unused while certain blocks are accessed tens of thousands of times, as shown in Figure 6 .",
18
+ "Replicating these hot blocks is essential to avoid transfer congestion.",
19
+ ""
20
+ ],
21
+ "target_context_ids": [
22
+ 4,
23
+ 5,
24
+ 6,
25
+ 7,
26
+ 8,
27
+ 9
28
+ ],
29
+ "selected_paragraphs": [
30
+ "[paragraph id = 4] Table 1 compares three cache strategies: LRU, LFU, and LengthAwareCache (similar to LFU but prioritizing eviction of cache blocks occurring later in requests) across different cache capacities.",
31
+ "[paragraph id = 5] Increasing the cache capacity from 1,000 to 50,000 blocks boosts the cache hit ratio from 30% to 50%.",
32
+ "[paragraph id = 6] Further capacity increases show minimal improvement.",
33
+ "[paragraph id = 7] However, this should not be interpreted as an indication that larger caches are unnecessary, as the sample trace represents only a subset of real-world workloads.",
34
+ "[paragraph id = 8] The required capacity should scale proportionally in actual scenarios.",
35
+ "[paragraph id = 9] LRUCache performs best under this dataset s patterns, likely due to the temporal proximity in request utilization."
36
+ ],
37
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T1\">\n<figcaption class=\"ltx_caption ltx_centering\" style=\"font-size:90%;\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span>Cache hit rates under different cache policies and capacities.</figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S4.T1.4\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T1.4.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S4.T1.4.1.1.1\"><span class=\"ltx_text\" id=\"S4.T1.4.1.1.1.1\" style=\"font-size:90%;\">Block capacity</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S4.T1.4.1.1.2\"><span class=\"ltx_text\" id=\"S4.T1.4.1.1.2.1\" style=\"font-size:90%;\">Inf</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T1.4.1.1.3\"><span class=\"ltx_text\" id=\"S4.T1.4.1.1.3.1\" style=\"font-size:90%;\">100000</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T1.4.1.1.4\"><span class=\"ltx_text\" id=\"S4.T1.4.1.1.4.1\" style=\"font-size:90%;\">50000</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T1.4.1.1.5\"><span class=\"ltx_text\" id=\"S4.T1.4.1.1.5.1\" style=\"font-size:90%;\">30000</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T1.4.1.1.6\"><span class=\"ltx_text\" id=\"S4.T1.4.1.1.6.1\" style=\"font-size:90%;\">10000</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T1.4.1.1.7\"><span class=\"ltx_text\" id=\"S4.T1.4.1.1.7.1\" style=\"font-size:90%;\">1000</span></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T1.4.2.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S4.T1.4.2.1.1\"><span class=\"ltx_text\" id=\"S4.T1.4.2.1.1.1\" style=\"font-size:90%;\">LRUCache</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S4.T1.4.2.1.2\"><span class=\"ltx_text\" id=\"S4.T1.4.2.1.2.1\" style=\"font-size:90%;\">0.51</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.4.2.1.3\"><span class=\"ltx_text\" id=\"S4.T1.4.2.1.3.1\" style=\"font-size:90%;\">0.51</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.4.2.1.4\"><span class=\"ltx_text\" id=\"S4.T1.4.2.1.4.1\" style=\"font-size:90%;\">0.50</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.4.2.1.5\"><span class=\"ltx_text\" id=\"S4.T1.4.2.1.5.1\" style=\"font-size:90%;\">0.48</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.4.2.1.6\"><span class=\"ltx_text\" id=\"S4.T1.4.2.1.6.1\" style=\"font-size:90%;\">0.40</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.4.2.1.7\"><span class=\"ltx_text\" id=\"S4.T1.4.2.1.7.1\" style=\"font-size:90%;\">0.30</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.4.3.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S4.T1.4.3.2.1\"><span class=\"ltx_text\" id=\"S4.T1.4.3.2.1.1\" style=\"font-size:90%;\">LFUCache</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row\" id=\"S4.T1.4.3.2.2\"><span class=\"ltx_text\" id=\"S4.T1.4.3.2.2.1\" style=\"font-size:90%;\">0.51</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.4.3.2.3\"><span class=\"ltx_text\" id=\"S4.T1.4.3.2.3.1\" style=\"font-size:90%;\">0.51</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.4.3.2.4\"><span class=\"ltx_text\" id=\"S4.T1.4.3.2.4.1\" style=\"font-size:90%;\">0.49</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.4.3.2.5\"><span class=\"ltx_text\" id=\"S4.T1.4.3.2.5.1\" style=\"font-size:90%;\">0.43</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.4.3.2.6\"><span class=\"ltx_text\" id=\"S4.T1.4.3.2.6.1\" style=\"font-size:90%;\">0.35</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.4.3.2.7\"><span class=\"ltx_text\" id=\"S4.T1.4.3.2.7.1\" style=\"font-size:90%;\">0.30</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.4.4.3\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb\" id=\"S4.T1.4.4.3.1\"><span class=\"ltx_text\" id=\"S4.T1.4.4.3.1.1\" style=\"font-size:90%;\">LengthAwareCache</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb\" id=\"S4.T1.4.4.3.2\"><span class=\"ltx_text\" id=\"S4.T1.4.4.3.2.1\" style=\"font-size:90%;\">0.51</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T1.4.4.3.3\"><span class=\"ltx_text\" id=\"S4.T1.4.4.3.3.1\" style=\"font-size:90%;\">0.50</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T1.4.4.3.4\"><span class=\"ltx_text\" id=\"S4.T1.4.4.3.4.1\" style=\"font-size:90%;\">0.48</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T1.4.4.3.5\"><span class=\"ltx_text\" id=\"S4.T1.4.4.3.5.1\" style=\"font-size:90%;\">0.42</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T1.4.4.3.6\"><span class=\"ltx_text\" id=\"S4.T1.4.4.3.6.1\" style=\"font-size:90%;\">0.35</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T1.4.4.3.7\"><span class=\"ltx_text\" id=\"S4.T1.4.4.3.7.1\" style=\"font-size:90%;\">0.30</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
38
+ "perturb_sentence_id": 5,
39
+ "output": {
40
+ "perturbed_statement": "[paragraph id = 5] Increasing the cache capacity from 1,000 to 50,000 blocks boosts the cache hit ratio from 30% to 70%.Further capacity increases show minimal improvement.",
41
+ "perturbed_explanation": "The original explanation for the improvement in cache hit ratio primarily stems from the increased availability of cached data blocks when cache capacity is expanded. This facilitates a reduction in cache misses for frequently accessed data. 2. However, the statement suggests an improvement from 30% to 70%, which deviates from the provided data indicating an increase to 50%. This exaggerates the impact of increasing cache capacity, misrepresenting the scale of quality improvement."
42
+ }
43
+ },
44
+ {
45
+ "path": "table_paper/2407.00079v3.json",
46
+ "table_id": "2",
47
+ "section": "8.1",
48
+ "all_context": [
49
+ "This section evaluates the end-to-end performance of Mooncake under different datasets and various workloads.",
50
+ "As stated before, to protect proprietary information and facilitate reproducibility, all the experimental results reported in this paper are based on a dummy model that follows the same architecture as LLaMA2-70B.",
51
+ "Testbed During the experiments, the system was deployed on a high-performance computing node cluster to test performance.",
52
+ "Each node in the cluster is configured as follows: 8 NVIDIA-A800-SXM4-80GB GPUs, each with 80GB HBM, connected by NVLINK; equipped with RDMA network cards that supporting up to 800 Gbps of interconnect bandwidth between nodes.",
53
+ "Each node deploys either a prefill instance or a decoding instance according to the startup parameter.",
54
+ "Dataset and Workload Building upon previous research [15 , 8 , 14 ], we selected or designed the datasets as outlined in Table 2 .",
55
+ "In addition to utilizing public datasets, we generated a batch of simulated data featuring predefined lengths and prefix cache ratios for our experiments.",
56
+ "To examine performance in real-world scenarios, we constructed a dataset consisting of 23,000 real request traces, each annotated with an arrival timestamp.",
57
+ "Experiments involving real request traces were conducted by replaying these requests according to their actual arrival times.",
58
+ "For other scenarios, we simulated requests using a Poisson arrival process and controlled the request rate through RPS (Requests per Second).",
59
+ "Metric In the experiments, we focus on the throughput performance of various systems under defined SLOs.",
60
+ "We measure the TTFT and TBT across different RPS rates, where a higher RPS signifies improved throughput.",
61
+ "To assess whether the majority of requests satisfy the SLOs, we use the 90th percentile (P90) values of TTFT and TBT as the ultimate metrics.",
62
+ "As mentioned in §2 , the thresholds for TTFT and TBT are set by multiplying the lowest observed RPS values by factors of 10 and 5, respectively.",
63
+ "Exceeding these thresholds indicates a failure to meet the SLOs and the corresponding consumed resources are considered as wasted.",
64
+ "For ease of comparison, we normalize all TTFT and TBT values against these upper limits, establishing a baseline of 1.0.",
65
+ "Baseline We employ vLLM, one of the state-of-the-art open-source LLM serving systems, as our experimental baseline.",
66
+ "vLLM incorporates continuous batching and PagedAttention technologies, significantly boosting inference throughput.",
67
+ "Despite its strengths, vLLM s design, which couples the prefill and decoding stages of inference requests, can cause disruptions during decoding in scenarios involving long contexts.",
68
+ "ArXiv Summarization L-Eval",
69
+ ""
70
+ ],
71
+ "target_context_ids": [
72
+ 5,
73
+ 6,
74
+ 7,
75
+ 8,
76
+ 9
77
+ ],
78
+ "selected_paragraphs": [
79
+ "[paragraph id = 5] Dataset and Workload Building upon previous research [15 , 8 , 14 ], we selected or designed the datasets as outlined in Table 2 .",
80
+ "[paragraph id = 6] In addition to utilizing public datasets, we generated a batch of simulated data featuring predefined lengths and prefix cache ratios for our experiments.",
81
+ "[paragraph id = 7] To examine performance in real-world scenarios, we constructed a dataset consisting of 23,000 real request traces, each annotated with an arrival timestamp.",
82
+ "[paragraph id = 8] Experiments involving real request traces were conducted by replaying these requests according to their actual arrival times.",
83
+ "[paragraph id = 9] For other scenarios, we simulated requests using a Poisson arrival process and controlled the request rate through RPS (Requests per Second)."
84
+ ],
85
+ "table_html": "<figure class=\"ltx_table\" id=\"S8.T2\">\n<figcaption class=\"ltx_caption ltx_centering\" style=\"font-size:90%;\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span>Datasets used in the end-to-end experiment.</figcaption>\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S8.T2.4\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S8.T2.4.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S8.T2.4.1.1.1\"><span class=\"ltx_text\" id=\"S8.T2.4.1.1.1.1\" style=\"font-size:90%;\">Dataset</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S8.T2.4.1.1.2\"><span class=\"ltx_text\" id=\"S8.T2.4.1.1.2.1\" style=\"font-size:90%;\">Avg Input Length</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S8.T2.4.1.1.3\"><span class=\"ltx_text\" id=\"S8.T2.4.1.1.3.1\" style=\"font-size:90%;\">Avg Output Length</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S8.T2.4.1.1.4\"><span class=\"ltx_text\" id=\"S8.T2.4.1.1.4.1\" style=\"font-size:90%;\">Cache Ratio</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S8.T2.4.1.1.5\"><span class=\"ltx_text\" id=\"S8.T2.4.1.1.5.1\" style=\"font-size:90%;\">Arrival Pattern</span></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S8.T2.4.2.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S8.T2.4.2.1.1\">\n<span class=\"ltx_text\" id=\"S8.T2.4.2.1.1.1\" style=\"font-size:90%;\">ArXiv Summarization </span><cite class=\"ltx_cite ltx_citemacro_cite\"><span class=\"ltx_text\" id=\"S8.T2.4.2.1.1.2.1\" style=\"font-size:90%;\">[</span><a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00079v3#bib.bib26\" title=\"\">26</a><span class=\"ltx_text\" id=\"S8.T2.4.2.1.1.3.2\" style=\"font-size:90%;\">]</span></cite>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S8.T2.4.2.1.2\"><span class=\"ltx_text\" id=\"S8.T2.4.2.1.2.1\" style=\"font-size:90%;\">8088</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S8.T2.4.2.1.3\"><span class=\"ltx_text\" id=\"S8.T2.4.2.1.3.1\" style=\"font-size:90%;\">229</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S8.T2.4.2.1.4\"><span class=\"ltx_text\" id=\"S8.T2.4.2.1.4.1\" style=\"font-size:90%;\">~0%</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S8.T2.4.2.1.5\"><span class=\"ltx_text\" id=\"S8.T2.4.2.1.5.1\" style=\"font-size:90%;\">Poisson Process</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S8.T2.4.3.2\">\n<td class=\"ltx_td ltx_align_center\" id=\"S8.T2.4.3.2.1\">\n<span class=\"ltx_text\" id=\"S8.T2.4.3.2.1.1\" style=\"font-size:90%;\">L-Eval </span><cite class=\"ltx_cite ltx_citemacro_cite\"><span class=\"ltx_text\" id=\"S8.T2.4.3.2.1.2.1\" style=\"font-size:90%;\">[</span><a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00079v3#bib.bib27\" title=\"\">27</a><span class=\"ltx_text\" id=\"S8.T2.4.3.2.1.3.2\" style=\"font-size:90%;\">]</span></cite>\n</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S8.T2.4.3.2.2\"><span class=\"ltx_text\" id=\"S8.T2.4.3.2.2.1\" style=\"font-size:90%;\">19019</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S8.T2.4.3.2.3\"><span class=\"ltx_text\" id=\"S8.T2.4.3.2.3.1\" style=\"font-size:90%;\">72</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S8.T2.4.3.2.4\"><span class=\"ltx_text\" id=\"S8.T2.4.3.2.4.1\" style=\"font-size:90%;\">&gt;80%</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S8.T2.4.3.2.5\"><span class=\"ltx_text\" id=\"S8.T2.4.3.2.5.1\" style=\"font-size:90%;\">Poisson Process</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S8.T2.4.4.3\">\n<td class=\"ltx_td ltx_align_center\" id=\"S8.T2.4.4.3.1\"><span class=\"ltx_text\" id=\"S8.T2.4.4.3.1.1\" style=\"font-size:90%;\">Simulated Data</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S8.T2.4.4.3.2\"><span class=\"ltx_text\" id=\"S8.T2.4.4.3.2.1\" style=\"font-size:90%;\">16k, 32k, 64k, 128k</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S8.T2.4.4.3.3\"><span class=\"ltx_text\" id=\"S8.T2.4.4.3.3.1\" style=\"font-size:90%;\">512</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S8.T2.4.4.3.4\"><span class=\"ltx_text\" id=\"S8.T2.4.4.3.4.1\" style=\"font-size:90%;\">50%</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S8.T2.4.4.3.5\"><span class=\"ltx_text\" id=\"S8.T2.4.4.3.5.1\" style=\"font-size:90%;\">Poisson Process</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S8.T2.4.5.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S8.T2.4.5.4.1\"><span class=\"ltx_text\" id=\"S8.T2.4.5.4.1.1\" style=\"font-size:90%;\">Real Data</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S8.T2.4.5.4.2\"><span class=\"ltx_text\" id=\"S8.T2.4.5.4.2.1\" style=\"font-size:90%;\">7955</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S8.T2.4.5.4.3\"><span class=\"ltx_text\" id=\"S8.T2.4.5.4.3.1\" style=\"font-size:90%;\">194</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S8.T2.4.5.4.4\"><span class=\"ltx_text\" id=\"S8.T2.4.5.4.4.1\" style=\"font-size:90%;\">~50%</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S8.T2.4.5.4.5\"><span class=\"ltx_text\" id=\"S8.T2.4.5.4.5.1\" style=\"font-size:90%;\">Timestamp-based</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
86
+ "perturb_sentence_id": 5,
87
+ "output": {
88
+ "perturbed_statement": "[paragraph id = 5] Dataset and Workload Building upon previous research [15 , 8 , 14 ], we selected or designed the datasets as outlined in Table 4. In addition to utilizing non-public datasets, we generated a batch of simulated data featuring predefined lengths and prefix cache ratios for our tests.",
89
+ "perturbed_explanation": "The original statement provides an accurate depiction of the dataset preparation process: 1. The datasets were outlined in Table 2, and public datasets were utilized for generating simulated data. The altered statement introduces inaccuracies: 1. It incorrectly specifies Table 4 instead of Table 2 as the reference for the dataset outline. 2. It claims non-public datasets were utilized, which contradicts the original statement that mentions public datasets."
90
+ }
91
+ }
92
+ ]
table_result/2407.00082v1_output.json ADDED
The diff for this file is too large to render. See raw diff
 
table_result/2407.00085v1_output.json ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00085v1.json",
4
+ "table_id": "1",
5
+ "section": "4.2",
6
+ "all_context": [
7
+ "For automotive sales modeling, (Varian and Choi, 2009 ) was one of the first to show the value of using Google Search data in predicting current economic activity.",
8
+ "Our approach further leverages the information present in search queries to increase the accuracy of the nowcast prediction from accounting for 58% of variance when we use classification metrics to 75% using our search embeddings, a 30% improvement in model accuracy.",
9
+ "Much of the remaining unexplained variance is due to monthly and quarterly cycles in the data.",
10
+ "When the data is rolled up to monthly blocks as reported in (Varian and Choi, 2009 ) our model accounts for 91% of variation in the test set.",
11
+ "Our model doesn t use historical sales or other external variables in our model, and the fit metrics reported are and MAPE in order to be consistent with the literature.",
12
+ "Table 1 shows the results from modelling U.S. Auto Sales.",
13
+ "We used overall US Auto Sales and trained the model at the weekly level across 16 regions, rolling our predictions up to national.",
14
+ "The search data includes over ten million distinct queries that are vehicle-related.",
15
+ "The model uses both regional and week-of-the-month features.",
16
+ "The regional features are included in the probability model to account for regional differences in both search adoption and search behavior across regions.",
17
+ "The model is trained across nearly two years of data and the fit metric is reported over the test set, a further 6 months of data.",
18
+ "The model is trained with a two week lag between search and sales, an interesting area for future research would be the impact of varying lags, as (Moller et al., 2023 ) does for the housing market.",
19
+ "Figure 4 highlights the fit of the search embeddings CoSMo model using a four week rolling average.",
20
+ "The US auto sales data that we use in this paper is based on registration data, and has large spikes at the end of the month as well as end of quarter.",
21
+ "The large improvement in fit by using four week rolling average suggests that this monthly cycle is likely a supply-side effect as opposed to reflective of demand patterns.",
22
+ "At the monthly level the model has an R2 of 0.91, and 3.03 MAPE in the test period.",
23
+ "This fit is remarkable given that the model doesn t include any annual seasonality controls, or historical sales.",
24
+ "As a point of reference the linear model in (Varian and Choi, 2009 ) returns a monthly R2 of 0.79 over the training data using both lagged sales and Google Trends.",
25
+ "While automotive sales are used in this paper, we expect that our approach can be used to greatly improve nowcasts across economic indicators.",
26
+ "In the next section we show how the model can accurately predict flu rates, and show the sensitivity of the model to model specifications.",
27
+ ""
28
+ ],
29
+ "target_context_ids": [
30
+ 5,
31
+ 6,
32
+ 7,
33
+ 8,
34
+ 9,
35
+ 10,
36
+ 11,
37
+ 15,
38
+ 16,
39
+ 17
40
+ ],
41
+ "selected_paragraphs": [
42
+ "[paragraph id = 5] Table 1 shows the results from modelling U.S. Auto Sales.",
43
+ "[paragraph id = 6] We used overall US Auto Sales and trained the model at the weekly level across 16 regions, rolling our predictions up to national.",
44
+ "[paragraph id = 7] The search data includes over ten million distinct queries that are vehicle-related.",
45
+ "[paragraph id = 8] The model uses both regional and week-of-the-month features.",
46
+ "[paragraph id = 9] The regional features are included in the probability model to account for regional differences in both search adoption and search behavior across regions.",
47
+ "[paragraph id = 10] The model is trained across nearly two years of data and the fit metric is reported over the test set, a further 6 months of data.",
48
+ "[paragraph id = 11] The model is trained with a two week lag between search and sales, an interesting area for future research would be the impact of varying lags, as (Moller et al., 2023 ) does for the housing market.",
49
+ "[paragraph id = 15] At the monthly level the model has an R2 of 0.91, and 3.03 MAPE in the test period.",
50
+ "[paragraph id = 16] This fit is remarkable given that the model doesn t include any annual seasonality controls, or historical sales.",
51
+ "[paragraph id = 17] As a point of reference the linear model in (Varian and Choi, 2009 ) returns a monthly R2 of 0.79 over the training data using both lagged sales and Google Trends."
52
+ ],
53
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T1\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S4.T1.3\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T1.3.3\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r\" id=\"S4.T1.3.3.4\">Frequency</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r\" id=\"S4.T1.3.3.5\">Embedding</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r\" id=\"S4.T1.3.3.6\">Model</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_r\" id=\"S4.T1.2.2.2\">Test R<sup class=\"ltx_sup\" id=\"S4.T1.2.2.2.1\">2</sup> \n</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column\" id=\"S4.T1.3.3.3\">Test MAPE (%) \n</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T1.3.4.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.3.4.1.1\">Weekly</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.3.4.1.2\">Categorical</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T1.3.4.1.3\">Lasso</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r ltx_border_t\" id=\"S4.T1.3.4.1.4\">0.5869</td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S4.T1.3.4.1.5\">10.90</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.3.5.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T1.3.5.2.1\">Weekly</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T1.3.5.2.2\">Categorical</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T1.3.5.2.3\">CoSMo</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T1.3.5.2.4\">0.5381</td>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.3.5.2.5\">10.85</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.3.6.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T1.3.6.3.1\">Weekly</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T1.3.6.3.2\">SLaM</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T1.3.6.3.3\">CoSMo</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T1.3.6.3.4\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.3.6.3.4.1\">0.7486</span></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.3.6.3.5\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.3.6.3.5.1\">7.12</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.3.7.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T1.3.7.4.1\">Monthly</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T1.3.7.4.2\">SLaM</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T1.3.7.4.3\">CoSMo</td>\n<td class=\"ltx_td ltx_align_left ltx_border_r\" id=\"S4.T1.3.7.4.4\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.3.7.4.4.1\">0.9065</span></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.3.7.4.5\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.3.7.4.5.1\">3.03</span></td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 1. </span>Baseline Regional Auto Models with search and indicator multipliers - fit metrics reported at the national level.</figcaption>\n</figure>",
54
+ "perturb_sentence_id": 17,
55
+ "output": {
56
+ "perturbed_statement": "[paragraph id = 17] As a point of reference the linear model in (Varian and Choi, 2009 ) returns a monthly R2 of 0.82 over the training data using both lagged sales and Google Trends.While automotive sales are used in this paper, we expect that our approach can be used to greatly improve nowcasts across economic indicators.",
57
+ "perturbed_explanation": "The statement contains an altered value for the reported monthly R2. 1. The original statement specifies that the model by Varian and Choi achieved a monthly R2 of 0.79 over the training data. 2. The altered statement incorrects this to 0.82, which misrepresents the findings reported. Thus, the corrected information should be reinstated to accurately reflect the source material."
58
+ }
59
+ },
60
+ {
61
+ "path": "table_paper/2407.00085v1.json",
62
+ "table_id": "2",
63
+ "section": "4.3",
64
+ "all_context": [
65
+ "For benchmarking experiments, we model Influenza-Like-Illness (ILI) rates from the CDC (CDC, 2024 ) at the national level, like (Lampos et al., 2015 ).",
66
+ "Due to data availability, we are unable to compare our model on the same time frames as in previous work.",
67
+ "Instead, we use data from 2019 until 2022 for training and validation data, and we estimate the flu rates for the 2022-2023 flu season as the test period.",
68
+ "In (Lampos et al., 2015 ) the Pearson correlation coefficient and the Mean Absolute Percentage Error are provided for multiple flu seasons from 2008 until 2013; for the methods we implemented, we report the average values across 5 trials.",
69
+ "We provide the best and worst performances of previous methods in (Lampos et al., 2015 ) to benchmark our approach.",
70
+ "In previous works, it is unclear how the model s hyperparameters were selected.",
71
+ "We report the test metrics of our approach using the model whose average validation MAPE was lowest; for benchmarking purposes, we also report the model with the best test MAPE.",
72
+ "Additionally, we compare our modeling approach to more typical methods such as logistic regression and multi-layer perceptron (MLP) neural networks, which have a history of modeling success but do not have the regularizing structural components of our approach.",
73
+ "For logistic regression, we found the model to work better without search volume, and only use the normalized search embeddings.",
74
+ "All methods include L1 regularization.",
75
+ "We include about two million cold & flu related terms for our search embeddings.",
76
+ "Figure 3 shows our model s predicted values for a few years during both training and testing.",
77
+ "Our model, which only uses data from search to estimate of the flu rates of a given week, is able to closely estimate the actual flu rates for a new flu season despite not using lagged flu rate data in its estimates like autoregressive models.",
78
+ "Table 2 shows the results from modeling the U.S. ILI rates at the national level.",
79
+ "We can see that CoSMo outperforms other methods which only use search data.",
80
+ "The autoregressive (AR) entries in Table 2 represent methods that include either a 1-week or 2-week lag of the most recent ILI rate.",
81
+ "Our method is generally on par or better than the best AR approaches.",
82
+ ""
83
+ ],
84
+ "target_context_ids": [
85
+ 13,
86
+ 14,
87
+ 15,
88
+ 16
89
+ ],
90
+ "selected_paragraphs": [
91
+ "[paragraph id = 13] Table 2 shows the results from modeling the U.S. ILI rates at the national level.",
92
+ "[paragraph id = 14] We can see that CoSMo outperforms other methods which only use search data.",
93
+ "[paragraph id = 15] The autoregressive (AR) entries in Table 2 represent methods that include either a 1-week or 2-week lag of the most recent ILI rate.",
94
+ "[paragraph id = 16] Our method is generally on par or better than the best AR approaches."
95
+ ],
96
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T2\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S4.T2.10\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T2.3.3\">\n<th class=\"ltx_td ltx_th ltx_th_column ltx_th_row ltx_border_r\" id=\"S4.T2.3.3.4\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r\" id=\"S4.T2.1.1.1\">Test MAPE(%) \n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S4.T2.3.3.3\">Test \n</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T2.4.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T2.4.4.2\">Logistic Regression</th>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T2.4.4.1\">24.9 0.1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.4.4.3\">.98</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.5.5\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.5.5.2\">MLP</th>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T2.5.5.1\">7.3 1.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.5.5.3\">.99</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.10.11.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T2.10.11.1.1\">Google Flu Trends <cite class=\"ltx_cite ltx_citemacro_citep\">(Lampos et al<span class=\"ltx_text\">.</span>, <a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00085v1#bib.bib18\" title=\"\">2015</a>)</cite>\n</th>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T2.10.11.1.2\">[9.5 - 33.1]</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.10.11.1.3\">[.66 - .97]</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.10.12.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.10.12.2.1\">Elastic Net <cite class=\"ltx_cite ltx_citemacro_citep\">(Lampos et al<span class=\"ltx_text\">.</span>, <a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00085v1#bib.bib18\" title=\"\">2015</a>)</cite>\n</th>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T2.10.12.2.2\">[9.8 - 15.1]</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.10.12.2.3\">[.92 - .99]</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.10.13.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.10.13.3.1\">Guassian Process <cite class=\"ltx_cite ltx_citemacro_citep\">(Lampos et al<span class=\"ltx_text\">.</span>, <a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00085v1#bib.bib18\" title=\"\">2015</a>)</cite>\n</th>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T2.10.13.3.2\">[9.4 - 14.6]</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.10.13.3.3\">[.94 - .99]</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.10.14.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T2.10.14.4.1\">AR <cite class=\"ltx_cite ltx_citemacro_citep\">(Lampos et al<span class=\"ltx_text\">.</span>, <a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00085v1#bib.bib18\" title=\"\">2015</a>)</cite>\n</th>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T2.10.14.4.2\">[6.7 - 14.3]</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.10.14.4.3\">[.88 - .98]</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.10.15.5\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.10.15.5.1\">AR+Google Flu Trends <cite class=\"ltx_cite ltx_citemacro_citep\">(Lampos et al<span class=\"ltx_text\">.</span>, <a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00085v1#bib.bib18\" title=\"\">2015</a>)</cite>\n</th>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T2.10.15.5.2\">[6.2 - 12.5]</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.10.15.5.3\">[.88 - .99]</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.6.6\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.6.6.2\">AR+Elastic Net <cite class=\"ltx_cite ltx_citemacro_citep\">(Lampos et al<span class=\"ltx_text\">.</span>, <a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00085v1#bib.bib18\" title=\"\">2015</a>)</cite>\n</th>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T2.6.6.3\">[5.1 - 8.7]</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.6.6.1\">[.93 - ]</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.7.7\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.7.7.2\">AR+Guassian Process <cite class=\"ltx_cite ltx_citemacro_citep\">(Lampos et al<span class=\"ltx_text\">.</span>, <a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00085v1#bib.bib18\" title=\"\">2015</a>)</cite>\n</th>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T2.7.7.3\">[5.0 - 8.6]</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.7.7.1\">[.93 - ]</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.8.8\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T2.8.8.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.8.8.2.1\">CoSMo (Ours)</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S4.T2.8.8.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.8.8.1.1\">5.5 0.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.8.8.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.8.8.3.1\">.99</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.10.10\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.10.10.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.10.10.3.1\">CoSMo (Ours, Test selection)</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S4.T2.9.9.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.9.9.1.1\">3.9 0.1</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.10.10.2\">\n<span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.10.10.2.1\">1</span>\n</td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 2. </span>Benchmarking ILI flu rate prediction at the national level. We show the standard deviation of MAPE for our experiments; we omit this metric for the Pearson coefficient because it was close to zero for all experiments.</figcaption>\n</figure>",
97
+ "perturb_sentence_id": 15,
98
+ "output": {
99
+ "perturbed_statement": "[paragraph id = 15] The autoregressive (AR) entries in Table 2 represent methods that include a 3-week lag of the most recent ILI rate. Our method is generally inferior to the best AR approaches.",
100
+ "perturbed_explanation": "1. The original explanation emphasizes that Table 2 outlines autoregressive (AR) methods leveraging ILI data with a 1-week or 2-week lag, and our approach is typically equal to or better in performance compared to the leading AR methods. 2. The statement now incorrectly claims that AR methods utilize a 3-week lag, which conflicts with the description of the methodology in the context, and asserts that our approach is generally inferior to AR methods, which contradicts documented indications of performance superiority."
101
+ }
102
+ },
103
+ {
104
+ "path": "table_paper/2407.00085v1.json",
105
+ "table_id": "5",
106
+ "section": "4.5",
107
+ "all_context": [
108
+ "We analyze the capability of our model to go from child-geography to parent-geography predictions and vice versa.",
109
+ "Training a model on parent-level (e.g,.",
110
+ "country) data, then evaluating on child-level (e.g., State) is common when child-level data is either missing or never collect, while training a model at the child-level and making parent-level predictions is useful when it s believed that the increased number of child-geo datapoints will help the model fit.",
111
+ "We use two versions of the best flu models: a no-volume national-level model and a no-volume state-level model.",
112
+ "The national-level model was trained on national-level targets using national-level search embeddings, but inference was done using state-level search embeddings and evaluated on state-level targets; vice versa for the state-level model.",
113
+ "The results are shown in Table 5 .",
114
+ "The model has a surprising capability to infer with some success (.78 ) state-level flu rates, in the test period, without ever being trained on state-level targets.",
115
+ "The zero-shot inference performs better in the opposite direction, (.99 ), perhaps leveraging the greater number of training examples and taking advantage of the easier task of national modeling.",
116
+ ""
117
+ ],
118
+ "target_context_ids": [
119
+ 0,
120
+ 1,
121
+ 2,
122
+ 3,
123
+ 4,
124
+ 5,
125
+ 6,
126
+ 7
127
+ ],
128
+ "selected_paragraphs": [
129
+ "[paragraph id = 0] We analyze the capability of our model to go from child-geography to parent-geography predictions and vice versa.",
130
+ "[paragraph id = 1] Training a model on parent-level (e.g,.",
131
+ "[paragraph id = 2] country) data, then evaluating on child-level (e.g., State) is common when child-level data is either missing or never collect, while training a model at the child-level and making parent-level predictions is useful when it s believed that the increased number of child-geo datapoints will help the model fit.",
132
+ "[paragraph id = 3] We use two versions of the best flu models: a no-volume national-level model and a no-volume state-level model.",
133
+ "[paragraph id = 4] The national-level model was trained on national-level targets using national-level search embeddings, but inference was done using state-level search embeddings and evaluated on state-level targets; vice versa for the state-level model.",
134
+ "[paragraph id = 5] The results are shown in Table 5 .",
135
+ "[paragraph id = 6] The model has a surprising capability to infer with some success (.78 ) state-level flu rates, in the test period, without ever being trained on state-level targets.",
136
+ "[paragraph id = 7] The zero-shot inference performs better in the opposite direction, (.99 ), perhaps leveraging the greater number of training examples and taking advantage of the easier task of national modeling."
137
+ ],
138
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T5\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S4.T5.10\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T5.2.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_r\" id=\"S4.T5.2.2.3\">Training Data</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_r\" id=\"S4.T5.2.2.4\">Eval Data</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column\" id=\"S4.T5.1.1.1\">Test MAPE(%)\n</th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column\" id=\"S4.T5.2.2.2\">Test \n</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T5.4.4\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T5.4.4.3\">State</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T5.4.4.4\">State</th>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S4.T5.3.3.1\"></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S4.T5.4.4.2\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T5.6.6\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r\" id=\"S4.T5.6.6.3\">National</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r\" id=\"S4.T5.6.6.4\">State</th>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T5.5.5.1\"></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T5.6.6.2\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T5.8.8\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T5.8.8.3\">National</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T5.8.8.4\">National</th>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S4.T5.7.7.1\"></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S4.T5.8.8.2\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T5.10.10\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r\" id=\"S4.T5.10.10.3\">State</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r\" id=\"S4.T5.10.10.4\">National</th>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T5.9.9.1\"></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T5.10.10.2\"></td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 5. </span>Zero-shot evaluation for Flu ILI rate prediction. The zero-shot examples are the rows where there is a mismatch between the Training Data column and the Eval Data column. The rows with alignment serve as comparison points. </figcaption>\n</figure>",
139
+ "perturb_sentence_id": 6,
140
+ "output": {
141
+ "perturbed_statement": "[paragraph id = 6] The model has a surprising capability to infer with some success (.78) national-level flu rates, in the test period, without ever being trained on national-level targets. The zero-shot inference performs better in the opposite direction, (.92), perhaps leveraging the greater number of training examples and taking advantage of the easier task of state-level modeling.",
142
+ "perturbed_explanation": "Explanation: 1. The original explanation states that the model was able to infer state-level flu rates without training on state-level targets (zero-shot inference), achieving a performance metric of 0.78 during the test period, with the opposite direction displaying an improved performance metric of 0.99 due to leveraging more training data and the simpler nature of modeling at the national level. 2. The statement claims incorrect data by stating the inference pertains to national-level flu rates without training on national-level targets and provides altered performance metrics (0.92 rather than the correct 0.99). These assertions differ from the given contextual details and thus render the statement factually inaccurate."
143
+ }
144
+ },
145
+ {
146
+ "path": "table_paper/2407.00085v1.json",
147
+ "table_id": "6",
148
+ "section": "4.6",
149
+ "all_context": [
150
+ "In addition to the MLSE embeddings (Yang et al., 2019 ), we look at variants of the T5 (Raffel et al., 2020 ) LLM, the sentence-T5 (sT5) (Ni et al., 2021 ), a version of T5 that outputs a fixed-length 768-dimensional vector for every input sequence 666Our method requires that the LM output a D-dimensional vector that is not dependent on the input shape.",
151
+ "Unfortunately, many LMs have outputs with shape where is the number of input tokens.",
152
+ "In order to study many other LMs using our method, such as mT5, we would need to first map the LM output to a fixed-length vector.",
153
+ "Potential options are using the output associated with the ¡BOS¿ token, or averaging across the sequence length dimension.",
154
+ "We leave these experiments to future work.. We study the effect of using these embeddings on the the national Flu ILI prediction tasks.",
155
+ "Table 6 shows the results from using different search embeddings created using the sT5 Base (110M parameters) and sT5 Large (335M parameters) models.",
156
+ "Surprisingly, larger capacity models like sT5 Base and sT5 Large do not outperform the smaller capacity MLSE model.",
157
+ "We believe this has to do with sT5 models being trained on only the English language.",
158
+ "The MLSE model being a multi-lingual model is able to make better use of the multiple languages present in the search data, where as the sT5 models are unable accurately map the meanings of these queries.",
159
+ "We validate this by generating search embeddings using only English queries and training models on these English-only search embeddings.",
160
+ "These results are shown in Table 6 .",
161
+ "We can see that the sT5 models perform similar to their all-language counter parts, where as performance for MLSE considerable lowers.",
162
+ "We leave further studies to future work.",
163
+ ""
164
+ ],
165
+ "target_context_ids": [
166
+ 5,
167
+ 6,
168
+ 7,
169
+ 8,
170
+ 9,
171
+ 10,
172
+ 11
173
+ ],
174
+ "selected_paragraphs": [
175
+ "[paragraph id = 5] Table 6 shows the results from using different search embeddings created using the sT5 Base (110M parameters) and sT5 Large (335M parameters) models.",
176
+ "[paragraph id = 6] Surprisingly, larger capacity models like sT5 Base and sT5 Large do not outperform the smaller capacity MLSE model.",
177
+ "[paragraph id = 7] We believe this has to do with sT5 models being trained on only the English language.",
178
+ "[paragraph id = 8] The MLSE model being a multi-lingual model is able to make better use of the multiple languages present in the search data, where as the sT5 models are unable accurately map the meanings of these queries.",
179
+ "[paragraph id = 9] We validate this by generating search embeddings using only English queries and training models on these English-only search embeddings.",
180
+ "[paragraph id = 10] These results are shown in Table 6 .",
181
+ "[paragraph id = 11] We can see that the sT5 models perform similar to their all-language counter parts, where as performance for MLSE considerable lowers."
182
+ ],
183
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T6\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S4.T6.14\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T6.2.2\">\n<th class=\"ltx_td ltx_th ltx_th_row ltx_border_r\" id=\"S4.T6.2.2.3\"></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T6.1.1.1\">Test MAPE(%)\n</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T6.2.2.2\">Test \n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T6.4.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T6.4.4.3\">MLSE (baseline)</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T6.3.3.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T6.4.4.2\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T6.6.6\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T6.6.6.3\">sT5 Base</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T6.5.5.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T6.6.6.2\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T6.8.8\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T6.8.8.3\">sT5 Large</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T6.7.7.1\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T6.8.8.2\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T6.10.10\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T6.10.10.3\">MLSE (English only)</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T6.9.9.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T6.10.10.2\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T6.12.12\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T6.12.12.3\">sT5 Base (English only)</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T6.11.11.1\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T6.12.12.2\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T6.14.14\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T6.14.14.3\">sT5 Large (English only)</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T6.13.13.1\"></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T6.14.14.2\"></td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 6. </span>National ILI rate modeling results from using different embedding functions from a variety of LMs.</figcaption>\n</figure>",
184
+ "perturb_sentence_id": 5,
185
+ "output": {
186
+ "perturbed_statement": "[paragraph id = 5] Table 6 shows the results from using different search embeddings created using the MLSE model.Surprisingly, larger capacity models like sT5 Base (110M parameters) and sT5 Large (335M parameters) significantly outperform the oversized MLSE model.",
187
+ "perturbed_explanation": "Table 6 shows the results from using different search embeddings created using the sT5 Base (110M parameters) and sT5 Large (335M parameters) models. Surprisingly, larger capacity models like sT5 Base and sT5 Large do not outperform the smaller capacity MLSE model. 1. These results imply that larger model capacity does not necessarily correlate with superior performance when compared to smaller, diversified models like MLSE. 2. Contrary to the information provided, the statement suggests the sT5 models notably outperform the MLSE model, which conflicts with the observed outcomes where MLSE's performances are competitive or better in multilingual contexts. Therefore, the revised statement introduces a factual inconsistency."
188
+ }
189
+ }
190
+ ]
table_result/2407.00087v2_output.json ADDED
The diff for this file is too large to render. See raw diff
 
table_result/2407.00088v1_output.json ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00088v1.json",
4
+ "table_id": "2",
5
+ "section": "5.1",
6
+ "all_context": [
7
+ "As shown in Table 2 , we evaluate T-MAC across four distinct edge devices.",
8
+ "These devices range from high-performance ones like M2-Ultra to less powerful ones like Raspberry Pi.",
9
+ "The CPUs tested encompass Intel Core, Apple Silicon, and Cortex series.",
10
+ "The operating systems include OSX, Linux, and Windows.",
11
+ "This evaluation guarantees T-MAC s cross-platform compatibility and consistent performance across different instruction sets and various edge deployment scenarios.",
12
+ "To evaluate the performance of T-MAC, we conduct extensive benchmarks using real-word low-bit LLMs and scenarios.",
13
+ "For the kernel performance benchmark, we select matrix shapes derived from the Llama-2-7B and Llama-2-13B models, ensuring our evaluation reflects the practical demands.",
14
+ "To conduct an end-to-end throughput test, we employed actual quantized models to demonstrate the practical efficacy of T-MAC across different bit-width configurations.",
15
+ "Specifically, we employ 4-bit,3-bit,2-bit and 1-bit quantized Llama models, and also 1-bit and 1.58bit BitNet models that are trained from scratch.",
16
+ "The 4-bit Llama models are from GPTQ (frantar2022gptq, ).",
17
+ "The 3-bit and 2-bit Llama models are from BitDistiller (du2024bitdistiller, ).",
18
+ "The 1-bit Llama models are from OneBit (xu2024onebit, ).",
19
+ "We compared the performance of T-MAC with llama.cpp, a state-of-the-art implementation for LLM deployment on edge devices.",
20
+ "We chose llama.cpp as the baseline for several compelling reasons.",
21
+ "Firstly, llama.cpp represents the cutting-edge in LLM deployment on edge devices, featuring highly optimized kernel implementations tailored to each hardware platform.",
22
+ "Its versatility and robust performance make it an ideal benchmark for assessing the efficacy of new methodologies.",
23
+ "Additionally, llama.cpp is implemented in plain C/C++ without any dependencies, ensuring maximum compatibility and efficiency across diverse hardware configurations.",
24
+ "For kernel performance benchmarks, we utilized the optimized kernels provided by llama.cpp as the baselines on the respective hardware devices.",
25
+ "In our end-to-end throughput evaluations, we integrate the LUT-based kernels from T-MAC to llama.cpp and compare it with original llama.cpp.",
26
+ "We perform both kernel-level and model-level measurement.",
27
+ "To obtain precise and consistent kernel-level latency on CPU, we first perform a warmup of 10 iterations, followed by 100 runs to calculate an average.",
28
+ "The warmup on M2-Ultra differs slightly from the others, requiring at least 1 second to maximize performance.",
29
+ "To perform model-level latency, we integrate T-MAC into llama.cpp.",
30
+ "We repeatedly generate 64 tokens for 20 iterations to evaluate token generation throughput.",
31
+ ""
32
+ ],
33
+ "target_context_ids": [
34
+ 0,
35
+ 1,
36
+ 2,
37
+ 3,
38
+ 4
39
+ ],
40
+ "selected_paragraphs": [
41
+ "[paragraph id = 0] As shown in Table 2 , we evaluate T-MAC across four distinct edge devices.",
42
+ "[paragraph id = 1] These devices range from high-performance ones like M2-Ultra to less powerful ones like Raspberry Pi.",
43
+ "[paragraph id = 2] The CPUs tested encompass Intel Core, Apple Silicon, and Cortex series.",
44
+ "[paragraph id = 3] The operating systems include OSX, Linux, and Windows.",
45
+ "[paragraph id = 4] This evaluation guarantees T-MAC s cross-platform compatibility and consistent performance across different instruction sets and various edge deployment scenarios."
46
+ ],
47
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T2\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S5.T2.2\" style=\"width:433.6pt;height:129.4pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(35.8pt,-10.7pt) scale(1.19793527173856,1.19793527173856) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S5.T2.2.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T2.2.1.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T2.2.1.1.1.1\" rowspan=\"2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.2.1.1.1.1.1\">Device</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T2.2.1.1.1.2\" rowspan=\"2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.2.1.1.1.2.1\">Processor</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T2.2.1.1.1.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.2.1.1.1.3.1\">Performance</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T2.2.1.1.1.4\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.2.1.1.1.4.1\">Max. Memory</span></th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.2.1.2.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S5.T2.2.1.2.2.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.2.1.2.2.1.1\">Cores</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S5.T2.2.1.2.2.2\">\n<span class=\"ltx_text ltx_font_bold\" id=\"S5.T2.2.1.2.2.2.1\">Bandwidth</span> (GB/s)</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T2.2.1.3.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T2.2.1.3.1.1\">M2-Ultra</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T2.2.1.3.1.2\">Apple M2-Ultra</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T2.2.1.3.1.3\">16</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T2.2.1.3.1.4\">819.2</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.2.1.4.2\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.1.4.2.1\">Raspberry Pi 5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.1.4.2.2\">ARM Cortex-A76</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.1.4.2.3\">4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.1.4.2.4\">17.1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.2.1.5.3\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.1.5.3.1\">Jetson AGX Orin</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.1.5.3.2\">ARM Cortex-A78AE</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.1.5.3.3\">12</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T2.2.1.5.3.4\">204.8</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.2.1.6.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T2.2.1.6.4.1\">Surface Book 3</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T2.2.1.6.4.2\">Intel Core i5-1035G7</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T2.2.1.6.4.3\">4</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T2.2.1.6.4.4\">58.2</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S5.T2.3.1.1\" style=\"font-size:90%;\">Table 2</span>. </span><span class=\"ltx_text\" id=\"S5.T2.4.2\" style=\"font-size:90%;\">Hardware device specification.</span></figcaption>\n</figure>",
48
+ "perturb_sentence_id": 0,
49
+ "output": {
50
+ "perturbed_statement": "[paragraph id = 0] As shown in Table 2 , we evaluate T-MAC across four distinct edge devices. These devices range from high-performance ones like Intel Xeon to less powerful ones like Cortex A357.",
51
+ "perturbed_explanation": "The original explanation highlighted the evaluation of T-MAC across a range of devices, such as the high-performance M2-Ultra and the less powerful Raspberry Pi. Now, addressing the statement: 1. The high-performance evaluation platform mentioned is not an Intel Xeon; instead, it refers to M2-Ultra as per the context. 2. The less powerful device named Cortex A357 is not specified within the context provided in this form. Hence, the modified statement introduces inaccuracies concerning the specific devices discussed for evaluation."
52
+ }
53
+ },
54
+ {
55
+ "path": "table_paper/2407.00088v1.json",
56
+ "table_id": "4",
57
+ "section": "5.6",
58
+ "all_context": [
59
+ "There are two sources of error compared to conventional mpGEMM implementation: (a) table quantization, which is an algorithmic approximation included in our method, and (b) fast aggregation, whose error is introduced during the instruction execution within the fixed CPU architecture.",
60
+ "We evaluate the impact of these two error sources at both kernel-level and model-level.",
61
+ "We use the unquantized GEMV as the benchmark.",
62
+ "The weights and activation of the GEMV are randomly generated FP16 values following a Gaussian Distribution, which are then quantized to 4-bit for execution by llama.cpp and T-MAC.",
63
+ "The Normalized Mean Squared Error (NMSE) is then computed between the ground truth and the mpGEMV outputs.",
64
+ "As shown in Table.",
65
+ "3 , the NMSE difference between llama.cpp and T-MAC is negligible, indicating that the table quantization error is minimal.",
66
+ "However, after applying fast aggregation, the NMSE increases to 2.5.",
67
+ "To examine the impact of these errors on real-world models, we chose Llama-2-7B for testing.",
68
+ "The models are the GGUF model converted from official Llama-2-7B weights for the un-quantized ground truth and the original llama-2-7b.Q4_0.gguf model (gguf-models, ) released with llama.cpp for mpGEMM.",
69
+ "After integrating T-MAC into llama.cpp, we conduct the evaluation through the perplexity (llamacpp-perplexity, ) tool provided by llama.cpp.",
70
+ "The evaluation is performed on three different tasks: WikiText-2 (merity2016pointer, ) and lambada_openai (paperno-etal-2016-lambada, ; radford2019language, ) for perplexity (the lower the better), and WinoGrande (ai2:winogrande, ) for question answering accuracy (the higher the better.",
71
+ "As shown in Table 4 , on all of the three tasks, T-MAC delivers the same results compared to llama.cpp, suggesting that the error introduced by T-MAC is negligible for real-world models.",
72
+ "After toggling on the fast aggregation, the perplexity increases by 0.4 and 1.0 respectively and the accuracy drops by 0.3%.",
73
+ "In summary, T-MAC introduces negligible error to model inference while offering significant speedup.",
74
+ "The fast aggregation can further enhance performance, but at the cost of model quality.",
75
+ "We offer this as an option for users in scenarios that prioritize real-time performance and are less sensitive to accuracy.",
76
+ "Without fast aggregation, T-MAC can still achieve substantial gain according to Figure 10 .",
77
+ "In the future, we anticipate the error introduced by fast aggregation can be mitigated with straightforward optimizations of the CPU micro-architecture.",
78
+ ""
79
+ ],
80
+ "target_context_ids": [
81
+ 11,
82
+ 12,
83
+ 13,
84
+ 14,
85
+ 15
86
+ ],
87
+ "selected_paragraphs": [
88
+ "[paragraph id = 11] The evaluation is performed on three different tasks: WikiText-2 (merity2016pointer, ) and lambada_openai (paperno-etal-2016-lambada, ; radford2019language, ) for perplexity (the lower the better), and WinoGrande (ai2:winogrande, ) for question answering accuracy (the higher the better.",
89
+ "[paragraph id = 12] As shown in Table 4 , on all of the three tasks, T-MAC delivers the same results compared to llama.cpp, suggesting that the error introduced by T-MAC is negligible for real-world models.",
90
+ "[paragraph id = 13] After toggling on the fast aggregation, the perplexity increases by 0.4 and 1.0 respectively and the accuracy drops by 0.3%.",
91
+ "[paragraph id = 14] In summary, T-MAC introduces negligible error to model inference while offering significant speedup.",
92
+ "[paragraph id = 15] The fast aggregation can further enhance performance, but at the cost of model quality."
93
+ ],
94
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T4\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S5.T4.4\" style=\"width:433.6pt;height:128.2pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(34.1pt,-10.1pt) scale(1.18686896846672,1.18686896846672) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S5.T4.4.4\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T4.4.4.5.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T4.4.4.5.1.1\" rowspan=\"2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.4.4.5.1.1.1\">Framework</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T4.4.4.5.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.4.4.5.1.2.1\">Throughput</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T4.4.4.5.1.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.4.4.5.1.3.1\">WikiText2</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T4.4.4.5.1.4\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.4.4.5.1.4.1\">lambada_openai</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T4.4.4.5.1.5\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.4.4.5.1.5.1\">WinoGrande</span></th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.4.4.4\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S5.T4.1.1.1.1\">Tokens/sec \n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S5.T4.2.2.2.2\">PPL \n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S5.T4.3.3.3.3\">PPL \n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S5.T4.4.4.4.4\">Acc. \n</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T4.4.4.6.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S5.T4.4.4.6.1.1\">Un-quantized</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.4.4.6.1.2\">3.79</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.4.4.6.1.3\">5.80</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.4.4.6.1.4\">12.65</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.4.4.6.1.5\">71.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.4.4.7.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T4.4.4.7.2.1\">llama.cpp</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.4.4.7.2.2\">5.65</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.4.4.7.2.3\">5.96</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.4.4.7.2.4\">12.95</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.4.4.7.2.5\">70.8</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.4.4.8.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S5.T4.4.4.8.3.1\">T-MAC</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.4.4.8.3.2\">7.34</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.4.4.8.3.3\">5.96</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.4.4.8.3.4\">12.95</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.4.4.8.3.5\">70.8</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.4.4.9.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_bb\" id=\"S5.T4.4.4.9.4.1\">T-MAC (+FA)</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T4.4.4.9.4.2\">8.97</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T4.4.4.9.4.3\">6.38</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T4.4.4.9.4.4\">13.99</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T4.4.4.9.4.5\">67.8</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S5.T4.10.3.1\" style=\"font-size:90%;\">Table 4</span>. </span><span class=\"ltx_text\" id=\"S5.T4.8.2\" style=\"font-size:90%;\">End-to-end throughput and model quality of Llama-2-7B-4bit on M2-Ultra with single-thread. T-MAC improves throughput by 1.3 compared to llama.cpp with the same model quality. Fast Aggregation (FA) can further improve the throughput gain to 1.6, but the model quality will drop because of the numerical error of current CPU instructions.</span></figcaption>\n</figure>",
95
+ "perturb_sentence_id": 11,
96
+ "output": {
97
+ "perturbed_statement": "[paragraph id = 11] The evaluation is performed on four different tasks: WikiText-2 (merity2016pointer, ), lambada_openai (paperno-etal-2016-lambada, ; radford2019language, ), a custom summarization task (smith2020summarization, ), and WinoGrande (ai2:winogrande, ) for question answering accuracy (the higher the better). As shown in Table 4 , on all of these tasks, T-MAC delivers the same results compared to llama.cpp, suggesting that the error introduced by T-MAC is negligible for real-world models.",
98
+ "perturbed_explanation": "The original explanation states: 1. The evaluation mentioned involves only three tasks: WikiText-2, lambada_openai, and WinoGrande, each targeting specific metrics like perplexity or accuracy. 2. Upon reviewing the statement, it inaccurately mentions 'four different tasks' and includes a 'custom summarization task,' which does not align with the outlined tasks. This discrepancy renders the statement factually incorrect."
99
+ }
100
+ },
101
+ {
102
+ "path": "table_paper/2407.00088v1.json",
103
+ "table_id": "5",
104
+ "section": "5.7",
105
+ "all_context": [
106
+ "GPUs are widely used in LLM deployments.",
107
+ "We compare T-MAC on CPU with llama.cpp on GPU to illustrate the efficiency of T-MAC.",
108
+ "llama.cpp is the state-of-the-art LLM inference framework on both CPU and GPU.",
109
+ "Figure 11 shows the mpGEMV kernel performance comparsion of T-MAC (CPU) and llama.cpp (GPU) on NVIDIA Jetson AGX Orin, a platform with ARM CPU and NVIDIA CUDA GPU.",
110
+ "The kernel configurations are all from Llama-2-7B.",
111
+ "T-MAC significantly outperforms GPU on W1A16 on all cases, while achieves comparable performance on W2A16 and W3A16.",
112
+ "Although GPU performs better on higher bits and larger shape due to its powerful parallel computing capacity, this evaluation still shows huge potential of CPU-based LLM deployments on edge devices.",
113
+ "Table 5 shows the end-to-end comparison of the Llama-2-7B-2bit model on NVIDIA Jetson AGX Orin.",
114
+ "Without T-MAC, CPU only performs better than GPU in power, however, the energy consumption is still worse than GPU due to lower throughput.",
115
+ "Compared to llama.cpp on CPU, T-MAC not only improves the throughput to 2.2, but also reduces the power to 69, resulting in 3.2 energy efficiency.",
116
+ "Compared to llama.cpp on GPU, although T-MAC only achieves 78 throughput, T-MAC only needs 34 power, resulting in 2.3 energy efficiency.",
117
+ "Note that Figure 11 shows T-MAC outperforms the GPU on the mpGEMV kernels.",
118
+ "The reason why the throughput of T-MAC is still lower than that of GPU is due to the performance of kernels except mpGEMVs in llama.cpp on CPU.",
119
+ ""
120
+ ],
121
+ "target_context_ids": [
122
+ 7,
123
+ 8,
124
+ 9,
125
+ 10
126
+ ],
127
+ "selected_paragraphs": [
128
+ "[paragraph id = 7] Table 5 shows the end-to-end comparison of the Llama-2-7B-2bit model on NVIDIA Jetson AGX Orin.",
129
+ "[paragraph id = 8] Without T-MAC, CPU only performs better than GPU in power, however, the energy consumption is still worse than GPU due to lower throughput.",
130
+ "[paragraph id = 9] Compared to llama.cpp on CPU, T-MAC not only improves the throughput to 2.2, but also reduces the power to 69, resulting in 3.2 energy efficiency.",
131
+ "[paragraph id = 10] Compared to llama.cpp on GPU, although T-MAC only achieves 78 throughput, T-MAC only needs 34 power, resulting in 2.3 energy efficiency."
132
+ ],
133
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T5\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S5.T5.2\" style=\"width:355.6pt;height:131.4pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(56.0pt,-20.7pt) scale(1.46000059019698,1.46000059019698) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S5.T5.2.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T5.2.1.1.1\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T5.2.1.1.1.1\" rowspan=\"2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T5.2.1.1.1.1.1\">Framework</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T5.2.1.1.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T5.2.1.1.1.2.1\">Throughput</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T5.2.1.1.1.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T5.2.1.1.1.3.1\">Power</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T5.2.1.1.1.4\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T5.2.1.1.1.4.1\">Energy</span></th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T5.2.1.2.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S5.T5.2.1.2.2.1\">Tokens/sec</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S5.T5.2.1.2.2.2\">W</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column\" id=\"S5.T5.2.1.2.2.3\">J/token</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T5.2.1.3.1\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_t\" id=\"S5.T5.2.1.3.1.1\">llama.cpp (CPU)</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T5.2.1.3.1.2\">7.08</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T5.2.1.3.1.3\">15.0</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T5.2.1.3.1.4\">2.12</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T5.2.1.4.2\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S5.T5.2.1.4.2.1\">llama.cpp (GPU)</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.1.4.2.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T5.2.1.4.2.2.1\">20.03</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.1.4.2.3\">30.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.1.4.2.4\">1.54</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T5.2.1.5.3\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_bb\" id=\"S5.T5.2.1.5.3.1\">T-MAC (CPU)</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T5.2.1.5.3.2\">15.62</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T5.2.1.5.3.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T5.2.1.5.3.3.1\">10.4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T5.2.1.5.3.4\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T5.2.1.5.3.4.1\">0.66</span></td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S5.T5.3.1.1\" style=\"font-size:90%;\">Table 5</span>. </span><span class=\"ltx_text\" id=\"S5.T5.4.2\" style=\"font-size:90%;\">Llama-2-7B-2bit end-to-end inference throughput, power and energy comparisons on NVIDIA Jetson AGX Orin.</span></figcaption>\n</figure>",
134
+ "perturb_sentence_id": 10,
135
+ "output": {
136
+ "perturbed_statement": "[paragraph id = 10] Compared to llama.cpp on GPU, although T-MAC only achieves 3.1 throughput, T-MAC only needs 34 power, resulting in 0.9 energy efficiency.Note that Figure 11 shows T-MAC outperforms the GPU on the mpGEMV kernels.",
137
+ "perturbed_explanation": "Original Explanation: The provided statement was incorrect because (1) it mentions GPU while the original context discusses CPU; (2) it presents contradictory values for throughput, power, and energy efficiency than those explained, which were throughput at 2.2, power at 69, and energy efficiency at 3.2. (3) It alters specific claims about Figure 11 inconsistently. Continuing, these alterations contradict the defined and observed facts shown. T-MAC’s efficiency improvements are quantifiable and linked to the CPU benchmarks as highlighted."
138
+ }
139
+ }
140
+ ]
table_result/2407.00091v1_output.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00091v1.json",
4
+ "table_id": "1",
5
+ "section": "3.1",
6
+ "all_context": [
7
+ "In the first phase, we study the effects of the parameter through offline simulation of the search system.",
8
+ "Table 1 summarizes the aggregate statistics of map-results corresponding to different values of .",
9
+ "The lower bound of the exploration is set by product experience considerations, and the upper bound the result of diminishing effects.",
10
+ "The baseline for all these comparisons is map-results with no filtering and a fixed limit of pins.",
11
+ "Table 1 validates that the Bookability Filter is having the intended effect on map-results.",
12
+ "The second phase of our testing investigates how users react to various values of .",
13
+ "We run multiple A/B experiments online, where control applies no filtering on map-results, and treatments apply the Bookability Filter with different values of .",
14
+ "Table 2 summarizes the effect of on searchers.",
15
+ "A brief explanation of the key metrics evaluated in the online experiments: Uncanceled bookings: This is the top line metric, the number of bookings made by searchers that were not cancelled.",
16
+ "5-star trips: Trips booked by searchers that resulted in 5-star rating after checkout, evaluated days after end of experiment.",
17
+ "Average impressions to discovery: The average number of distinct search results that a booker saw before clicking the listing that was booked.",
18
+ "This measures the cognitive load of making a booking.",
19
+ "Average clicks to discovery: The number of distinct search results clicked by a booker before clicking the listing that was booked.",
20
+ "This is an alternative measure of the cognitive load of making a booking.",
21
+ "In the final phase, we fix the value of to corresponding to maximum user benefit, and repeat the online A/B experiment at a larger scale, allocating million searchers worldwide to each of control and treatment.",
22
+ "This grinds the p-value of the key metrics below .",
23
+ "Uncanceled bookings increase by , measured as a percentage of overall global bookings at Airbnb, making it one of the largest improvements launched over the last several years.",
24
+ "5-star trips increase by indicating not only growth in bookings, but a growth in quality bookings.",
25
+ "Average number of results seen by the searcher before clicking on the booked listing reduces by , while search results clicked drop by .",
26
+ "The reduction in effort to locate the booked listing, due to removal of inferior choices, is the key mechanism driving the gain in bookings.",
27
+ ""
28
+ ],
29
+ "target_context_ids": [
30
+ 1,
31
+ 4,
32
+ 5
33
+ ],
34
+ "selected_paragraphs": [
35
+ "[paragraph id = 1] Table 1 summarizes the aggregate statistics of map-results corresponding to different values of .",
36
+ "[paragraph id = 4] Table 1 validates that the Bookability Filter is having the intended effect on map-results.",
37
+ "[paragraph id = 5] The second phase of our testing investigates how users react to various values of ."
38
+ ],
39
+ "table_html": "<figure class=\"ltx_table\" id=\"S3.T1\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S3.T1.25\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S3.T1.5.5\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S3.T1.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S3.T1.2.2.2\"></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S3.T1.3.3.3\"></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S3.T1.4.4.4\"></th>\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_border_tt\" id=\"S3.T1.5.5.5\"></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S3.T1.9.9\">\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S3.T1.9.9.5\">Number of map pins</td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S3.T1.6.6.1\"></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S3.T1.7.7.2\"></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S3.T1.8.8.3\"></td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S3.T1.9.9.4\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.13.13\">\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.13.13.5\">Average booking probability</td>\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.10.10.1\"></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.11.11.2\"></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.12.12.3\"></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.13.13.4\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.17.17\">\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.17.17.5\">Average total price</td>\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.14.14.1\"></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.15.15.2\"></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.16.16.3\"></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.17.17.4\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.21.21\">\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.21.21.5\">Average number of reviews</td>\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.18.18.1\"></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.19.19.2\"></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.20.20.3\"></td>\n<td class=\"ltx_td ltx_align_left\" id=\"S3.T1.21.21.4\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.25.25\">\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S3.T1.25.25.5\">Average review rating</td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S3.T1.22.22.1\"></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S3.T1.23.23.2\"></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S3.T1.24.24.3\"></td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S3.T1.25.25.4\"></td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S3.T1.31.3.1\" style=\"font-size:90%;\">Table 1</span>. </span><span class=\"ltx_text\" id=\"S3.T1.29.2\" style=\"font-size:90%;\">Offline exploration of compared against a baseline with no filtering, which is conceptually equivalent to .</span></figcaption>\n</figure>",
40
+ "perturb_sentence_id": 1,
41
+ "output": {
42
+ "perturbed_statement": "[paragraph id = 1] Table 1 presents the aggregate statistics of map-results unrelated to different values of . The lower bound of the exploration is determined by technical limitations, and the upper bound the result of proportional effects.",
43
+ "perturbed_explanation": "The original explanation: 1. Table 1 in the context provides the aggregate statistics for map-results corresponding to various parameter values. 2. The bounds for the exploration are influenced by product experience at the lower end and diminishing effects at the upper end. However, the statement modifies these aspects: 1. It wrongly indicates that map-results are unrelated to the parameter values, contradicting the analysis described. 2. It incorrectly states the determinants for exploration bounds as 'technical limitations' for the lower bound and 'proportional effects' for the upper, which are inconsistent with the indicated factors of 'product experience considerations' and 'diminishing effects'."
44
+ }
45
+ }
46
+ ]
table_result/2407.00100v1_output.json ADDED
The diff for this file is too large to render. See raw diff
 
table_result/2407.00101v1_output.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00101v1.json",
4
+ "table_id": "1",
5
+ "section": "7.1",
6
+ "all_context": [
7
+ "Plots 4 and 5 shows the average values of testing accuracy, testing loss, and training loss for five rounds of training from random initialization on the MNIST dataset.",
8
+ "It can be seen clearly that our algorithm maintains the lead in terms of accuracy and loss as compared to both asynchronous and synchronous versions.",
9
+ "The same trend is observed for all the combinations of batch sizes and step sizes.",
10
+ "However, the speed gain by our algorithm over the asynchronous version is not that significant, we believe that MNIST poses a simple optimization problem that does not bring out problems of asynchronous algorithm effectively.",
11
+ "Table 1 shows the difference of the metrics like accuracy and loss between our algorithm and asynchronous algorithm averaged over the entire training interval.",
12
+ "For better performance, the difference in accuracy should be positive and that loss should be negative.",
13
+ "For the next set of experiments, we selected CIFAR-10 as our dataset since we believe that it provides a difficult optimization problem as compared to MNIST.",
14
+ "Table 2 and plots 6 and 7 show similar statistics as that for MNIST.",
15
+ "We can clearly note here that our algorithms show significant speedup as compared to both of the other algorithms.",
16
+ "It is able to achieve higher accuracy and lower loss as compared to asynchronous and synchronous algorithms.",
17
+ "In all the previous experiments, the synchronous algorithm was very slow, and hence for future analysis, only present a comparison between our algorithm and the asynchronous algorithm.",
18
+ ""
19
+ ],
20
+ "target_context_ids": [
21
+ 4,
22
+ 5
23
+ ],
24
+ "selected_paragraphs": [
25
+ "[paragraph id = 4] Table 1 shows the difference of the metrics like accuracy and loss between our algorithm and asynchronous algorithm averaged over the entire training interval.",
26
+ "[paragraph id = 5] For better performance, the difference in accuracy should be positive and that loss should be negative."
27
+ ],
28
+ "table_html": "<figure class=\"ltx_table\" id=\"S7.T1\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S7.T1.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S7.T1.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_l ltx_border_r ltx_border_t\" id=\"S7.T1.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S7.T1.1.1.2\">(300,32)</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S7.T1.1.1.3\">(300,64)</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S7.T1.1.1.4\">(500,32)</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S7.T1.1.1.5\">(500,64)</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S7.T1.1.2.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S7.T1.1.2.1.1\">Test Accuracy</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T1.1.2.1.2\">1.374</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T1.1.2.1.3\">-0.516</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T1.1.2.1.4\">1.366</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T1.1.2.1.5\">1.291</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T1.1.3.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S7.T1.1.3.2.1\">Test loss</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T1.1.3.2.2\">-0.047</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T1.1.3.2.3\">0.001</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T1.1.3.2.4\">-0.053</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T1.1.3.2.5\">-0.022</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T1.1.4.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_l ltx_border_r ltx_border_t\" id=\"S7.T1.1.4.3.1\">Train loss</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S7.T1.1.4.3.2\">-0.047</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S7.T1.1.4.3.3\">-0.001</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S7.T1.1.4.3.4\">-0.054</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S7.T1.1.4.3.5\">-0.023</td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 1. </span>Difference between the metric for our algorithm and asynchronous algorithm averaged over entire training interval for MNIST dataset. For better performance, difference in accuracy should be positive and that loss should be negative</figcaption>\n</figure>",
29
+ "perturb_sentence_id": 4,
30
+ "output": {
31
+ "perturbed_statement": "[paragraph id = 4] Table 1 shows the difference of the metrics like accuracy and speed between our algorithm and asynchronous algorithm averaged over the test subset. For better performance, the difference in accuracy should be greater than or equal to zero and speed should be negative.",
32
+ "perturbed_explanation": "1. Initially, the metrics compared were accuracy and loss over the training interval, not accuracy and speed, nor over a limited subset like the test data. 2. While it is correct that higher performance is indicated by positive differences in accuracy, speed is not typically represented as a metric difference and cannot logically be negative under usual interpretations, leading to an incoherent assertion."
33
+ }
34
+ },
35
+ {
36
+ "path": "table_paper/2407.00101v1.json",
37
+ "table_id": "2",
38
+ "section": "7.1",
39
+ "all_context": [
40
+ "Plots 4 and 5 shows the average values of testing accuracy, testing loss, and training loss for five rounds of training from random initialization on the MNIST dataset.",
41
+ "It can be seen clearly that our algorithm maintains the lead in terms of accuracy and loss as compared to both asynchronous and synchronous versions.",
42
+ "The same trend is observed for all the combinations of batch sizes and step sizes.",
43
+ "However, the speed gain by our algorithm over the asynchronous version is not that significant, we believe that MNIST poses a simple optimization problem that does not bring out problems of asynchronous algorithm effectively.",
44
+ "Table 1 shows the difference of the metrics like accuracy and loss between our algorithm and asynchronous algorithm averaged over the entire training interval.",
45
+ "For better performance, the difference in accuracy should be positive and that loss should be negative.",
46
+ "For the next set of experiments, we selected CIFAR-10 as our dataset since we believe that it provides a difficult optimization problem as compared to MNIST.",
47
+ "Table 2 and plots 6 and 7 show similar statistics as that for MNIST.",
48
+ "We can clearly note here that our algorithms show significant speedup as compared to both of the other algorithms.",
49
+ "It is able to achieve higher accuracy and lower loss as compared to asynchronous and synchronous algorithms.",
50
+ "In all the previous experiments, the synchronous algorithm was very slow, and hence for future analysis, only present a comparison between our algorithm and the asynchronous algorithm.",
51
+ ""
52
+ ],
53
+ "target_context_ids": [
54
+ 6,
55
+ 7,
56
+ 8,
57
+ 9
58
+ ],
59
+ "selected_paragraphs": [
60
+ "[paragraph id = 6] For the next set of experiments, we selected CIFAR-10 as our dataset since we believe that it provides a difficult optimization problem as compared to MNIST.",
61
+ "[paragraph id = 7] Table 2 and plots 6 and 7 show similar statistics as that for MNIST.",
62
+ "[paragraph id = 8] We can clearly note here that our algorithms show significant speedup as compared to both of the other algorithms.",
63
+ "[paragraph id = 9] It is able to achieve higher accuracy and lower loss as compared to asynchronous and synchronous algorithms."
64
+ ],
65
+ "table_html": "<figure class=\"ltx_table\" id=\"S7.T2\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S7.T2.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S7.T2.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_l ltx_border_r ltx_border_t\" id=\"S7.T2.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S7.T2.1.1.2\">(300,32)</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S7.T2.1.1.3\">(300,64)</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S7.T2.1.1.4\">(500,32)</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S7.T2.1.1.5\">(500,64)</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S7.T2.1.2.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S7.T2.1.2.1.1\">Test Accuracy</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T2.1.2.1.2\">4.849</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T2.1.2.1.3\">2.435</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T2.1.2.1.4\">3.468</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T2.1.2.1.5\">2.884</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T2.1.3.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S7.T2.1.3.2.1\">Test loss</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T2.1.3.2.2\">-0.137</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T2.1.3.2.3\">-0.066</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T2.1.3.2.4\">-0.092</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T2.1.3.2.5\">-0.080</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T2.1.4.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_l ltx_border_r ltx_border_t\" id=\"S7.T2.1.4.3.1\">Train loss</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S7.T2.1.4.3.2\">-0.139</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S7.T2.1.4.3.3\">-0.067</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S7.T2.1.4.3.4\">-0.091</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S7.T2.1.4.3.5\">-0.082</td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 2. </span>Difference between the metric for our algorithm and asynchronous algorithm averaged over entire training interval for CIFAR-10 dataset. For better performance, difference in accuracy should be positive and that loss should be negative</figcaption>\n</figure>",
66
+ "perturb_sentence_id": 7,
67
+ "output": {
68
+ "perturbed_statement": "[paragraph id = 7] Table 2 and plots 6 and 7 show similar statistics as that for CIFAR-10. We can clearly note here that our algorithms demonstrate comparable performance to both of the other algorithms.",
69
+ "perturbed_explanation": "1. The original explanation highlighted that Table 2 and plots 6 and 7 illustrated a clear speedup of the authors' algorithms relative to the other algorithms. 2. The statement is factually incorrect because it incorrectly asserts that the algorithms exhibit performance comparable to the others, which is contrary to the context indicating a significant speedup."
70
+ }
71
+ },
72
+ {
73
+ "path": "table_paper/2407.00101v1.json",
74
+ "table_id": "3",
75
+ "section": "7.2",
76
+ "all_context": [
77
+ "Further, we wanted to understand how different values of batch sizes affect the efficiency of our approach.",
78
+ "For each of the batch sizes, we executed 5 rounds of training, each with different initialization of the parameters on the randomly generated dataset.",
79
+ "Table 3 shows the difference of the metrics like accuracy and loss between our algorithm and asynchronous algorithm averaged over the entire training interval.",
80
+ "We hypothesized that as the batch size increases, the difference should decrease since asynchronous algorithms start providing updates with high confidence.",
81
+ "This can be also validated by the trend observed in the plot 8 .",
82
+ ""
83
+ ],
84
+ "target_context_ids": [
85
+ 0,
86
+ 2,
87
+ 3
88
+ ],
89
+ "selected_paragraphs": [
90
+ "[paragraph id = 0] Further, we wanted to understand how different values of batch sizes affect the efficiency of our approach.",
91
+ "[paragraph id = 2] Table 3 shows the difference of the metrics like accuracy and loss between our algorithm and asynchronous algorithm averaged over the entire training interval.",
92
+ "[paragraph id = 3] We hypothesized that as the batch size increases, the difference should decrease since asynchronous algorithms start providing updates with high confidence."
93
+ ],
94
+ "table_html": "<figure class=\"ltx_table\" id=\"S7.T3\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S7.T3.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S7.T3.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_l ltx_border_r ltx_border_t\" id=\"S7.T3.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S7.T3.1.1.2\">8</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S7.T3.1.1.3\">16</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S7.T3.1.1.4\">32</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S7.T3.1.1.5\">64</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_t\" id=\"S7.T3.1.1.6\">128</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S7.T3.1.2.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S7.T3.1.2.1.1\">Test Accuracy</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T3.1.2.1.2\">4.896</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T3.1.2.1.3\">5.183</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T3.1.2.1.4\">4.222</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T3.1.2.1.5\">3.304</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T3.1.2.1.6\">2.599</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T3.1.3.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_l ltx_border_r ltx_border_t\" id=\"S7.T3.1.3.2.1\">Test loss</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T3.1.3.2.2\">-0.141</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T3.1.3.2.3\">-0.141</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T3.1.3.2.4\">-0.117</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T3.1.3.2.5\">-0.089</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S7.T3.1.3.2.6\">-0.072</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S7.T3.1.4.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_l ltx_border_r ltx_border_t\" id=\"S7.T3.1.4.3.1\">Train loss</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S7.T3.1.4.3.2\">-0.143</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S7.T3.1.4.3.3\">-0.141</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S7.T3.1.4.3.4\">-0.114</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S7.T3.1.4.3.5\">-0.088</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b ltx_border_r ltx_border_t\" id=\"S7.T3.1.4.3.6\">-0.068</td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 3. </span>Difference between the metric for our algorithm and asynchronous algorithm averaged over entire training interval for various batch sizes and constant step size of 500. For better performance, difference in accuracy should be positive and that loss should be negative</figcaption>\n</figure>",
95
+ "perturb_sentence_id": 2,
96
+ "output": {
97
+ "perturbed_statement": "[paragraph id = 2] Table 3 shows the difference in metrics like accuracy and efficiency between our algorithm and the asynchronous algorithm averaged over the entire testing interval. We hypothesized that as the batch size increases, the difference should increase since asynchronous algorithms start providing updates with low certainty.",
98
+ "perturbed_explanation": "The original explanation states two key ideas: (1) the difference in metrics like accuracy and loss between the algorithms are analyzed over the training interval, and (2) it is hypothesized that increasing batch size decreases the difference due to heightened confidence in asynchronous updates. The statement is incorrect because: (3) it incorrectly states the metrics as accuracy and efficiency instead of accuracy and loss, altering their analytical significance, (4) it specifies the interval as testing instead of training, misrepresenting the context's application phase, and (5) it reverses the hypothesized impact of increasing batch size on difference, making the direction inconsistent with the stated rationale."
99
+ }
100
+ }
101
+ ]
table_result/2407.00102v1_output.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00102v1.json",
4
+ "table_id": "1",
5
+ "section": "4.2",
6
+ "all_context": [
7
+ "We use the LLaVA-v1.5-7B [25 ] architecture with model weights fully fine-tuned using LLaVA-1.5-mix-665k data.",
8
+ "Subsequently, we fine-tune this model with LoRA [14 ] during the follow-up experiments.",
9
+ "In training, we keep the visual encoder, projector, and LLM weights frozen, and maximize the likelihood of with trainable parameters of LoRA only.",
10
+ "We keep the rest of the training protocol the same to allow for a fair comparison.",
11
+ "Scenario 1, which only includes LoRA tuning, takes approximately 16 hours on an NVIDIA Tesla A100 GPU with 40GB of memory, using DeepSpeed ZeRO Stage 3.",
12
+ "We use the SVIT-core-157K [39 ] dataset for continuous fine-tuning to establish a baseline.",
13
+ "And the same method is applied to fine-tune our data.",
14
+ "We report our main results in Table 1 .",
15
+ "Our method, using only 7000 samples of SVIT-core-157K, achieved higher performance across all benchmarks compared to the full data experiment setup.",
16
+ "Furthermore, it surpassed the base model on SQA [27 ] and VisWiz [13 ], reaching state-of-the-art (SOTA) performance.",
17
+ "In the efficient LoRA training setup, our data exceeded SVIT-core-157K[39 ] by 4.7 points in GQA [15 ], 2.0 points in VQAV2 [12 ], 1.0 point in TextVQA [33 ], 2.0 points in VisWiz [13 ], and 0.5 points in SQA [27 ].",
18
+ "The improvements verify the better training effects of our data since less data amount and same model are used.",
19
+ "In Table 2, we use the top-right corner in the left panel of Figure 7 (shown in the appendix) as the top 5% of the DIQ and conducted a comparison experiment, we found that using the 5% selected by DIQ resulted in better performance compared to using the top 5% of DIS and DIL separately.",
20
+ "We realized that this improvement is due to the subset from DIQ selecting data evenly from the entire region, whereas DIS and DIL focus on regions with high levels of clip score or loss.",
21
+ "Based on these insights, we introduced curriculum learning, utilizing multi-stage training that progresses from low-quality to high-quality data.",
22
+ "This approach, as demonstrated in the ablation experiment in Table 2, highlights the importance of increasing the diversity of data quality for improving model performance.",
23
+ "By employing this method, we found that using curriculum learning with the DIQ method can further enhance model performance.",
24
+ "To further understand the effectiveness of curriculum learning, we observe that it starts with simple examples, which have lower noise and smaller loss.",
25
+ "This provides a smoother loss landscape, reducing gradient oscillations and instability for a more stable initial training process.",
26
+ "As the model progresses to higher-quality data, it benefits from established initial parameters and a clear learning direction, facilitating easier optimization.",
27
+ "By gradually increasing data quality, curriculum learning helps the model adapt and optimize progressively, leading to improved performance as shown in our results.",
28
+ ""
29
+ ],
30
+ "target_context_ids": [
31
+ 7,
32
+ 9,
33
+ 10,
34
+ 11
35
+ ],
36
+ "selected_paragraphs": [
37
+ "[paragraph id = 7] We report our main results in Table 1 .",
38
+ "[paragraph id = 9] Furthermore, it surpassed the base model on SQA [27 ] and VisWiz [13 ], reaching state-of-the-art (SOTA) performance.",
39
+ "[paragraph id = 10] In the efficient LoRA training setup, our data exceeded SVIT-core-157K[39 ] by 4.7 points in GQA [15 ], 2.0 points in VQAV2 [12 ], 1.0 point in TextVQA [33 ], 2.0 points in VisWiz [13 ], and 0.5 points in SQA [27 ].",
40
+ "[paragraph id = 11] The improvements verify the better training effects of our data since less data amount and same model are used."
41
+ ],
42
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T1\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S4.T1.7\" style=\"width:493.9pt;height:201.1pt;vertical-align:-0.9pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-27.4pt,11.1pt) scale(0.9,0.9) ;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T1.7.7\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T1.3.3.3\">\n<td class=\"ltx_td ltx_align_left ltx_border_tt\" id=\"S4.T1.3.3.3.4\">Method</td>\n<td class=\"ltx_td ltx_align_left ltx_border_tt\" id=\"S4.T1.3.3.3.5\">LLM</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_tt\" id=\"S4.T1.3.3.3.6\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.3.3.3.6.1\">\n<span class=\"ltx_p\" id=\"S4.T1.3.3.3.6.1.1\" style=\"width:14.2pt;\">Res.</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_tt\" id=\"S4.T1.3.3.3.7\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.3.3.3.7.1\">\n<span class=\"ltx_p\" id=\"S4.T1.3.3.3.7.1.1\" style=\"width:19.9pt;\">PT</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_tt\" id=\"S4.T1.3.3.3.8\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.3.3.3.8.1\">\n<span class=\"ltx_p\" id=\"S4.T1.3.3.3.8.1.1\" style=\"width:25.6pt;\">IT</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_tt\" id=\"S4.T1.1.1.1.1\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.1.1.1.1.1\">\n<span class=\"ltx_p\" id=\"S4.T1.1.1.1.1.1.1\" style=\"width:22.8pt;\">VQA</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_tt\" id=\"S4.T1.3.3.3.9\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.3.3.3.9.1\">\n<span class=\"ltx_p\" id=\"S4.T1.3.3.3.9.1.1\" style=\"width:22.8pt;\">GQA</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_tt\" id=\"S4.T1.3.3.3.10\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.3.3.3.10.1\">\n<span class=\"ltx_p\" id=\"S4.T1.3.3.3.10.1.1\" style=\"width:22.8pt;\">VisWiz</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_tt\" id=\"S4.T1.2.2.2.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.2.2.2.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.2.2.2.2.1.1\" style=\"width:22.8pt;\">SQA</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_tt\" id=\"S4.T1.3.3.3.3\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.3.3.3.3.1\">\n<span class=\"ltx_p\" id=\"S4.T1.3.3.3.3.1.1\" style=\"width:22.8pt;\">VQA</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.7.7.8.1\">\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S4.T1.7.7.8.1.1\">BLIP-2<cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib19\" title=\"\">19</a>]</cite>\n</td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S4.T1.7.7.8.1.2\">Vicuna-13B</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.8.1.3\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.8.1.3.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.8.1.3.1.1\" style=\"width:14.2pt;\">224</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.8.1.4\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.8.1.4.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.8.1.4.1.1\" style=\"width:19.9pt;\">129M</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.7.7.8.1.5\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.8.1.5.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.8.1.5.1.1\" style=\"width:25.6pt;\">-</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.8.1.6\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.8.1.6.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.8.1.6.1.1\" style=\"width:22.8pt;\">41.0</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.8.1.7\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.8.1.7.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.8.1.7.1.1\" style=\"width:22.8pt;\">41</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.8.1.8\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.8.1.8.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.8.1.8.1.1\" style=\"width:22.8pt;\">19.6</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.8.1.9\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.8.1.9.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.8.1.9.1.1\" style=\"width:22.8pt;\">61</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.8.1.10\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.8.1.10.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.8.1.10.1.1\" style=\"width:22.8pt;\">42.5</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.7.7.9.2\">\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.9.2.1\">InstructBLIP<cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib9\" title=\"\">9</a>]</cite>\n</td>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.9.2.2\">Vicuna-7B</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.9.2.3\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.9.2.3.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.9.2.3.1.1\" style=\"width:14.2pt;\">224</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.9.2.4\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.9.2.4.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.9.2.4.1.1\" style=\"width:19.9pt;\">129M</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r\" id=\"S4.T1.7.7.9.2.5\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.9.2.5.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.9.2.5.1.1\" style=\"width:25.6pt;\">1.2M</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.9.2.6\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.9.2.6.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.9.2.6.1.1\" style=\"width:22.8pt;\">–</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.9.2.7\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.9.2.7.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.9.2.7.1.1\" style=\"width:22.8pt;\">49.2</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.9.2.8\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.9.2.8.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.9.2.8.1.1\" style=\"width:22.8pt;\">34.5</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.9.2.9\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.9.2.9.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.9.2.9.1.1\" style=\"width:22.8pt;\">60.5</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.9.2.10\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.9.2.10.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.9.2.10.1.1\" style=\"width:22.8pt;\">50.1</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.7.7.10.3\">\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.10.3.1\">InstructBLIP<cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib9\" title=\"\">9</a>]</cite>\n</td>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.10.3.2\">Vicuna-13B</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.10.3.3\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.10.3.3.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.10.3.3.1.1\" style=\"width:14.2pt;\">224</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.10.3.4\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.10.3.4.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.10.3.4.1.1\" style=\"width:19.9pt;\">129M</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r\" id=\"S4.T1.7.7.10.3.5\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.10.3.5.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.10.3.5.1.1\" style=\"width:25.6pt;\">1.2M</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.10.3.6\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.10.3.6.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.10.3.6.1.1\" style=\"width:22.8pt;\">–</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.10.3.7\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.10.3.7.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.10.3.7.1.1\" style=\"width:22.8pt;\">49.5</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.10.3.8\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.10.3.8.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.10.3.8.1.1\" style=\"width:22.8pt;\">33.4</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.10.3.9\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.10.3.9.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.10.3.9.1.1\" style=\"width:22.8pt;\">63.1</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.10.3.10\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.10.3.10.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.10.3.10.1.1\" style=\"width:22.8pt;\">50.7</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.7.7.11.4\">\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.11.4.1\">Shikra<cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib6\" title=\"\">6</a>]</cite>\n</td>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.11.4.2\">Vicuna-13B</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.11.4.3\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.11.4.3.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.11.4.3.1.1\" style=\"width:14.2pt;\">224</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.11.4.4\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.11.4.4.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.11.4.4.1.1\" style=\"width:19.9pt;\">600K</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r\" id=\"S4.T1.7.7.11.4.5\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.11.4.5.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.11.4.5.1.1\" style=\"width:25.6pt;\">5.5M</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.11.4.6\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.11.4.6.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.11.4.6.1.1\" style=\"width:22.8pt;\">77.4</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.11.4.7\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.11.4.7.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.11.4.7.1.1\" style=\"width:22.8pt;\">–</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.11.4.8\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.11.4.8.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.11.4.8.1.1\" style=\"width:22.8pt;\">–</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.11.4.9\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.11.4.9.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.11.4.9.1.1\" style=\"width:22.8pt;\">–</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.11.4.10\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.11.4.10.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.11.4.10.1.1\" style=\"width:22.8pt;\">–</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.7.7.12.5\">\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.12.5.1\">IDEFICS-9B <cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib16\" title=\"\">16</a>]</cite>\n</td>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.12.5.2\">LLaMA-7B</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.12.5.3\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.12.5.3.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.12.5.3.1.1\" style=\"width:14.2pt;\">224</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.12.5.4\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.12.5.4.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.12.5.4.1.1\" style=\"width:19.9pt;\">353M</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r\" id=\"S4.T1.7.7.12.5.5\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.12.5.5.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.12.5.5.1.1\" style=\"width:25.6pt;\">1M</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.12.5.6\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.12.5.6.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.12.5.6.1.1\" style=\"width:22.8pt;\">50.9</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.12.5.7\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.12.5.7.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.12.5.7.1.1\" style=\"width:22.8pt;\">38.4</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.12.5.8\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.12.5.8.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.12.5.8.1.1\" style=\"width:22.8pt;\">35.5</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.12.5.9\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.12.5.9.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.12.5.9.1.1\" style=\"width:22.8pt;\">–</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.12.5.10\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.12.5.10.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.12.5.10.1.1\" style=\"width:22.8pt;\">25.9</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.7.7.13.6\">\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.13.6.1\">IDEFICS-80B<cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib16\" title=\"\">16</a>]</cite>\n</td>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.13.6.2\">LLaMA-65B</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.13.6.3\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.13.6.3.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.13.6.3.1.1\" style=\"width:14.2pt;\">224</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.13.6.4\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.13.6.4.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.13.6.4.1.1\" style=\"width:19.9pt;\">353M</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r\" id=\"S4.T1.7.7.13.6.5\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.13.6.5.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.13.6.5.1.1\" style=\"width:25.6pt;\">1M</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.13.6.6\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.13.6.6.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.13.6.6.1.1\" style=\"width:22.8pt;\">60.0</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.13.6.7\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.13.6.7.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.13.6.7.1.1\" style=\"width:22.8pt;\">45.2</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.13.6.8\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.13.6.8.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.13.6.8.1.1\" style=\"width:22.8pt;\">36.0</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.13.6.9\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.13.6.9.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.13.6.9.1.1\" style=\"width:22.8pt;\">–</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.13.6.10\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.13.6.10.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.13.6.10.1.1\" style=\"width:22.8pt;\">30.9</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.5.5.5\">\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.5.5.5.3\">Qwen-VL<cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib1\" title=\"\">1</a>]</cite>\n</td>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.5.5.5.4\">Qwen-7B</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.5.5.5.5\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.5.5.5.5.1\">\n<span class=\"ltx_p\" id=\"S4.T1.5.5.5.5.1.1\" style=\"width:14.2pt;\">448</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.4.4.4.1\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.4.4.4.1.1\">\n<span class=\"ltx_p\" id=\"S4.T1.4.4.4.1.1.1\" style=\"width:19.9pt;\">1.4B<sup class=\"ltx_sup\" id=\"S4.T1.4.4.4.1.1.1.1\">†</sup></span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r\" id=\"S4.T1.5.5.5.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.5.5.5.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.5.5.5.2.1.1\" style=\"width:25.6pt;\">50M<sup class=\"ltx_sup\" id=\"S4.T1.5.5.5.2.1.1.1\">†</sup></span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.5.5.5.6\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.5.5.5.6.1\">\n<span class=\"ltx_p\" id=\"S4.T1.5.5.5.6.1.1\" style=\"width:22.8pt;\">78.8</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.5.5.5.7\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.5.5.5.7.1\">\n<span class=\"ltx_p\" id=\"S4.T1.5.5.5.7.1.1\" style=\"width:22.8pt;\">59.3</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.5.5.5.8\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.5.5.5.8.1\">\n<span class=\"ltx_p\" id=\"S4.T1.5.5.5.8.1.1\" style=\"width:22.8pt;\">35.2</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.5.5.5.9\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.5.5.5.9.1\">\n<span class=\"ltx_p\" id=\"S4.T1.5.5.5.9.1.1\" style=\"width:22.8pt;\">67.1</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.5.5.5.10\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.5.5.5.10.1\">\n<span class=\"ltx_p\" id=\"S4.T1.5.5.5.10.1.1\" style=\"width:22.8pt;\">63.8</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.7.7.7\">\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.7.3\">Qwen-VL-Chat<cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib1\" title=\"\">1</a>]</cite>\n</td>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.7.4\">Qwen-7B</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.7.5\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.7.5.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.7.5.1.1\" style=\"width:14.2pt;\">448</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.6.6.6.1\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.6.6.6.1.1\">\n<span class=\"ltx_p\" id=\"S4.T1.6.6.6.1.1.1\" style=\"width:19.9pt;\">1.4B<sup class=\"ltx_sup\" id=\"S4.T1.6.6.6.1.1.1.1\">†</sup></span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r\" id=\"S4.T1.7.7.7.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.7.2.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.7.2.1.1\" style=\"width:25.6pt;\">50M<sup class=\"ltx_sup\" id=\"S4.T1.7.7.7.2.1.1.1\">†</sup></span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.7.6\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.7.6.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.7.6.1.1\" style=\"width:22.8pt;\">78.2</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.7.7\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.7.7.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.7.7.1.1\" style=\"width:22.8pt;\">57.5</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.7.8\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.7.8.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.7.8.1.1\" style=\"width:22.8pt;\">38.9</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.7.9\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.7.9.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.7.9.1.1\" style=\"width:22.8pt;\">68.2</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.7.10\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.7.10.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.7.10.1.1\" style=\"width:22.8pt;\">61.5</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.7.7.14.7\">\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.14.7.1\">LLAVA-V1.5<cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib25\" title=\"\">25</a>]</cite>\n</td>\n<td class=\"ltx_td ltx_align_left\" id=\"S4.T1.7.7.14.7.2\">Vicuna-7B</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.14.7.3\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.14.7.3.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.14.7.3.1.1\" style=\"width:14.2pt;\">336</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.14.7.4\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.14.7.4.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.14.7.4.1.1\" style=\"width:19.9pt;\">558K</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r\" id=\"S4.T1.7.7.14.7.5\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.14.7.5.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.14.7.5.1.1\" style=\"width:25.6pt;\">665K</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.14.7.6\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.14.7.6.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.14.7.6.1.1\" style=\"width:22.8pt;\">78.5</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.14.7.7\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.14.7.7.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.14.7.7.1.1\" style=\"width:22.8pt;\">62.0</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.14.7.8\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.14.7.8.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.14.7.8.1.1\" style=\"width:22.8pt;\">50.0</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.14.7.9\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.14.7.9.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.14.7.9.1.1\" style=\"width:22.8pt;\">66.8</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top\" id=\"S4.T1.7.7.14.7.10\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.14.7.10.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.14.7.10.1.1\" style=\"width:22.8pt;\">58.2</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.7.7.15.8\">\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S4.T1.7.7.15.8.1\">+ SVIT-Core-157K<cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib39\" title=\"\">39</a>]</cite>\n</td>\n<td class=\"ltx_td ltx_align_left ltx_border_t\" id=\"S4.T1.7.7.15.8.2\">Vicuna-7B</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.15.8.3\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.15.8.3.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.15.8.3.1.1\" style=\"width:14.2pt;\">336</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.15.8.4\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.15.8.4.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.15.8.4.1.1\" style=\"width:19.9pt;\">558K</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_r ltx_border_t\" id=\"S4.T1.7.7.15.8.5\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.15.8.5.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.15.8.5.1.1\" style=\"width:25.6pt;\">+157K</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.15.8.6\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.15.8.6.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.15.8.6.1.1\" style=\"width:22.8pt;\">75.9</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.15.8.7\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.15.8.7.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.15.8.7.1.1\" style=\"width:22.8pt;\">57.1</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.15.8.8\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.15.8.8.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.15.8.8.1.1\" style=\"width:22.8pt;\">49.1</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.15.8.9\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.15.8.9.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.15.8.9.1.1\" style=\"width:22.8pt;\">69.0</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_t\" id=\"S4.T1.7.7.15.8.10\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.15.8.10.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.15.8.10.1.1\" style=\"width:22.8pt;\">56.3</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.7.7.16.9\">\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S4.T1.7.7.16.9.1\">+ Ours</td>\n<td class=\"ltx_td ltx_align_left ltx_border_bb\" id=\"S4.T1.7.7.16.9.2\">Vicuna-7B</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_bb\" id=\"S4.T1.7.7.16.9.3\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.16.9.3.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.16.9.3.1.1\" style=\"width:14.2pt;\">336</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_bb\" id=\"S4.T1.7.7.16.9.4\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.16.9.4.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.16.9.4.1.1\" style=\"width:19.9pt;\">558K</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_bb ltx_border_r\" id=\"S4.T1.7.7.16.9.5\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.16.9.5.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.16.9.5.1.1\" style=\"width:25.6pt;\">+7K</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_bb\" id=\"S4.T1.7.7.16.9.6\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.16.9.6.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.16.9.6.1.1\" style=\"width:22.8pt;\">77.9</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_bb\" id=\"S4.T1.7.7.16.9.7\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.16.9.7.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.16.9.7.1.1\" style=\"width:22.8pt;\">61.8</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_bb\" id=\"S4.T1.7.7.16.9.8\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.16.9.8.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.16.9.8.1.1\" style=\"width:22.8pt;\">51.1</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_bb\" id=\"S4.T1.7.7.16.9.9\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.16.9.9.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.16.9.9.1.1\" style=\"width:22.8pt;\">69.5</span>\n</span>\n</td>\n<td class=\"ltx_td ltx_align_justify ltx_align_top ltx_border_bb\" id=\"S4.T1.7.7.16.9.10\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S4.T1.7.7.16.9.10.1\">\n<span class=\"ltx_p\" id=\"S4.T1.7.7.16.9.10.1.1\" style=\"width:22.8pt;\">57.3</span>\n</span>\n</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span><span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.9.1\">Comparison with SoTA methods on 5 benchmarks.</span> We achieves better performance on all benchmarks than SVIT-Core-157K. Res, PT, and IT indicate input image resolution, and the number of samples in the pretraining and instruction tuning stage, respectively.\nBenchmark names are abbreviated due to space limits. VQA-v2 <cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib12\" title=\"\">12</a>]</cite>, GQA <cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib15\" title=\"\">15</a>]</cite>, VisWiz <cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib13\" title=\"\">13</a>]</cite>, ScienceQA-IMG <cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib27\" title=\"\">27</a>]</cite>, TextVQA <cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00102v1#bib.bib33\" title=\"\">33</a>]</cite>. More details can be found in the Evaluation Metrics section of the Appendix.</figcaption>\n</figure>",
43
+ "perturb_sentence_id": 10,
44
+ "output": {
45
+ "perturbed_statement": "[paragraph id = 10] In the efficient LoRA training setup, our data exceeded SVIT-core-157K[39 ] by 3.0 points in GQA [15 ], 1.5 points in VQAV2 [12 ], 1.2 points in TextVQA [33 ], 1.8 points in VisWiz [13 ], and 0.3 points in SQA [27 ]. The improvements verify the better training effects of our data since less data amount and same model are used.",
46
+ "perturbed_explanation": "1. The original explanation states that the data achieved performance increases by specific margins across various datasets, as given in the results above. 2. The statement now includes invalid performance margins for some datasets, such as '3.0 points in GQA' instead of '4.7 points' and '1.5 points in VQAV2' instead of '2.0 points,' which do not match the results provided. This error alters the reported evidence of the model's effectiveness."
47
+ }
48
+ },
49
+ {
50
+ "path": "table_paper/2407.00102v1.json",
51
+ "table_id": "2",
52
+ "section": "4.2",
53
+ "all_context": [
54
+ "We use the LLaVA-v1.5-7B [25 ] architecture with model weights fully fine-tuned using LLaVA-1.5-mix-665k data.",
55
+ "Subsequently, we fine-tune this model with LoRA [14 ] during the follow-up experiments.",
56
+ "In training, we keep the visual encoder, projector, and LLM weights frozen, and maximize the likelihood of with trainable parameters of LoRA only.",
57
+ "We keep the rest of the training protocol the same to allow for a fair comparison.",
58
+ "Scenario 1, which only includes LoRA tuning, takes approximately 16 hours on an NVIDIA Tesla A100 GPU with 40GB of memory, using DeepSpeed ZeRO Stage 3.",
59
+ "We use the SVIT-core-157K [39 ] dataset for continuous fine-tuning to establish a baseline.",
60
+ "And the same method is applied to fine-tune our data.",
61
+ "We report our main results in Table 1 .",
62
+ "Our method, using only 7000 samples of SVIT-core-157K, achieved higher performance across all benchmarks compared to the full data experiment setup.",
63
+ "Furthermore, it surpassed the base model on SQA [27 ] and VisWiz [13 ], reaching state-of-the-art (SOTA) performance.",
64
+ "In the efficient LoRA training setup, our data exceeded SVIT-core-157K[39 ] by 4.7 points in GQA [15 ], 2.0 points in VQAV2 [12 ], 1.0 point in TextVQA [33 ], 2.0 points in VisWiz [13 ], and 0.5 points in SQA [27 ].",
65
+ "The improvements verify the better training effects of our data since less data amount and same model are used.",
66
+ "In Table 2, we use the top-right corner in the left panel of Figure 7 (shown in the appendix) as the top 5% of the DIQ and conducted a comparison experiment, we found that using the 5% selected by DIQ resulted in better performance compared to using the top 5% of DIS and DIL separately.",
67
+ "We realized that this improvement is due to the subset from DIQ selecting data evenly from the entire region, whereas DIS and DIL focus on regions with high levels of clip score or loss.",
68
+ "Based on these insights, we introduced curriculum learning, utilizing multi-stage training that progresses from low-quality to high-quality data.",
69
+ "This approach, as demonstrated in the ablation experiment in Table 2, highlights the importance of increasing the diversity of data quality for improving model performance.",
70
+ "By employing this method, we found that using curriculum learning with the DIQ method can further enhance model performance.",
71
+ "To further understand the effectiveness of curriculum learning, we observe that it starts with simple examples, which have lower noise and smaller loss.",
72
+ "This provides a smoother loss landscape, reducing gradient oscillations and instability for a more stable initial training process.",
73
+ "As the model progresses to higher-quality data, it benefits from established initial parameters and a clear learning direction, facilitating easier optimization.",
74
+ "By gradually increasing data quality, curriculum learning helps the model adapt and optimize progressively, leading to improved performance as shown in our results.",
75
+ ""
76
+ ],
77
+ "target_context_ids": [
78
+ 12,
79
+ 14,
80
+ 15,
81
+ 16
82
+ ],
83
+ "selected_paragraphs": [
84
+ "[paragraph id = 12] In Table 2, we use the top-right corner in the left panel of Figure 7 (shown in the appendix) as the top 5% of the DIQ and conducted a comparison experiment, we found that using the 5% selected by DIQ resulted in better performance compared to using the top 5% of DIS and DIL separately.",
85
+ "[paragraph id = 14] Based on these insights, we introduced curriculum learning, utilizing multi-stage training that progresses from low-quality to high-quality data.",
86
+ "[paragraph id = 15] This approach, as demonstrated in the ablation experiment in Table 2, highlights the importance of increasing the diversity of data quality for improving model performance.",
87
+ "[paragraph id = 16] By employing this method, we found that using curriculum learning with the DIQ method can further enhance model performance."
88
+ ],
89
+ "table_html": "<figure class=\"ltx_table ltx_align_floatright\" id=\"S4.T2\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S4.T2.1\" style=\"width:166.8pt;height:126pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(0.0pt,0.0pt) scale(1.0,1.0) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S4.T2.1.1\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r ltx_border_tt\" id=\"S4.T2.1.1.1.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.1.1.1.1.1\" style=\"font-size:90%;\">Strategy</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"3\" id=\"S4.T2.1.1.1.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.1.1.1.2.1\" style=\"font-size:90%;\">Scenario 1</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.2.2\">\n<th class=\"ltx_td ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.1.1.2.2.1\"></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.1.2.2.2\"><span class=\"ltx_text\" id=\"S4.T2.1.1.2.2.2.1\" style=\"font-size:90%;\">SQA</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.1.2.2.3\"><span class=\"ltx_text\" id=\"S4.T2.1.1.2.2.3.1\" style=\"font-size:90%;\">TextVQA</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.1.2.2.4\"><span class=\"ltx_text\" id=\"S4.T2.1.1.2.2.4.1\" style=\"font-size:90%;\">GQA</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.3.3\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T2.1.1.3.3.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.3.3.1.1\" style=\"font-size:90%;\">DIS</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.1.3.3.2\"><span class=\"ltx_text\" id=\"S4.T2.1.1.3.3.2.1\" style=\"font-size:90%;\">57.06</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.1.3.3.3\"><span class=\"ltx_text\" id=\"S4.T2.1.1.3.3.3.1\" style=\"font-size:90%;\">56.13</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.1.3.3.4\"><span class=\"ltx_text\" id=\"S4.T2.1.1.3.3.4.1\" style=\"font-size:90%;\">61.06</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.4.4\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.1.1.4.4.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.4.4.1.1\" style=\"font-size:90%;\">DIL</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.1.4.4.2\"><span class=\"ltx_text\" id=\"S4.T2.1.1.4.4.2.1\" style=\"font-size:90%;\">68.82</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.1.4.4.3\"><span class=\"ltx_text\" id=\"S4.T2.1.1.4.4.3.1\" style=\"font-size:90%;\">56.30</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.1.4.4.4\"><span class=\"ltx_text\" id=\"S4.T2.1.1.4.4.4.1\" style=\"font-size:90%;\">60.87</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.5.5\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.1.1.5.5.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.5.5.1.1\" style=\"font-size:90%;\">DIQ</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.1.5.5.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.1.5.5.2.1\" style=\"font-size:90%;\">69.56</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.1.5.5.3\"><span class=\"ltx_text\" id=\"S4.T2.1.1.5.5.3.1\" style=\"font-size:90%;\">56.84</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.1.5.5.4\"><span class=\"ltx_text\" id=\"S4.T2.1.1.5.5.4.1\" style=\"font-size:90%;\">61.16</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.6.6\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" colspan=\"4\" id=\"S4.T2.1.1.6.6.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S4.T2.1.1.6.6.1.1\" style=\"font-size:90%;\">Result with Data Curriculum</span></th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.7.7\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb ltx_border_r\" id=\"S4.T2.1.1.7.7.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.7.7.1.1\" style=\"font-size:90%;\">Ours</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.1.1.7.7.2\"><span class=\"ltx_text\" id=\"S4.T2.1.1.7.7.2.1\" style=\"font-size:90%;\">69.51</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.1.1.7.7.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.1.7.7.3.1\" style=\"font-size:90%;\">57.25</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.1.1.7.7.4\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.1.7.7.4.1\" style=\"font-size:90%;\">61.80</span></td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\" style=\"font-size:90%;\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span>Results across different methods.</figcaption>\n</figure>",
90
+ "perturb_sentence_id": 12,
91
+ "output": {
92
+ "perturbed_statement": "[paragraph id = 12] In Table 2, we use the top-left corner in the left panel of Figure 7 (shown in the appendix) as the top 5% of the DIQ and conducted a comparison experiment. We found that using the 5% selected by DIQ resulted in better performance compared to using the top 5% of DIS and DIL separately. We realized that this improvement is due to the subset from DIQ selecting data evenly from the entire region, whereas DIS and DIL focus on regions with high levels of clip score or loss.",
93
+ "perturbed_explanation": "1. The original explanation clarified that the selection criterion was based on the top-right corner.\n2. The statement specifies the top-left corner, which inconsistency diminishes the accuracy of the description regarding the location derived from Figure 7."
94
+ }
95
+ }
96
+ ]
table_result/2407.00104v1_output.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00104v1.json",
4
+ "table_id": "1",
5
+ "section": "2.1",
6
+ "all_context": [
7
+ "The entire database was provided by the Dermatology Unit of the “Hospital Universitario Virgen Macarena” and were sent over 2 years from 60 primary care centers.",
8
+ "The dataset comprises 1559 dermoscopic images divided into 3 subsets.",
9
+ "Four dermatologists provided different types of annotation according to the subsets.",
10
+ "Specifically: The first subset consisted of 1089 dermoscopic images.",
11
+ "Initially, the labeling annotations for these images were the presence or absence of each of the dermoscopic features involved in the diagnosis of BCC.",
12
+ "A second subset of 334 images is additionally enriched with dermatologist delineations of BCC dermoscopic patterns within each image.",
13
+ "More than one segmented area may appear on an image if there are multiple patterns in the BCC lesion.",
14
+ "In the Figure 3 an example is shown.",
15
+ "The third subset is made up of 136 non-BCC images, mostly consisting of nevus lesions, from the ISIC archive [8 ].",
16
+ "Table 1 summarizes the distribution of labels in the database.",
17
+ "As can be seen in this table, the database has a significant class imbalance, with SW and MG underrepresented.",
18
+ "Several techniques have been used to address this problem.",
19
+ ""
20
+ ],
21
+ "target_context_ids": [
22
+ 9,
23
+ 10
24
+ ],
25
+ "selected_paragraphs": [
26
+ "[paragraph id = 9] Table 1 summarizes the distribution of labels in the database.",
27
+ "[paragraph id = 10] As can be seen in this table, the database has a significant class imbalance, with SW and MG underrepresented."
28
+ ],
29
+ "table_html": "<figure class=\"ltx_table\" id=\"S2.T1\">\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span>Sample distribution for binary and multilabel codification.</figcaption>\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S2.T1.1\" style=\"width:433.6pt;height:36.3pt;vertical-align:-0.7pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-112.0pt,9.2pt) scale(0.659441329482834,0.659441329482834) ;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T1.1.1\">\n<tr class=\"ltx_tr\" id=\"S2.T1.1.1.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"2\" id=\"S2.T1.1.1.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.1.1.1.1.1\">Binary codification</span></td>\n<td class=\"ltx_td ltx_border_tt\" id=\"S2.T1.1.1.1.2\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"6\" id=\"S2.T1.1.1.1.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T1.1.1.1.3.1\">Multi-label codification</span></td>\n<td class=\"ltx_td ltx_border_tt\" id=\"S2.T1.1.1.1.4\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.1.1.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T1.1.1.2.1\">BCC</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T1.1.1.2.2\">Non-BCC</td>\n<td class=\"ltx_td ltx_border_t\" id=\"S2.T1.1.1.2.3\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T1.1.1.2.4\">Pigment Network</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T1.1.1.2.5\">Ulceration</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T1.1.1.2.6\">Ovoid Nests</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T1.1.1.2.7\">Multiglobules</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T1.1.1.2.8\">Maple Leaf-like</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T1.1.1.2.9\">Spoke Wheel</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T1.1.1.2.10\">Arborizing Telangiectasia</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T1.1.1.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T1.1.1.3.1\">775</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T1.1.1.3.2\">784</td>\n<td class=\"ltx_td ltx_border_bb ltx_border_t\" id=\"S2.T1.1.1.3.3\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T1.1.1.3.4\">557</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T1.1.1.3.5\">385</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T1.1.1.3.6\">333</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T1.1.1.3.7\">191</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T1.1.1.3.8\">244</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T1.1.1.3.9\">178</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_t\" id=\"S2.T1.1.1.3.10\">455</td>\n</tr>\n</table>\n</span></div>\n</figure>",
30
+ "perturb_sentence_id": 9,
31
+ "output": {
32
+ "perturbed_statement": "[paragraph id = 9] Table 1 summarizes the distribution of labels in the database. As can be seen in this table, the database has a balanced class distribution, with no labels significantly underrepresented.",
33
+ "perturbed_explanation": "The original explanation: Table 1 highlights the label distribution within the database, showing a significant imbalance with SW and MG labels being underrepresented. The altered statement is incorrect due to the following reasons: 1. It inaccurately claims a balanced class distribution, directly contradicting the information from Table 1. 2. Such misrepresentation could lead to misunderstandings about the data characteristics and their implications."
34
+ }
35
+ },
36
+ {
37
+ "path": "table_paper/2407.00104v1.json",
38
+ "table_id": "2",
39
+ "section": "2.1.1",
40
+ "all_context": [
41
+ "Each image may contain multiple dermoscopic patterns.",
42
+ "Therefore, a one-hot coding scheme was used to encode the labels during image annotation and subsequently to process the dermatologists annotations.",
43
+ "Each image label is a binary word and each BCC dermoscopic pattern is a digit, where means presence and means absence.",
44
+ "The seven patterns that can appear in a BCC lesion are[4 , 18 , 17 ]: Pigment Network (PN) (negative criterion), Ulceration (U), Ovoid Nests (ON), Multiglobules (MG), Maple Leaf-like (ML), Spoke Wheel (SW), Arborizing Telangiectasia (AT) (Figure 2 ).",
45
+ "Thus, each label is a vector of dimensions .",
46
+ "In Table 2 there are some examples of this process.",
47
+ ""
48
+ ],
49
+ "target_context_ids": [
50
+ 2,
51
+ 4,
52
+ 5
53
+ ],
54
+ "selected_paragraphs": [
55
+ "[paragraph id = 2] Each image label is a binary word and each BCC dermoscopic pattern is a digit, where means presence and means absence.",
56
+ "[paragraph id = 4] Thus, each label is a vector of dimensions .",
57
+ "[paragraph id = 5] In Table 2 there are some examples of this process."
58
+ ],
59
+ "table_html": "<figure class=\"ltx_table\" id=\"S2.T2\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span>Example of multilabel and binary encoding for BCC diagnosis</figcaption>\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S2.T2.1\" style=\"width:260.2pt;height:68.8pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-6.1pt,1.6pt) scale(0.955059914997519,0.955059914997519) ;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S2.T2.1.1\">\n<tr class=\"ltx_tr\" id=\"S2.T2.1.1.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S2.T2.1.1.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T2.1.1.1.1.1\">Codification</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S2.T2.1.1.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T2.1.1.1.2.1\">Multi-label</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S2.T2.1.1.1.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T2.1.1.1.3.1\">Binary</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S2.T2.1.1.1.4\"><span class=\"ltx_text ltx_font_bold\" id=\"S2.T2.1.1.1.4.1\">Diagnostic</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.1.1.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.1.1.2.1\">Example 1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.1.1.2.2\">[0 1 0 1 1 0 1]</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.1.1.2.3\">1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S2.T2.1.1.2.4\">Presence of BCC</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.1.1.3\">\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T2.1.1.3.1\">Example 2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T2.1.1.3.2\">[1 0 0 0 0 0 0]</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T2.1.1.3.3\">0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S2.T2.1.1.3.4\">Absence of BCC</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S2.T2.1.1.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S2.T2.1.1.4.1\">Example 3</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S2.T2.1.1.4.2\">[0 0 0 0 0 0 0]</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S2.T2.1.1.4.3\">0</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S2.T2.1.1.4.4\">Absence of BCC</td>\n</tr>\n</table>\n</span></div>\n</figure>",
60
+ "perturb_sentence_id": 5,
61
+ "output": {
62
+ "perturbed_statement": "[paragraph id = 5] In Table 5 there are some examples of this process.",
63
+ "perturbed_explanation": "Original Explanation: Table 2 is identified as containing examples of the discussed process. 2. The statement mistakenly references Table 5 instead, which is not mentioned in the provided context as containing examples."
64
+ }
65
+ },
66
+ {
67
+ "path": "table_paper/2407.00104v1.json",
68
+ "table_id": "3",
69
+ "section": "3.2",
70
+ "all_context": [
71
+ "This section analyzes the performance of the AI tool for BCC detection in conjunction with the labels provided to explain this classification.",
72
+ "Table 3 presents metrics that summarize this performance.",
73
+ "The metrics are averaged over all folds.",
74
+ "This table has three parts.",
75
+ "The first part shows the performance of the AI tool in the binary classification.",
76
+ "The second part shows its performance in detecting BCC dermoscopic patterns.",
77
+ "Finally, the third part represents the accuracy of the labels that provide the clinical explanation.",
78
+ "Overall, the BCC/non-BCC diagnostic performance is high, around for all metrics.",
79
+ "However, the BCC pattern detection performance has to be analysed with a deeper insight.",
80
+ "Minority classes tend to attain low recall because the AI tool trained with unbalanced databases tends to favor majority classes.",
81
+ "As shown in Sect.",
82
+ "2.1 , SW, MG and ML are underrepresented classes.",
83
+ "Strategies such as data augmentation and advanced sampling, a one-vs-all strategy combined with stratified k-fold cross-validation helped to achieve a more balanced classification across patterns, thereby improving overall model performance.",
84
+ "However, the metrics achieved should not be analyzed in the same way as BCC/non-BCC performance.",
85
+ "They should only be evaluated to the extent that they provide a correct explanation for the binary classification.",
86
+ "It is not relevant if the AI tool misses a specific BCC pattern, but if it misses any BCC pattern, as clinicians diagnose skin lesions in the same way.",
87
+ "This further evaluation is summarized in the third part of Table 3 .",
88
+ "As shown in this table, 73 percent of non-BCC lesions without any BCC pattern, 95 percent of non-BCC lesions with PN, and 99 percent of BCC lesions with some BCC pattern are correctly labeled as such.",
89
+ ""
90
+ ],
91
+ "target_context_ids": [
92
+ 1,
93
+ 4,
94
+ 5,
95
+ 6,
96
+ 7,
97
+ 8,
98
+ 9,
99
+ 12,
100
+ 13,
101
+ 14,
102
+ 15,
103
+ 16,
104
+ 17
105
+ ],
106
+ "selected_paragraphs": [
107
+ "[paragraph id = 1] Table 3 presents metrics that summarize this performance.",
108
+ "[paragraph id = 4] The first part shows the performance of the AI tool in the binary classification.",
109
+ "[paragraph id = 5] The second part shows its performance in detecting BCC dermoscopic patterns.",
110
+ "[paragraph id = 6] Finally, the third part represents the accuracy of the labels that provide the clinical explanation.",
111
+ "[paragraph id = 7] Overall, the BCC/non-BCC diagnostic performance is high, around for all metrics.",
112
+ "[paragraph id = 8] However, the BCC pattern detection performance has to be analysed with a deeper insight.",
113
+ "[paragraph id = 9] Minority classes tend to attain low recall because the AI tool trained with unbalanced databases tends to favor majority classes.",
114
+ "[paragraph id = 12] Strategies such as data augmentation and advanced sampling, a one-vs-all strategy combined with stratified k-fold cross-validation helped to achieve a more balanced classification across patterns, thereby improving overall model performance.",
115
+ "[paragraph id = 13] However, the metrics achieved should not be analyzed in the same way as BCC/non-BCC performance.",
116
+ "[paragraph id = 14] They should only be evaluated to the extent that they provide a correct explanation for the binary classification.",
117
+ "[paragraph id = 15] It is not relevant if the AI tool misses a specific BCC pattern, but if it misses any BCC pattern, as clinicians diagnose skin lesions in the same way.",
118
+ "[paragraph id = 16] This further evaluation is summarized in the third part of Table 3 .",
119
+ "[paragraph id = 17] As shown in this table, 73 percent of non-BCC lesions without any BCC pattern, 95 percent of non-BCC lesions with PN, and 99 percent of BCC lesions with some BCC pattern are correctly labeled as such."
120
+ ],
121
+ "table_html": "<figure class=\"ltx_table\" id=\"S3.T3\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 3: </span>Evaluation using binary and multilabel classification metrics, fine-tuned binary classifier, and physician-guided analysis.</figcaption>\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S3.T3.1\" style=\"width:260.2pt;height:212.7pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-35.0pt,28.6pt) scale(0.787876288469247,0.787876288469247) ;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S3.T3.1.1\">\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.1\">\n<td class=\"ltx_td ltx_border_tt\" id=\"S3.T3.1.1.1.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S3.T3.1.1.1.2\">Recall</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S3.T3.1.1.1.3\">Specificity</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S3.T3.1.1.1.4\">Precision</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S3.T3.1.1.1.5\">Accuracy</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.2.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T3.1.1.2.1.1\">BCC/Non-BCC</span></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T3.1.1.2.2\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T3.1.1.2.3\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T3.1.1.2.4\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T3.1.1.2.5\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.3\">\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T3.1.1.3.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.3.2\">0.89</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.3.3\">0.89</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.3.4\">0.90</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.3.5\">0.90</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.4.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T3.1.1.4.1.1\">Pattern detection</span></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T3.1.1.4.2\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T3.1.1.4.3\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T3.1.1.4.4\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T3.1.1.4.5\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.5\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.5.1\">Pigment Network</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.5.2\">0.94</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.5.3\">0.96</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.5.4\">0.97</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.5.5\">0.95</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.6\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.6.1\">Ulceration</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.6.2\">0.81</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.6.3\">0.75</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.6.4\">0.52</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.6.5\">0.77</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.7\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.7.1\">OvidNests</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.7.2\">0.65</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.7.3\">0.84</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.7.4\">0.53</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.7.5\">0.84</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.8\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.8.1\">Multiglobules</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.8.2\">0.61</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.8.3\">0.81</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.8.4\">0.32</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.8.5\">0.80</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.9\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.9.1\">Maple Leaf-like</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.9.2\">0.50</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.9.3\">0.82</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.9.4\">0.34</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.9.5\">0.77</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.10\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.10.1\">Spoke Wheel</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.10.2\">0.60</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.10.3\">0.87</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.10.4\">0.37</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.10.5\">0.84</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.11\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.11.1\">Arborizing Telangiectasia</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.11.2\">0.89</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.11.3\">0.76</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.11.4\">0.61</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.11.5\">0.80</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.12\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.12.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T3.1.1.12.1.1\">Clinical-inspired XAI</span></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T3.1.1.12.2\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T3.1.1.12.3\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T3.1.1.12.4\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T3.1.1.12.5\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.13\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.13.1\">All 0’s</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.13.2\">-</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.13.3\">-</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.13.4\">-</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.13.5\">0.73</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.14\">\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.14.1\">Pigment Network</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.14.2\">0.94</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.14.3\">0.96</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.14.4\">0.97</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.14.5\">0.95</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.15\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.1.1.15.1\">BCC pattern detection</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.1.1.15.2\">0.84</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.1.1.15.3\">0.88</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.1.1.15.4\">0.71</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.1.1.15.5\">0.99</td>\n</tr>\n</table>\n</span></div>\n</figure>",
122
+ "perturb_sentence_id": 17,
123
+ "output": {
124
+ "perturbed_statement": "[paragraph id = 17] As shown in this table, 73 percent of BCC lesions without any BCC pattern, 95 percent of non-BCC lesions with PN, and 99 percent of non-BCC lesions with some BCC pattern are correctly labeled as such.",
125
+ "perturbed_explanation": "1. Analyzing the given information, it is stated that 73 percent of non-BCC lesions without any BCC pattern, 95 percent of non-BCC lesions with PN, and 99 percent of BCC lesions with some BCC pattern are accurately labeled. 2. The statement incorrectly suggests that 73 percent of BCC lesions without BCC patterns and 99 percent of non-BCC lesions exhibiting BCC patterns are correctly labeled, which contradicts the context since it specifically mentions the performance metrics for non-BCC lesions without patterns and BCC lesions with patterns."
126
+ }
127
+ },
128
+ {
129
+ "path": "table_paper/2407.00104v1.json",
130
+ "table_id": "4",
131
+ "section": "3.3",
132
+ "all_context": [
133
+ "This section aims to quantify the accuracy of the AI tool in focusing on the correct part of the lesion, specifically the BCC dermoscopic patterns identified by clinicians.",
134
+ "To this end, BCC pattern areas delineated by dermatologists will be compared with model activated areas.",
135
+ "This will provide a quantitative measure of the model s agreement with human diagnostic criteria and demonstrate its ability to accurately identify critical features of BCC lesions.",
136
+ "To quantify the accuracy of the model activation areas with respect to the areas of clinical interest the conditional probability density functions of the normalized GradCAM values within and outside the area segmented by dermatologist were estimated.",
137
+ "Let the GradCAM value at position .",
138
+ "Let denote Fg the area segmented by the dermatologist and Bg the background.",
139
+ "is the probability density function of GradCAM values for pixels and w is the probability density function of GradCAM values for pixels .",
140
+ "Figure 4 illustrates this analysis.",
141
+ "Figure 4(a) shows the original BCC lesion.",
142
+ "Figure 4(b) shows the Grad-CAM map.",
143
+ "Figure 4(c) shows the dermatologist s segmentation overlaid on the Grad-CAM map.",
144
+ "Figure 4(d) shows an example of the two conditional probability density functions.",
145
+ "The orange curve represents , and the blue curve represents .",
146
+ "The orange curve is centered near 0, indicating low activation outside the mask, while the blue curve shows significant Grad-CAM information within the clinical segmentation, indicating that the model extracts features from the same region as the specialist.",
147
+ "Table 4 summarizes the information extracted from these probability density function.",
148
+ "Specifically, mean, standard deviation of for and respectively, and the intersection area between and are shown.",
149
+ "This table shows that correctly predicted samples have a larger mean standard deviation than incorrectly predicted samples.",
150
+ "In addition, the intersection area is larger in these cases.",
151
+ "These facts prove that the model is not able to pay attention to the areas of clinical interest in the incorrect predictions.",
152
+ ""
153
+ ],
154
+ "target_context_ids": [
155
+ 3,
156
+ 14,
157
+ 15,
158
+ 16,
159
+ 17
160
+ ],
161
+ "selected_paragraphs": [
162
+ "[paragraph id = 3] To quantify the accuracy of the model activation areas with respect to the areas of clinical interest the conditional probability density functions of the normalized GradCAM values within and outside the area segmented by dermatologist were estimated.",
163
+ "[paragraph id = 14] Table 4 summarizes the information extracted from these probability density function.",
164
+ "[paragraph id = 15] Specifically, mean, standard deviation of for and respectively, and the intersection area between and are shown.",
165
+ "[paragraph id = 16] This table shows that correctly predicted samples have a larger mean standard deviation than incorrectly predicted samples.",
166
+ "[paragraph id = 17] In addition, the intersection area is larger in these cases."
167
+ ],
168
+ "table_html": "<figure class=\"ltx_table\" id=\"S3.T4\">\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 4: </span>Statistics derived from estimation of conditional probability density functions of GradCAM within and outside the region of clinical interest.</figcaption>\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S3.T4.1\" style=\"width:260.2pt;height:42.4pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-52.2pt,8.5pt) scale(0.713620974996278,0.713620974996278) ;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S3.T4.1.1\">\n<tr class=\"ltx_tr\" id=\"S3.T4.1.1.1\">\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S3.T4.1.1.1.1\">Prediction</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S3.T4.1.1.1.2\">Intersection</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S3.T4.1.1.1.3\">\n<span class=\"ltx_text\" id=\"S3.T4.1.1.1.3.1\"></span> <span class=\"ltx_text\" id=\"S3.T4.1.1.1.3.2\">\n<span class=\"ltx_tabular ltx_align_middle\" id=\"S3.T4.1.1.1.3.2.1\">\n<span class=\"ltx_tr\" id=\"S3.T4.1.1.1.3.2.1.1\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T4.1.1.1.3.2.1.1.1\">Mean Fg</span></span>\n</span></span><span class=\"ltx_text\" id=\"S3.T4.1.1.1.3.3\"></span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S3.T4.1.1.1.4\">\n<span class=\"ltx_text\" id=\"S3.T4.1.1.1.4.1\"></span> <span class=\"ltx_text\" id=\"S3.T4.1.1.1.4.2\">\n<span class=\"ltx_tabular ltx_align_middle\" id=\"S3.T4.1.1.1.4.2.1\">\n<span class=\"ltx_tr\" id=\"S3.T4.1.1.1.4.2.1.1\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T4.1.1.1.4.2.1.1.1\">Mean Bg</span></span>\n</span></span><span class=\"ltx_text\" id=\"S3.T4.1.1.1.4.3\"></span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S3.T4.1.1.1.5\">\n<span class=\"ltx_text\" id=\"S3.T4.1.1.1.5.1\"></span> <span class=\"ltx_text\" id=\"S3.T4.1.1.1.5.2\">\n<span class=\"ltx_tabular ltx_align_middle\" id=\"S3.T4.1.1.1.5.2.1\">\n<span class=\"ltx_tr\" id=\"S3.T4.1.1.1.5.2.1.1\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T4.1.1.1.5.2.1.1.1\">Std Fg</span></span>\n</span></span><span class=\"ltx_text\" id=\"S3.T4.1.1.1.5.3\"></span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S3.T4.1.1.1.6\">\n<span class=\"ltx_text\" id=\"S3.T4.1.1.1.6.1\"></span> <span class=\"ltx_text\" id=\"S3.T4.1.1.1.6.2\">\n<span class=\"ltx_tabular ltx_align_middle\" id=\"S3.T4.1.1.1.6.2.1\">\n<span class=\"ltx_tr\" id=\"S3.T4.1.1.1.6.2.1.1\">\n<span class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T4.1.1.1.6.2.1.1.1\">Std Bg</span></span>\n</span></span><span class=\"ltx_text\" id=\"S3.T4.1.1.1.6.3\"></span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T4.1.1.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T4.1.1.2.1\">Correct</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T4.1.1.2.2\">0.24</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T4.1.1.2.3\">0.57</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T4.1.1.2.4\">0.16</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T4.1.1.2.5\">0.14</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T4.1.1.2.6\">0.22</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T4.1.1.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T4.1.1.3.1\">Incorrect</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T4.1.1.3.2\">0.32</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T4.1.1.3.3\">0.33</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T4.1.1.3.4\">0.14</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T4.1.1.3.5\">0.01</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T4.1.1.3.6\">0.21</td>\n</tr>\n</table>\n</span></div>\n</figure>",
169
+ "perturb_sentence_id": 14,
170
+ "output": {
171
+ "perturbed_statement": "[paragraph id = 14] Table 4 summarizes the information extracted from these probability density function. Specifically, the median and variance of for and respectively, and the intersection area between and are shown.",
172
+ "perturbed_explanation": "1. The original statement describes that Table 4 provides a summary including the mean, standard deviation, and intersection area related to the probability density functions. 2. The statement is incorrect because it inaccurately states that the table contains the median and variance instead of the actual metrics, mean and standard deviation, as outlined in the summary."
173
+ }
174
+ }
175
+ ]
table_result/2407.00108v1_output.json ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00108v1.json",
4
+ "table_id": "1",
5
+ "section": "3.2",
6
+ "all_context": [
7
+ "The human evaluation aspect of the study is interpreted as the effort required to post-edit the translations to a production standard, and captured in the number of errors, keystrokes and total edit time.",
8
+ "The task was performed by professional HTs and PEs using ZOOSubs, an in-house software application belonging to ZOO Digital, built to facilitate manual translation of video material (Figure 1 ).",
9
+ "The software s interface displays the video material along with timed subtitles in the original language.",
10
+ "The target stream, i.e.",
11
+ "the set of text boxes provided to the right of the source stream, is where the HTs input their translations to the desired language.",
12
+ "It can optionally be pre-populated with “draft” translations – a setting we opted for in this study – allowing post-editors to edit, divide or combine the segments as they see fit.",
13
+ "To make amendments to a segment, the PE needs to click on its box.",
14
+ "From that point, the system tracks the time spent editing the box and the number of keystrokes made.",
15
+ "These metrics are recorded for each box separately and taken into account only if the post-edited text differs from the original.",
16
+ "After applying modifications, an Issues for event window appears for the user to specify the purpose of the changes by selecting errors from a predefined list, optionally providing text commentary.",
17
+ "We leveraged this functionality of ZOOSubs to measure the total and average time and number of keystrokes made by HTs and PEs given some pre-existing translations.",
18
+ "We also measured the number of selected errors.",
19
+ "For this project, we created a bespoke taxonomy of errors (Table 1 ) based on translation errors reported in previous work [Freitag et al., 2021 , Sharou and Specia, 2022 ], the original list of issues already present in the ZOOSubs system and relevant errors from previous work (§2 ).",
20
+ "Error categories from the aforementioned sources were compiled together and curated to fit the study requirements444We uploaded a draft taxonomy to ZOOSubs, and the first author performed a test evaluation against a stream with segments to validate the list.",
21
+ "As a result, some errors were split into more granular categories, some were renamed and some generalised.",
22
+ "The PEs operated on seven episodes from three TV series of varying genres: a fictional series about space exploration, a documentary exploring aspects of everyday life, and a family cooking competition show.",
23
+ "They were unaware that some of the text they worked with was machine translated, but were told that it was for a research project and asked to relax some constraints such as adhering to the reading speed limits.",
24
+ "In addition, we asked four HTs (two to German, two to French) to translate one episode of the cooking show from scratch in ZOOSubs so we could compare their effort to that of post-editors.",
25
+ "For each of the seven episodes, the PEs were asked to post-edit one out of four versions of the text, corresponding to the list outlined in §3 .",
26
+ "We included the human references (Ref) to account for the fact that PEs can sometimes post-edit a translation even when the original one is valid.",
27
+ "Our setup ensured that the same PE evaluated the output for each episode exactly once (i.e.",
28
+ "does not see two different versions of the same text) (Table 2 ).",
29
+ "When referring to individual PEs, we use the notation PE.",
30
+ "[L][i], where L {G (German), F (French)}, and i denotes the PE ID .",
31
+ "The recruited PEs and HTs were professionals within the subtitle domain and freelance employees of ZOO Digital.",
32
+ "They were informed that the undertaken work was carried out for a research project, but nevertheless, they were paid for their effort at competitive PE and HT rates, standard within the company for this type of work.",
33
+ "Information about the PEs and HTs years of experience (YOE) was collected to shed more light on the findings (Table 3 ).",
34
+ "They also answered a short survey about their views regarding machine translation, discussed in detail in §5.3 : Which one would you prefer: translating a stream from scratch or completing a quality check on (post-editing) a stream?",
35
+ "What are your views on the use of machine translation in the industry?",
36
+ "In your view, are there benefits to post-editing translations over translating from scratch?",
37
+ "All French HTs had training in post-editing, and three out of four preferred it to translating from scratch, while no German HTs had received such training in the past, and all but one strictly preferred FST.",
38
+ "All PEs had at least one YOE in post-editing and one and a half in the subtitle domain.",
39
+ "Although the HTs within both pairs had a similar amount of experience in translation in general and in the subtitle domain ( for French vs for German), the French HTs had the advantage in terms of YOE in both subtitling (a mean difference of YOE) and post-editing (a mean difference of YOE).",
40
+ ""
41
+ ],
42
+ "target_context_ids": [
43
+ 12,
44
+ 13,
45
+ 14
46
+ ],
47
+ "selected_paragraphs": [
48
+ "[paragraph id = 12] For this project, we created a bespoke taxonomy of errors (Table 1 ) based on translation errors reported in previous work [Freitag et al., 2021 , Sharou and Specia, 2022 ], the original list of issues already present in the ZOOSubs system and relevant errors from previous work (§2 ).",
49
+ "[paragraph id = 13] Error categories from the aforementioned sources were compiled together and curated to fit the study requirements444We uploaded a draft taxonomy to ZOOSubs, and the first author performed a test evaluation against a stream with segments to validate the list.",
50
+ "[paragraph id = 14] As a result, some errors were split into more granular categories, some were renamed and some generalised."
51
+ ],
52
+ "table_html": "<figure class=\"ltx_table\" id=\"S3.T1\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S3.T1.1\" style=\"width:455.2pt;height:275.9pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-62.2pt,37.7pt) scale(0.785454796276505,0.785454796276505) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S3.T1.1.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.1.1\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S3.T1.1.1.1.1.1\">Type</th>\n<th class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top ltx_th ltx_th_column ltx_border_tt\" id=\"S3.T1.1.1.1.1.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.1.1.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.1.1.2.1.1\" style=\"width:412.6pt;\">Description</span>\n</span>\n</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.2.1\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_t\" id=\"S3.T1.1.1.2.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T1.1.1.2.1.1.1\">Translation quality</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_top ltx_border_t\" id=\"S3.T1.1.1.2.1.2\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.3.2\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.3.2.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.3.2.1.1\">Catastrophic translation</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.3.2.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.3.2.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.3.2.2.1.1\" style=\"width:412.6pt;\">Impossible to post-edit, must be translated from scratch.</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.4.3\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.4.3.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.4.3.1.1\">Mistranslation</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.4.3.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.4.3.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.4.3.2.1.1\" style=\"width:412.6pt;\">Incorrect. Does not preserve the meaning or function of the source.</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.5.4\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.5.4.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.5.4.1.1\">Omission</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.5.4.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.5.4.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.5.4.2.1.1\" style=\"width:412.6pt;\">Part of the source text was left untranslated.</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.6.5\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.6.5.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.6.5.1.1\">Deviation in sentiment</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.6.5.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.6.5.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.6.5.2.1.1\" style=\"width:412.6pt;\">Does not preserve the sentiment of the source (e.g. does not match the expressed excitement), or negates the sentiment (e.g. from positive to negative).</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.7.6\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.7.6.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.7.6.1.1\">Locale convention</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.7.6.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.7.6.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.7.6.2.1.1\" style=\"width:412.6pt;\">Violates locale convention, e.g. currency and date format.</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.8.7\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.8.7.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.8.7.1.1\">Fluency</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.8.7.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.8.7.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.8.7.2.1.1\" style=\"width:412.6pt;\">Contains punctuation, spelling and grammar errors.</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.9.8\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_t\" id=\"S3.T1.1.1.9.8.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T1.1.1.9.8.1.1\">Context</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_top ltx_border_t\" id=\"S3.T1.1.1.9.8.2\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.10.9\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.10.9.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.10.9.1.1\">Incorrect gender</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.10.9.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.10.9.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.10.9.2.1.1\" style=\"width:412.6pt;\">Misgenders the speaker or the addressed person(s).</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.11.10\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.11.10.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.11.10.1.1\">Incorrect plurality</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.11.10.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.11.10.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.11.10.2.1.1\" style=\"width:412.6pt;\">Incorrectly refers to a single person when a group is addressed, or vice versa.</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.12.11\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.12.11.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.12.11.1.1\">Wrong formality</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.12.11.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.12.11.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.12.11.2.1.1\" style=\"width:412.6pt;\">Expressed in informal style or uses informal addressing when should use formal, or vice versa.</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.13.12\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.13.12.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.13.12.1.1\">Other inconsistency with video</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.13.12.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.13.12.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.13.12.2.1.1\" style=\"width:412.6pt;\">Contains inconsistencies with the video material not falling within any of the above.</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.14.13\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_t\" id=\"S3.T1.1.1.14.13.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T1.1.1.14.13.1.1\">Style</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_top ltx_border_t\" id=\"S3.T1.1.1.14.13.2\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.15.14\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.15.14.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.15.14.1.1\">Subtitle formatting violation</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.15.14.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.15.14.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.15.14.2.1.1\" style=\"width:412.6pt;\">Violation of the subtitle blocking guidelines.</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.16.15\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.16.15.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.16.15.1.1\">Other style sheet non-compliance</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.16.15.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.16.15.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.16.15.2.1.1\" style=\"width:412.6pt;\">Does not conform to the provided style sheet.</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.17.16\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.17.16.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.17.16.1.1\">Awkward style</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.17.16.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.17.16.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.17.16.2.1.1\" style=\"width:412.6pt;\">The style of the translation does not reflect the style of the source sentence and/or the context.</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.18.17\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S3.T1.1.1.18.17.1\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T1.1.1.18.17.1.1\">Subjective style changes</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top\" id=\"S3.T1.1.1.18.17.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.18.17.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.18.17.2.1.1\" style=\"width:412.6pt;\">The translation is acceptable but the editor suggests improvements in style.</span>\n</span>\n</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T1.1.1.19.18\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_bb ltx_border_t\" id=\"S3.T1.1.1.19.18.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T1.1.1.19.18.1.1\">Other</span></th>\n<td class=\"ltx_td ltx_nopad_r ltx_align_justify ltx_align_top ltx_border_bb ltx_border_t\" id=\"S3.T1.1.1.19.18.2\">\n<span class=\"ltx_inline-block ltx_align_top\" id=\"S3.T1.1.1.19.18.2.1\">\n<span class=\"ltx_p\" id=\"S3.T1.1.1.19.18.2.1.1\" style=\"width:412.6pt;\">Error of type not found above (use text box provided).</span>\n</span>\n</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span>List of errors provided to the human evaluators during the campaign.</figcaption>\n</figure>",
53
+ "perturb_sentence_id": 12,
54
+ "output": {
55
+ "perturbed_statement": "[paragraph id = 12] For this project, we created a bespoke taxonomy of errors (Table 2) based on translation errors reported in recent work [Freitag et al., 2021, Sharou and Specia, 2020], the original list of issues currently present in the ZOOSubs system, and errors unrelated to previous work (§3 ). Error categories from the sources were compiled and curated to be generic and not specific to the study requirements.",
56
+ "perturbed_explanation": "1. The original statement correctly listed the contributing sources for the taxonomy of errors as previous works and the ZOOSubs system, aligning with the provided information. It precisely specified the process of compiling and curating error categories to fit the specific study requirements, conforming to the context.\n2. The statement now incorrectly refers to \"Table 2\" instead of \"Table 1\", erroneously attributes Freitag et al. to 2021 instead of 2020, incorrectly claims inclusion of errors unrelated to previous work, and states that error categories were curated to be generic rather than specific, which contradicts the given explanations. Such inaccuracies misrepresent the scope and methods described for the taxonomy creation and its alignment with the study focus."
57
+ }
58
+ },
59
+ {
60
+ "path": "table_paper/2407.00108v1.json",
61
+ "table_id": "2",
62
+ "section": "3.2",
63
+ "all_context": [
64
+ "The human evaluation aspect of the study is interpreted as the effort required to post-edit the translations to a production standard, and captured in the number of errors, keystrokes and total edit time.",
65
+ "The task was performed by professional HTs and PEs using ZOOSubs, an in-house software application belonging to ZOO Digital, built to facilitate manual translation of video material (Figure 1 ).",
66
+ "The software s interface displays the video material along with timed subtitles in the original language.",
67
+ "The target stream, i.e.",
68
+ "the set of text boxes provided to the right of the source stream, is where the HTs input their translations to the desired language.",
69
+ "It can optionally be pre-populated with “draft” translations – a setting we opted for in this study – allowing post-editors to edit, divide or combine the segments as they see fit.",
70
+ "To make amendments to a segment, the PE needs to click on its box.",
71
+ "From that point, the system tracks the time spent editing the box and the number of keystrokes made.",
72
+ "These metrics are recorded for each box separately and taken into account only if the post-edited text differs from the original.",
73
+ "After applying modifications, an Issues for event window appears for the user to specify the purpose of the changes by selecting errors from a predefined list, optionally providing text commentary.",
74
+ "We leveraged this functionality of ZOOSubs to measure the total and average time and number of keystrokes made by HTs and PEs given some pre-existing translations.",
75
+ "We also measured the number of selected errors.",
76
+ "For this project, we created a bespoke taxonomy of errors (Table 1 ) based on translation errors reported in previous work [Freitag et al., 2021 , Sharou and Specia, 2022 ], the original list of issues already present in the ZOOSubs system and relevant errors from previous work (§2 ).",
77
+ "Error categories from the aforementioned sources were compiled together and curated to fit the study requirements444We uploaded a draft taxonomy to ZOOSubs, and the first author performed a test evaluation against a stream with segments to validate the list.",
78
+ "As a result, some errors were split into more granular categories, some were renamed and some generalised.",
79
+ "The PEs operated on seven episodes from three TV series of varying genres: a fictional series about space exploration, a documentary exploring aspects of everyday life, and a family cooking competition show.",
80
+ "They were unaware that some of the text they worked with was machine translated, but were told that it was for a research project and asked to relax some constraints such as adhering to the reading speed limits.",
81
+ "In addition, we asked four HTs (two to German, two to French) to translate one episode of the cooking show from scratch in ZOOSubs so we could compare their effort to that of post-editors.",
82
+ "For each of the seven episodes, the PEs were asked to post-edit one out of four versions of the text, corresponding to the list outlined in §3 .",
83
+ "We included the human references (Ref) to account for the fact that PEs can sometimes post-edit a translation even when the original one is valid.",
84
+ "Our setup ensured that the same PE evaluated the output for each episode exactly once (i.e.",
85
+ "does not see two different versions of the same text) (Table 2 ).",
86
+ "When referring to individual PEs, we use the notation PE.",
87
+ "[L][i], where L {G (German), F (French)}, and i denotes the PE ID .",
88
+ "The recruited PEs and HTs were professionals within the subtitle domain and freelance employees of ZOO Digital.",
89
+ "They were informed that the undertaken work was carried out for a research project, but nevertheless, they were paid for their effort at competitive PE and HT rates, standard within the company for this type of work.",
90
+ "Information about the PEs and HTs years of experience (YOE) was collected to shed more light on the findings (Table 3 ).",
91
+ "They also answered a short survey about their views regarding machine translation, discussed in detail in §5.3 : Which one would you prefer: translating a stream from scratch or completing a quality check on (post-editing) a stream?",
92
+ "What are your views on the use of machine translation in the industry?",
93
+ "In your view, are there benefits to post-editing translations over translating from scratch?",
94
+ "All French HTs had training in post-editing, and three out of four preferred it to translating from scratch, while no German HTs had received such training in the past, and all but one strictly preferred FST.",
95
+ "All PEs had at least one YOE in post-editing and one and a half in the subtitle domain.",
96
+ "Although the HTs within both pairs had a similar amount of experience in translation in general and in the subtitle domain ( for French vs for German), the French HTs had the advantage in terms of YOE in both subtitling (a mean difference of YOE) and post-editing (a mean difference of YOE).",
97
+ ""
98
+ ],
99
+ "target_context_ids": [
100
+ 20,
101
+ 21,
102
+ 22
103
+ ],
104
+ "selected_paragraphs": [
105
+ "[paragraph id = 20] Our setup ensured that the same PE evaluated the output for each episode exactly once (i.e.",
106
+ "[paragraph id = 21] does not see two different versions of the same text) (Table 2 ).",
107
+ "[paragraph id = 22] When referring to individual PEs, we use the notation PE."
108
+ ],
109
+ "table_html": "<figure class=\"ltx_table\" id=\"S3.T2\">\n<div class=\"ltx_inline-block ltx_transformed_outer\" id=\"S3.T2.1\" style=\"width:433.6pt;height:132.5pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-18.9pt,5.8pt) scale(0.919930461661833,0.919930461661833) ;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S3.T2.1.1\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.1.1\">\n<td class=\"ltx_td ltx_align_right ltx_border_tt\" id=\"S3.T2.1.1.1.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.1.1.1.1\">Series</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_tt\" colspan=\"2\" id=\"S3.T2.1.1.1.1.2\"><span class=\"ltx_text ltx_font_bold ltx_font_smallcaps\" id=\"S3.T2.1.1.1.1.2.1\">A</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_tt\" colspan=\"2\" id=\"S3.T2.1.1.1.1.3\"><span class=\"ltx_text ltx_font_bold ltx_font_smallcaps\" id=\"S3.T2.1.1.1.1.3.1\">B</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"3\" id=\"S3.T2.1.1.1.1.4\"><span class=\"ltx_text ltx_font_bold ltx_font_smallcaps\" id=\"S3.T2.1.1.1.1.4.1\">C</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.2.2\">\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S3.T2.1.1.2.2.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.2.2.1.1\">Ep. ID</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.2.2.2\">A1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T2.1.1.2.2.3\">A2</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.2.2.4\">B1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T2.1.1.2.2.5\">B2</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.2.2.6\">C1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.2.2.7\">C2</td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.2.2.8\">C3</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.3.3\">\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S3.T2.1.1.3.3.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.3.3.1.1\">PE.1</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.3.3.2\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.2.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T2.1.1.3.3.3\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.3.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.3.3.4\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.4.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T2.1.1.3.3.5\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.5.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.3.3.6\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.6.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.3.3.7\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.7.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.3.3.8\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.8.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.4.4\">\n<td class=\"ltx_td ltx_align_right\" id=\"S3.T2.1.1.4.4.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.4.4.1.1\">PE.2</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.4.4.2\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.2.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.1.1.4.4.3\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.3.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.4.4.4\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.4.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.1.1.4.4.5\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.5.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.4.4.6\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.6.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.4.4.7\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.7.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T2.1.1.4.4.8\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.8.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.5.5\">\n<td class=\"ltx_td ltx_align_right\" id=\"S3.T2.1.1.5.5.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.5.5.1.1\">PE.3</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.5.5.2\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.2.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.1.1.5.5.3\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.3.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.5.5.4\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.4.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.1.1.5.5.5\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.5.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.5.5.6\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.6.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.5.5.7\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.7.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T2.1.1.5.5.8\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.8.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.6.6\">\n<td class=\"ltx_td ltx_align_right\" id=\"S3.T2.1.1.6.6.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.6.6.1.1\">PE.4</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.6.6.2\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.2.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.1.1.6.6.3\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.3.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.6.6.4\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.4.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.1.1.6.6.5\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.5.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.6.6.6\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.6.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.6.6.7\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.7.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T2.1.1.6.6.8\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.8.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.7.7\">\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S3.T2.1.1.7.7.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.7.7.1.1\">HT.1</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.7.7.2\" style=\"background-color:#FFFFFF;\"><span class=\"ltx_text\" id=\"S3.T2.1.1.7.7.2.1\" style=\"background-color:#FFFFFF;\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T2.1.1.7.7.2.1.1\">From Scratch</span></span></td>\n<td class=\"ltx_td ltx_border_r ltx_border_t\" id=\"S3.T2.1.1.7.7.3\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T2.1.1.7.7.4\"></td>\n<td class=\"ltx_td ltx_border_r ltx_border_t\" id=\"S3.T2.1.1.7.7.5\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T2.1.1.7.7.6\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T2.1.1.7.7.7\"></td>\n<td class=\"ltx_td ltx_nopad_r ltx_border_t\" id=\"S3.T2.1.1.7.7.8\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.8.8\">\n<td class=\"ltx_td ltx_align_right ltx_border_bb\" id=\"S3.T2.1.1.8.8.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.8.8.1.1\">HT.2</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T2.1.1.8.8.2\" style=\"background-color:#FFFFFF;\"><span class=\"ltx_text\" id=\"S3.T2.1.1.8.8.2.1\" style=\"background-color:#FFFFFF;\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T2.1.1.8.8.2.1.1\">From Scratch</span></span></td>\n<td class=\"ltx_td ltx_border_bb ltx_border_r\" id=\"S3.T2.1.1.8.8.3\"></td>\n<td class=\"ltx_td ltx_border_bb\" id=\"S3.T2.1.1.8.8.4\"></td>\n<td class=\"ltx_td ltx_border_bb ltx_border_r\" id=\"S3.T2.1.1.8.8.5\"></td>\n<td class=\"ltx_td ltx_border_bb\" id=\"S3.T2.1.1.8.8.6\"></td>\n<td class=\"ltx_td ltx_border_bb\" id=\"S3.T2.1.1.8.8.7\"></td>\n<td class=\"ltx_td ltx_nopad_r ltx_border_bb\" id=\"S3.T2.1.1.8.8.8\"></td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span>Work assignment to PEs and HTs in the human evaluation campaign used for both language pairs.</figcaption>\n</figure>",
110
+ "perturb_sentence_id": 21,
111
+ "output": {
112
+ "perturbed_statement": "[paragraph id = 21] sees two different versions of the same text for evaluation) (Table 3 ).When referring to individual PEs, we use the notation PE.",
113
+ "perturbed_explanation": "Original Explanation: The statement accurately represents that in the experimental design, individual PEs do not evaluate multiple versions of the same text, and the table referenced is Table 2. Explanation of Incorrectness: 1. The altered statement incorrectly claims that the PEs evaluate two different versions of the same text, introducing a factual error about the uniqueness of evaluation text versions. 2. Additionally, the table referenced is incorrectly numbered as Table 3 instead of Table 2, which does not align with the referenced material's notation."
114
+ }
115
+ },
116
+ {
117
+ "path": "table_paper/2407.00108v1.json",
118
+ "table_id": "2",
119
+ "section": "5.1",
120
+ "all_context": [
121
+ "An initial inspection of the results indicated that each PE marked a significantly different total number of errors (e.g.",
122
+ "PE.F1 marked errors total while PE.F4 marked ).",
123
+ "This made direct comparison of the error counts across systems unreliable as each PE also post-edited a different number of segments for each system (cf.",
124
+ "Table 2 ).",
125
+ "With seven episodes and four different versions of the text, for each PE there is a version of text they would only have seen one episode from.",
126
+ "For example, in Table 2 , PE.1 is assigned two episodes for Ref, MTCue and Google, but only one for Base-NMT.",
127
+ "In this example, if PE.1 generally marked fewer errors than others, Base-NMT would be disproportionately rewarded.",
128
+ "To make the measurements comparable, we normalised them by computing a normalisation coefficient for each PE and then multiplying their error counts for each category by their .",
129
+ "Let denote the number of errors within the category for the -th PE.",
130
+ "We compute the normalised count as described by Equation 1 .",
131
+ "We report the total error counts as well as the normalisation multipliers in Table 4 .",
132
+ "To facilitate post-editing in ZOOSubs, MT outputs had to be adapted to match the subtitle format.",
133
+ "Quality checks of translations conducted in ZOOSubs normally require the users not just to ensure the correctness of translations but also that the subtitles comply with strict guidelines555This includes adhering to reading speed and length limits, balancing the length of the top and bottom subtitle, disambiguation of speaker turns with colours or dashes, and applying appropriate formatting, as specified by a style sheet..",
134
+ "Typical MT systems, like the ones used in this project, are not designed to create translations conforming to these stringent guidelines, and the primary goal of this study was to identify the impact of the translation errors alone.",
135
+ "To faithfully replicate the normal work environment of the PEs, we applied a greedy reformatting tool (built into ZOOSubs) to reformat our translations as subtitles.",
136
+ "We made it clear that the project is centred on the correctness of translations, not the subtitle formatting.",
137
+ "Still, to ensure that the translation and non-translation errors are kept separate, we included two environment-specific errors for the workers to select from: Subtitle formatting violation covering cases where the subtitle is not split to optimally adhere to segmentation guidelines; and Other style sheet non-compliance where a rule outlined in the style sheet from the client company was not followed, such as custom punctuation conventions.",
138
+ "In some instances, a PE would encounter both translation and non-translation errors within the same segment, as exemplified in Example 1 , where both translation errors (Mistranslation of by any chance and Formal/informal mismatch of you re doing) and non-translation errors (Subtitle formatting violation of the position of the subtitle break) are present.",
139
+ "In such cases, we (i) disregard the non-translation error counts, and (ii) correct the effort rates (editing time and keystrokes) to account solely for translation-related errors.",
140
+ "To precisely gauge the latter, we employed a correction method: let and be the total effort expended by a PE on a segment that had only non-translation and only translation errors marked, respectively.",
141
+ "We calculated translation share (TS) as follows: We then used it to calculate the estimated share of the effort spent on translation in segments that had both errors marked by multiplying TS by the total effort spent on a segment with both error types.666For example, if a PE took three seconds for translation errors and two seconds for non-translation errors on average, where they marked both types we multiplied their total effort for that segment by .",
142
+ "Finally, since the Other category was used substantially, we parsed the contents of the optional description text box.",
143
+ "The most commonly reported Other errors were “Grammar”, “Punctuation”, “Timing”, “SGP” (spelling, grammar, punctuation) and “Literal translation”.",
144
+ "Such errors () were removed from the Other category and pigeonholed as appropriate (e.g.",
145
+ "“Grammar” as Fluency).",
146
+ "More complex comments such as “wissen Sie should not be in the translation” were left categorised as Other ().",
147
+ "The calculated normalised counts of errors within each category (Table 5 ) suggest that MTCue performs no worse than both non-contextual MT systems overall (row Total), while performing significantly better in the Context and Style categories in en-fr, pointing to gains related to the use of context information.",
148
+ "The most frequently flagged errors in both language pairs were consistently Mistranslation and Fluency.",
149
+ "Mistranslation was reported a similar number of times for all three machine translation systems in en-de and three times less frequently for post-editing Ref.",
150
+ "This gap was similar in en-fr, though within the MT systems themselves, the Google system had a significantly higher error rate for Mistranslation errors ( mean) than the next best system, i.e.",
151
+ "Base-NMT (); the contextual MTCue achieved an even lower rate of .",
152
+ "Interestingly, MTCue also produced outputs of higher Fluency than other systems, even surpassing Ref for en-fr, though insignificantly at the selected confidence interval ().",
153
+ "In both language pairs, the Omission error was consistently marked the fewest times in Google-generated text (see Translation quality Omission).",
154
+ "In both cases, Ref scored significantly above the mean.",
155
+ "This is unsurprising: translations authored by the general-purpose Google engine tend to be overly literal and faithful to the source, while in the domain of dialogue, the HT often needs to let go of individual features of the source text or opt for alternative expressions to maintain the brevity and dynamics of the source dialogue, leading to spontaneous omissions in the reference translations.",
156
+ "To exemplify, Google consistently unnecessarily translated the English “(…), you know,” to “(…), wissen Sie,” in German, necessitating additional post-editing in our study.",
157
+ "A similar error was typically avoided by the other systems, due to their data-learned preference for brevity and dynamically expressive language.",
158
+ "As a result, both systems were marked with Omission more times than Google.",
159
+ "In fact, MTCue scored even more Omissions than Base-NMT, suggesting that MTCue s omission behaviour more closely matches that of professional HTs.",
160
+ "Other Translation quality errors were relatively infrequent and with insignificant differences between systems.",
161
+ "To capture context-related issues, we provided categories for the most frequent contextual errors: Incorrect gender, Plural/singular form and Formal/informal mismatch.",
162
+ "Since the perception of speaking style in dialogue is subjective and difficult to gauge, we did not provide explicit ways for the PEs to mark speaker style errors to avoid biasing them towards thinking in terms of what is a characteristic way of expression for the given speaker.",
163
+ "Instead, we provided loose categories for Style, with the intention of collecting measurements of how often the PEs feel the need to alter the style of the translations.",
164
+ "Since all of the post-edited content is dialogue, the style of the translation can be directly associated with the style of the speaker s expression.",
165
+ "Our findings regarding some Context categories (Incorrect gender, Formal/informal mismatch) are consistent between the two language pairs, and MTCue was found to be superior in most categories in both cases, with the overall score for the Context category being significant at confidence for en-fr.",
166
+ "The Plural/singular form error required few corrections in en-de (where Base-NMT was found superior to MTCue) and more in en-fr (where MTCue was found superior).",
167
+ "The findings from the Style category also work in favour of contextual MT, where it was found comparable to non-contextual systems for the en-de pair and significantly better than them for the en-fr pair, requiring the fewest style-based adjustments, even fewer than Ref.",
168
+ "Within the en-de pair, Subjective style changes were flagged only up to times per segments for any system, and a consistent number of times between systems, and Awkward style was flagged the fewest times for Ref ( on average), much less frequently than for the other systems, among which Google required the most edits and Base-NMT the fewest.",
169
+ "Overall, our error count analysis suggests that within the en-fr pair, MTCue has significantly reduced the number of errors marked for contextual and stylistic reasons compared to non-contextual systems, while not degrading overall translation quality.",
170
+ "The findings within the en-de pair are too variable to yield definitive conclusions but entail no degradation of quality leading from the inclusion of context, a significant improvement for contextual phenomena compared to Google, and highlight that MTCue makes the fewest contextual errors overall.",
171
+ ""
172
+ ],
173
+ "target_context_ids": [
174
+ 3,
175
+ 5,
176
+ 6
177
+ ],
178
+ "selected_paragraphs": [
179
+ "[paragraph id = 3] Table 2 ).",
180
+ "[paragraph id = 5] For example, in Table 2 , PE.1 is assigned two episodes for Ref, MTCue and Google, but only one for Base-NMT.",
181
+ "[paragraph id = 6] In this example, if PE.1 generally marked fewer errors than others, Base-NMT would be disproportionately rewarded."
182
+ ],
183
+ "table_html": "<figure class=\"ltx_table\" id=\"S3.T2\">\n<div class=\"ltx_inline-block ltx_transformed_outer\" id=\"S3.T2.1\" style=\"width:433.6pt;height:132.5pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-18.9pt,5.8pt) scale(0.919930461661833,0.919930461661833) ;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S3.T2.1.1\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.1.1\">\n<td class=\"ltx_td ltx_align_right ltx_border_tt\" id=\"S3.T2.1.1.1.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.1.1.1.1\">Series</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_tt\" colspan=\"2\" id=\"S3.T2.1.1.1.1.2\"><span class=\"ltx_text ltx_font_bold ltx_font_smallcaps\" id=\"S3.T2.1.1.1.1.2.1\">A</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_tt\" colspan=\"2\" id=\"S3.T2.1.1.1.1.3\"><span class=\"ltx_text ltx_font_bold ltx_font_smallcaps\" id=\"S3.T2.1.1.1.1.3.1\">B</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"3\" id=\"S3.T2.1.1.1.1.4\"><span class=\"ltx_text ltx_font_bold ltx_font_smallcaps\" id=\"S3.T2.1.1.1.1.4.1\">C</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.2.2\">\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S3.T2.1.1.2.2.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.2.2.1.1\">Ep. ID</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.2.2.2\">A1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T2.1.1.2.2.3\">A2</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.2.2.4\">B1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T2.1.1.2.2.5\">B2</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.2.2.6\">C1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.2.2.7\">C2</td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.2.2.8\">C3</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.3.3\">\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S3.T2.1.1.3.3.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.3.3.1.1\">PE.1</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.3.3.2\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.2.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T2.1.1.3.3.3\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.3.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.3.3.4\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.4.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T2.1.1.3.3.5\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.5.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.3.3.6\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.6.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.3.3.7\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.7.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.3.3.8\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.3.3.8.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.4.4\">\n<td class=\"ltx_td ltx_align_right\" id=\"S3.T2.1.1.4.4.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.4.4.1.1\">PE.2</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.4.4.2\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.2.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.1.1.4.4.3\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.3.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.4.4.4\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.4.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.1.1.4.4.5\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.5.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.4.4.6\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.6.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.4.4.7\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.7.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T2.1.1.4.4.8\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.4.4.8.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.5.5\">\n<td class=\"ltx_td ltx_align_right\" id=\"S3.T2.1.1.5.5.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.5.5.1.1\">PE.3</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.5.5.2\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.2.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.1.1.5.5.3\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.3.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.5.5.4\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.4.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.1.1.5.5.5\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.5.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.5.5.6\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.6.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.5.5.7\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.7.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T2.1.1.5.5.8\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.5.5.8.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.6.6\">\n<td class=\"ltx_td ltx_align_right\" id=\"S3.T2.1.1.6.6.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.6.6.1.1\">PE.4</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.6.6.2\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.2.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.1.1.6.6.3\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.3.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.6.6.4\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.4.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T2.1.1.6.6.5\" style=\"background-color:#FFD9C9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.5.1\" style=\"background-color:#FFD9C9;\">Ref</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.6.6.6\" style=\"background-color:#D5B3F9;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.6.1\" style=\"background-color:#D5B3F9;\">MTCue</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T2.1.1.6.6.7\" style=\"background-color:#D0FFF6;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.7.1\" style=\"background-color:#D0FFF6;\">Google</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T2.1.1.6.6.8\" style=\"background-color:#95D675;\"><span class=\"ltx_text ltx_font_smallcaps\" id=\"S3.T2.1.1.6.6.8.1\" style=\"background-color:#95D675;\">Base-NMT</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.7.7\">\n<td class=\"ltx_td ltx_align_right ltx_border_t\" id=\"S3.T2.1.1.7.7.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.7.7.1.1\">HT.1</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T2.1.1.7.7.2\" style=\"background-color:#FFFFFF;\"><span class=\"ltx_text\" id=\"S3.T2.1.1.7.7.2.1\" style=\"background-color:#FFFFFF;\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T2.1.1.7.7.2.1.1\">From Scratch</span></span></td>\n<td class=\"ltx_td ltx_border_r ltx_border_t\" id=\"S3.T2.1.1.7.7.3\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T2.1.1.7.7.4\"></td>\n<td class=\"ltx_td ltx_border_r ltx_border_t\" id=\"S3.T2.1.1.7.7.5\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T2.1.1.7.7.6\"></td>\n<td class=\"ltx_td ltx_border_t\" id=\"S3.T2.1.1.7.7.7\"></td>\n<td class=\"ltx_td ltx_nopad_r ltx_border_t\" id=\"S3.T2.1.1.7.7.8\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T2.1.1.8.8\">\n<td class=\"ltx_td ltx_align_right ltx_border_bb\" id=\"S3.T2.1.1.8.8.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T2.1.1.8.8.1.1\">HT.2</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T2.1.1.8.8.2\" style=\"background-color:#FFFFFF;\"><span class=\"ltx_text\" id=\"S3.T2.1.1.8.8.2.1\" style=\"background-color:#FFFFFF;\"><span class=\"ltx_text ltx_font_italic\" id=\"S3.T2.1.1.8.8.2.1.1\">From Scratch</span></span></td>\n<td class=\"ltx_td ltx_border_bb ltx_border_r\" id=\"S3.T2.1.1.8.8.3\"></td>\n<td class=\"ltx_td ltx_border_bb\" id=\"S3.T2.1.1.8.8.4\"></td>\n<td class=\"ltx_td ltx_border_bb ltx_border_r\" id=\"S3.T2.1.1.8.8.5\"></td>\n<td class=\"ltx_td ltx_border_bb\" id=\"S3.T2.1.1.8.8.6\"></td>\n<td class=\"ltx_td ltx_border_bb\" id=\"S3.T2.1.1.8.8.7\"></td>\n<td class=\"ltx_td ltx_nopad_r ltx_border_bb\" id=\"S3.T2.1.1.8.8.8\"></td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span>Work assignment to PEs and HTs in the human evaluation campaign used for both language pairs.</figcaption>\n</figure>",
184
+ "perturb_sentence_id": 5,
185
+ "output": {
186
+ "perturbed_statement": "[paragraph id = 5] For example, in Table 2 , PE.1 is assigned two episodes for Ref and Base-NMT, but only one for MTCue and Google.",
187
+ "perturbed_explanation": "Original Explanation: The discussed context highlights the imbalance in the distribution of assigned episodes, particularly the preference observed for certain systems. 1. By stating that PE.1 is assigned two episodes for 'Ref and Base-NMT' instead of 'Ref, MTCue and Google,' while 'MTCue and Google' receive just one, the altered statement introduces a misrepresentation of the observation distribution. This does not align with the information summarized in the table, making the statement inaccurate."
188
+ }
189
+ },
190
+ {
191
+ "path": "table_paper/2407.00108v1.json",
192
+ "table_id": "3",
193
+ "section": "3.2",
194
+ "all_context": [
195
+ "The human evaluation aspect of the study is interpreted as the effort required to post-edit the translations to a production standard, and captured in the number of errors, keystrokes and total edit time.",
196
+ "The task was performed by professional HTs and PEs using ZOOSubs, an in-house software application belonging to ZOO Digital, built to facilitate manual translation of video material (Figure 1 ).",
197
+ "The software s interface displays the video material along with timed subtitles in the original language.",
198
+ "The target stream, i.e.",
199
+ "the set of text boxes provided to the right of the source stream, is where the HTs input their translations to the desired language.",
200
+ "It can optionally be pre-populated with “draft” translations – a setting we opted for in this study – allowing post-editors to edit, divide or combine the segments as they see fit.",
201
+ "To make amendments to a segment, the PE needs to click on its box.",
202
+ "From that point, the system tracks the time spent editing the box and the number of keystrokes made.",
203
+ "These metrics are recorded for each box separately and taken into account only if the post-edited text differs from the original.",
204
+ "After applying modifications, an Issues for event window appears for the user to specify the purpose of the changes by selecting errors from a predefined list, optionally providing text commentary.",
205
+ "We leveraged this functionality of ZOOSubs to measure the total and average time and number of keystrokes made by HTs and PEs given some pre-existing translations.",
206
+ "We also measured the number of selected errors.",
207
+ "For this project, we created a bespoke taxonomy of errors (Table 1 ) based on translation errors reported in previous work [Freitag et al., 2021 , Sharou and Specia, 2022 ], the original list of issues already present in the ZOOSubs system and relevant errors from previous work (§2 ).",
208
+ "Error categories from the aforementioned sources were compiled together and curated to fit the study requirements444We uploaded a draft taxonomy to ZOOSubs, and the first author performed a test evaluation against a stream with segments to validate the list.",
209
+ "As a result, some errors were split into more granular categories, some were renamed and some generalised.",
210
+ "The PEs operated on seven episodes from three TV series of varying genres: a fictional series about space exploration, a documentary exploring aspects of everyday life, and a family cooking competition show.",
211
+ "They were unaware that some of the text they worked with was machine translated, but were told that it was for a research project and asked to relax some constraints such as adhering to the reading speed limits.",
212
+ "In addition, we asked four HTs (two to German, two to French) to translate one episode of the cooking show from scratch in ZOOSubs so we could compare their effort to that of post-editors.",
213
+ "For each of the seven episodes, the PEs were asked to post-edit one out of four versions of the text, corresponding to the list outlined in §3 .",
214
+ "We included the human references (Ref) to account for the fact that PEs can sometimes post-edit a translation even when the original one is valid.",
215
+ "Our setup ensured that the same PE evaluated the output for each episode exactly once (i.e.",
216
+ "does not see two different versions of the same text) (Table 2 ).",
217
+ "When referring to individual PEs, we use the notation PE.",
218
+ "[L][i], where L {G (German), F (French)}, and i denotes the PE ID .",
219
+ "The recruited PEs and HTs were professionals within the subtitle domain and freelance employees of ZOO Digital.",
220
+ "They were informed that the undertaken work was carried out for a research project, but nevertheless, they were paid for their effort at competitive PE and HT rates, standard within the company for this type of work.",
221
+ "Information about the PEs and HTs years of experience (YOE) was collected to shed more light on the findings (Table 3 ).",
222
+ "They also answered a short survey about their views regarding machine translation, discussed in detail in §5.3 : Which one would you prefer: translating a stream from scratch or completing a quality check on (post-editing) a stream?",
223
+ "What are your views on the use of machine translation in the industry?",
224
+ "In your view, are there benefits to post-editing translations over translating from scratch?",
225
+ "All French HTs had training in post-editing, and three out of four preferred it to translating from scratch, while no German HTs had received such training in the past, and all but one strictly preferred FST.",
226
+ "All PEs had at least one YOE in post-editing and one and a half in the subtitle domain.",
227
+ "Although the HTs within both pairs had a similar amount of experience in translation in general and in the subtitle domain ( for French vs for German), the French HTs had the advantage in terms of YOE in both subtitling (a mean difference of YOE) and post-editing (a mean difference of YOE).",
228
+ ""
229
+ ],
230
+ "target_context_ids": [
231
+ 25,
232
+ 34,
233
+ 35
234
+ ],
235
+ "selected_paragraphs": [
236
+ "[paragraph id = 25] They were informed that the undertaken work was carried out for a research project, but nevertheless, they were paid for their effort at competitive PE and HT rates, standard within the company for this type of work."
237
+ ],
238
+ "table_html": "<figure class=\"ltx_table\" id=\"S3.T3\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S3.T3.1\" style=\"width:433.6pt;height:127.3pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(2.2pt,-0.7pt) scale(1.01037484358733,1.01037484358733) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S3.T3.1.1\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.1.1\">\n<th class=\"ltx_td ltx_th ltx_th_row ltx_border_r ltx_border_tt\" id=\"S3.T3.1.1.1.1.1\"></th>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"4\" id=\"S3.T3.1.1.1.1.2\">English-to-French</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" colspan=\"4\" id=\"S3.T3.1.1.1.1.3\">English-to-German</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.2.2\">\n<th class=\"ltx_td ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S3.T3.1.1.2.2.1\"></th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.2.2.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T3.1.1.2.2.2.1\">PE.F1</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.2.2.3\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T3.1.1.2.2.3.1\">PE.F2</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.2.2.4\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T3.1.1.2.2.4.1\">PE.F3</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T3.1.1.2.2.5\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T3.1.1.2.2.5.1\">PE.F4</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.2.2.6\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T3.1.1.2.2.6.1\">PE.G1</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.2.2.7\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T3.1.1.2.2.7.1\">PE.G2</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.2.2.8\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T3.1.1.2.2.8.1\">PE.G3</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.2.2.9\"><span class=\"ltx_text ltx_font_bold\" id=\"S3.T3.1.1.2.2.9.1\">PE.G4</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.3.3\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S3.T3.1.1.3.3.1\">Translation YOE</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.3.3.2\">15</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.3.3.3\">8</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.3.3.4\">3</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S3.T3.1.1.3.3.5\">20</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.3.3.6\">7</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.3.3.7\">18</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.3.3.8\">8</td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center ltx_border_t\" id=\"S3.T3.1.1.3.3.9\">17</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.4.4\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_r\" id=\"S3.T3.1.1.4.4.1\">YOE in subtitles</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.4.4.2\">8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.4.4.3\">6</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.4.4.4\">1.5</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T3.1.1.4.4.5\">20</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.4.4.6\">7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.4.4.7\">5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.4.4.8\">8</td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T3.1.1.4.4.9\">7</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.5.5\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_r\" id=\"S3.T3.1.1.5.5.1\">YOE in post-editing</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.5.5.2\">8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.5.5.3\">6</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.5.5.4\">3</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T3.1.1.5.5.5\">10</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.5.5.6\">5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.5.5.7\">5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.5.5.8\">1</td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T3.1.1.5.5.9\">3</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.6.6\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_r\" id=\"S3.T3.1.1.6.6.1\">Post-editing training?</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.6.6.2\"><span class=\"ltx_text\" id=\"S3.T3.1.1.6.6.2.1\" style=\"color:#009900;\">✓</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.6.6.3\"><span class=\"ltx_text\" id=\"S3.T3.1.1.6.6.3.1\" style=\"color:#009900;\">✓</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.6.6.4\"><span class=\"ltx_text\" id=\"S3.T3.1.1.6.6.4.1\" style=\"color:#009900;\">✓</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S3.T3.1.1.6.6.5\"><span class=\"ltx_text\" id=\"S3.T3.1.1.6.6.5.1\" style=\"color:#009900;\">✓</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.6.6.6\"><span class=\"ltx_text\" id=\"S3.T3.1.1.6.6.6.1\" style=\"color:#B80000;\">✗</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.6.6.7\"><span class=\"ltx_text\" id=\"S3.T3.1.1.6.6.7.1\" style=\"color:#B80000;\">✗</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S3.T3.1.1.6.6.8\"><span class=\"ltx_text\" id=\"S3.T3.1.1.6.6.8.1\" style=\"color:#B80000;\">✗</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S3.T3.1.1.6.6.9\"><span class=\"ltx_text\" id=\"S3.T3.1.1.6.6.9.1\" style=\"color:#B80000;\">✗</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S3.T3.1.1.7.7\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_bb ltx_border_r\" id=\"S3.T3.1.1.7.7.1\">Prefer post-editing?</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.1.1.7.7.2\"><span class=\"ltx_text\" id=\"S3.T3.1.1.7.7.2.1\" style=\"color:#009900;\">✓</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.1.1.7.7.3\"><span class=\"ltx_text\" id=\"S3.T3.1.1.7.7.3.1\" style=\"color:#009900;\">✓</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.1.1.7.7.4\"><span class=\"ltx_text\" id=\"S3.T3.1.1.7.7.4.1\" style=\"color:#B80000;\">✗</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r\" id=\"S3.T3.1.1.7.7.5\"><span class=\"ltx_text\" id=\"S3.T3.1.1.7.7.5.1\" style=\"color:#009900;\">✓</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.1.1.7.7.6\">\n<span class=\"ltx_text\" id=\"S3.T3.1.1.7.7.6.1\" style=\"color:#009900;\">✓</span>/<span class=\"ltx_text\" id=\"S3.T3.1.1.7.7.6.2\" style=\"color:#B80000;\">✗</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.1.1.7.7.7\"><span class=\"ltx_text\" id=\"S3.T3.1.1.7.7.7.1\" style=\"color:#B80000;\">✗</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S3.T3.1.1.7.7.8\"><span class=\"ltx_text\" id=\"S3.T3.1.1.7.7.8.1\" style=\"color:#B80000;\">✗</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center ltx_border_bb\" id=\"S3.T3.1.1.7.7.9\"><span class=\"ltx_text\" id=\"S3.T3.1.1.7.7.9.1\" style=\"color:#B80000;\">✗</span></td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 3: </span>Details regarding employed PEs.</figcaption>\n</figure>",
239
+ "perturb_sentence_id": 25,
240
+ "output": {
241
+ "perturbed_statement": "[paragraph id = 25] They were informed that the undertaken work was carried out for a community service project, but nevertheless, they were paid for their effort at competitive PE and HT rates, standard within the company for this type of work. Information about the PEs and HTs years of experience (YOE) was collected to shed more light on the findings (Table 3).",
242
+ "perturbed_explanation": "1. Original Explanation: The statement correctly describes the context in which the project work was conducted and the rationale for collecting the professionals' years of experience data.\n2. The statement is incorrect because it misrepresents the purpose of the project. Specifically, it was not a community service project but rather a research project. This inaccurately reflects the original intent and nature of the work undertaken."
243
+ }
244
+ },
245
+ {
246
+ "path": "table_paper/2407.00108v1.json",
247
+ "table_id": "4",
248
+ "section": "5.1",
249
+ "all_context": [
250
+ "An initial inspection of the results indicated that each PE marked a significantly different total number of errors (e.g.",
251
+ "PE.F1 marked errors total while PE.F4 marked ).",
252
+ "This made direct comparison of the error counts across systems unreliable as each PE also post-edited a different number of segments for each system (cf.",
253
+ "Table 2 ).",
254
+ "With seven episodes and four different versions of the text, for each PE there is a version of text they would only have seen one episode from.",
255
+ "For example, in Table 2 , PE.1 is assigned two episodes for Ref, MTCue and Google, but only one for Base-NMT.",
256
+ "In this example, if PE.1 generally marked fewer errors than others, Base-NMT would be disproportionately rewarded.",
257
+ "To make the measurements comparable, we normalised them by computing a normalisation coefficient for each PE and then multiplying their error counts for each category by their .",
258
+ "Let denote the number of errors within the category for the -th PE.",
259
+ "We compute the normalised count as described by Equation 1 .",
260
+ "We report the total error counts as well as the normalisation multipliers in Table 4 .",
261
+ "To facilitate post-editing in ZOOSubs, MT outputs had to be adapted to match the subtitle format.",
262
+ "Quality checks of translations conducted in ZOOSubs normally require the users not just to ensure the correctness of translations but also that the subtitles comply with strict guidelines555This includes adhering to reading speed and length limits, balancing the length of the top and bottom subtitle, disambiguation of speaker turns with colours or dashes, and applying appropriate formatting, as specified by a style sheet..",
263
+ "Typical MT systems, like the ones used in this project, are not designed to create translations conforming to these stringent guidelines, and the primary goal of this study was to identify the impact of the translation errors alone.",
264
+ "To faithfully replicate the normal work environment of the PEs, we applied a greedy reformatting tool (built into ZOOSubs) to reformat our translations as subtitles.",
265
+ "We made it clear that the project is centred on the correctness of translations, not the subtitle formatting.",
266
+ "Still, to ensure that the translation and non-translation errors are kept separate, we included two environment-specific errors for the workers to select from: Subtitle formatting violation covering cases where the subtitle is not split to optimally adhere to segmentation guidelines; and Other style sheet non-compliance where a rule outlined in the style sheet from the client company was not followed, such as custom punctuation conventions.",
267
+ "In some instances, a PE would encounter both translation and non-translation errors within the same segment, as exemplified in Example 1 , where both translation errors (Mistranslation of by any chance and Formal/informal mismatch of you re doing) and non-translation errors (Subtitle formatting violation of the position of the subtitle break) are present.",
268
+ "In such cases, we (i) disregard the non-translation error counts, and (ii) correct the effort rates (editing time and keystrokes) to account solely for translation-related errors.",
269
+ "To precisely gauge the latter, we employed a correction method: let and be the total effort expended by a PE on a segment that had only non-translation and only translation errors marked, respectively.",
270
+ "We calculated translation share (TS) as follows: We then used it to calculate the estimated share of the effort spent on translation in segments that had both errors marked by multiplying TS by the total effort spent on a segment with both error types.666For example, if a PE took three seconds for translation errors and two seconds for non-translation errors on average, where they marked both types we multiplied their total effort for that segment by .",
271
+ "Finally, since the Other category was used substantially, we parsed the contents of the optional description text box.",
272
+ "The most commonly reported Other errors were “Grammar”, “Punctuation”, “Timing”, “SGP” (spelling, grammar, punctuation) and “Literal translation”.",
273
+ "Such errors () were removed from the Other category and pigeonholed as appropriate (e.g.",
274
+ "“Grammar” as Fluency).",
275
+ "More complex comments such as “wissen Sie should not be in the translation” were left categorised as Other ().",
276
+ "The calculated normalised counts of errors within each category (Table 5 ) suggest that MTCue performs no worse than both non-contextual MT systems overall (row Total), while performing significantly better in the Context and Style categories in en-fr, pointing to gains related to the use of context information.",
277
+ "The most frequently flagged errors in both language pairs were consistently Mistranslation and Fluency.",
278
+ "Mistranslation was reported a similar number of times for all three machine translation systems in en-de and three times less frequently for post-editing Ref.",
279
+ "This gap was similar in en-fr, though within the MT systems themselves, the Google system had a significantly higher error rate for Mistranslation errors ( mean) than the next best system, i.e.",
280
+ "Base-NMT (); the contextual MTCue achieved an even lower rate of .",
281
+ "Interestingly, MTCue also produced outputs of higher Fluency than other systems, even surpassing Ref for en-fr, though insignificantly at the selected confidence interval ().",
282
+ "In both language pairs, the Omission error was consistently marked the fewest times in Google-generated text (see Translation quality Omission).",
283
+ "In both cases, Ref scored significantly above the mean.",
284
+ "This is unsurprising: translations authored by the general-purpose Google engine tend to be overly literal and faithful to the source, while in the domain of dialogue, the HT often needs to let go of individual features of the source text or opt for alternative expressions to maintain the brevity and dynamics of the source dialogue, leading to spontaneous omissions in the reference translations.",
285
+ "To exemplify, Google consistently unnecessarily translated the English “(…), you know,” to “(…), wissen Sie,” in German, necessitating additional post-editing in our study.",
286
+ "A similar error was typically avoided by the other systems, due to their data-learned preference for brevity and dynamically expressive language.",
287
+ "As a result, both systems were marked with Omission more times than Google.",
288
+ "In fact, MTCue scored even more Omissions than Base-NMT, suggesting that MTCue s omission behaviour more closely matches that of professional HTs.",
289
+ "Other Translation quality errors were relatively infrequent and with insignificant differences between systems.",
290
+ "To capture context-related issues, we provided categories for the most frequent contextual errors: Incorrect gender, Plural/singular form and Formal/informal mismatch.",
291
+ "Since the perception of speaking style in dialogue is subjective and difficult to gauge, we did not provide explicit ways for the PEs to mark speaker style errors to avoid biasing them towards thinking in terms of what is a characteristic way of expression for the given speaker.",
292
+ "Instead, we provided loose categories for Style, with the intention of collecting measurements of how often the PEs feel the need to alter the style of the translations.",
293
+ "Since all of the post-edited content is dialogue, the style of the translation can be directly associated with the style of the speaker s expression.",
294
+ "Our findings regarding some Context categories (Incorrect gender, Formal/informal mismatch) are consistent between the two language pairs, and MTCue was found to be superior in most categories in both cases, with the overall score for the Context category being significant at confidence for en-fr.",
295
+ "The Plural/singular form error required few corrections in en-de (where Base-NMT was found superior to MTCue) and more in en-fr (where MTCue was found superior).",
296
+ "The findings from the Style category also work in favour of contextual MT, where it was found comparable to non-contextual systems for the en-de pair and significantly better than them for the en-fr pair, requiring the fewest style-based adjustments, even fewer than Ref.",
297
+ "Within the en-de pair, Subjective style changes were flagged only up to times per segments for any system, and a consistent number of times between systems, and Awkward style was flagged the fewest times for Ref ( on average), much less frequently than for the other systems, among which Google required the most edits and Base-NMT the fewest.",
298
+ "Overall, our error count analysis suggests that within the en-fr pair, MTCue has significantly reduced the number of errors marked for contextual and stylistic reasons compared to non-contextual systems, while not degrading overall translation quality.",
299
+ "The findings within the en-de pair are too variable to yield definitive conclusions but entail no degradation of quality leading from the inclusion of context, a significant improvement for contextual phenomena compared to Google, and highlight that MTCue makes the fewest contextual errors overall.",
300
+ ""
301
+ ],
302
+ "target_context_ids": [
303
+ 0,
304
+ 1,
305
+ 2,
306
+ 3,
307
+ 4,
308
+ 5,
309
+ 6,
310
+ 7,
311
+ 8,
312
+ 9,
313
+ 10
314
+ ],
315
+ "selected_paragraphs": [
316
+ "[paragraph id = 0] An initial inspection of the results indicated that each PE marked a significantly different total number of errors (e.g.",
317
+ "[paragraph id = 1] PE.F1 marked errors total while PE.F4 marked ).",
318
+ "[paragraph id = 2] This made direct comparison of the error counts across systems unreliable as each PE also post-edited a different number of segments for each system (cf.",
319
+ "[paragraph id = 3] Table 2 ).",
320
+ "[paragraph id = 4] With seven episodes and four different versions of the text, for each PE there is a version of text they would only have seen one episode from.",
321
+ "[paragraph id = 5] For example, in Table 2 , PE.1 is assigned two episodes for Ref, MTCue and Google, but only one for Base-NMT.",
322
+ "[paragraph id = 6] In this example, if PE.1 generally marked fewer errors than others, Base-NMT would be disproportionately rewarded.",
323
+ "[paragraph id = 7] To make the measurements comparable, we normalised them by computing a normalisation coefficient for each PE and then multiplying their error counts for each category by their .",
324
+ "[paragraph id = 8] Let denote the number of errors within the category for the -th PE.",
325
+ "[paragraph id = 9] We compute the normalised count as described by Equation 1 .",
326
+ "[paragraph id = 10] We report the total error counts as well as the normalisation multipliers in Table 4 ."
327
+ ],
328
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T4\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S5.T4.18\" style=\"width:433.6pt;height:171.7pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(80.4pt,-31.8pt) scale(1.58972460985968,1.58972460985968) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S5.T4.18.18\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T4.18.18.19.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_tt\" colspan=\"3\" id=\"S5.T4.18.18.19.1.1\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.18.18.19.1.1.1\">English-to-German</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_l ltx_border_tt\" colspan=\"3\" id=\"S5.T4.18.18.19.1.2\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.18.18.19.1.2.1\">English-to-French</span></th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T4.2.2.2\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_t\" id=\"S5.T4.2.2.2.3\"><span class=\"ltx_text ltx_font_italic\" id=\"S5.T4.2.2.2.3.1\">PE ID</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.2.2.4\"><span class=\"ltx_text ltx_font_italic\" id=\"S5.T4.2.2.2.4.1\">Error count</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T4.1.1.1.1\"></td>\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_t\" id=\"S5.T4.2.2.2.5\"><span class=\"ltx_text ltx_font_italic\" id=\"S5.T4.2.2.2.5.1\">PE ID</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.2.2.6\"><span class=\"ltx_text ltx_font_italic\" id=\"S5.T4.2.2.2.6.1\">Error count</span></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center ltx_border_t\" id=\"S5.T4.2.2.2.2\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.6.6.6\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S5.T4.6.6.6.5\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.6.6.6.5.1\">PE.G1</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.3.3.3.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S5.T4.4.4.4.2\"></td>\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S5.T4.6.6.6.6\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.6.6.6.6.1\">PE.F1</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.5.5.5.3\"></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S5.T4.6.6.6.4\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.10.10.10\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S5.T4.10.10.10.5\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.10.10.10.5.1\">PE.G2</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.7.7.7.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S5.T4.8.8.8.2\"></td>\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S5.T4.10.10.10.6\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.10.10.10.6.1\">PE.F2</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.9.9.9.3\"></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S5.T4.10.10.10.4\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.14.14.14\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S5.T4.14.14.14.5\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.14.14.14.5.1\">PE.G3</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.11.11.11.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_r\" id=\"S5.T4.12.12.12.2\"></td>\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row\" id=\"S5.T4.14.14.14.6\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.14.14.14.6.1\">PE.F3</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.13.13.13.3\"></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S5.T4.14.14.14.4\"></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.18.18.18\">\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_bb\" id=\"S5.T4.18.18.18.5\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.18.18.18.5.1\">PE.G4</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T4.15.15.15.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r\" id=\"S5.T4.16.16.16.2\"></td>\n<th class=\"ltx_td ltx_align_right ltx_th ltx_th_row ltx_border_bb\" id=\"S5.T4.18.18.18.6\"><span class=\"ltx_text ltx_font_bold\" id=\"S5.T4.18.18.18.6.1\">PE.F4</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T4.17.17.17.3\"></td>\n<td class=\"ltx_td ltx_nopad_r ltx_align_center ltx_border_bb\" id=\"S5.T4.18.18.18.4\"></td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 4: </span>Error counts and values of for each PE.</figcaption>\n</figure>",
329
+ "perturb_sentence_id": 1,
330
+ "output": {
331
+ "perturbed_statement": "[paragraph id = 1] PE.F1 marked 120 errors total while PE.F4 marked 200).This suggested that the error counts across systems were highly consistent as every PE post-edited an identical number of segments for each system (cf.",
332
+ "perturbed_explanation": "The original explanation is: 1. Direct comparison of the error counts across systems was unreliable due to each PE post-editing a different number of segments for each system. The statement introduces an inconsistency because: 2. It asserts that every PE post-edited an identical number of segments for each system, which contradicts the provided details about differences in the post-edited segment counts across systems for PEs, as normalization was required to address this discrepancy."
333
+ }
334
+ }
335
+ ]
table_result/2407.00111v1_output.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00111v1.json",
4
+ "table_id": "2",
5
+ "section": "4.1",
6
+ "all_context": [
7
+ "We explored the performance of statistical machine learning (ML) models on our LPI affinity prediction task.",
8
+ "A training set of 100,000 LPI examples, and their corresponding ordinal affinity values, were drawn from the LPI-1.5M data set.",
9
+ "The ligand SMILES strings were converted into both MACCS (Molecular ACCess System) fingerprint sparse embeddings Durant et al.",
10
+ "(2002 ) and extended-connectivity \"circular\" fingerprint (ECFP) sparse embeddings Rogers & Hahn (2010 ).",
11
+ "The protein amino acid sequences were converted into dense embeddings with the ESM2-3B (Evolutionary Scale Modeling 2) model Lin et al.",
12
+ "(2023 ).",
13
+ "These ligand and protein embedding techniques were selected due to their prevalence and performance in LPI binary affinity classification prior art Kimber et al.",
14
+ "(2021 ).",
15
+ "The ligand and protein embeddings were concatenated, then -normalized.",
16
+ "The same process was applied to a 10,000-example test set from the LPI-1.5M data set.",
17
+ "The train and test data sets were unique with no overlap.",
18
+ "A support vector machines (SVM) machine learning model was selected for this analysis given its strong performance on imbalanced data sets Chakrabarti & Fauber (2022 ), which are often present in multinomial classification tasks such as ours (Figure 5).333https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC (accessed 11June2024) A one-versus-rest (OvR) instance of a linear kernel SVM was employed, thus enabling our multinomial classification task.444https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html (accessed 11June2024) Additional details for our data embedding and ML methods are described in the Appendix.",
19
+ "The OvR instances of linear SVM models demonstrated 7% overall accuracy and 7% overall exact matches on our multinomial classification task for both ligand embedding techniques (Table 2).",
20
+ "Additionally, both model instances produced 0% exact matches for the A and B ordinal affinity values, and 1%, 15%, and 9% exact matches for the ordinal affinity values C, D, and E, respectively.",
21
+ "These results resemble the distribution of the parent LPI-1.5M data (Figure 5), yet lack sufficient utility in prioritizing ligands for progression in a drug discovery campaign.",
22
+ ""
23
+ ],
24
+ "target_context_ids": [
25
+ 10,
26
+ 11,
27
+ 12,
28
+ 13
29
+ ],
30
+ "selected_paragraphs": [
31
+ "[paragraph id = 10] The train and test data sets were unique with no overlap.",
32
+ "[paragraph id = 11] A support vector machines (SVM) machine learning model was selected for this analysis given its strong performance on imbalanced data sets Chakrabarti & Fauber (2022 ), which are often present in multinomial classification tasks such as ours (Figure 5).333https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC (accessed 11June2024) A one-versus-rest (OvR) instance of a linear kernel SVM was employed, thus enabling our multinomial classification task.444https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html (accessed 11June2024) Additional details for our data embedding and ML methods are described in the Appendix.",
33
+ "[paragraph id = 12] The OvR instances of linear SVM models demonstrated 7% overall accuracy and 7% overall exact matches on our multinomial classification task for both ligand embedding techniques (Table 2).",
34
+ "[paragraph id = 13] Additionally, both model instances produced 0% exact matches for the A and B ordinal affinity values, and 1%, 15%, and 9% exact matches for the ordinal affinity values C, D, and E, respectively."
35
+ ],
36
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T2\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S4.T2.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S4.T2.1.1.1.1\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T2.1.1.1.1.1\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.1.1.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.1.1.1.1.1\" style=\"font-size:90%;\">Machine Learning</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.1.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.1.1.2.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.1.1.2.1.1\" style=\"font-size:90%;\">Model</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T2.1.1.1.2\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T2.1.1.1.2.1\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.2.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.2.1.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.2.1.1.1.1\" style=\"font-size:90%;\">Ligand</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.2.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.2.1.2.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.2.1.2.1.1\" style=\"font-size:90%;\">Embedding</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.2.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.2.1.3.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.2.1.3.1.1\" style=\"font-size:90%;\">Model</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T2.1.1.1.3\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T2.1.1.1.3.1\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.3.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.3.1.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.3.1.1.1.1\" style=\"font-size:90%;\">Protein</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.3.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.3.1.2.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.3.1.2.1.1\" style=\"font-size:90%;\">Embedding</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.3.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.3.1.3.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.3.1.3.1.1\" style=\"font-size:90%;\">Model</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T2.1.1.1.4\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T2.1.1.1.4.1\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.4.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.4.1.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.4.1.1.1.1\" style=\"font-size:90%;\">Dimension of</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.4.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.4.1.2.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.4.1.2.1.1\" style=\"font-size:90%;\">Ligand + Protein</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.4.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.4.1.3.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.4.1.3.1.1\" style=\"font-size:90%;\">Embedding</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T2.1.1.1.5\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T2.1.1.1.5.1\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.5.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.5.1.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.5.1.1.1.1\" style=\"font-size:90%;\">% Accuracy</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T2.1.1.1.6\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T2.1.1.1.6.1\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.6.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.6.1.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.6.1.1.1.1\" style=\"font-size:90%;\">% Exact</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.6.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.6.1.2.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.6.1.2.1.1\" style=\"font-size:90%;\">Matches</span></td>\n</tr>\n</table>\n</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.2.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S4.T2.1.2.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.2.1.1.1\" style=\"font-size:90%;\">OvR(LinearSVM)</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.2.1.2\"><span class=\"ltx_text\" id=\"S4.T2.1.2.1.2.1\" style=\"font-size:90%;\">ECFP</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.2.1.3\"><span class=\"ltx_text\" id=\"S4.T2.1.2.1.3.1\" style=\"font-size:90%;\">ESM2-3B</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.2.1.4\"><span class=\"ltx_text\" id=\"S4.T2.1.2.1.4.1\" style=\"font-size:90%;\">4,608</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.2.1.5\"><span class=\"ltx_text\" id=\"S4.T2.1.2.1.5.1\" style=\"font-size:90%;\">7%</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.2.1.6\"><span class=\"ltx_text\" id=\"S4.T2.1.2.1.6.1\" style=\"font-size:90%;\">7%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.3.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_bb\" id=\"S4.T2.1.3.2.1\"><span class=\"ltx_text\" id=\"S4.T2.1.3.2.1.1\" style=\"font-size:90%;\">OvR(LinearSVM)</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.1.3.2.2\"><span class=\"ltx_text\" id=\"S4.T2.1.3.2.2.1\" style=\"font-size:90%;\">MACCS</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.1.3.2.3\"><span class=\"ltx_text\" id=\"S4.T2.1.3.2.3.1\" style=\"font-size:90%;\">ESM2-3B</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.1.3.2.4\"><span class=\"ltx_text\" id=\"S4.T2.1.3.2.4.1\" style=\"font-size:90%;\">2,727</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.1.3.2.5\"><span class=\"ltx_text\" id=\"S4.T2.1.3.2.5.1\" style=\"font-size:90%;\">7%</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.1.3.2.6\"><span class=\"ltx_text\" id=\"S4.T2.1.3.2.6.1\" style=\"font-size:90%;\">7%</span></td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span>Performance of ML models in the conversion of 10,000 test instances of ligand embeddings and protein amino acid sequence embeddings into their corresponding predicted LPI ordinal affinity values from the LPI-1.5M data set. The ML model outputs were compared to their ground truth values for scoring.</figcaption>\n</figure>",
37
+ "perturb_sentence_id": 11,
38
+ "output": {
39
+ "perturbed_statement": "[paragraph id = 11] A support vector machines (SVM) machine learning model was selected for this analysis given its strong performance on imbalanced data sets Chakrabarti & Fauber (2022 ), which are statistically rare in multinomial classification tasks such as ours (Figure 5).333https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC (accessed 11June2024) A one-versus-rest (OvR) instance of a nonlinear kernel SVM was employed, thus enabling our multinomial classification task.444https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html (accessed 11June2024) Additional details for our data embedding and ML methods are described in the Appendix.The OvR instances of linear SVM models demonstrated 7% overall accuracy and 7% overall exact matches on our multinomial classification task for both ligand embedding techniques (Table 2).",
40
+ "perturbed_explanation": "1. The original explanation mentions that a linear kernel was selected as an integral part of the SVM model used in the analysis, designed for addressing multinomial classification task, as linear kernels are computationally efficient and effective in certain scenarios of data separation. 2. The statement incorrectly claims that a nonlinear kernel was utilized, which introduces a contradiction to the detailed choice mentioned. This change misrepresents the methodological decisions made in the study and could lead to misunderstandings regarding the applied techniques. To clarify, linear SVMs differ significantly from their nonlinear counterparts in their handling of data distributions."
41
+ }
42
+ },
43
+ {
44
+ "path": "table_paper/2407.00111v1.json",
45
+ "table_id": "2",
46
+ "section": "4.3",
47
+ "all_context": [
48
+ "The OPT-125M pretrained small language model was instruction fine-tuned on 100,000 training examples drawn from the LPI-1.5M data set.",
49
+ "We observed a significant improvement in the performance of our fine-tuned SLM on our LPI affinity prediction task versus the baseline model on a test set of 10,000 examples from the LPI-1.5M data set.",
50
+ "Our fine-tuned SLM achieved 37% overall accuracy and 37% overall exact matches on our task.",
51
+ "Notably, our fine-tuned SLM achieved 14%, 36%, 64%, and 22% exact matches for the ordinal affinity values B, C, D, and E, respectively (Figure 6).",
52
+ "These results were significantly better than the ML results (Table 2) and baseline language model results (Table 3) on the same train/test data sets.",
53
+ "Relaxing the scoring criteria to a predicted ordinal affinity value equal to or value relative to the ground truth, as is regularly employed in the FEP+ method Schrodinger (2023 ); Ross et al.",
54
+ "(2023 ), resulted in impressive outcomes with our method.",
55
+ "With the relaxed \"near match\" criteria, we achieved an 77% overall accuracy and all ordinal affinity values achieved 19-94% near matches relative the the ground truth with our method (Figure 6).",
56
+ "The relaxed criteria of a near match is reasonable for the prioritization of ligands in virtual screening, and is likely why this practice was introduced by FEP+ practitioners.",
57
+ ""
58
+ ],
59
+ "target_context_ids": [
60
+ 1,
61
+ 2,
62
+ 4,
63
+ 5,
64
+ 7
65
+ ],
66
+ "selected_paragraphs": [
67
+ "[paragraph id = 1] We observed a significant improvement in the performance of our fine-tuned SLM on our LPI affinity prediction task versus the baseline model on a test set of 10,000 examples from the LPI-1.5M data set.",
68
+ "[paragraph id = 2] Our fine-tuned SLM achieved 37% overall accuracy and 37% overall exact matches on our task.",
69
+ "[paragraph id = 4] These results were significantly better than the ML results (Table 2) and baseline language model results (Table 3) on the same train/test data sets.",
70
+ "[paragraph id = 5] Relaxing the scoring criteria to a predicted ordinal affinity value equal to or value relative to the ground truth, as is regularly employed in the FEP+ method Schrodinger (2023 ); Ross et al.",
71
+ "[paragraph id = 7] With the relaxed \"near match\" criteria, we achieved an 77% overall accuracy and all ordinal affinity values achieved 19-94% near matches relative the the ground truth with our method (Figure 6)."
72
+ ],
73
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T2\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S4.T2.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S4.T2.1.1.1.1\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T2.1.1.1.1.1\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.1.1.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.1.1.1.1.1\" style=\"font-size:90%;\">Machine Learning</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.1.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.1.1.2.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.1.1.2.1.1\" style=\"font-size:90%;\">Model</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T2.1.1.1.2\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T2.1.1.1.2.1\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.2.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.2.1.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.2.1.1.1.1\" style=\"font-size:90%;\">Ligand</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.2.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.2.1.2.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.2.1.2.1.1\" style=\"font-size:90%;\">Embedding</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.2.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.2.1.3.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.2.1.3.1.1\" style=\"font-size:90%;\">Model</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T2.1.1.1.3\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T2.1.1.1.3.1\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.3.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.3.1.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.3.1.1.1.1\" style=\"font-size:90%;\">Protein</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.3.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.3.1.2.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.3.1.2.1.1\" style=\"font-size:90%;\">Embedding</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.3.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.3.1.3.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.3.1.3.1.1\" style=\"font-size:90%;\">Model</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T2.1.1.1.4\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T2.1.1.1.4.1\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.4.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.4.1.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.4.1.1.1.1\" style=\"font-size:90%;\">Dimension of</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.4.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.4.1.2.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.4.1.2.1.1\" style=\"font-size:90%;\">Ligand + Protein</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.4.1.3\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.4.1.3.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.4.1.3.1.1\" style=\"font-size:90%;\">Embedding</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T2.1.1.1.5\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T2.1.1.1.5.1\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.5.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.5.1.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.5.1.1.1.1\" style=\"font-size:90%;\">% Accuracy</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T2.1.1.1.6\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T2.1.1.1.6.1\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.6.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.6.1.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.6.1.1.1.1\" style=\"font-size:90%;\">% Exact</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1.6.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T2.1.1.1.6.1.2.1\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.6.1.2.1.1\" style=\"font-size:90%;\">Matches</span></td>\n</tr>\n</table>\n</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.2.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S4.T2.1.2.1.1\"><span class=\"ltx_text\" id=\"S4.T2.1.2.1.1.1\" style=\"font-size:90%;\">OvR(LinearSVM)</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.2.1.2\"><span class=\"ltx_text\" id=\"S4.T2.1.2.1.2.1\" style=\"font-size:90%;\">ECFP</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.2.1.3\"><span class=\"ltx_text\" id=\"S4.T2.1.2.1.3.1\" style=\"font-size:90%;\">ESM2-3B</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.2.1.4\"><span class=\"ltx_text\" id=\"S4.T2.1.2.1.4.1\" style=\"font-size:90%;\">4,608</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.2.1.5\"><span class=\"ltx_text\" id=\"S4.T2.1.2.1.5.1\" style=\"font-size:90%;\">7%</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.2.1.6\"><span class=\"ltx_text\" id=\"S4.T2.1.2.1.6.1\" style=\"font-size:90%;\">7%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.3.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_bb\" id=\"S4.T2.1.3.2.1\"><span class=\"ltx_text\" id=\"S4.T2.1.3.2.1.1\" style=\"font-size:90%;\">OvR(LinearSVM)</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.1.3.2.2\"><span class=\"ltx_text\" id=\"S4.T2.1.3.2.2.1\" style=\"font-size:90%;\">MACCS</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.1.3.2.3\"><span class=\"ltx_text\" id=\"S4.T2.1.3.2.3.1\" style=\"font-size:90%;\">ESM2-3B</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.1.3.2.4\"><span class=\"ltx_text\" id=\"S4.T2.1.3.2.4.1\" style=\"font-size:90%;\">2,727</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.1.3.2.5\"><span class=\"ltx_text\" id=\"S4.T2.1.3.2.5.1\" style=\"font-size:90%;\">7%</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T2.1.3.2.6\"><span class=\"ltx_text\" id=\"S4.T2.1.3.2.6.1\" style=\"font-size:90%;\">7%</span></td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span>Performance of ML models in the conversion of 10,000 test instances of ligand embeddings and protein amino acid sequence embeddings into their corresponding predicted LPI ordinal affinity values from the LPI-1.5M data set. The ML model outputs were compared to their ground truth values for scoring.</figcaption>\n</figure>",
74
+ "perturb_sentence_id": 1,
75
+ "output": {
76
+ "perturbed_statement": "[paragraph id = 1] We observed a significant improvement in the performance of our fine-tuned SLM on our LPI affinity prediction task versus the baseline model on a test set of 500 examples from the LPI-1.5M data set. Our fine-tuned SLM achieved 37% overall accuracy and 57% overall exact matches on our task.",
77
+ "perturbed_explanation": "The original explanation highlights the key performance of the fine-tuned SLM on the test set. The statement specifies a test set size of 10,000 examples and reports both 37% accuracy and exact matches. However, the statement now incorrectly mentions that the test set size was 500 examples and that exact matches reached 57%. These altered details conflict with the reported test set size and performance figures, rendering the statement inaccurate."
78
+ }
79
+ },
80
+ {
81
+ "path": "table_paper/2407.00111v1.json",
82
+ "table_id": "3",
83
+ "section": "4.3",
84
+ "all_context": [
85
+ "The OPT-125M pretrained small language model was instruction fine-tuned on 100,000 training examples drawn from the LPI-1.5M data set.",
86
+ "We observed a significant improvement in the performance of our fine-tuned SLM on our LPI affinity prediction task versus the baseline model on a test set of 10,000 examples from the LPI-1.5M data set.",
87
+ "Our fine-tuned SLM achieved 37% overall accuracy and 37% overall exact matches on our task.",
88
+ "Notably, our fine-tuned SLM achieved 14%, 36%, 64%, and 22% exact matches for the ordinal affinity values B, C, D, and E, respectively (Figure 6).",
89
+ "These results were significantly better than the ML results (Table 2) and baseline language model results (Table 3) on the same train/test data sets.",
90
+ "Relaxing the scoring criteria to a predicted ordinal affinity value equal to or value relative to the ground truth, as is regularly employed in the FEP+ method Schrodinger (2023 ); Ross et al.",
91
+ "(2023 ), resulted in impressive outcomes with our method.",
92
+ "With the relaxed \"near match\" criteria, we achieved an 77% overall accuracy and all ordinal affinity values achieved 19-94% near matches relative the the ground truth with our method (Figure 6).",
93
+ "The relaxed criteria of a near match is reasonable for the prioritization of ligands in virtual screening, and is likely why this practice was introduced by FEP+ practitioners.",
94
+ ""
95
+ ],
96
+ "target_context_ids": [
97
+ 1,
98
+ 2,
99
+ 3,
100
+ 4
101
+ ],
102
+ "selected_paragraphs": [
103
+ "[paragraph id = 1] We observed a significant improvement in the performance of our fine-tuned SLM on our LPI affinity prediction task versus the baseline model on a test set of 10,000 examples from the LPI-1.5M data set.",
104
+ "[paragraph id = 2] Our fine-tuned SLM achieved 37% overall accuracy and 37% overall exact matches on our task.",
105
+ "[paragraph id = 3] Notably, our fine-tuned SLM achieved 14%, 36%, 64%, and 22% exact matches for the ordinal affinity values B, C, D, and E, respectively (Figure 6).",
106
+ "[paragraph id = 4] These results were significantly better than the ML results (Table 2) and baseline language model results (Table 3) on the same train/test data sets."
107
+ ],
108
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T3\">\n<table class=\"ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle\" id=\"S4.T3.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T3.1.1.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S4.T3.1.1.1.1\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T3.1.1.1.1.1\">\n<tr class=\"ltx_tr\" id=\"S4.T3.1.1.1.1.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T3.1.1.1.1.1.1.1\"><span class=\"ltx_text\" id=\"S4.T3.1.1.1.1.1.1.1.1\" style=\"font-size:90%;\">Pretrained Foundational</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.1.1.1.1.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T3.1.1.1.1.1.2.1\"><span class=\"ltx_text\" id=\"S4.T3.1.1.1.1.1.2.1.1\" style=\"font-size:90%;\">Language Model</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T3.1.1.1.2\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T3.1.1.1.2.1\">\n<tr class=\"ltx_tr\" id=\"S4.T3.1.1.1.2.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T3.1.1.1.2.1.1.1\"><span class=\"ltx_text\" id=\"S4.T3.1.1.1.2.1.1.1.1\" style=\"font-size:90%;\">Language Model</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.1.1.1.2.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T3.1.1.1.2.1.2.1\"><span class=\"ltx_text\" id=\"S4.T3.1.1.1.2.1.2.1.1\" style=\"font-size:90%;\">Parameter Count</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T3.1.1.1.3\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T3.1.1.1.3.1\">\n<tr class=\"ltx_tr\" id=\"S4.T3.1.1.1.3.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T3.1.1.1.3.1.1.1\"><span class=\"ltx_text\" id=\"S4.T3.1.1.1.3.1.1.1.1\" style=\"font-size:90%;\">% Accuracy</span></td>\n</tr>\n</table>\n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S4.T3.1.1.1.4\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S4.T3.1.1.1.4.1\">\n<tr class=\"ltx_tr\" id=\"S4.T3.1.1.1.4.1.1\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T3.1.1.1.4.1.1.1\"><span class=\"ltx_text\" id=\"S4.T3.1.1.1.4.1.1.1.1\" style=\"font-size:90%;\">% Exact</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.1.1.1.4.1.2\">\n<td class=\"ltx_td ltx_nopad_r ltx_align_center\" id=\"S4.T3.1.1.1.4.1.2.1\"><span class=\"ltx_text\" id=\"S4.T3.1.1.1.4.1.2.1.1\" style=\"font-size:90%;\">Matches</span></td>\n</tr>\n</table>\n</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T3.1.2.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_t\" id=\"S4.T3.1.2.1.1\"><span class=\"ltx_text\" id=\"S4.T3.1.2.1.1.1\" style=\"font-size:90%;\">roneneldan/TinyStories-28M</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.1.2.1.2\"><span class=\"ltx_text\" id=\"S4.T3.1.2.1.2.1\" style=\"font-size:90%;\">28M</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.1.2.1.3\"><span class=\"ltx_text\" id=\"S4.T3.1.2.1.3.1\" style=\"font-size:90%;\">0%</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.1.2.1.4\"><span class=\"ltx_text\" id=\"S4.T3.1.2.1.4.1\" style=\"font-size:90%;\">0%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.1.3.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row\" id=\"S4.T3.1.3.2.1\"><span class=\"ltx_text\" id=\"S4.T3.1.3.2.1.1\" style=\"font-size:90%;\">facebook/opt-125m</span></th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.1.3.2.2\"><span class=\"ltx_text\" id=\"S4.T3.1.3.2.2.1\" style=\"font-size:90%;\">125M</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.1.3.2.3\"><span class=\"ltx_text\" id=\"S4.T3.1.3.2.3.1\" style=\"font-size:90%;\">0%</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.1.3.2.4\"><span class=\"ltx_text\" id=\"S4.T3.1.3.2.4.1\" style=\"font-size:90%;\">0%</span></td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.1.4.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_bb\" id=\"S4.T3.1.4.3.1\"><span class=\"ltx_text\" id=\"S4.T3.1.4.3.1.1\" style=\"font-size:90%;\">EleutherAI/gpt-neo-125m</span></th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T3.1.4.3.2\"><span class=\"ltx_text\" id=\"S4.T3.1.4.3.2.1\" style=\"font-size:90%;\">125M</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T3.1.4.3.3\"><span class=\"ltx_text\" id=\"S4.T3.1.4.3.3.1\" style=\"font-size:90%;\">0%</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S4.T3.1.4.3.4\"><span class=\"ltx_text\" id=\"S4.T3.1.4.3.4.1\" style=\"font-size:90%;\">0%</span></td>\n</tr>\n</tbody>\n</table>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\">Table 3: </span>Baseline performance of pretrained foundational small language models in the conversion of 10,000 test instances of ligand SMILES strings and protein amino acid sequences into their corresponding predicted LPI ordinal affinity values from the LPI-1.5M data set. The model outputs were compared to their ground truth values for scoring. The language models are described by their <span class=\"ltx_text ltx_font_typewriter\" id=\"S4.T3.3.1\">HuggingFace.co</span> repo names (accessed 30May2024).</figcaption>\n</figure>",
109
+ "perturb_sentence_id": 3,
110
+ "output": {
111
+ "perturbed_statement": "[paragraph id = 3] Notably, our fine-tuned SLM achieved 44%, 26%, 34%, and 12% exact matches for the ordinal affinity values B, C, D, and E, respectively (Figure 6).These results were significantly better than the ML results (Table 2) and baseline language model results (Table 3) on the same train/test data sets.",
112
+ "perturbed_explanation": "The original explanation: This data point indicates that the fine-tuned SLM model's performance significantly improved over the results from previous models, as evidenced by the higher exact match percentages for each ordinal affinity value. 1. In the statement, the exact match percentages for B, C, D, and E should be 14%, 36%, 64%, and 22%, respectively; this discrepancy alters the correct reported performance numbers and suggests a different performance overview. 2. Consequently, the provided numbers no longer confirm the stated superior performance without factual alignment with verified results."
113
+ }
114
+ }
115
+ ]
table_result/2407.00115v3_output.json ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00115v3.json",
4
+ "table_id": "2",
5
+ "section": "5.1",
6
+ "all_context": [
7
+ "CIFAR-100: image classification.",
8
+ "As shown in Table 1 , we conduct image classification on the CIFAR-100 dataset to demonstrate the generalization performance of our RLKD method across 11 teacher-student pairs, including RN-56 & RN-20, etc.",
9
+ "Among them, 5 pairs of teacher and student models (VGG-13 & MN-V2, etc.)",
10
+ "are characterized by distinguishing architectural frameworks.",
11
+ "These experimental designs we employed provide a diverse and comprehensive assessment environment.",
12
+ "When the teacher and student networks share the same architecture, the experimental results show that our RLKD method has a strong generalization capacity, also exhibits a superior performance compared to CTKD.",
13
+ "Specifically, in the case of RN-110 & RN-20, our method outperforms Vanilla KD by 0.78% (71.44% vs 70.66%) and CTKD by 0.36% (71.44% vs 71.08%).",
14
+ "Moreover, in the case where the teacher and student networks have different architectures, the powerful generalization capacity of our RLKD is also validated.",
15
+ "To validate the generalization of our RLKD method across different KD frameworks, we conduct experiments on 6 currently leading KD frameworks (see Table 3 ), including DKD, PKT, etc.",
16
+ "When applied to the teacher-student pair RN110 & RN32, our RLKD brings an improvement of 0.61% (74.27% vs 73.66%) in the DKD framework, which surpasses the accuracy of CTKD by 0.36% (74.27% vs 73.91%).",
17
+ "Experiments conducted on other 5 KD frameworks (e.g.",
18
+ "PKT, etc.)",
19
+ "further confirm the strong generalization of our RLKD.",
20
+ "Both the accuracy and stability of the proposed RLKD are significantly superior to CTKD, this can be attributed to our RLKD method considers the future rewards of the instance temperature adjustment operations.",
21
+ "ImageNet: image classification.",
22
+ "To validate the scalability of our method and its applicability in complex scenarios involving large datasets, we further conduct image classification on ImageNet.",
23
+ "Table 2 details the top-1 and top-5 accuracy.",
24
+ "Using CTKD and our RLKD as the adaptable plug-in approach, we incorporate them into 5 current leading distillation frameworks (i.e.",
25
+ "KD, PKT, RKD, SRRL, and DKD).",
26
+ "The experimental results obtained from these 5 KD frameworks unequivocally demonstrate the excellent scalability of our method.",
27
+ "Remarkably, our RLKD exhibits robust performance on large dataset like ImageNet.",
28
+ "For instance, in the Vanilla KD and SRRL frameworks, our method achieves improvement of 0.2% (90.51% vs 90.31%) and 0.11% (90.52% vs 90.41%) respectively.",
29
+ "In contrast, CTKD obtains much fewer improvement on these KD frameworks, with gains of just 0.02% (90.33% vs 90.31%) and 0.01% (90.42% vs 90.41%) respectively, about 10 times lower.",
30
+ "We think the superior performance of RLKD can be attributed to its RL-based framework in instance temperature adjustment, which considers the future benefits of these adjustments.",
31
+ "Additionally, unlike CTKD, our RLKD also takes into account the student model s grasp of individual instances during instance temperature adjustment.",
32
+ "MS-COCO: object detection.",
33
+ "To verify whether our RLKD method possesses robustness across other visual tasks, we execute object detection on the MS-COCO dataset.",
34
+ "As shown in Table 4 , in the case of RN-50 & MN-V2, regarding the mAP metric, our RLKD outperforms Vanilla KD by 1.36% (31.49% vs 30.13%) and CTKD by 0.28% (31.49% vs 31.21%), respectively.",
35
+ "Additionally, for detecting objects with varying sizes – evaluated by the AP metrics for large (APl), medium (APm) and small (APs) objects, our RLKD also shows a significant enhancement, consistently surpasses CTKD across all size categories.",
36
+ "Results demonstrate the robustness of our approach, where instance temperature adjustment is treated as a sequential decision-making task, enabling consideration of future benefits.",
37
+ ""
38
+ ],
39
+ "target_context_ids": [
40
+ 16,
41
+ 17,
42
+ 18,
43
+ 19,
44
+ 20,
45
+ 21,
46
+ 22,
47
+ 23,
48
+ 24,
49
+ 25,
50
+ 26
51
+ ],
52
+ "selected_paragraphs": [
53
+ "[paragraph id = 16] Table 2 details the top-1 and top-5 accuracy.",
54
+ "[paragraph id = 17] Using CTKD and our RLKD as the adaptable plug-in approach, we incorporate them into 5 current leading distillation frameworks (i.e.",
55
+ "[paragraph id = 18] KD, PKT, RKD, SRRL, and DKD).",
56
+ "[paragraph id = 19] The experimental results obtained from these 5 KD frameworks unequivocally demonstrate the excellent scalability of our method.",
57
+ "[paragraph id = 20] Remarkably, our RLKD exhibits robust performance on large dataset like ImageNet.",
58
+ "[paragraph id = 21] For instance, in the Vanilla KD and SRRL frameworks, our method achieves improvement of 0.2% (90.51% vs 90.31%) and 0.11% (90.52% vs 90.41%) respectively.",
59
+ "[paragraph id = 22] In contrast, CTKD obtains much fewer improvement on these KD frameworks, with gains of just 0.02% (90.33% vs 90.31%) and 0.01% (90.42% vs 90.41%) respectively, about 10 times lower.",
60
+ "[paragraph id = 23] We think the superior performance of RLKD can be attributed to its RL-based framework in instance temperature adjustment, which considers the future benefits of these adjustments.",
61
+ "[paragraph id = 24] Additionally, unlike CTKD, our RLKD also takes into account the student model s grasp of individual instances during instance temperature adjustment.",
62
+ "[paragraph id = 25] MS-COCO: object detection.",
63
+ "[paragraph id = 26] To verify whether our RLKD method possesses robustness across other visual tasks, we execute object detection on the MS-COCO dataset."
64
+ ],
65
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T2\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S5.T2.2\" style=\"width:474.1pt;height:33pt;vertical-align:-0.6pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-158.0pt,10.8pt) scale(0.6,0.6) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S5.T2.2.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T2.2.1.1.1\">\n<th class=\"ltx_td ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T2.2.1.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T2.2.1.1.1.2\">Teacher</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_tt\" id=\"S5.T2.2.1.1.1.3\">Student</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T2.2.1.1.1.4\">Vanilla KD</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T2.2.1.1.1.5\">+CTKD</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_tt\" id=\"S5.T2.2.1.1.1.6\">+Ours</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T2.2.1.1.1.7\">PKT</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T2.2.1.1.1.8\">+CTKD</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_tt\" id=\"S5.T2.2.1.1.1.9\">+Ours</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T2.2.1.1.1.10\">RKD</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T2.2.1.1.1.11\">+CTKD</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_tt\" id=\"S5.T2.2.1.1.1.12\">+Ours</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T2.2.1.1.1.13\">SRRL</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T2.2.1.1.1.14\">+CTKD</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_r ltx_border_tt\" id=\"S5.T2.2.1.1.1.15\">+Ours</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T2.2.1.1.1.16\">DKD</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T2.2.1.1.1.17\">+CTKD</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T2.2.1.1.1.18\">+Ours</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T2.2.1.2.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S5.T2.2.1.2.1.1\">Top-1</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T2.2.1.2.1.2\">73.96</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.2.1.2.1.3\">70.26</td>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S5.T2.2.1.2.1.4\">70.83</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T2.2.1.2.1.5\">71.28</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.2.1.2.1.6\">71.39</td>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S5.T2.2.1.2.1.7\">70.92</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T2.2.1.2.1.8\">71.31</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.2.1.2.1.9\">71.53</td>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S5.T2.2.1.2.1.10\">70.94</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T2.2.1.2.1.11\">71.13</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.2.1.2.1.12\">71.37</td>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S5.T2.2.1.2.1.13\">71.01</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T2.2.1.2.1.14\">71.25</td>\n<td class=\"ltx_td ltx_align_center ltx_border_r ltx_border_t\" id=\"S5.T2.2.1.2.1.15\">71.38</td>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S5.T2.2.1.2.1.16\">71.13</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T2.2.1.2.1.17\">71.47</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T2.2.1.2.1.18\">71.62</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T2.2.1.3.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb\" id=\"S5.T2.2.1.3.2.1\">Top-5</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T2.2.1.3.2.2\">91.58</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r\" id=\"S5.T2.2.1.3.2.3\">89.50</td>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb\" id=\"S5.T2.2.1.3.2.4\">90.31</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T2.2.1.3.2.5\">90.33</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r\" id=\"S5.T2.2.1.3.2.6\">90.51</td>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb\" id=\"S5.T2.2.1.3.2.7\">90.25</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T2.2.1.3.2.8\">90.30</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r\" id=\"S5.T2.2.1.3.2.9\">90.42</td>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb\" id=\"S5.T2.2.1.3.2.10\">90.33</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T2.2.1.3.2.11\">90.34</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r\" id=\"S5.T2.2.1.3.2.12\">90.45</td>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb\" id=\"S5.T2.2.1.3.2.13\">90.41</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T2.2.1.3.2.14\">90.42</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb ltx_border_r\" id=\"S5.T2.2.1.3.2.15\">90.52</td>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb\" id=\"S5.T2.2.1.3.2.16\">90.31</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T2.2.1.3.2.17\">90.44</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T2.2.1.3.2.18\">90.56</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S5.T2.3.1.1\" style=\"font-size:90%;\">Table 2</span>: </span><span class=\"ltx_text\" id=\"S5.T2.4.2\" style=\"font-size:90%;\">Top-1 and Top-5 accuracy on ImageNet with ResNet-34 as teacher and ResNet-18 as student.</span></figcaption>\n</figure>",
66
+ "perturb_sentence_id": 22,
67
+ "output": {
68
+ "perturbed_statement": "[paragraph id = 22] In contrast, CTKD achieves significant improvements on these KD frameworks, with gains of 0.22% (90.53% vs 90.31%) and 0.21% (90.62% vs 90.41%) respectively, which equals the performance of RLKD in these tasks.",
69
+ "perturbed_explanation": "The original explanation states that RLKD achieves superior performance compared to CTKD, with the latter showing gains of only 0.02% and 0.01%, attributed to RLKD’s RL-based framework in temperature adjustment which considers long-term benefits. 1. RLKD is reported as outperforming CTKD in terms of percentage gains in the provided KD framework results. 2. The claim that CTKD achieves a 0.22% and 0.21% improvement contradicts this fact, as the gains are overstated and contradict the specific values provided. Therefore, CTKD does not match RLKD's reported achievements per stated metrics."
70
+ }
71
+ },
72
+ {
73
+ "path": "table_paper/2407.00115v3.json",
74
+ "table_id": "3",
75
+ "section": "5.1",
76
+ "all_context": [
77
+ "CIFAR-100: image classification.",
78
+ "As shown in Table 1 , we conduct image classification on the CIFAR-100 dataset to demonstrate the generalization performance of our RLKD method across 11 teacher-student pairs, including RN-56 & RN-20, etc.",
79
+ "Among them, 5 pairs of teacher and student models (VGG-13 & MN-V2, etc.)",
80
+ "are characterized by distinguishing architectural frameworks.",
81
+ "These experimental designs we employed provide a diverse and comprehensive assessment environment.",
82
+ "When the teacher and student networks share the same architecture, the experimental results show that our RLKD method has a strong generalization capacity, also exhibits a superior performance compared to CTKD.",
83
+ "Specifically, in the case of RN-110 & RN-20, our method outperforms Vanilla KD by 0.78% (71.44% vs 70.66%) and CTKD by 0.36% (71.44% vs 71.08%).",
84
+ "Moreover, in the case where the teacher and student networks have different architectures, the powerful generalization capacity of our RLKD is also validated.",
85
+ "To validate the generalization of our RLKD method across different KD frameworks, we conduct experiments on 6 currently leading KD frameworks (see Table 3 ), including DKD, PKT, etc.",
86
+ "When applied to the teacher-student pair RN110 & RN32, our RLKD brings an improvement of 0.61% (74.27% vs 73.66%) in the DKD framework, which surpasses the accuracy of CTKD by 0.36% (74.27% vs 73.91%).",
87
+ "Experiments conducted on other 5 KD frameworks (e.g.",
88
+ "PKT, etc.)",
89
+ "further confirm the strong generalization of our RLKD.",
90
+ "Both the accuracy and stability of the proposed RLKD are significantly superior to CTKD, this can be attributed to our RLKD method considers the future rewards of the instance temperature adjustment operations.",
91
+ "ImageNet: image classification.",
92
+ "To validate the scalability of our method and its applicability in complex scenarios involving large datasets, we further conduct image classification on ImageNet.",
93
+ "Table 2 details the top-1 and top-5 accuracy.",
94
+ "Using CTKD and our RLKD as the adaptable plug-in approach, we incorporate them into 5 current leading distillation frameworks (i.e.",
95
+ "KD, PKT, RKD, SRRL, and DKD).",
96
+ "The experimental results obtained from these 5 KD frameworks unequivocally demonstrate the excellent scalability of our method.",
97
+ "Remarkably, our RLKD exhibits robust performance on large dataset like ImageNet.",
98
+ "For instance, in the Vanilla KD and SRRL frameworks, our method achieves improvement of 0.2% (90.51% vs 90.31%) and 0.11% (90.52% vs 90.41%) respectively.",
99
+ "In contrast, CTKD obtains much fewer improvement on these KD frameworks, with gains of just 0.02% (90.33% vs 90.31%) and 0.01% (90.42% vs 90.41%) respectively, about 10 times lower.",
100
+ "We think the superior performance of RLKD can be attributed to its RL-based framework in instance temperature adjustment, which considers the future benefits of these adjustments.",
101
+ "Additionally, unlike CTKD, our RLKD also takes into account the student model s grasp of individual instances during instance temperature adjustment.",
102
+ "MS-COCO: object detection.",
103
+ "To verify whether our RLKD method possesses robustness across other visual tasks, we execute object detection on the MS-COCO dataset.",
104
+ "As shown in Table 4 , in the case of RN-50 & MN-V2, regarding the mAP metric, our RLKD outperforms Vanilla KD by 1.36% (31.49% vs 30.13%) and CTKD by 0.28% (31.49% vs 31.21%), respectively.",
105
+ "Additionally, for detecting objects with varying sizes – evaluated by the AP metrics for large (APl), medium (APm) and small (APs) objects, our RLKD also shows a significant enhancement, consistently surpasses CTKD across all size categories.",
106
+ "Results demonstrate the robustness of our approach, where instance temperature adjustment is treated as a sequential decision-making task, enabling consideration of future benefits.",
107
+ ""
108
+ ],
109
+ "target_context_ids": [
110
+ 8,
111
+ 9,
112
+ 10,
113
+ 11,
114
+ 12
115
+ ],
116
+ "selected_paragraphs": [
117
+ "[paragraph id = 8] To validate the generalization of our RLKD method across different KD frameworks, we conduct experiments on 6 currently leading KD frameworks (see Table 3 ), including DKD, PKT, etc.",
118
+ "[paragraph id = 9] When applied to the teacher-student pair RN110 & RN32, our RLKD brings an improvement of 0.61% (74.27% vs 73.66%) in the DKD framework, which surpasses the accuracy of CTKD by 0.36% (74.27% vs 73.91%).",
119
+ "[paragraph id = 10] Experiments conducted on other 5 KD frameworks (e.g.",
120
+ "[paragraph id = 11] PKT, etc.)",
121
+ "[paragraph id = 12] further confirm the strong generalization of our RLKD."
122
+ ],
123
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T3\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S5.T3.2\" style=\"width:238.1pt;height:229.7pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-86.2pt,83.2pt) scale(0.58,0.58) ;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S5.T3.2.2\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T3.2.2.2.3\">Teacher</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T3.2.2.2.4\">RN-56</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T3.2.2.2.5\">RN-110</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T3.2.2.2.6\">RN-110</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T3.2.2.2.7\">WRN-40-2</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T3.2.2.2.8\">WRN-40-2</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T3.1.1.1.1\">RN-324</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T3.2.2.2.2\">RN-324</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.3.1\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.3.1.1\">Acc</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.3.1.2\">72.34</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.3.1.3\">74.31</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.3.1.4\">74.31</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.3.1.5\">75.61</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.3.1.6\">75.61</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.3.1.7\">79.42</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.3.1.8\">79.42</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.4.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.4.2.1\">Student</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.4.2.2\">RN-20</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.4.2.3\">RN-32</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.4.2.4\">RN-20</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.4.2.5\">WRN-16-2</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.4.2.6\">WRN-40-1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.4.2.7\">SN-V1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.4.2.8\">SN-V2</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.5.3\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.5.3.1\">Acc</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.5.3.2\">69.06</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.5.3.3\">71.14</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.5.3.4\">69.06</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.5.3.5\">73.26</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.5.3.6\">71.98</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.5.3.7\">70.70</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.5.3.8\">71.82</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.6.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.6.4.1\">PKT</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.6.4.2\">70.85</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.6.4.3\">73.36</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.6.4.4\">70.88</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.6.4.5\">74.82</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.6.4.6\">74.01</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.6.4.7\">74.39</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.6.4.8\">75.10</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.7.5\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.7.5.1\">+CTKD</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.7.5.2\">71.13</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.7.5.3\">73.49</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.7.5.4\">71.07</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.7.5.5\">75.34</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.7.5.6\">74.11</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.7.5.7\">74.63</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.7.5.8\">75.52</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.8.6\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.8.6.1\">+Ours</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.8.6.2\">71.41</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.8.6.3\">73.68</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.8.6.4\">71.34</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.8.6.5\">75.62</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.8.6.6\">74.23</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.8.6.7\">74.89</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.8.6.8\">75.78</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.9.7\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.9.7.1\">SP</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.9.7.2\">70.84</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.9.7.3\">73.09</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.9.7.4\">70.74</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.9.7.5\">74.88</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.9.7.6\">73.77</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.9.7.7\">74.97</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.9.7.8\">75.59</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.10.8\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.10.8.1\">+CTKD</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.10.8.2\">71.29</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.10.8.3\">73.42</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.10.8.4\">71.17</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.10.8.5\">75.30</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.10.8.6\">73.97</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.10.8.7\">75.28</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.10.8.8\">75.79</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.11.9\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.11.9.1\">+Ours</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.11.9.2\">71.65</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.11.9.3\">73.70</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.11.9.4\">71.51</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.11.9.5\">75.61</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.11.9.6\">74.22</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.11.9.7\">75.31</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.11.9.8\">76.04</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.12.10\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.12.10.1\">VID</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.12.10.2\">70.62</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.12.10.3\">73.02</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.12.10.4\">70.59</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.12.10.5\">74.89</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.12.10.6\">73.60</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.12.10.7\">74.81</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.12.10.8\">75.24</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.13.11\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.13.11.1\">+CTKD</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.13.11.2\">70.81</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.13.11.3\">73.38</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.13.11.4\">71.11</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.13.11.5\">75.20</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.13.11.6\">73.75</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.13.11.7\">75.23</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.13.11.8\">75.48</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.14.12\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.14.12.1\">+Ours</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.14.12.2\">71.09</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.14.12.3\">73.70</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.14.12.4\">71.39</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.14.12.5\">75.48</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.14.12.6\">74.02</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.14.12.7\">75.58</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.14.12.8\">75.81</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.15.13\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.15.13.1\">CRD</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.15.13.2\">71.69</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.15.13.3\">73.63</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.15.13.4\">71.38</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.15.13.5\">75.53</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.15.13.6\">74.36</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.15.13.7\">75.13</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.15.13.8\">75.90</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.16.14\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.16.14.1\">+CTKD</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.16.14.2\">72.13</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.16.14.3\">74.08</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.16.14.4\">72.02</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.16.14.5\">75.71</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.16.14.6\">74.72</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.16.14.7\">75.41</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.16.14.8\">76.20</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.17.15\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.17.15.1\">+Ours</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.17.15.2\">72.29</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.17.15.3\">74.41</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.17.15.4\">72.28</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.17.15.5\">76.03</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.17.15.6\">74.98</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.17.15.7\">75.68</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.17.15.8\">76.55</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.18.16\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.18.16.1\">SRRL</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.18.16.2\">71.13</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.18.16.3\">73.48</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.18.16.4\">71.09</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.18.16.5\">75.69</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.18.16.6\">74.18</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.18.16.7\">75.36</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.18.16.8\">75.90</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.19.17\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.19.17.1\">+CTKD</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.19.17.2\">71.41</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.19.17.3\">73.81</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.19.17.4\">71.52</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.19.17.5\">75.90</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.19.17.6\">74.38</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.19.17.7\">75.62</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.19.17.8\">75.97</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.20.18\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.20.18.1\">+Ours</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.20.18.2\">71.61</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.20.18.3\">74.02</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.20.18.4\">71.81</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.20.18.5\">76.23</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.20.18.6\">74.64</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.20.18.7\">75.90</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.20.18.8\">76.06</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.21.19\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.21.19.1\">DKD</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.21.19.2\">71.43</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.21.19.3\">73.66</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.21.19.4\">71.28</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.21.19.5\">75.70</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.21.19.6\">74.54</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.21.19.7\">75.44</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T3.2.2.21.19.8\">76.48</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.22.20\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.22.20.1\">+CTKD</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.22.20.2\">71.62</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.22.20.3\">73.91</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.22.20.4\">71.65</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.22.20.5\">75.85</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.22.20.6\">74.57</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.22.20.7\">75.88</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T3.2.2.22.20.8\">76.91</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T3.2.2.23.21\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T3.2.2.23.21.1\">+Ours</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T3.2.2.23.21.2\">71.89</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T3.2.2.23.21.3\">74.27</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T3.2.2.23.21.4\">71.91</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T3.2.2.23.21.5\">76.02</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T3.2.2.23.21.6\">74.90</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T3.2.2.23.21.7\">76.02</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T3.2.2.23.21.8\">77.21</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S5.T3.4.1.1\" style=\"font-size:90%;\">Table 3</span>: </span><span class=\"ltx_text\" id=\"S5.T3.5.2\" style=\"font-size:90%;\">Student network Top-1 accuracy on CIFAR-100 dataset. </span></figcaption>\n</figure>",
124
+ "perturb_sentence_id": 9,
125
+ "output": {
126
+ "perturbed_statement": "[paragraph id = 9] When applied to the teacher-student pair RN110 & RN32, our RLKD brings an improvement of 0.61% (74.27% vs 73.66%) in the CTKD framework, which surpasses the accuracy of DKD by 0.36% (74.27% vs 73.91%). Experiments conducted on other 5 KD frameworks (e.g.",
127
+ "perturbed_explanation": "1. The original explanation states that RLKD achieves an improvement of 0.61% in the DKD framework (74.27% vs 73.66%) and surpasses CTKD by 0.36% (74.27% vs 73.91%). 2. However, the statement claims that the 0.61% improvement occurred in the CTKD framework compared to DKD, which alters the roles of the frameworks mentioned. This adjustment contradicts the described experimental outcomes and results."
128
+ }
129
+ },
130
+ {
131
+ "path": "table_paper/2407.00115v3.json",
132
+ "table_id": "4",
133
+ "section": "5.1",
134
+ "all_context": [
135
+ "CIFAR-100: image classification.",
136
+ "As shown in Table 1 , we conduct image classification on the CIFAR-100 dataset to demonstrate the generalization performance of our RLKD method across 11 teacher-student pairs, including RN-56 & RN-20, etc.",
137
+ "Among them, 5 pairs of teacher and student models (VGG-13 & MN-V2, etc.)",
138
+ "are characterized by distinguishing architectural frameworks.",
139
+ "These experimental designs we employed provide a diverse and comprehensive assessment environment.",
140
+ "When the teacher and student networks share the same architecture, the experimental results show that our RLKD method has a strong generalization capacity, also exhibits a superior performance compared to CTKD.",
141
+ "Specifically, in the case of RN-110 & RN-20, our method outperforms Vanilla KD by 0.78% (71.44% vs 70.66%) and CTKD by 0.36% (71.44% vs 71.08%).",
142
+ "Moreover, in the case where the teacher and student networks have different architectures, the powerful generalization capacity of our RLKD is also validated.",
143
+ "To validate the generalization of our RLKD method across different KD frameworks, we conduct experiments on 6 currently leading KD frameworks (see Table 3 ), including DKD, PKT, etc.",
144
+ "When applied to the teacher-student pair RN110 & RN32, our RLKD brings an improvement of 0.61% (74.27% vs 73.66%) in the DKD framework, which surpasses the accuracy of CTKD by 0.36% (74.27% vs 73.91%).",
145
+ "Experiments conducted on other 5 KD frameworks (e.g.",
146
+ "PKT, etc.)",
147
+ "further confirm the strong generalization of our RLKD.",
148
+ "Both the accuracy and stability of the proposed RLKD are significantly superior to CTKD, this can be attributed to our RLKD method considers the future rewards of the instance temperature adjustment operations.",
149
+ "ImageNet: image classification.",
150
+ "To validate the scalability of our method and its applicability in complex scenarios involving large datasets, we further conduct image classification on ImageNet.",
151
+ "Table 2 details the top-1 and top-5 accuracy.",
152
+ "Using CTKD and our RLKD as the adaptable plug-in approach, we incorporate them into 5 current leading distillation frameworks (i.e.",
153
+ "KD, PKT, RKD, SRRL, and DKD).",
154
+ "The experimental results obtained from these 5 KD frameworks unequivocally demonstrate the excellent scalability of our method.",
155
+ "Remarkably, our RLKD exhibits robust performance on large dataset like ImageNet.",
156
+ "For instance, in the Vanilla KD and SRRL frameworks, our method achieves improvement of 0.2% (90.51% vs 90.31%) and 0.11% (90.52% vs 90.41%) respectively.",
157
+ "In contrast, CTKD obtains much fewer improvement on these KD frameworks, with gains of just 0.02% (90.33% vs 90.31%) and 0.01% (90.42% vs 90.41%) respectively, about 10 times lower.",
158
+ "We think the superior performance of RLKD can be attributed to its RL-based framework in instance temperature adjustment, which considers the future benefits of these adjustments.",
159
+ "Additionally, unlike CTKD, our RLKD also takes into account the student model s grasp of individual instances during instance temperature adjustment.",
160
+ "MS-COCO: object detection.",
161
+ "To verify whether our RLKD method possesses robustness across other visual tasks, we execute object detection on the MS-COCO dataset.",
162
+ "As shown in Table 4 , in the case of RN-50 & MN-V2, regarding the mAP metric, our RLKD outperforms Vanilla KD by 1.36% (31.49% vs 30.13%) and CTKD by 0.28% (31.49% vs 31.21%), respectively.",
163
+ "Additionally, for detecting objects with varying sizes – evaluated by the AP metrics for large (APl), medium (APm) and small (APs) objects, our RLKD also shows a significant enhancement, consistently surpasses CTKD across all size categories.",
164
+ "Results demonstrate the robustness of our approach, where instance temperature adjustment is treated as a sequential decision-making task, enabling consideration of future benefits.",
165
+ ""
166
+ ],
167
+ "target_context_ids": [
168
+ 26,
169
+ 27,
170
+ 28,
171
+ 29
172
+ ],
173
+ "selected_paragraphs": [
174
+ "[paragraph id = 26] To verify whether our RLKD method possesses robustness across other visual tasks, we execute object detection on the MS-COCO dataset.",
175
+ "[paragraph id = 27] As shown in Table 4 , in the case of RN-50 & MN-V2, regarding the mAP metric, our RLKD outperforms Vanilla KD by 1.36% (31.49% vs 30.13%) and CTKD by 0.28% (31.49% vs 31.21%), respectively.",
176
+ "[paragraph id = 28] Additionally, for detecting objects with varying sizes – evaluated by the AP metrics for large (APl), medium (APm) and small (APs) objects, our RLKD also shows a significant enhancement, consistently surpasses CTKD across all size categories.",
177
+ "[paragraph id = 29] Results demonstrate the robustness of our approach, where instance temperature adjustment is treated as a sequential decision-making task, enabling consideration of future benefits."
178
+ ],
179
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T4\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S5.T4.2\" style=\"width:176.2pt;height:124.7pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-51.7pt,36.6pt) scale(0.63,0.63) ;\">\n<table class=\"ltx_tabular ltx_align_middle\" id=\"S5.T4.2.1\">\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T4.2.1.1.1\">\n<td class=\"ltx_td ltx_border_tt\" id=\"S5.T4.2.1.1.1.1\"></td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T4.2.1.1.1.2\">mAP</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T4.2.1.1.1.3\">AP50</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T4.2.1.1.1.4\">AP75</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T4.2.1.1.1.5\">APl</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T4.2.1.1.1.6\">APm</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T4.2.1.1.1.7\">APs</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.1.2.2\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.2.2.1\">T: RN-101</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.2.2.2\">42.04</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.2.2.3\">62.48</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.2.2.4\">45.88</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.2.2.5\">54.60</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.2.2.6\">45.55</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.2.2.7\">25.22</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.1.3.3\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.3.3.1\">S: RN-18</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.3.3.2\">33.26</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.3.3.3\">53.61</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.3.3.4\">35.26</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.3.3.5\">43.16</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.3.3.6\">35.68</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.3.3.7\">18.96</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.1.4.4\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.4.4.1\">Vanilla KD</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.4.4.2\">33.97</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.4.4.3\">54.66</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.4.4.4\">36.62</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.4.4.5\">44.14</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.4.4.6\">36.67</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.4.4.7\">18.71</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.1.5.5\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.5.5.1\">+CTKD</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.5.5.2\">34.51</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.5.5.3\">55.32</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.5.5.4\">36.95</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.5.5.5\">44.76</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.5.5.6\">37.17</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.5.5.7\">19.01</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.1.6.6\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.6.6.1\">+Ours</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.6.6.2\">34.73</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.6.6.3\">55.61</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.6.6.4\">37.19</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.6.6.5\">45.27</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.6.6.6\">37.30</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.6.6.7\">19.12</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.1.7.7\">\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T4.2.1.7.7.1\">T: RN-50</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T4.2.1.7.7.2\">40.22</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T4.2.1.7.7.3\">61.02</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T4.2.1.7.7.4\">43.81</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T4.2.1.7.7.5\">51.98</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T4.2.1.7.7.6\">43.53</td>\n<td class=\"ltx_td ltx_align_center ltx_border_tt\" id=\"S5.T4.2.1.7.7.7\">24.16</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.1.8.8\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.8.8.1\">S: MN-V2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.8.8.2\">29.47</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.8.8.3\">48.87</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.8.8.4\">30.90</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.8.8.5\">38.86</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.8.8.6\">30.77</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.8.8.7\">16.33</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.1.9.9\">\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.9.9.1\">Vanilla KD</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.9.9.2\">30.13</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.9.9.3\">50.28</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.9.9.4\">31.35</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.9.9.5\">39.56</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.9.9.6\">31.91</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T4.2.1.9.9.7\">16.69</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.1.10.10\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.10.10.1\">+CTKD</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.10.10.2\">31.21</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.10.10.3\">52.12</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.10.10.4\">32.01</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.10.10.5\">41.11</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.10.10.6\">33.44</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T4.2.1.10.10.7\">18.09</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T4.2.1.11.11\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T4.2.1.11.11.1\">+Ours</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T4.2.1.11.11.2\">31.49</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T4.2.1.11.11.3\">52.57</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T4.2.1.11.11.4\">33.23</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T4.2.1.11.11.5\">41.71</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T4.2.1.11.11.6\">33.65</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T4.2.1.11.11.7\">18.31</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S5.T4.3.1.1\" style=\"font-size:90%;\">Table 4</span>: </span><span class=\"ltx_text\" id=\"S5.T4.4.2\" style=\"font-size:90%;\">Results of our RLKD on the MS-COCO dataset, utilizing Faster-RCNN <cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00115v3#bib.bib27\" title=\"\"><span class=\"ltx_text\" style=\"font-size:90%;\">27</span></a>]</cite> with FPN <cite class=\"ltx_cite ltx_citemacro_cite\">[<a class=\"ltx_ref\" href=\"https://arxiv.org/html/2407.00115v3#bib.bib22\" title=\"\"><span class=\"ltx_text\" style=\"font-size:90%;\">22</span></a>]</cite>. We conduct experiments with the following teacher-student pairings: RN-101 paired with RN-18, and RN-50 paired with MN-V2.</span></figcaption>\n</figure>",
180
+ "perturb_sentence_id": 27,
181
+ "output": {
182
+ "perturbed_statement": "[paragraph id = 27] As shown in Table 4, in the case of RN-50 & MN-V2, regarding the mAP metric, our RLKD underperforms Vanilla KD by -1.36% (31.13% vs 31.49%) and CTKD by -0.28% (31.13% vs 31.21%), respectively. Additionally, for detecting objects with varying sizes – evaluated by the AP metrics for large (APl), medium (APm), and small (APs) objects, our RLKD shows a slight reduction, consistently falling below CTKD across all size categories.",
183
+ "perturbed_explanation": "Original Explanation: The statement outlines the performance of the RLKD model against other benchmarks, with RLKD demonstrating superior results across all metrics. Correct values are compared for clarity and accuracy. 1. RLKD outperformance by 1.36% compared to Vanilla KD indicates an improvement in the mAP metric. 2. RLKD outperformance by 0.28% compared to CTKD shows a consistent trend of enhancement. Updated Explanation: The altered description inaccurately states that RLKD underperforms both Vanilla KD and CTKD across metrics, which misrepresents the actual experimental findings. 3. RLKD does not perform worse in the described comparison metrics (mAP and AP for object size categories); instead, it shows higher values, evidencing its robustness."
184
+ }
185
+ },
186
+ {
187
+ "path": "table_paper/2407.00115v3.json",
188
+ "table_id": "5",
189
+ "section": "5.2",
190
+ "all_context": [
191
+ "In the ablation studies, we evaluate the performance of the uncertainty score that is included in our state representation, the instance reward calibration scheme, the efficient exploration strategy, and different high-quality training example selection strategies.",
192
+ "All experiments are conducted on the CIFAR-100 dataset with respect to the image classification task, and utilize the Vanilla KD framework.",
193
+ "Uncertainty score.",
194
+ "We conduct experiments on 4 sets of teacher-student network pairs to test the effectiveness of the uncertainty score in our state representation.",
195
+ "As shown in Table 5 , when incorporating uncertainty score into state representation, our method shows an improvement of 0.24% (71.40% vs 71.16%) in the RN-56 & RN-20 teacher-student pair.",
196
+ "This enhancement verifies the effectiveness of our designed uncertainty score, which enables the agent to make wiser decisions by taking into account the student model s mastery of the training instances.",
197
+ "Instance reward calibration.",
198
+ "As shown in Table 6 , when incorporating an instance reward calibration strategy into our RLKD method, a promotive effect across 4 different sets of the teacher-student pairs (RN-56 & RN-20, etc.)",
199
+ "is achieved.",
200
+ "E.g., our instance temperature calibration strategy boosts the performance of RN-110 & RN-32 pair by 0.55% (73.81% vs 73.26%).",
201
+ "We believe the effectiveness of the instance reward calibration strategy lies in its ability to enable the agent to more accurately perceive the rewards resulting from each of its instance temperature adjustment actions, thereby enhancing its capacity to update its policy for performing the action.",
202
+ "Efficient exploration.",
203
+ "As shown in Table 7 , we conduct ablation experiments on our efficient exploration strategy across 4 teacher-student pairs.",
204
+ "The experimental results demonstrate that our effective exploration strategy facilitates performance of the student model across 4 teacher-student pairs.",
205
+ "In the experiments involving the RN-56 & RN-20 teacher-student pair, our efficient exploration strategy results in a performance improvement of 0.37% (71.40% vs 71.03%).",
206
+ "We attribute this success to the strategy enables the agent to learn valuable instance temperature adjustment policy faster, allowing the student model to acquire more useful knowledge during the early stages of KD.",
207
+ "Selection of high-quality training examples.",
208
+ "As shown in Table 8 , we conduct experiments on CIFAR-100 to compare different strategies for selecting the high-quality training examples.",
209
+ "Interestingly, we observe that when using the top 10% of high-quality training data, the performance of the student model in the teacher-student pair RN-56 & RN-20 is 70.92%, which is not as good as the performance 71.21% of the student model when using the training data ranked from 10% to 20%.",
210
+ "This phenomenon is also observed in the teacher-student pair WRN-40-2 & WRN-16-2.",
211
+ "We think this may due to utilizing the top 10% samples caused overfitting in the agent.",
212
+ "Furthermore, in the teacher-student pair RN-56 & RN-20, when conducting the mix-up method on the training data ranked from 10% to 20% using the training data ranked 40% to 50%, there is a performance increase of 0.19% (71.40% vs 71.21%).",
213
+ "The experimental results verify the validity of our mix-up method that combines instances of varying knowledge values can produce high-quality training data.",
214
+ ""
215
+ ],
216
+ "target_context_ids": [
217
+ 0,
218
+ 3,
219
+ 4,
220
+ 5
221
+ ],
222
+ "selected_paragraphs": [
223
+ "[paragraph id = 0] In the ablation studies, we evaluate the performance of the uncertainty score that is included in our state representation, the instance reward calibration scheme, the efficient exploration strategy, and different high-quality training example selection strategies.",
224
+ "[paragraph id = 3] We conduct experiments on 4 sets of teacher-student network pairs to test the effectiveness of the uncertainty score in our state representation.",
225
+ "[paragraph id = 4] As shown in Table 5 , when incorporating uncertainty score into state representation, our method shows an improvement of 0.24% (71.40% vs 71.16%) in the RN-56 & RN-20 teacher-student pair.",
226
+ "[paragraph id = 5] This enhancement verifies the effectiveness of our designed uncertainty score, which enables the agent to make wiser decisions by taking into account the student model s mastery of the training instances."
227
+ ],
228
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T5\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S5.T5.2\" style=\"width:177.7pt;height:47.5pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-45.8pt,12.2pt) scale(0.66,0.66) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S5.T5.2.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T5.2.1.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T5.2.1.1.1.1\">Teacher</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T5.2.1.1.1.2\">RN-56</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T5.2.1.1.1.3\">RN-110</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T5.2.1.1.1.4\">WRN-40-2</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T5.2.1.1.1.5\">VGG-13</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T5.2.1.2.1\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.1.2.1.1\">Student</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.1.2.1.2\">RN-20</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.1.2.1.3\">RN-32</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.1.2.1.4\">WRN-16-2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T5.2.1.2.1.5\">VGG-8</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T5.2.1.3.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T5.2.1.3.2.1\">Ours w/o US</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T5.2.1.3.2.2\">71.16</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T5.2.1.3.2.3\">73.68</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T5.2.1.3.2.4\">75.61</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T5.2.1.3.2.5\">73.57</th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T5.2.1.4.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T5.2.1.4.3.1\">Ours w US</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T5.2.1.4.3.2\">71.40</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T5.2.1.4.3.3\">73.81</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T5.2.1.4.3.4\">75.79</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T5.2.1.4.3.5\">73.75</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S5.T5.3.1.1\" style=\"font-size:90%;\">Table 5</span>: </span><span class=\"ltx_text\" id=\"S5.T5.4.2\" style=\"font-size:90%;\">Ablation study of the uncertainty score (US) feature. </span></figcaption>\n</figure>",
229
+ "perturb_sentence_id": 4,
230
+ "output": {
231
+ "perturbed_statement": "[paragraph id = 4] As shown in Table 5 , when incorporating uncertainty score into state representation, our method shows an improvement of 1.04% (72.20% vs 71.16%) in the RN-56 & RN-20 teacher-student pair. This enhancement verifies the effectiveness of our designed uncertainty score, which enables the agent to make wiser decisions by taking into account the student model s mastery of the training instances.",
232
+ "perturbed_explanation": "Original Explanation: 1. Incorporating the uncertainty score into the state representation led to an improvement of 0.24% (71.40% vs 71.16%) in accuracy for the RN-56 & RN-20 Network Pair. 2. This indicates that the uncertainty score enables more effective decision-making by considering the student model's training instance mastery.\n\nExplanation of the Inaccuracy: The provided statement erroneously claims an improvement of 1.04% (72.20% vs 71.16%) with the inclusion of the uncertainty score, which substantially exceeds the described actual improvement of 0.24% (71.40% vs 71.16%). This misstatement overstates the effectiveness of the designed uncertainty score."
233
+ }
234
+ },
235
+ {
236
+ "path": "table_paper/2407.00115v3.json",
237
+ "table_id": "6",
238
+ "section": "5.2",
239
+ "all_context": [
240
+ "In the ablation studies, we evaluate the performance of the uncertainty score that is included in our state representation, the instance reward calibration scheme, the efficient exploration strategy, and different high-quality training example selection strategies.",
241
+ "All experiments are conducted on the CIFAR-100 dataset with respect to the image classification task, and utilize the Vanilla KD framework.",
242
+ "Uncertainty score.",
243
+ "We conduct experiments on 4 sets of teacher-student network pairs to test the effectiveness of the uncertainty score in our state representation.",
244
+ "As shown in Table 5 , when incorporating uncertainty score into state representation, our method shows an improvement of 0.24% (71.40% vs 71.16%) in the RN-56 & RN-20 teacher-student pair.",
245
+ "This enhancement verifies the effectiveness of our designed uncertainty score, which enables the agent to make wiser decisions by taking into account the student model s mastery of the training instances.",
246
+ "Instance reward calibration.",
247
+ "As shown in Table 6 , when incorporating an instance reward calibration strategy into our RLKD method, a promotive effect across 4 different sets of the teacher-student pairs (RN-56 & RN-20, etc.)",
248
+ "is achieved.",
249
+ "E.g., our instance temperature calibration strategy boosts the performance of RN-110 & RN-32 pair by 0.55% (73.81% vs 73.26%).",
250
+ "We believe the effectiveness of the instance reward calibration strategy lies in its ability to enable the agent to more accurately perceive the rewards resulting from each of its instance temperature adjustment actions, thereby enhancing its capacity to update its policy for performing the action.",
251
+ "Efficient exploration.",
252
+ "As shown in Table 7 , we conduct ablation experiments on our efficient exploration strategy across 4 teacher-student pairs.",
253
+ "The experimental results demonstrate that our effective exploration strategy facilitates performance of the student model across 4 teacher-student pairs.",
254
+ "In the experiments involving the RN-56 & RN-20 teacher-student pair, our efficient exploration strategy results in a performance improvement of 0.37% (71.40% vs 71.03%).",
255
+ "We attribute this success to the strategy enables the agent to learn valuable instance temperature adjustment policy faster, allowing the student model to acquire more useful knowledge during the early stages of KD.",
256
+ "Selection of high-quality training examples.",
257
+ "As shown in Table 8 , we conduct experiments on CIFAR-100 to compare different strategies for selecting the high-quality training examples.",
258
+ "Interestingly, we observe that when using the top 10% of high-quality training data, the performance of the student model in the teacher-student pair RN-56 & RN-20 is 70.92%, which is not as good as the performance 71.21% of the student model when using the training data ranked from 10% to 20%.",
259
+ "This phenomenon is also observed in the teacher-student pair WRN-40-2 & WRN-16-2.",
260
+ "We think this may due to utilizing the top 10% samples caused overfitting in the agent.",
261
+ "Furthermore, in the teacher-student pair RN-56 & RN-20, when conducting the mix-up method on the training data ranked from 10% to 20% using the training data ranked 40% to 50%, there is a performance increase of 0.19% (71.40% vs 71.21%).",
262
+ "The experimental results verify the validity of our mix-up method that combines instances of varying knowledge values can produce high-quality training data.",
263
+ ""
264
+ ],
265
+ "target_context_ids": [
266
+ 7,
267
+ 8,
268
+ 9,
269
+ 10
270
+ ],
271
+ "selected_paragraphs": [
272
+ "[paragraph id = 7] As shown in Table 6 , when incorporating an instance reward calibration strategy into our RLKD method, a promotive effect across 4 different sets of the teacher-student pairs (RN-56 & RN-20, etc.)",
273
+ "[paragraph id = 8] is achieved.",
274
+ "[paragraph id = 9] E.g., our instance temperature calibration strategy boosts the performance of RN-110 & RN-32 pair by 0.55% (73.81% vs 73.26%).",
275
+ "[paragraph id = 10] We believe the effectiveness of the instance reward calibration strategy lies in its ability to enable the agent to more accurately perceive the rewards resulting from each of its instance temperature adjustment actions, thereby enhancing its capacity to update its policy for performing the action."
276
+ ],
277
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T6\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S5.T6.2\" style=\"width:181.3pt;height:47.5pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-46.7pt,12.2pt) scale(0.66,0.66) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S5.T6.2.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T6.2.1.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T6.2.1.1.1.1\">Teacher</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T6.2.1.1.1.2\">RN-56</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T6.2.1.1.1.3\">RN-110</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T6.2.1.1.1.4\">WRN-40-2</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T6.2.1.1.1.5\">VGG-13</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T6.2.1.2.1\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T6.2.1.2.1.1\">Student</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T6.2.1.2.1.2\">RN-20</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T6.2.1.2.1.3\">RN-32</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T6.2.1.2.1.4\">WRN-16-2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T6.2.1.2.1.5\">VGG-8</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T6.2.1.3.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T6.2.1.3.2.1\">Ours w/o IRA</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T6.2.1.3.2.2\">70.91</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T6.2.1.3.2.3\">73.26</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T6.2.1.3.2.4\">75.39</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T6.2.1.3.2.5\">73.32</th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T6.2.1.4.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T6.2.1.4.3.1\">Ours w IRA</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T6.2.1.4.3.2\">71.40</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T6.2.1.4.3.3\">73.81</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T6.2.1.4.3.4\">75.79</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T6.2.1.4.3.5\">73.75</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S5.T6.3.1.1\" style=\"font-size:90%;\">Table 6</span>: </span><span class=\"ltx_text\" id=\"S5.T6.4.2\" style=\"font-size:90%;\">Ablation on instance reward calibration (IRA) strategy.</span></figcaption>\n</figure>",
278
+ "perturb_sentence_id": 9,
279
+ "output": {
280
+ "perturbed_statement": "[paragraph id = 9] E.g., our instance reward calibration strategy diminishes the performance of RN-110 & RN-32 pair by 0.55% (73.81% vs 73.26%).We believe the effectiveness of the instance reward calibration strategy lies in its ability to enable the agent to more accurately perceive the rewards resulting from each of its instance temperature adjustment actions, thereby enhancing its capacity to update its policy for performing the action.",
281
+ "perturbed_explanation": "The original explanation: The statement highlights the effect of an instance reward calibration strategy on the RN-110 & RN-32 pair, emphasizing its role in improving the performance by 0.55% through more accurate reward perception and policy enhancement. 1. The claim in the statement that the strategy diminishes performance is factually incorrect, as the provided performance metrics display an increase from 73.26% to 73.81%, indicating an improvement rather than a decrease. 2. The evaluation thus reaffirms the promotive influence of the calibration strategy on performance, contrasting with the diminishing effect mentioned."
282
+ }
283
+ },
284
+ {
285
+ "path": "table_paper/2407.00115v3.json",
286
+ "table_id": "7",
287
+ "section": "5.2",
288
+ "all_context": [
289
+ "In the ablation studies, we evaluate the performance of the uncertainty score that is included in our state representation, the instance reward calibration scheme, the efficient exploration strategy, and different high-quality training example selection strategies.",
290
+ "All experiments are conducted on the CIFAR-100 dataset with respect to the image classification task, and utilize the Vanilla KD framework.",
291
+ "Uncertainty score.",
292
+ "We conduct experiments on 4 sets of teacher-student network pairs to test the effectiveness of the uncertainty score in our state representation.",
293
+ "As shown in Table 5 , when incorporating uncertainty score into state representation, our method shows an improvement of 0.24% (71.40% vs 71.16%) in the RN-56 & RN-20 teacher-student pair.",
294
+ "This enhancement verifies the effectiveness of our designed uncertainty score, which enables the agent to make wiser decisions by taking into account the student model s mastery of the training instances.",
295
+ "Instance reward calibration.",
296
+ "As shown in Table 6 , when incorporating an instance reward calibration strategy into our RLKD method, a promotive effect across 4 different sets of the teacher-student pairs (RN-56 & RN-20, etc.)",
297
+ "is achieved.",
298
+ "E.g., our instance temperature calibration strategy boosts the performance of RN-110 & RN-32 pair by 0.55% (73.81% vs 73.26%).",
299
+ "We believe the effectiveness of the instance reward calibration strategy lies in its ability to enable the agent to more accurately perceive the rewards resulting from each of its instance temperature adjustment actions, thereby enhancing its capacity to update its policy for performing the action.",
300
+ "Efficient exploration.",
301
+ "As shown in Table 7 , we conduct ablation experiments on our efficient exploration strategy across 4 teacher-student pairs.",
302
+ "The experimental results demonstrate that our effective exploration strategy facilitates performance of the student model across 4 teacher-student pairs.",
303
+ "In the experiments involving the RN-56 & RN-20 teacher-student pair, our efficient exploration strategy results in a performance improvement of 0.37% (71.40% vs 71.03%).",
304
+ "We attribute this success to the strategy enables the agent to learn valuable instance temperature adjustment policy faster, allowing the student model to acquire more useful knowledge during the early stages of KD.",
305
+ "Selection of high-quality training examples.",
306
+ "As shown in Table 8 , we conduct experiments on CIFAR-100 to compare different strategies for selecting the high-quality training examples.",
307
+ "Interestingly, we observe that when using the top 10% of high-quality training data, the performance of the student model in the teacher-student pair RN-56 & RN-20 is 70.92%, which is not as good as the performance 71.21% of the student model when using the training data ranked from 10% to 20%.",
308
+ "This phenomenon is also observed in the teacher-student pair WRN-40-2 & WRN-16-2.",
309
+ "We think this may due to utilizing the top 10% samples caused overfitting in the agent.",
310
+ "Furthermore, in the teacher-student pair RN-56 & RN-20, when conducting the mix-up method on the training data ranked from 10% to 20% using the training data ranked 40% to 50%, there is a performance increase of 0.19% (71.40% vs 71.21%).",
311
+ "The experimental results verify the validity of our mix-up method that combines instances of varying knowledge values can produce high-quality training data.",
312
+ ""
313
+ ],
314
+ "target_context_ids": [
315
+ 12,
316
+ 13,
317
+ 14,
318
+ 15
319
+ ],
320
+ "selected_paragraphs": [
321
+ "[paragraph id = 12] As shown in Table 7 , we conduct ablation experiments on our efficient exploration strategy across 4 teacher-student pairs.",
322
+ "[paragraph id = 13] The experimental results demonstrate that our effective exploration strategy facilitates performance of the student model across 4 teacher-student pairs.",
323
+ "[paragraph id = 14] In the experiments involving the RN-56 & RN-20 teacher-student pair, our efficient exploration strategy results in a performance improvement of 0.37% (71.40% vs 71.03%).",
324
+ "[paragraph id = 15] We attribute this success to the strategy enables the agent to learn valuable instance temperature adjustment policy faster, allowing the student model to acquire more useful knowledge during the early stages of KD."
325
+ ],
326
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T7\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S5.T7.2\" style=\"width:178.1pt;height:47.5pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-45.9pt,12.2pt) scale(0.66,0.66) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S5.T7.2.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T7.2.1.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T7.2.1.1.1.1\">Teacher</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T7.2.1.1.1.2\">RN-56</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T7.2.1.1.1.3\">RN-110</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T7.2.1.1.1.4\">WRN-40-2</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T7.2.1.1.1.5\">VGG-13</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T7.2.1.2.1\">\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T7.2.1.2.1.1\">Student</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T7.2.1.2.1.2\">RN-20</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T7.2.1.2.1.3\">RN-32</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T7.2.1.2.1.4\">WRN-16-2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S5.T7.2.1.2.1.5\">VGG-8</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T7.2.1.3.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T7.2.1.3.2.1\">Ours w/o EE</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T7.2.1.3.2.2\">71.03</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T7.2.1.3.2.3\">73.52</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T7.2.1.3.2.4\">75.50</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S5.T7.2.1.3.2.5\">73.45</th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T7.2.1.4.3\">\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T7.2.1.4.3.1\">Ours w EE</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T7.2.1.4.3.2\">71.40</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T7.2.1.4.3.3\">73.81</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T7.2.1.4.3.4\">75.79</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T7.2.1.4.3.5\">73.75</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S5.T7.3.1.1\" style=\"font-size:90%;\">Table 7</span>: </span><span class=\"ltx_text\" id=\"S5.T7.4.2\" style=\"font-size:90%;\">Ablation study of the efficient exploration (EE) strategy.</span></figcaption>\n</figure>",
327
+ "perturb_sentence_id": 14,
328
+ "output": {
329
+ "perturbed_statement": "[paragraph id = 14] In the experiments involving the RN-56 & RN-20 teacher-student pair, our efficient exploration strategy results in a performance improvement of 0.15% (71.40% vs 71.25%).We attribute this success to the strategy enables the agent to learn valuable instance temperature adjustment policy faster, allowing the student model to acquire more useful knowledge during the early stages of KD.",
330
+ "perturbed_explanation": "Original Explanation: The efficient exploration strategy demonstrated a performance improvement of 0.37% (71.40% vs 71.03%) for the RN-56 & RN-20 teacher-student pair, attributing success to the accelerated policy learning that benefited the student's knowledge acquisition during KD's initial stages. 2. The statement claims an improvement of 0.15% (71.40% vs 71.25%), which conflicts with the exact 0.37% improvement provided in the details, making the improvement range erroneous."
331
+ }
332
+ },
333
+ {
334
+ "path": "table_paper/2407.00115v3.json",
335
+ "table_id": "8",
336
+ "section": "5.2",
337
+ "all_context": [
338
+ "In the ablation studies, we evaluate the performance of the uncertainty score that is included in our state representation, the instance reward calibration scheme, the efficient exploration strategy, and different high-quality training example selection strategies.",
339
+ "All experiments are conducted on the CIFAR-100 dataset with respect to the image classification task, and utilize the Vanilla KD framework.",
340
+ "Uncertainty score.",
341
+ "We conduct experiments on 4 sets of teacher-student network pairs to test the effectiveness of the uncertainty score in our state representation.",
342
+ "As shown in Table 5 , when incorporating uncertainty score into state representation, our method shows an improvement of 0.24% (71.40% vs 71.16%) in the RN-56 & RN-20 teacher-student pair.",
343
+ "This enhancement verifies the effectiveness of our designed uncertainty score, which enables the agent to make wiser decisions by taking into account the student model s mastery of the training instances.",
344
+ "Instance reward calibration.",
345
+ "As shown in Table 6 , when incorporating an instance reward calibration strategy into our RLKD method, a promotive effect across 4 different sets of the teacher-student pairs (RN-56 & RN-20, etc.)",
346
+ "is achieved.",
347
+ "E.g., our instance temperature calibration strategy boosts the performance of RN-110 & RN-32 pair by 0.55% (73.81% vs 73.26%).",
348
+ "We believe the effectiveness of the instance reward calibration strategy lies in its ability to enable the agent to more accurately perceive the rewards resulting from each of its instance temperature adjustment actions, thereby enhancing its capacity to update its policy for performing the action.",
349
+ "Efficient exploration.",
350
+ "As shown in Table 7 , we conduct ablation experiments on our efficient exploration strategy across 4 teacher-student pairs.",
351
+ "The experimental results demonstrate that our effective exploration strategy facilitates performance of the student model across 4 teacher-student pairs.",
352
+ "In the experiments involving the RN-56 & RN-20 teacher-student pair, our efficient exploration strategy results in a performance improvement of 0.37% (71.40% vs 71.03%).",
353
+ "We attribute this success to the strategy enables the agent to learn valuable instance temperature adjustment policy faster, allowing the student model to acquire more useful knowledge during the early stages of KD.",
354
+ "Selection of high-quality training examples.",
355
+ "As shown in Table 8 , we conduct experiments on CIFAR-100 to compare different strategies for selecting the high-quality training examples.",
356
+ "Interestingly, we observe that when using the top 10% of high-quality training data, the performance of the student model in the teacher-student pair RN-56 & RN-20 is 70.92%, which is not as good as the performance 71.21% of the student model when using the training data ranked from 10% to 20%.",
357
+ "This phenomenon is also observed in the teacher-student pair WRN-40-2 & WRN-16-2.",
358
+ "We think this may due to utilizing the top 10% samples caused overfitting in the agent.",
359
+ "Furthermore, in the teacher-student pair RN-56 & RN-20, when conducting the mix-up method on the training data ranked from 10% to 20% using the training data ranked 40% to 50%, there is a performance increase of 0.19% (71.40% vs 71.21%).",
360
+ "The experimental results verify the validity of our mix-up method that combines instances of varying knowledge values can produce high-quality training data.",
361
+ ""
362
+ ],
363
+ "target_context_ids": [
364
+ 16,
365
+ 17,
366
+ 18,
367
+ 19,
368
+ 20
369
+ ],
370
+ "selected_paragraphs": [
371
+ "[paragraph id = 16] Selection of high-quality training examples.",
372
+ "[paragraph id = 17] As shown in Table 8 , we conduct experiments on CIFAR-100 to compare different strategies for selecting the high-quality training examples.",
373
+ "[paragraph id = 18] Interestingly, we observe that when using the top 10% of high-quality training data, the performance of the student model in the teacher-student pair RN-56 & RN-20 is 70.92%, which is not as good as the performance 71.21% of the student model when using the training data ranked from 10% to 20%.",
374
+ "[paragraph id = 19] This phenomenon is also observed in the teacher-student pair WRN-40-2 & WRN-16-2.",
375
+ "[paragraph id = 20] We think this may due to utilizing the top 10% samples caused overfitting in the agent."
376
+ ],
377
+ "table_html": "<figure class=\"ltx_table\" id=\"S5.T8\">\n<div class=\"ltx_inline-block ltx_align_center ltx_transformed_outer\" id=\"S5.T8.8\" style=\"width:241.3pt;height:30.8pt;vertical-align:-0.0pt;\"><span class=\"ltx_transformed_inner\" style=\"transform:translate(-91.0pt,11.6pt) scale(0.57,0.57) ;\">\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S5.T8.8.8\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S5.T8.8.8.8\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_tt\" id=\"S5.T8.8.8.8.9\">Teacher</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_r ltx_border_tt\" id=\"S5.T8.8.8.8.10\">Student</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T8.1.1.1.1\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T8.2.2.2.2\"></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T8.5.5.5.5\">\n \n</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_tt\" id=\"S5.T8.8.8.8.8\">\n \n</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S5.T8.8.8.9.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_t\" id=\"S5.T8.8.8.9.1.1\">72.34</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S5.T8.8.8.9.1.2\">69.06</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T8.8.8.9.1.3\">70.92</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T8.8.8.9.1.4\">71.21</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T8.8.8.9.1.5\">71.27</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S5.T8.8.8.9.1.6\">71.40</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S5.T8.8.8.10.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb\" id=\"S5.T8.8.8.10.2.1\">75.61</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_bb ltx_border_r\" id=\"S5.T8.8.8.10.2.2\">73.26</th>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T8.8.8.10.2.3\">75.33</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T8.8.8.10.2.4\">75.57</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T8.8.8.10.2.5\">75.61</td>\n<td class=\"ltx_td ltx_align_center ltx_border_bb\" id=\"S5.T8.8.8.10.2.6\">75.79</td>\n</tr>\n</tbody>\n</table>\n</span></div>\n<figcaption class=\"ltx_caption ltx_centering\"><span class=\"ltx_tag ltx_tag_table\"><span class=\"ltx_text\" id=\"S5.T8.12.2.1\" style=\"font-size:90%;\">Table 8</span>: </span><span class=\"ltx_text\" id=\"S5.T8.10.1\" style=\"font-size:90%;\">Comparison of different high-quality training sample selection strategies. The teacher-student pairs corresponding to the second and third rows are respectively RN-56 &amp; RN-20 and WRN-40-2 &amp; WRN-16-2. “” denotes the mix-up operation.</span></figcaption>\n</figure>",
378
+ "perturb_sentence_id": 18,
379
+ "output": {
380
+ "perturbed_statement": "[paragraph id = 18] Interestingly, we observe that when using the top 5% of high-quality training data, the performance of the student model in the teacher-student pair RN-56 & RN-20 is 70.92%, which is not as good as the performance 71.21% of the student model when using the training data ranked from 10% to 20%.This phenomenon is also observed in the teacher-student pair WRN-40-2 & WRN-16-2.",
381
+ "perturbed_explanation": "The original explanation states: 1. The performance comparison is based on using the top 10% of data versus data ranked from 10% to 20%. 2. The performance values given are the result of this specific data segmentation. Alteration Impact Analysis: 3. The altered statement incorrectly specifies the top 5% instead of the top 10%. This misrepresentation contradicts the actual observed methodology and data selection criteria, which might lead to a misunderstanding of the experiment's outcomes."
382
+ }
383
+ }
384
+ ]
table_result/2407.00119v2_output.json ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "table_paper/2407.00119v2.json",
4
+ "table_id": "1",
5
+ "section": "4.3",
6
+ "all_context": [
7
+ "To verify the superiority of the ELR-GNN method proposed in this paper, we report the experimental results of ELR-GNN and other comparative methods on the IEMOCAP and MELD data sets.",
8
+ "Experimental results are presented in Tables 1 and 2 .",
9
+ "IEMOCAP: As shown in Table 1 , the multi-modal emotion recognition method proposed in this paper achie-ved the best emotion recognition effect on the IEMOCAP data set, with an average accuracy of 70.6% and an average F1 value of 70.9%.",
10
+ "ELR-GCN proposes an effective modeling method of long-distance context latent dependencies for multi-modal emotion recognition.",
11
+ "In addition, ELR-GCN also combines early and adaptive late fusion methods to achieve the capture of fine-grained emotional features.",
12
+ "Among other comparison methods, the emotion recognition effect of DER-GCN is slightly lower than that of ELR-GNN, with an average accuracy of 69.7% and an average F1 value of 69.4%.",
13
+ "Although DER-GCN comprehensively considers event relationships and dialogue relationships between speakers to enhance the model s emotional understanding, it ignores latent context dependencies.",
14
+ "The emotion recognition effect of LR-GCN is lower than ELR-GNN and DER-GCN, with an average accuracy of 68.5% and an average F1 value of 68.3%.",
15
+ "Although LR-GCN considers latent dependencies between contexts, due to the high computational complexity of GCN, LR-GCN can only capture local latent dependencies.",
16
+ "The emotion recognition effects of other comparison methods are lower than ELR-GNN.",
17
+ "Likewise, none of them take into account potential dependencies on context.",
18
+ "Overall, the accuracy of ELR-GNN on the happy emotion analogy is much higher than that of other comparison algorithms, while the accuracy of other emotion categories is also relatively close to that of other comparison algorithms.",
19
+ "In addition, the F1 value of ELR-GNN on the happy and excited emotional analogies is much higher than that of other comparison algorithms.",
20
+ "At the same time, the F1 value of ELR-GNN on other emotional categories is also relatively close to other comparison algorithms.",
21
+ "The experimental results prove the superiority of the ELR-GNN method proposed in this paper.",
22
+ "MELD: As shown in Table 2 , The ELR-GNN method proposed in this article has the best emotion recognition effect on the MELD data set, with an average accuracy of 68.7% and an average F1 value of 69.9%.",
23
+ "The emotion recognition effect of DER-GCN is second, with an average accuracy of 69.7% and an average F1 value of 69.4%.",
24
+ "The emotion recognition effect of LR-GCN is lower than that of ELR-GNN and DER-GCN, with an average accuracy of 68.5% and an average F1 value of 68.3%.",
25
+ "The emotion recognition effects of other comparison methods are relatively poor, and the average accuracy and F1 value are lower than ELR-GNN.",
26
+ "The performance improvement may be attributed to ELR-GNN s ability to capture long-distance contextual latent dependencies and fine-grained fusion of dialogue relationships between speakers, contextual latent dependencies and contextual semantic information.",
27
+ "Overall, the accuracy of ELR-GNN on the neutral, fear, sadness, joy, and disgust emotion analogy is much higher than that of other comparison algorithms, while the accuracy of other emotion categories is also relatively close to that of other comparison algorithms.",
28
+ "In addition, the F1 value of ELR-GNN on the neutral, fear, sadness, joy, and anger emotional analogies is much higher than that of other comparison algorithms.",
29
+ "At the same time, the F1 value of ELR-GNN on other emotional categories is also relatively close to other comparison algorithms.",
30
+ "In addition, we find that ELR-GNN has better emotion recognition effects on the minority emotions fear and disgust, with relatively high accuracy and F1 value.",
31
+ "The experimental results prove the superiority of the ELR-GNN method proposed in this paper.",
32
+ "In addition, to intuitively illustrate that the running time of the ELR-GNN method proposed in this paper is better than other comparative methods, we statistics in Table 3 the running time of other comparative methods of the ELR-GNN method on the IEMOCAP and MELD data sets.",
33
+ "As shown in Table 3 , the running time of the ELR-GNN method proposed in this paper on the IEMOCAP and MELD data sets is 41s and 91s respectively, which is significantly better than other comparison methods.",
34
+ "The running times of DialogueGCN are 58s and 127s respectively, which are lower than LR-GCN and DER-GCN, but the emotion recognition effect is relatively poor.",
35
+ "The running times of LR-GCN are 87s and 142s respectively.",
36
+ "The running times of DER-GCN are 125s and 189s respectively.",
37
+ "The experimental results prove the efficiency and effectiveness of the ELR-GNN method proposed in this paper.",
38
+ ""
39
+ ],
40
+ "target_context_ids": [
41
+ 2,
42
+ 3,
43
+ 4,
44
+ 5,
45
+ 6,
46
+ 7,
47
+ 8,
48
+ 9,
49
+ 10,
50
+ 11,
51
+ 12,
52
+ 13,
53
+ 14
54
+ ],
55
+ "selected_paragraphs": [
56
+ "[paragraph id = 2] IEMOCAP: As shown in Table 1 , the multi-modal emotion recognition method proposed in this paper achie-ved the best emotion recognition effect on the IEMOCAP data set, with an average accuracy of 70.6% and an average F1 value of 70.9%.",
57
+ "[paragraph id = 3] ELR-GCN proposes an effective modeling method of long-distance context latent dependencies for multi-modal emotion recognition.",
58
+ "[paragraph id = 4] In addition, ELR-GCN also combines early and adaptive late fusion methods to achieve the capture of fine-grained emotional features.",
59
+ "[paragraph id = 5] Among other comparison methods, the emotion recognition effect of DER-GCN is slightly lower than that of ELR-GNN, with an average accuracy of 69.7% and an average F1 value of 69.4%.",
60
+ "[paragraph id = 6] Although DER-GCN comprehensively considers event relationships and dialogue relationships between speakers to enhance the model s emotional understanding, it ignores latent context dependencies.",
61
+ "[paragraph id = 7] The emotion recognition effect of LR-GCN is lower than ELR-GNN and DER-GCN, with an average accuracy of 68.5% and an average F1 value of 68.3%.",
62
+ "[paragraph id = 8] Although LR-GCN considers latent dependencies between contexts, due to the high computational complexity of GCN, LR-GCN can only capture local latent dependencies.",
63
+ "[paragraph id = 9] The emotion recognition effects of other comparison methods are lower than ELR-GNN.",
64
+ "[paragraph id = 10] Likewise, none of them take into account potential dependencies on context.",
65
+ "[paragraph id = 11] Overall, the accuracy of ELR-GNN on the happy emotion analogy is much higher than that of other comparison algorithms, while the accuracy of other emotion categories is also relatively close to that of other comparison algorithms.",
66
+ "[paragraph id = 12] In addition, the F1 value of ELR-GNN on the happy and excited emotional analogies is much higher than that of other comparison algorithms.",
67
+ "[paragraph id = 13] At the same time, the F1 value of ELR-GNN on other emotional categories is also relatively close to other comparison algorithms.",
68
+ "[paragraph id = 14] The experimental results prove the superiority of the ELR-GNN method proposed in this paper."
69
+ ],
70
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T1\">\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 1: </span>Comparison with other baseline models on the IEMOCAP dataset.</figcaption>\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S4.T1.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T1.1.1.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T1.1.1.1.1\" rowspan=\"3\" style=\"padding:2.5pt 8.7pt;\"><span class=\"ltx_text\" id=\"S4.T1.1.1.1.1.1\">Methods</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" colspan=\"7\" id=\"S4.T1.1.1.1.2\" style=\"padding:2.5pt 8.7pt;\">IEMOCAP</th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.2.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.2.2.1\" style=\"padding:2.5pt 8.7pt;\">Happy</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.2.2.2\" style=\"padding:2.5pt 8.7pt;\">Sad</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.2.2.3\" style=\"padding:2.5pt 8.7pt;\">Neutral</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.2.2.4\" style=\"padding:2.5pt 8.7pt;\">Angry</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.2.2.5\" style=\"padding:2.5pt 8.7pt;\">Excited</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.2.2.6\" style=\"padding:2.5pt 8.7pt;\">Frustrated</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.2.2.7\" style=\"padding:2.5pt 8.7pt;\">Average(w)</th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.3.3\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.3.3.1\" style=\"padding:2.5pt 8.7pt;\">Acc. F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.3.3.2\" style=\"padding:2.5pt 8.7pt;\">Acc. F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.3.3.3\" style=\"padding:2.5pt 8.7pt;\">Acc. F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.3.3.4\" style=\"padding:2.5pt 8.7pt;\">Acc. F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.3.3.5\" style=\"padding:2.5pt 8.7pt;\">Acc. F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.3.3.6\" style=\"padding:2.5pt 8.7pt;\">Acc. F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T1.1.3.3.7\" style=\"padding:2.5pt 8.7pt;\">Acc. F1</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T1.1.4.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T1.1.4.1.1\" style=\"padding:2.5pt 8.7pt;\">TextCNN</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.4.1.2\" style=\"padding:2.5pt 8.7pt;\">27.7 29..8</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.4.1.3\" style=\"padding:2.5pt 8.7pt;\">57.1 53.8</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.4.1.4\" style=\"padding:2.5pt 8.7pt;\">34.3 40.1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.4.1.5\" style=\"padding:2.5pt 8.7pt;\">61.1 52.4</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.4.1.6\" style=\"padding:2.5pt 8.7pt;\">46.1 50.0</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.4.1.7\" style=\"padding:2.5pt 8.7pt;\">62.9 55.7</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T1.1.4.1.8\" style=\"padding:2.5pt 8.7pt;\">48.9 48.1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.5.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.5.2.1\" style=\"padding:2.5pt 8.7pt;\">bc-LSTM</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.5.2.2\" style=\"padding:2.5pt 8.7pt;\">29.1 34.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.5.2.3\" style=\"padding:2.5pt 8.7pt;\">57.1 60.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.5.2.4\" style=\"padding:2.5pt 8.7pt;\">54.1 51.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.5.2.5\" style=\"padding:2.5pt 8.7pt;\">57.0 56.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.5.2.6\" style=\"padding:2.5pt 8.7pt;\">51.1 57.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.5.2.7\" style=\"padding:2.5pt 8.7pt;\">67.1 58.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.5.2.8\" style=\"padding:2.5pt 8.7pt;\">55.2 54.9</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.6.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.6.3.1\" style=\"padding:2.5pt 8.7pt;\">MFN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.6.3.2\" style=\"padding:2.5pt 8.7pt;\">24.0 34.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.6.3.3\" style=\"padding:2.5pt 8.7pt;\">65.6 70.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.6.3.4\" style=\"padding:2.5pt 8.7pt;\">55.5 52.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.6.3.5\" style=\"padding:2.5pt 8.7pt;\">72.3 66.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.6.3.6\" style=\"padding:2.5pt 8.7pt;\">64.3 62.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.6.3.7\" style=\"padding:2.5pt 8.7pt;\">67.9 62.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.6.3.8\" style=\"padding:2.5pt 8.7pt;\">60.1 59.9</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.7.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.7.4.1\" style=\"padding:2.5pt 8.7pt;\">CMN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.7.4.2\" style=\"padding:2.5pt 8.7pt;\">25.0 30.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.7.4.3\" style=\"padding:2.5pt 8.7pt;\">55.9 62.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.7.4.4\" style=\"padding:2.5pt 8.7pt;\">52.8 52.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.7.4.5\" style=\"padding:2.5pt 8.7pt;\">61.7 59.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.7.4.6\" style=\"padding:2.5pt 8.7pt;\">55.5 60.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.7.4.7\" style=\"padding:2.5pt 8.7pt;\">\n<span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.1.7.4.7.1\">71.1</span> 60.6</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.7.4.8\" style=\"padding:2.5pt 8.7pt;\">56.5 56.1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.8.5\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.8.5.1\" style=\"padding:2.5pt 8.7pt;\">LFM</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.8.5.2\" style=\"padding:2.5pt 8.7pt;\">25.6 33.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.8.5.3\" style=\"padding:2.5pt 8.7pt;\">75.1 78.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.8.5.4\" style=\"padding:2.5pt 8.7pt;\">58.5 59.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.8.5.5\" style=\"padding:2.5pt 8.7pt;\">64.7 65.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.8.5.6\" style=\"padding:2.5pt 8.7pt;\">80.2 71.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.8.5.7\" style=\"padding:2.5pt 8.7pt;\">61.1 58.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.8.5.8\" style=\"padding:2.5pt 8.7pt;\">63.4 62.7</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.9.6\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.9.6.1\" style=\"padding:2.5pt 8.7pt;\">ICON</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.9.6.2\" style=\"padding:2.5pt 8.7pt;\">22.2 29.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.9.6.3\" style=\"padding:2.5pt 8.7pt;\">58.8 64.6</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.9.6.4\" style=\"padding:2.5pt 8.7pt;\">62.8 57.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.9.6.5\" style=\"padding:2.5pt 8.7pt;\">64.7 63.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.9.6.6\" style=\"padding:2.5pt 8.7pt;\">58.9 63.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.9.6.7\" style=\"padding:2.5pt 8.7pt;\">67.2 60.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.9.6.8\" style=\"padding:2.5pt 8.7pt;\">59.1 58.5</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.10.7\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.10.7.1\" style=\"padding:2.5pt 8.7pt;\">A-DMN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.10.7.2\" style=\"padding:2.5pt 8.7pt;\">43.1 50.6</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.10.7.3\" style=\"padding:2.5pt 8.7pt;\">69.4 76.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.10.7.4\" style=\"padding:2.5pt 8.7pt;\">63.0 62.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.10.7.5\" style=\"padding:2.5pt 8.7pt;\">63.5 56.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.10.7.6\" style=\"padding:2.5pt 8.7pt;\">\n<span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.1.10.7.6.1\">88.3</span> 77.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.10.7.7\" style=\"padding:2.5pt 8.7pt;\">53.3 55.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.10.7.8\" style=\"padding:2.5pt 8.7pt;\">64.6 64.3</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.11.8\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.11.8.1\" style=\"padding:2.5pt 8.7pt;\">DialogueGCN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.11.8.2\" style=\"padding:2.5pt 8.7pt;\">40.6 42.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.11.8.3\" style=\"padding:2.5pt 8.7pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.1.11.8.3.1\">89.1 84.5</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.11.8.4\" style=\"padding:2.5pt 8.7pt;\">62.0 63.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.11.8.5\" style=\"padding:2.5pt 8.7pt;\">67.5 64.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.11.8.6\" style=\"padding:2.5pt 8.7pt;\">65.5 63.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.11.8.7\" style=\"padding:2.5pt 8.7pt;\">64.1 66.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.11.8.8\" style=\"padding:2.5pt 8.7pt;\">65.2 64.1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.12.9\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.12.9.1\" style=\"padding:2.5pt 8.7pt;\">RGAT</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.12.9.2\" style=\"padding:2.5pt 8.7pt;\">60.1 51.6</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.12.9.3\" style=\"padding:2.5pt 8.7pt;\">78.8 77.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.12.9.4\" style=\"padding:2.5pt 8.7pt;\">60.1 65.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.12.9.5\" style=\"padding:2.5pt 8.7pt;\">70.7 63.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.12.9.6\" style=\"padding:2.5pt 8.7pt;\">78.0 68.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.12.9.7\" style=\"padding:2.5pt 8.7pt;\">64.3 61.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.12.9.8\" style=\"padding:2.5pt 8.7pt;\">65.0 65.2</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.13.10\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.13.10.1\" style=\"padding:2.5pt 8.7pt;\">AGHMN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.13.10.2\" style=\"padding:2.5pt 8.7pt;\">48.3 52.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.13.10.3\" style=\"padding:2.5pt 8.7pt;\">68.3 73.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.13.10.4\" style=\"padding:2.5pt 8.7pt;\">61.6 58.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.13.10.5\" style=\"padding:2.5pt 8.7pt;\">57.5 61.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.13.10.6\" style=\"padding:2.5pt 8.7pt;\">68.1 69.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.13.10.7\" style=\"padding:2.5pt 8.7pt;\">67.1 62.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.13.10.8\" style=\"padding:2.5pt 8.7pt;\">63.5 63.5</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.14.11\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.14.11.1\" style=\"padding:2.5pt 8.7pt;\">BiERU</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.14.11.2\" style=\"padding:2.5pt 8.7pt;\">54.2 31.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.14.11.3\" style=\"padding:2.5pt 8.7pt;\">80.6 84.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.14.11.4\" style=\"padding:2.5pt 8.7pt;\">64.7 60.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.14.11.5\" style=\"padding:2.5pt 8.7pt;\">67.9 65.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.14.11.6\" style=\"padding:2.5pt 8.7pt;\">62.8 74.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.14.11.7\" style=\"padding:2.5pt 8.7pt;\">61.9 61.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.14.11.8\" style=\"padding:2.5pt 8.7pt;\">66.1 64.7</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.15.12\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.15.12.1\" style=\"padding:2.5pt 8.7pt;\">CoMPM</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.15.12.2\" style=\"padding:2.5pt 8.7pt;\">59.9 60.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.15.12.3\" style=\"padding:2.5pt 8.7pt;\">78.0 82.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.15.12.4\" style=\"padding:2.5pt 8.7pt;\">60.4 63.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.15.12.5\" style=\"padding:2.5pt 8.7pt;\">70.2 59.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.15.12.6\" style=\"padding:2.5pt 8.7pt;\">85.8 78.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.15.12.7\" style=\"padding:2.5pt 8.7pt;\">62.9 59.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.15.12.8\" style=\"padding:2.5pt 8.7pt;\">67.7 67.2</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.16.13\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.16.13.1\" style=\"padding:2.5pt 8.7pt;\">EmoBERTa</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.16.13.2\" style=\"padding:2.5pt 8.7pt;\">56.9 56.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.16.13.3\" style=\"padding:2.5pt 8.7pt;\">79.1 83.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.16.13.4\" style=\"padding:2.5pt 8.7pt;\">64.0 61.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.16.13.5\" style=\"padding:2.5pt 8.7pt;\">70.6 69.6</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.16.13.6\" style=\"padding:2.5pt 8.7pt;\">86.0 78.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.16.13.7\" style=\"padding:2.5pt 8.7pt;\">63.8 68.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.16.13.8\" style=\"padding:2.5pt 8.7pt;\">67.3 67.3</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.17.14\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.17.14.1\" style=\"padding:2.5pt 8.7pt;\">COGMEN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.17.14.2\" style=\"padding:2.5pt 8.7pt;\">57.4 51.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.17.14.3\" style=\"padding:2.5pt 8.7pt;\">81.4 81.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.17.14.4\" style=\"padding:2.5pt 8.7pt;\">65.4 <span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.1.17.14.4.1\">68.6</span>\n</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.17.14.5\" style=\"padding:2.5pt 8.7pt;\">69.5 66.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.17.14.6\" style=\"padding:2.5pt 8.7pt;\">83.3 75.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.17.14.7\" style=\"padding:2.5pt 8.7pt;\">63.8 68.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.17.14.8\" style=\"padding:2.5pt 8.7pt;\">68.2 67.6</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.18.15\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.18.15.1\" style=\"padding:2.5pt 8.7pt;\">CTNet</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.18.15.2\" style=\"padding:2.5pt 8.7pt;\">47.9 51.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.18.15.3\" style=\"padding:2.5pt 8.7pt;\">78.0 79.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.18.15.4\" style=\"padding:2.5pt 8.7pt;\">\n<span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.1.18.15.4.1\">69.0</span> 65.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.18.15.5\" style=\"padding:2.5pt 8.7pt;\">\n<span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.1.18.15.5.1\">72.9</span> 67.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.18.15.6\" style=\"padding:2.5pt 8.7pt;\">85.3 78.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.18.15.7\" style=\"padding:2.5pt 8.7pt;\">52.2 58.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.18.15.8\" style=\"padding:2.5pt 8.7pt;\">68.0 67.5</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.19.16\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.19.16.1\" style=\"padding:2.5pt 8.7pt;\">LR-GCN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.19.16.2\" style=\"padding:2.5pt 8.7pt;\">54.2 55.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.19.16.3\" style=\"padding:2.5pt 8.7pt;\">81.6 79.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.19.16.4\" style=\"padding:2.5pt 8.7pt;\">59.1 63.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.19.16.5\" style=\"padding:2.5pt 8.7pt;\">69.4 69.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.19.16.6\" style=\"padding:2.5pt 8.7pt;\">76.3 74.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.19.16.7\" style=\"padding:2.5pt 8.7pt;\">68.2 <span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.1.19.16.7.1\">68.9</span>\n</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.19.16.8\" style=\"padding:2.5pt 8.7pt;\">68.5 68.3</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.20.17\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T1.1.20.17.1\" style=\"padding:2.5pt 8.7pt;\">DER-GCN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.20.17.2\" style=\"padding:2.5pt 8.7pt;\">60.7 58.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.20.17.3\" style=\"padding:2.5pt 8.7pt;\">75.9 79.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.20.17.4\" style=\"padding:2.5pt 8.7pt;\">66.5 61.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.20.17.5\" style=\"padding:2.5pt 8.7pt;\">71.3 <span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.1.20.17.5.1\">72.1</span>\n</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.20.17.6\" style=\"padding:2.5pt 8.7pt;\">71.1 73.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.20.17.7\" style=\"padding:2.5pt 8.7pt;\">66.1 67.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T1.1.20.17.8\" style=\"padding:2.5pt 8.7pt;\">69.7 69.4</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T1.1.21.18\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_b ltx_border_r\" id=\"S4.T1.1.21.18.1\" style=\"padding:2.5pt 8.7pt;\">ELR-GCN</th>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T1.1.21.18.2\" style=\"padding:2.5pt 8.7pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.1.21.18.2.1\">64.7 62.9</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T1.1.21.18.3\" style=\"padding:2.5pt 8.7pt;\">75.7 80.8</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T1.1.21.18.4\" style=\"padding:2.5pt 8.7pt;\">66.2 62.4</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T1.1.21.18.5\" style=\"padding:2.5pt 8.7pt;\">70.7 70.0</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T1.1.21.18.6\" style=\"padding:2.5pt 8.7pt;\">76.8 <span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.1.21.18.6.1\">78.6</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T1.1.21.18.7\" style=\"padding:2.5pt 8.7pt;\">67.9 68.1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T1.1.21.18.8\" style=\"padding:2.5pt 8.7pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T1.1.21.18.8.1\">70.6 70.9</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
71
+ "perturb_sentence_id": 2,
72
+ "output": {
73
+ "perturbed_statement": "[paragraph id = 2] IEMOCAP: As shown in Table 1, the multi-modal emotion recognition method proposed in this paper achieved the best emotion recognition effect on the IEMOCAP data set, with an average accuracy of 68.5% and an average F1 value of 68.3%. ELR-GCN proposes an effective modeling method of long-distance context latent dependencies for multi-modal emotion recognition.",
74
+ "perturbed_explanation": "1. The original explanation highlights the efficacy of the ELR-GCN method on the IEMOCAP dataset with specific accuracy metrics mentioned. 2. The statement now incorrectly lists the accuracy as 68.5% and the F1 value as 68.3%, while the actual values are 70.6% and 70.9%, as shown in the context of the experiment results."
75
+ }
76
+ },
77
+ {
78
+ "path": "table_paper/2407.00119v2.json",
79
+ "table_id": "2",
80
+ "section": "4.3",
81
+ "all_context": [
82
+ "To verify the superiority of the ELR-GNN method proposed in this paper, we report the experimental results of ELR-GNN and other comparative methods on the IEMOCAP and MELD data sets.",
83
+ "Experimental results are presented in Tables 1 and 2 .",
84
+ "IEMOCAP: As shown in Table 1 , the multi-modal emotion recognition method proposed in this paper achie-ved the best emotion recognition effect on the IEMOCAP data set, with an average accuracy of 70.6% and an average F1 value of 70.9%.",
85
+ "ELR-GCN proposes an effective modeling method of long-distance context latent dependencies for multi-modal emotion recognition.",
86
+ "In addition, ELR-GCN also combines early and adaptive late fusion methods to achieve the capture of fine-grained emotional features.",
87
+ "Among other comparison methods, the emotion recognition effect of DER-GCN is slightly lower than that of ELR-GNN, with an average accuracy of 69.7% and an average F1 value of 69.4%.",
88
+ "Although DER-GCN comprehensively considers event relationships and dialogue relationships between speakers to enhance the model s emotional understanding, it ignores latent context dependencies.",
89
+ "The emotion recognition effect of LR-GCN is lower than ELR-GNN and DER-GCN, with an average accuracy of 68.5% and an average F1 value of 68.3%.",
90
+ "Although LR-GCN considers latent dependencies between contexts, due to the high computational complexity of GCN, LR-GCN can only capture local latent dependencies.",
91
+ "The emotion recognition effects of other comparison methods are lower than ELR-GNN.",
92
+ "Likewise, none of them take into account potential dependencies on context.",
93
+ "Overall, the accuracy of ELR-GNN on the happy emotion analogy is much higher than that of other comparison algorithms, while the accuracy of other emotion categories is also relatively close to that of other comparison algorithms.",
94
+ "In addition, the F1 value of ELR-GNN on the happy and excited emotional analogies is much higher than that of other comparison algorithms.",
95
+ "At the same time, the F1 value of ELR-GNN on other emotional categories is also relatively close to other comparison algorithms.",
96
+ "The experimental results prove the superiority of the ELR-GNN method proposed in this paper.",
97
+ "MELD: As shown in Table 2 , The ELR-GNN method proposed in this article has the best emotion recognition effect on the MELD data set, with an average accuracy of 68.7% and an average F1 value of 69.9%.",
98
+ "The emotion recognition effect of DER-GCN is second, with an average accuracy of 69.7% and an average F1 value of 69.4%.",
99
+ "The emotion recognition effect of LR-GCN is lower than that of ELR-GNN and DER-GCN, with an average accuracy of 68.5% and an average F1 value of 68.3%.",
100
+ "The emotion recognition effects of other comparison methods are relatively poor, and the average accuracy and F1 value are lower than ELR-GNN.",
101
+ "The performance improvement may be attributed to ELR-GNN s ability to capture long-distance contextual latent dependencies and fine-grained fusion of dialogue relationships between speakers, contextual latent dependencies and contextual semantic information.",
102
+ "Overall, the accuracy of ELR-GNN on the neutral, fear, sadness, joy, and disgust emotion analogy is much higher than that of other comparison algorithms, while the accuracy of other emotion categories is also relatively close to that of other comparison algorithms.",
103
+ "In addition, the F1 value of ELR-GNN on the neutral, fear, sadness, joy, and anger emotional analogies is much higher than that of other comparison algorithms.",
104
+ "At the same time, the F1 value of ELR-GNN on other emotional categories is also relatively close to other comparison algorithms.",
105
+ "In addition, we find that ELR-GNN has better emotion recognition effects on the minority emotions fear and disgust, with relatively high accuracy and F1 value.",
106
+ "The experimental results prove the superiority of the ELR-GNN method proposed in this paper.",
107
+ "In addition, to intuitively illustrate that the running time of the ELR-GNN method proposed in this paper is better than other comparative methods, we statistics in Table 3 the running time of other comparative methods of the ELR-GNN method on the IEMOCAP and MELD data sets.",
108
+ "As shown in Table 3 , the running time of the ELR-GNN method proposed in this paper on the IEMOCAP and MELD data sets is 41s and 91s respectively, which is significantly better than other comparison methods.",
109
+ "The running times of DialogueGCN are 58s and 127s respectively, which are lower than LR-GCN and DER-GCN, but the emotion recognition effect is relatively poor.",
110
+ "The running times of LR-GCN are 87s and 142s respectively.",
111
+ "The running times of DER-GCN are 125s and 189s respectively.",
112
+ "The experimental results prove the efficiency and effectiveness of the ELR-GNN method proposed in this paper.",
113
+ ""
114
+ ],
115
+ "target_context_ids": [
116
+ 1,
117
+ 15,
118
+ 16,
119
+ 17,
120
+ 18,
121
+ 19,
122
+ 20,
123
+ 21,
124
+ 22,
125
+ 23,
126
+ 24,
127
+ 25
128
+ ],
129
+ "selected_paragraphs": [
130
+ "[paragraph id = 1] Experimental results are presented in Tables 1 and 2 .",
131
+ "[paragraph id = 15] MELD: As shown in Table 2 , The ELR-GNN method proposed in this article has the best emotion recognition effect on the MELD data set, with an average accuracy of 68.7% and an average F1 value of 69.9%.",
132
+ "[paragraph id = 16] The emotion recognition effect of DER-GCN is second, with an average accuracy of 69.7% and an average F1 value of 69.4%.",
133
+ "[paragraph id = 17] The emotion recognition effect of LR-GCN is lower than that of ELR-GNN and DER-GCN, with an average accuracy of 68.5% and an average F1 value of 68.3%.",
134
+ "[paragraph id = 18] The emotion recognition effects of other comparison methods are relatively poor, and the average accuracy and F1 value are lower than ELR-GNN.",
135
+ "[paragraph id = 19] The performance improvement may be attributed to ELR-GNN s ability to capture long-distance contextual latent dependencies and fine-grained fusion of dialogue relationships between speakers, contextual latent dependencies and contextual semantic information.",
136
+ "[paragraph id = 20] Overall, the accuracy of ELR-GNN on the neutral, fear, sadness, joy, and disgust emotion analogy is much higher than that of other comparison algorithms, while the accuracy of other emotion categories is also relatively close to that of other comparison algorithms.",
137
+ "[paragraph id = 21] In addition, the F1 value of ELR-GNN on the neutral, fear, sadness, joy, and anger emotional analogies is much higher than that of other comparison algorithms.",
138
+ "[paragraph id = 22] At the same time, the F1 value of ELR-GNN on other emotional categories is also relatively close to other comparison algorithms.",
139
+ "[paragraph id = 23] In addition, we find that ELR-GNN has better emotion recognition effects on the minority emotions fear and disgust, with relatively high accuracy and F1 value.",
140
+ "[paragraph id = 24] The experimental results prove the superiority of the ELR-GNN method proposed in this paper.",
141
+ "[paragraph id = 25] In addition, to intuitively illustrate that the running time of the ELR-GNN method proposed in this paper is better than other comparative methods, we statistics in Table 3 the running time of other comparative methods of the ELR-GNN method on the IEMOCAP and MELD data sets."
142
+ ],
143
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T2\">\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 2: </span>Comparison with other baseline models on the MELD dataset.</figcaption>\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S4.T2.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.1.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T2.1.1.1.1\" rowspan=\"3\" style=\"padding:2.5pt 5.4pt;\"><span class=\"ltx_text\" id=\"S4.T2.1.1.1.1.1\">Methods</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" colspan=\"8\" id=\"S4.T2.1.1.1.2\" style=\"padding:2.5pt 5.4pt;\">MELD</th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.2.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.2.2.1\" style=\"padding:2.5pt 5.4pt;\">Neutral</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.2.2.2\" style=\"padding:2.5pt 5.4pt;\">Surprise</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.2.2.3\" style=\"padding:2.5pt 5.4pt;\">Fear</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.2.2.4\" style=\"padding:2.5pt 5.4pt;\">Sadness</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.2.2.5\" style=\"padding:2.5pt 5.4pt;\">Joy</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.2.2.6\" style=\"padding:2.5pt 5.4pt;\">Disgust</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.2.2.7\" style=\"padding:2.5pt 5.4pt;\">Anger</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.2.2.8\" style=\"padding:2.5pt 5.4pt;\">Average(w)</th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.3.3\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.3.3.1\" style=\"padding:2.5pt 5.4pt;\">Acc. F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.3.3.2\" style=\"padding:2.5pt 5.4pt;\">Acc. F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.3.3.3\" style=\"padding:2.5pt 5.4pt;\">Acc. F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.3.3.4\" style=\"padding:2.5pt 5.4pt;\">Acc. F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.3.3.5\" style=\"padding:2.5pt 5.4pt;\">Acc. F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.3.3.6\" style=\"padding:2.5pt 5.4pt;\">Acc. F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.3.3.7\" style=\"padding:2.5pt 5.4pt;\">Acc. F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T2.1.3.3.8\" style=\"padding:2.5pt 5.4pt;\">Acc. F1</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T2.1.4.1\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T2.1.4.1.1\" style=\"padding:2.5pt 5.4pt;\">TextCNN</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.4.1.2\" style=\"padding:2.5pt 5.4pt;\">76.2 74.9</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.4.1.3\" style=\"padding:2.5pt 5.4pt;\">43.3 45.5</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.4.1.4\" style=\"padding:2.5pt 5.4pt;\">4.6 3.7</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.4.1.5\" style=\"padding:2.5pt 5.4pt;\">18.2 21.1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.4.1.6\" style=\"padding:2.5pt 5.4pt;\">46.1 49.4</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.4.1.7\" style=\"padding:2.5pt 5.4pt;\">8.9 8.3</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.4.1.8\" style=\"padding:2.5pt 5.4pt;\">35.3 34.5</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T2.1.4.1.9\" style=\"padding:2.5pt 5.4pt;\">56.3 55.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.5.2\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.1.5.2.1\" style=\"padding:2.5pt 5.4pt;\">bc-LSTM</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.5.2.2\" style=\"padding:2.5pt 5.4pt;\">78.4 73.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.5.2.3\" style=\"padding:2.5pt 5.4pt;\">46.8 47.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.5.2.4\" style=\"padding:2.5pt 5.4pt;\">3.8 5.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.5.2.5\" style=\"padding:2.5pt 5.4pt;\">22.4 25.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.5.2.6\" style=\"padding:2.5pt 5.4pt;\">51.6 51.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.5.2.7\" style=\"padding:2.5pt 5.4pt;\">4.3 5.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.5.2.8\" style=\"padding:2.5pt 5.4pt;\">36.7 38.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.5.2.9\" style=\"padding:2.5pt 5.4pt;\">57.5 55.9</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.6.3\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.1.6.3.1\" style=\"padding:2.5pt 5.4pt;\">DialogueRNN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.6.3.2\" style=\"padding:2.5pt 5.4pt;\">72.1 73.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.6.3.3\" style=\"padding:2.5pt 5.4pt;\">54.4 49.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.6.3.4\" style=\"padding:2.5pt 5.4pt;\">1.6 1.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.6.3.5\" style=\"padding:2.5pt 5.4pt;\">23.9 23.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.6.3.6\" style=\"padding:2.5pt 5.4pt;\">52.0 50.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.6.3.7\" style=\"padding:2.5pt 5.4pt;\">1.5 1.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.6.3.8\" style=\"padding:2.5pt 5.4pt;\">41.0 41.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.6.3.9\" style=\"padding:2.5pt 5.4pt;\">56.1 55.9</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.7.4\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.1.7.4.1\" style=\"padding:2.5pt 5.4pt;\">DialogueGCN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.7.4.2\" style=\"padding:2.5pt 5.4pt;\">70.3 72.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.7.4.3\" style=\"padding:2.5pt 5.4pt;\">42.4 41.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.7.4.4\" style=\"padding:2.5pt 5.4pt;\">3.0 2.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.7.4.5\" style=\"padding:2.5pt 5.4pt;\">20.9 21.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.7.4.6\" style=\"padding:2.5pt 5.4pt;\">44.7 44.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.7.4.7\" style=\"padding:2.5pt 5.4pt;\">6.5 6.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.7.4.8\" style=\"padding:2.5pt 5.4pt;\">39.0 36.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.7.4.9\" style=\"padding:2.5pt 5.4pt;\">54.9 54.7</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.8.5\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.1.8.5.1\" style=\"padding:2.5pt 5.4pt;\">RGAT</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.8.5.2\" style=\"padding:2.5pt 5.4pt;\">76.0 78.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.8.5.3\" style=\"padding:2.5pt 5.4pt;\">40.1 41.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.8.5.4\" style=\"padding:2.5pt 5.4pt;\">3.0 2.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.8.5.5\" style=\"padding:2.5pt 5.4pt;\">32.1 30.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.8.5.6\" style=\"padding:2.5pt 5.4pt;\">68.1 58.6</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.8.5.7\" style=\"padding:2.5pt 5.4pt;\">4.5 2.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.8.5.8\" style=\"padding:2.5pt 5.4pt;\">40.0 44.6</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.8.5.9\" style=\"padding:2.5pt 5.4pt;\">60.3 61.1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.9.6\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.1.9.6.1\" style=\"padding:2.5pt 5.4pt;\">CoMPM</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.9.6.2\" style=\"padding:2.5pt 5.4pt;\">78.3 82.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.9.6.3\" style=\"padding:2.5pt 5.4pt;\">48.3 49.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.9.6.4\" style=\"padding:2.5pt 5.4pt;\">1.7 2.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.9.6.5\" style=\"padding:2.5pt 5.4pt;\">35.9 32.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.9.6.6\" style=\"padding:2.5pt 5.4pt;\">71.4 61.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.9.6.7\" style=\"padding:2.5pt 5.4pt;\">3.1 2.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.9.6.8\" style=\"padding:2.5pt 5.4pt;\">42.2 45.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.9.6.9\" style=\"padding:2.5pt 5.4pt;\">64.1 65.3</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.10.7\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.1.10.7.1\" style=\"padding:2.5pt 5.4pt;\">EmoBERTa</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.10.7.2\" style=\"padding:2.5pt 5.4pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.10.7.2.1\">78.9 82.5</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.10.7.3\" style=\"padding:2.5pt 5.4pt;\">50.2 50.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.10.7.4\" style=\"padding:2.5pt 5.4pt;\">1.8 1.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.10.7.5\" style=\"padding:2.5pt 5.4pt;\">33.3 31.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.10.7.6\" style=\"padding:2.5pt 5.4pt;\">72.1 61.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.10.7.7\" style=\"padding:2.5pt 5.4pt;\">9.1 2.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.10.7.8\" style=\"padding:2.5pt 5.4pt;\">43.3 46.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.10.7.9\" style=\"padding:2.5pt 5.4pt;\">64.1 65.2</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.11.8\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.1.11.8.1\" style=\"padding:2.5pt 5.4pt;\">ConGCN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.11.8.2\" style=\"padding:2.5pt 5.4pt;\">46.8 45.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.11.8.3\" style=\"padding:2.5pt 5.4pt;\">10.6 8.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.11.8.4\" style=\"padding:2.5pt 5.4pt;\">8.7 8.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.11.8.5\" style=\"padding:2.5pt 5.4pt;\">53.1 54.6</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.11.8.6\" style=\"padding:2.5pt 5.4pt;\">76.7 75.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.11.8.7\" style=\"padding:2.5pt 5.4pt;\">28.5 <span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.11.8.7.1\">26.3</span>\n</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.11.8.8\" style=\"padding:2.5pt 5.4pt;\">50.3 48.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.11.8.9\" style=\"padding:2.5pt 5.4pt;\">59.4 58.7</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.12.9\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.1.12.9.1\" style=\"padding:2.5pt 5.4pt;\">A-DMN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.12.9.2\" style=\"padding:2.5pt 5.4pt;\">76.5 78.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.12.9.3\" style=\"padding:2.5pt 5.4pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.12.9.3.1\">56.2 55.3</span></td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.12.9.4\" style=\"padding:2.5pt 5.4pt;\">8.2 8.6</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.12.9.5\" style=\"padding:2.5pt 5.4pt;\">22.1 24.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.12.9.6\" style=\"padding:2.5pt 5.4pt;\">59.8 57.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.12.9.7\" style=\"padding:2.5pt 5.4pt;\">1.2 3.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.12.9.8\" style=\"padding:2.5pt 5.4pt;\">41.3 40.9</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.12.9.9\" style=\"padding:2.5pt 5.4pt;\">61.5 60.4</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.13.10\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.1.13.10.1\" style=\"padding:2.5pt 5.4pt;\">LR-GCN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.13.10.2\" style=\"padding:2.5pt 5.4pt;\">76.7 80.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.13.10.3\" style=\"padding:2.5pt 5.4pt;\">53.3 55.2</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.13.10.4\" style=\"padding:2.5pt 5.4pt;\">0.0 0.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.13.10.5\" style=\"padding:2.5pt 5.4pt;\">49.6 35.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.13.10.6\" style=\"padding:2.5pt 5.4pt;\">68.0 64.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.13.10.7\" style=\"padding:2.5pt 5.4pt;\">10.7 2.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.13.10.8\" style=\"padding:2.5pt 5.4pt;\">48.0 51.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.13.10.9\" style=\"padding:2.5pt 5.4pt;\">65.7 65.6</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.14.11\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_r\" id=\"S4.T2.1.14.11.1\" style=\"padding:2.5pt 5.4pt;\">DER-GCN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.14.11.2\" style=\"padding:2.5pt 5.4pt;\">76.8 80.6</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.14.11.3\" style=\"padding:2.5pt 5.4pt;\">50.5 51.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.14.11.4\" style=\"padding:2.5pt 5.4pt;\">14.8 10.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.14.11.5\" style=\"padding:2.5pt 5.4pt;\">56.7 41.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.14.11.6\" style=\"padding:2.5pt 5.4pt;\">69.3 64.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.14.11.7\" style=\"padding:2.5pt 5.4pt;\">17.2 10.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.14.11.8\" style=\"padding:2.5pt 5.4pt;\">\n<span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.14.11.8.1\">52.5</span> 57.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T2.1.14.11.9\" style=\"padding:2.5pt 5.4pt;\">66.8 66.1</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T2.1.15.12\">\n<th class=\"ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_b ltx_border_r\" id=\"S4.T2.1.15.12.1\" style=\"padding:2.5pt 5.4pt;\">ELR-GCN</th>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T2.1.15.12.2\" style=\"padding:2.5pt 5.4pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.15.12.2.1\">80.2 83.6</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T2.1.15.12.3\" style=\"padding:2.5pt 5.4pt;\">36.8 35.4</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T2.1.15.12.4\" style=\"padding:2.5pt 5.4pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.15.12.4.1\">19.2 13.1</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T2.1.15.12.5\" style=\"padding:2.5pt 5.4pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.15.12.5.1\">80.2 83.6</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T2.1.15.12.6\" style=\"padding:2.5pt 5.4pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.15.12.6.1\">76.5 69.7</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T2.1.15.12.7\" style=\"padding:2.5pt 5.4pt;\">\n<span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.15.12.7.1\">55.6</span> 13.0</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T2.1.15.12.8\" style=\"padding:2.5pt 5.4pt;\">52.1 <span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.15.12.8.1\">57.7</span>\n</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T2.1.15.12.9\" style=\"padding:2.5pt 5.4pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T2.1.15.12.9.1\">68.7 69.9</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
144
+ "perturb_sentence_id": 15,
145
+ "output": {
146
+ "perturbed_statement": "[paragraph id = 15] MELD: As shown in Table 2, The ELR-GNN method proposed in this article has the best emotion recognition effect on the MELD data set, with an average accuracy of 72.5% and an average F1 value of 69.9%. The emotion recognition effect of DER-GCN is second, with an average accuracy of 69.7% and an average F1 value of 69.4%.",
147
+ "perturbed_explanation": "The statement made is inaccurate due to the following reasons: 1. The actual average accuracy of ELR-GNN on the MELD dataset is indicated as 68.7%, not 72.5%, as stated in the context. 2. This alteration modifies the factual data presented, altering the outcome and potentially misleading regarding the method's performance."
148
+ }
149
+ },
150
+ {
151
+ "path": "table_paper/2407.00119v2.json",
152
+ "table_id": "3",
153
+ "section": "4.3",
154
+ "all_context": [
155
+ "To verify the superiority of the ELR-GNN method proposed in this paper, we report the experimental results of ELR-GNN and other comparative methods on the IEMOCAP and MELD data sets.",
156
+ "Experimental results are presented in Tables 1 and 2 .",
157
+ "IEMOCAP: As shown in Table 1 , the multi-modal emotion recognition method proposed in this paper achie-ved the best emotion recognition effect on the IEMOCAP data set, with an average accuracy of 70.6% and an average F1 value of 70.9%.",
158
+ "ELR-GCN proposes an effective modeling method of long-distance context latent dependencies for multi-modal emotion recognition.",
159
+ "In addition, ELR-GCN also combines early and adaptive late fusion methods to achieve the capture of fine-grained emotional features.",
160
+ "Among other comparison methods, the emotion recognition effect of DER-GCN is slightly lower than that of ELR-GNN, with an average accuracy of 69.7% and an average F1 value of 69.4%.",
161
+ "Although DER-GCN comprehensively considers event relationships and dialogue relationships between speakers to enhance the model s emotional understanding, it ignores latent context dependencies.",
162
+ "The emotion recognition effect of LR-GCN is lower than ELR-GNN and DER-GCN, with an average accuracy of 68.5% and an average F1 value of 68.3%.",
163
+ "Although LR-GCN considers latent dependencies between contexts, due to the high computational complexity of GCN, LR-GCN can only capture local latent dependencies.",
164
+ "The emotion recognition effects of other comparison methods are lower than ELR-GNN.",
165
+ "Likewise, none of them take into account potential dependencies on context.",
166
+ "Overall, the accuracy of ELR-GNN on the happy emotion analogy is much higher than that of other comparison algorithms, while the accuracy of other emotion categories is also relatively close to that of other comparison algorithms.",
167
+ "In addition, the F1 value of ELR-GNN on the happy and excited emotional analogies is much higher than that of other comparison algorithms.",
168
+ "At the same time, the F1 value of ELR-GNN on other emotional categories is also relatively close to other comparison algorithms.",
169
+ "The experimental results prove the superiority of the ELR-GNN method proposed in this paper.",
170
+ "MELD: As shown in Table 2 , The ELR-GNN method proposed in this article has the best emotion recognition effect on the MELD data set, with an average accuracy of 68.7% and an average F1 value of 69.9%.",
171
+ "The emotion recognition effect of DER-GCN is second, with an average accuracy of 69.7% and an average F1 value of 69.4%.",
172
+ "The emotion recognition effect of LR-GCN is lower than that of ELR-GNN and DER-GCN, with an average accuracy of 68.5% and an average F1 value of 68.3%.",
173
+ "The emotion recognition effects of other comparison methods are relatively poor, and the average accuracy and F1 value are lower than ELR-GNN.",
174
+ "The performance improvement may be attributed to ELR-GNN s ability to capture long-distance contextual latent dependencies and fine-grained fusion of dialogue relationships between speakers, contextual latent dependencies and contextual semantic information.",
175
+ "Overall, the accuracy of ELR-GNN on the neutral, fear, sadness, joy, and disgust emotion analogy is much higher than that of other comparison algorithms, while the accuracy of other emotion categories is also relatively close to that of other comparison algorithms.",
176
+ "In addition, the F1 value of ELR-GNN on the neutral, fear, sadness, joy, and anger emotional analogies is much higher than that of other comparison algorithms.",
177
+ "At the same time, the F1 value of ELR-GNN on other emotional categories is also relatively close to other comparison algorithms.",
178
+ "In addition, we find that ELR-GNN has better emotion recognition effects on the minority emotions fear and disgust, with relatively high accuracy and F1 value.",
179
+ "The experimental results prove the superiority of the ELR-GNN method proposed in this paper.",
180
+ "In addition, to intuitively illustrate that the running time of the ELR-GNN method proposed in this paper is better than other comparative methods, we statistics in Table 3 the running time of other comparative methods of the ELR-GNN method on the IEMOCAP and MELD data sets.",
181
+ "As shown in Table 3 , the running time of the ELR-GNN method proposed in this paper on the IEMOCAP and MELD data sets is 41s and 91s respectively, which is significantly better than other comparison methods.",
182
+ "The running times of DialogueGCN are 58s and 127s respectively, which are lower than LR-GCN and DER-GCN, but the emotion recognition effect is relatively poor.",
183
+ "The running times of LR-GCN are 87s and 142s respectively.",
184
+ "The running times of DER-GCN are 125s and 189s respectively.",
185
+ "The experimental results prove the efficiency and effectiveness of the ELR-GNN method proposed in this paper.",
186
+ ""
187
+ ],
188
+ "target_context_ids": [
189
+ 25,
190
+ 26,
191
+ 27,
192
+ 28,
193
+ 29,
194
+ 30
195
+ ],
196
+ "selected_paragraphs": [
197
+ "[paragraph id = 25] In addition, to intuitively illustrate that the running time of the ELR-GNN method proposed in this paper is better than other comparative methods, we statistics in Table 3 the running time of other comparative methods of the ELR-GNN method on the IEMOCAP and MELD data sets.",
198
+ "[paragraph id = 26] As shown in Table 3 , the running time of the ELR-GNN method proposed in this paper on the IEMOCAP and MELD data sets is 41s and 91s respectively, which is significantly better than other comparison methods.",
199
+ "[paragraph id = 27] The running times of DialogueGCN are 58s and 127s respectively, which are lower than LR-GCN and DER-GCN, but the emotion recognition effect is relatively poor.",
200
+ "[paragraph id = 28] The running times of LR-GCN are 87s and 142s respectively.",
201
+ "[paragraph id = 29] The running times of DER-GCN are 125s and 189s respectively.",
202
+ "[paragraph id = 30] The experimental results prove the efficiency and effectiveness of the ELR-GNN method proposed in this paper."
203
+ ],
204
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T3\">\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 3: </span>We tested the running time of the ELR-GNN method proposed in this paper and other comparative methods on the IEMOCAP and MELD data sets. In particular, ELR-GNN sets to and neighbor size to 64.</figcaption>\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S4.T3.5\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T3.5.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T3.5.1.1.1\" rowspan=\"2\" style=\"padding:2.5pt 18.5pt;\">      <span class=\"ltx_text\" id=\"S4.T3.5.1.1.1.1\">Methods</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" colspan=\"2\" id=\"S4.T3.5.1.1.2\" style=\"padding:2.5pt 18.5pt;\">      Running time (s)</th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.5.2.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T3.5.2.2.1\" style=\"padding:2.5pt 18.5pt;\">      IEMOCAP</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T3.5.2.2.2\" style=\"padding:2.5pt 18.5pt;\">      MELD</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T3.5.3.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T3.5.3.1.1\" style=\"padding:2.5pt 18.5pt;\">      DialogueGCN</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.5.3.1.2\" style=\"padding:2.5pt 18.5pt;\">      58</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T3.5.3.1.3\" style=\"padding:2.5pt 18.5pt;\">      127</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.5.4.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r\" id=\"S4.T3.5.4.2.1\" style=\"padding:2.5pt 18.5pt;\">      LR-GCN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.5.4.2.2\" style=\"padding:2.5pt 18.5pt;\">      87</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.5.4.2.3\" style=\"padding:2.5pt 18.5pt;\">      142</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.5.5.3\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r\" id=\"S4.T3.5.5.3.1\" style=\"padding:2.5pt 18.5pt;\">      DER-GCN</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.5.5.3.2\" style=\"padding:2.5pt 18.5pt;\">      125</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T3.5.5.3.3\" style=\"padding:2.5pt 18.5pt;\">      189</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T3.5.6.4\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_b ltx_border_r\" id=\"S4.T3.5.6.4.1\" style=\"padding:2.5pt 18.5pt;\">      ELR-GNN</th>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T3.5.6.4.2\" style=\"padding:2.5pt 18.5pt;\">      41</td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T3.5.6.4.3\" style=\"padding:2.5pt 18.5pt;\">      91</td>\n</tr>\n</tbody>\n</table>\n</figure>",
205
+ "perturb_sentence_id": 29,
206
+ "output": {
207
+ "perturbed_statement": "[paragraph id = 29] The running times of DER-GCN are 87s and 142s respectively.The experimental results prove the efficiency and effectiveness of the ELR-GNN method proposed in this paper.",
208
+ "perturbed_explanation": "The original explanation:\n1. Running times of DER-GCN were not mentioned in the provided context.\nThe statement is factually incorrect because:\n2. It directly attributes the running times of 87s and 142s to DER-GCN, which are the running times of LR-GCN. This misattribution misrepresents the data from the source elements."
209
+ }
210
+ },
211
+ {
212
+ "path": "table_paper/2407.00119v2.json",
213
+ "table_id": "4",
214
+ "section": "4.5.1",
215
+ "all_context": [
216
+ "To verify the importance of the three modal features of text, video and audio for ELR-GNN, we conducted ablation experiments on the IEMOCAP and MELD data sets to compare the performance of the combination of different modal features.",
217
+ "The experimental results are shown in Table 4 .",
218
+ "In single-modal experiments, ELR-GNN with text modality features has the best emotion recognition effect.",
219
+ "The average accuracy on the IEMOCAP and MELD data sets are 64.1% and 63.5%, respectively, and the average F1 value is 63.9% and 62.4%, respectively.",
220
+ "The emotion recognition effect of ELR-GNN with audio modal features is second, with average accuracy rates of 61.1% and 62.7% on the IEMOCAP and MELD data sets, and average F1 values of 60.8% and 62.0% respectively.",
221
+ "ELR-GNN with video modality features has the worst emotion recognition effect, with average accuracy rates of 59.4% and 60.1% on the IEMOCAP and MELD data sets, and average F1 values of 59.7% and 61.4% respectively.",
222
+ "Experimental results show that text features contain the most emotional semantic information.",
223
+ "In the dual-modal experiment, ELR-GNN with text and audio modal features has the best emotion recognition effect.",
224
+ "The average accuracy on the IEMOCAP and MELD data sets are 65.0% and 64.1%, respectively, and the average F1 values are are 64.4% and 63.2%, respectively.",
225
+ "Experimental results demonstrate the effectiveness of multimodal features.",
226
+ ""
227
+ ],
228
+ "target_context_ids": [
229
+ 1,
230
+ 2,
231
+ 3,
232
+ 4,
233
+ 5,
234
+ 6,
235
+ 7,
236
+ 8,
237
+ 9
238
+ ],
239
+ "selected_paragraphs": [
240
+ "[paragraph id = 1] The experimental results are shown in Table 4 .",
241
+ "[paragraph id = 2] In single-modal experiments, ELR-GNN with text modality features has the best emotion recognition effect.",
242
+ "[paragraph id = 3] The average accuracy on the IEMOCAP and MELD data sets are 64.1% and 63.5%, respectively, and the average F1 value is 63.9% and 62.4%, respectively.",
243
+ "[paragraph id = 4] The emotion recognition effect of ELR-GNN with audio modal features is second, with average accuracy rates of 61.1% and 62.7% on the IEMOCAP and MELD data sets, and average F1 values of 60.8% and 62.0% respectively.",
244
+ "[paragraph id = 5] ELR-GNN with video modality features has the worst emotion recognition effect, with average accuracy rates of 59.4% and 60.1% on the IEMOCAP and MELD data sets, and average F1 values of 59.7% and 61.4% respectively.",
245
+ "[paragraph id = 6] Experimental results show that text features contain the most emotional semantic information.",
246
+ "[paragraph id = 7] In the dual-modal experiment, ELR-GNN with text and audio modal features has the best emotion recognition effect.",
247
+ "[paragraph id = 8] The average accuracy on the IEMOCAP and MELD data sets are 65.0% and 64.1%, respectively, and the average F1 values are are 64.4% and 63.2%, respectively.",
248
+ "[paragraph id = 9] Experimental results demonstrate the effectiveness of multimodal features."
249
+ ],
250
+ "table_html": "<figure class=\"ltx_table\" id=\"S4.T4\">\n<figcaption class=\"ltx_caption\"><span class=\"ltx_tag ltx_tag_table\">Table 4: </span>The effect of ELR-GNN on IEMOCAP and MELD datasets using unimodal features and multimodal features, respectively. We report average accuracy and F1-score.</figcaption>\n<table class=\"ltx_tabular ltx_guessed_headers ltx_align_middle\" id=\"S4.T4.1\">\n<thead class=\"ltx_thead\">\n<tr class=\"ltx_tr\" id=\"S4.T4.1.1.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T4.1.1.1.1\" rowspan=\"2\" style=\"padding:2.5pt 12.8pt;\"><span class=\"ltx_text\" id=\"S4.T4.1.1.1.1.1\">Modality</span></th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" colspan=\"2\" id=\"S4.T4.1.1.1.2\" style=\"padding:2.5pt 12.8pt;\">IEMOCAP</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" colspan=\"2\" id=\"S4.T4.1.1.1.3\" style=\"padding:2.5pt 12.8pt;\">MELD</th>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.1.2.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T4.1.2.2.1\" style=\"padding:2.5pt 12.8pt;\">Acc.</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T4.1.2.2.2\" style=\"padding:2.5pt 12.8pt;\">F1</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T4.1.2.2.3\" style=\"padding:2.5pt 12.8pt;\">Acc</th>\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_column ltx_border_t\" id=\"S4.T4.1.2.2.4\" style=\"padding:2.5pt 12.8pt;\">F1</th>\n</tr>\n</thead>\n<tbody class=\"ltx_tbody\">\n<tr class=\"ltx_tr\" id=\"S4.T4.1.3.1\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r ltx_border_t\" id=\"S4.T4.1.3.1.1\" style=\"padding:2.5pt 12.8pt;\">T</th>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.3.1.2\" style=\"padding:2.5pt 12.8pt;\">64.1</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.3.1.3\" style=\"padding:2.5pt 12.8pt;\">63.9</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.3.1.4\" style=\"padding:2.5pt 12.8pt;\">63.5</td>\n<td class=\"ltx_td ltx_align_center ltx_border_t\" id=\"S4.T4.1.3.1.5\" style=\"padding:2.5pt 12.8pt;\">62.4</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.1.4.2\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r\" id=\"S4.T4.1.4.2.1\" style=\"padding:2.5pt 12.8pt;\">A</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.4.2.2\" style=\"padding:2.5pt 12.8pt;\">61.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.4.2.3\" style=\"padding:2.5pt 12.8pt;\">60.8</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.4.2.4\" style=\"padding:2.5pt 12.8pt;\">62.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.4.2.5\" style=\"padding:2.5pt 12.8pt;\">62.0</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.1.5.3\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r\" id=\"S4.T4.1.5.3.1\" style=\"padding:2.5pt 12.8pt;\">V</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.5.3.2\" style=\"padding:2.5pt 12.8pt;\">59.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.5.3.3\" style=\"padding:2.5pt 12.8pt;\">59.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.5.3.4\" style=\"padding:2.5pt 12.8pt;\">60.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.5.3.5\" style=\"padding:2.5pt 12.8pt;\">61.4</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.1.6.4\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r\" id=\"S4.T4.1.6.4.1\" style=\"padding:2.5pt 12.8pt;\">T+A</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.6.4.2\" style=\"padding:2.5pt 12.8pt;\">65.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.6.4.3\" style=\"padding:2.5pt 12.8pt;\">64.4</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.6.4.4\" style=\"padding:2.5pt 12.8pt;\">64.1</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.6.4.5\" style=\"padding:2.5pt 12.8pt;\">63.2</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.1.7.5\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r\" id=\"S4.T4.1.7.5.1\" style=\"padding:2.5pt 12.8pt;\">T+V</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.7.5.2\" style=\"padding:2.5pt 12.8pt;\">64.3</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.7.5.3\" style=\"padding:2.5pt 12.8pt;\">64.6</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.7.5.4\" style=\"padding:2.5pt 12.8pt;\">64.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.7.5.5\" style=\"padding:2.5pt 12.8pt;\">62.9</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.1.8.6\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_r\" id=\"S4.T4.1.8.6.1\" style=\"padding:2.5pt 12.8pt;\">V+A</th>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.8.6.2\" style=\"padding:2.5pt 12.8pt;\">63.0</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.8.6.3\" style=\"padding:2.5pt 12.8pt;\">62.7</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.8.6.4\" style=\"padding:2.5pt 12.8pt;\">61.5</td>\n<td class=\"ltx_td ltx_align_center\" id=\"S4.T4.1.8.6.5\" style=\"padding:2.5pt 12.8pt;\">61.9</td>\n</tr>\n<tr class=\"ltx_tr\" id=\"S4.T4.1.9.7\">\n<th class=\"ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_b ltx_border_r\" id=\"S4.T4.1.9.7.1\" style=\"padding:2.5pt 12.8pt;\">T+A+V</th>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T4.1.9.7.2\" style=\"padding:2.5pt 12.8pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.1.9.7.2.1\">70.6</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T4.1.9.7.3\" style=\"padding:2.5pt 12.8pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.1.9.7.3.1\">70.9</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T4.1.9.7.4\" style=\"padding:2.5pt 12.8pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.1.9.7.4.1\">68.7</span></td>\n<td class=\"ltx_td ltx_align_center ltx_border_b\" id=\"S4.T4.1.9.7.5\" style=\"padding:2.5pt 12.8pt;\"><span class=\"ltx_text ltx_font_bold\" id=\"S4.T4.1.9.7.5.1\">69.9</span></td>\n</tr>\n</tbody>\n</table>\n</figure>",
251
+ "perturb_sentence_id": 3,
252
+ "output": {
253
+ "perturbed_statement": "[paragraph id = 3] The average accuracy on the IEMOCAP and MELD data sets are 64.1% and 63.5%, respectively, and the average F1 value is 63.9% and 62.4%, respectively. The emotion recognition effect of ELR-GNN with audio modal features is second, with average accuracy rates of 61.5% and 63.1% on the IEMOCAP and MELD data sets, and average F1 values of 61.0% and 63.0% respectively.",
254
+ "perturbed_explanation": "1. The original statement describes particular evaluation metrics for the ELR-GNN model's performance using the audio modality, specifying average accuracy rates and F1 values. 2. The statement's claim of 63.1% accuracy on the MELD dataset and 63.0% F1 value are inconsistent with the provided data, which indicates 62.7% and 62.0%, respectively."
255
+ }
256
+ }
257
+ ]