orionweller commited on
Commit
bf3cbf2
1 Parent(s): b742bcd

init commit

Browse files
Files changed (4) hide show
  1. app.py +277 -0
  2. diffs.jsonl +0 -0
  3. packages.txt +1 -0
  4. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import pathlib
4
+ import pandas as pd
5
+ from collections import defaultdict
6
+ import json
7
+ import ast
8
+ import copy
9
+ import re
10
+ import tqdm
11
+
12
+
13
+ import pandas as pd
14
+ from collections import Counter
15
+ import string
16
+ import os
17
+ import streamlit as st
18
+ import difflib
19
+ from html import escape
20
+
21
+
22
+ def generate_diff_html_word_level(text1, text2):
23
+ """
24
+ Generates word-level difference between text1 and text2 as HTML, correctly handling spaces.
25
+ """
26
+ # Splitting texts into words
27
+ words1 = text1.split()
28
+ words2 = text2.split()
29
+
30
+ diff = []
31
+ matcher = difflib.SequenceMatcher(None, words1, words2)
32
+
33
+ for opcode in matcher.get_opcodes():
34
+ tag, i1, i2, j1, j2 = opcode
35
+ if tag == 'replace':
36
+ diff.append('<del style="background-color: #fbb6ce;">' + escape(' '.join(words1[i1:i2])) + '</del>')
37
+ diff.append('<ins style="background-color: #b7e4c7;">' + escape(' '.join(words2[j1:j2])) + '</ins>')
38
+ elif tag == 'delete':
39
+ diff.append('<del style="background-color: #fbb6ce;">' + escape(' '.join(words1[i1:i2])) + '</del>')
40
+ elif tag == 'insert':
41
+ diff.append('<ins style="background-color: #b7e4c7;">' + escape(' '.join(words2[j1:j2])) + '</ins>')
42
+ elif tag == 'equal':
43
+ diff.append(escape(' '.join(words1[i1:i2])))
44
+
45
+ # Construct final HTML string
46
+ final_html = ' '.join(diff).replace('</del> <ins', '</del>&nbsp;<ins')
47
+ return f'<pre style="white-space: pre-wrap;">{final_html}</pre>'
48
+
49
+
50
+
51
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
52
+ st.set_page_config(layout="wide")
53
+
54
+ query_input = None
55
+
56
+ @st.cache_data
57
+ def convert_df(df):
58
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
59
+ return df.to_csv(path_or_buf=None, index=False, quotechar='"').encode('utf-8')
60
+
61
+
62
+ def get_current_data():
63
+ cur_query_data = []
64
+ for id_num, checkbox in current_checkboxes:
65
+ if checkbox:
66
+ qid, pid = id_num.split("-----")
67
+ cur_query_data.append({
68
+ "qid": qid,
69
+ "pid": pid,
70
+ "is_relevant": 0
71
+ })
72
+ return convert_df(pd.DataFrame(cur_query_data))
73
+
74
+ @st.cache_data
75
+ def escape_markdown(text):
76
+ # List of characters to escape
77
+ # Adding backslash to the list of special characters to escape itself as well
78
+ text = text.replace("``", "\"")
79
+ text = text.replace("$", "\$")
80
+ special_chars = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!', '|', "$"]
81
+
82
+ # Escaping each special character
83
+ escaped_text = "".join(f"\\{char}" if char in special_chars else char for char in text)
84
+
85
+ return escaped_text
86
+
87
+
88
+
89
+ if 'cur_instance_num' not in st.session_state:
90
+ st.session_state.cur_instance_num = -1
91
+
92
+
93
+ with st.sidebar:
94
+ st.title("Options")
95
+
96
+
97
+ @st.cache_data
98
+ def load_data():
99
+ data = []
100
+ with open("diffs.jsonl", "r") as f:
101
+ for line in f:
102
+ data.append(json.loads(line))
103
+ df = pd.DataFrame(data)
104
+ return df
105
+
106
+
107
+ df = load_data()
108
+ df["id_combined"] = df.apply(lambda x: str(x["qid"]), axis=1)
109
+ # aggregate the df so that each qid is a row and the rest is a list of instances
110
+ df = df.groupby("id_combined").agg(lambda x: x.tolist()).reset_index()
111
+
112
+ # print details about the diff type
113
+ # diff_types = df["diff_type"].value_counts()
114
+ # st.write(diff_types)
115
+
116
+ # # print how many titles differ by looping through each instances
117
+ # diff_titles = 0
118
+ # for index, row in df.iterrows():
119
+ # if row["old"] is not None and row["new"] is not None and row["old"][0]["title"] != row["new"][0]["title"]:
120
+ # diff_titles += 1
121
+ # st.write(f" Number of titles that differ: {diff_titles}")
122
+
123
+
124
+ original_map = {item["id_combined"]: item for item in df.to_dict(orient="records")}
125
+
126
+
127
+ col1, col2 = st.columns([1, 3], gap="large")
128
+
129
+ with st.sidebar:
130
+ st.success("All files uploaded")
131
+
132
+ with col1:
133
+ # breakpoint()
134
+ ids = df["id_combined"].tolist()
135
+ set_of_cols = set(ids)
136
+ container_for_nav = st.container()
137
+ name_of_columns = sorted([item for item in set_of_cols])
138
+ instances_to_use = name_of_columns
139
+ st.title("Instances")
140
+
141
+ def sync_from_drop():
142
+ if st.session_state.selectbox_instance == "Overview":
143
+ st.session_state.number_of_col = -1
144
+ st.session_state.cur_instance_num = -1
145
+ else:
146
+ index_of_obj = name_of_columns.index(st.session_state.selectbox_instance)
147
+ # print("Index of obj: ", index_of_obj, type(index_of_obj))
148
+ st.session_state.number_of_col = index_of_obj
149
+ st.session_state.cur_instance_num = index_of_obj
150
+
151
+ def sync_from_number():
152
+ st.session_state.cur_instance_num = st.session_state.number_of_col
153
+ # print("Session state number of col: ", st.session_state.number_of_col, type(st.session_state.number_of_col))
154
+ if st.session_state.number_of_col == -1:
155
+ st.session_state.selectbox_instance = "Overview"
156
+ else:
157
+ st.session_state.selectbox_instance = name_of_columns[st.session_state.number_of_col]
158
+
159
+
160
+ number_of_col = container_for_nav.number_input(min_value=-1, step=1, max_value=len(instances_to_use) - 1, on_change=sync_from_number, label=f"Select instance by index (up to **{len(instances_to_use) - 1}**)", key="number_of_col")
161
+ selectbox_instance = container_for_nav.selectbox("Select instance by ID", ["Overview"] + name_of_columns, on_change=sync_from_drop, key="selectbox_instance")
162
+ st.divider()
163
+
164
+
165
+ with col2:
166
+ # get instance number
167
+ inst_index = number_of_col
168
+
169
+ if inst_index >= 0:
170
+ inst_num = instances_to_use[inst_index]
171
+
172
+ st.markdown("<h1 style='text-align: center; color: black;text-decoration: underline;'>Editor</h1>", unsafe_allow_html=True)
173
+
174
+ current_checkboxes = []
175
+
176
+ diff_types = original_map[inst_num]['diff_type']
177
+ num_new = Counter(diff_types)["new"]
178
+ instances = original_map[inst_num]
179
+
180
+ container = st.container()
181
+
182
+ container.subheader(f"Combined ID: {inst_num} with {len(instances['pid'])} docs ({num_new} are new)")
183
+ # container.markdown(f"Diff Type: **{**")
184
+
185
+ container.divider()
186
+
187
+ container.subheader(f"Query Info")
188
+
189
+ container.markdown(f"Query ID: {instances['query_info'][0]['_id']}")
190
+ container.markdown(f"**Query**:{instances['query_info'][0]['text']}")
191
+
192
+ # container.markdown(f"Query Instruction OG: {original_map[inst_num]['query_info']['instruction_og']}")
193
+ # container.markdown(f"Query Instruction New: {original_map[inst_num]['query_info']['instruction_changed']}")
194
+
195
+
196
+ # container.subheader("Instruction ")
197
+ processed_diff = generate_diff_html_word_level(instances['query_info'][0]['instruction_og'], instances['query_info'][0]['instruction_changed'])
198
+ with container.container():
199
+ st.markdown("**Instruction**: " + processed_diff, unsafe_allow_html=True)
200
+
201
+ container.divider()
202
+
203
+ for i in range(len(instances["pid"])):
204
+
205
+ container.markdown(f"Doc {instances['pid'][i]}: **{diff_types[i] if diff_types[i] == 'new' else 'changed'}**")
206
+
207
+ # previous qrel score was either relevant (>0) or non-relevant
208
+ qrel_score = instances["qrel_score"][i]
209
+ # if relevant highlight blue, if non-relevant make orange with html tags
210
+ if qrel_score > 0:
211
+ container.markdown(f"<h3>Previous: <span style='color: blue;'>Relevant</span></h3>", unsafe_allow_html=True)
212
+ else:
213
+ container.markdown(f"<h3>Previous: <span style='color: orange;'>Not Relevant</span></h3>", unsafe_allow_html=True)
214
+
215
+ combined_id = str(instances["qid"][i]) + "-----" + instances["pid"][i]
216
+ container.subheader(f"Title")
217
+
218
+ if instances['old'][i] is not None and instances['new'][i] is not None and instances['old'][i][0]['title'] == instances['new'][i][0]['title']:
219
+ container.markdown(f"{instances['old'][i][0]['title']}")
220
+ elif instances['old'][i] is None and instances['new'][i] is None:
221
+ container.markdown("")
222
+ else:
223
+ if instances['old'][i] is not None:
224
+ container.markdown(f"{instances['old'][i][0]['title']}")
225
+ elif instances['new'][i] is not None:
226
+ container.markdown(f"{instances['new'][i][0]['title']}")
227
+ else:
228
+ assert False
229
+
230
+
231
+ if instances['old'][i] is not None and instances['new'][i] is not None:
232
+ container.subheader("Title Diff")
233
+ processed_diff = generate_diff_html_word_level(instances['old'][i][0]['title'], instances['new'][i][0]['title'])
234
+ with container.container():
235
+ st.markdown(processed_diff, unsafe_allow_html=True)
236
+ # if both are none, say that:
237
+
238
+ on = container.toggle('Show original text', key="toggle" + combined_id)
239
+ if on:
240
+ container.subheader(f"Original Text")
241
+
242
+ if instances['old'][i] is not None:
243
+ original_input = container.markdown(instances['old'][i][0]['text'])
244
+ else:
245
+ original_input = None
246
+ container.markdown("")
247
+
248
+ container.subheader(f"New Text")
249
+ # generated_input = container.markdown(instances['new'][i][0]['text'])
250
+
251
+ # # Diff
252
+ # if original_input is not None and generated_input is not None:
253
+ # container.subheader("Diff")
254
+ # processed_diff = generate_diff_html_word_level(instances['old'][i][0]['text'], instances['new'][i][0]['text'])
255
+ # with container.container():
256
+ # st.markdown(processed_diff, unsafe_allow_html=True)
257
+
258
+ # container.subheader("Full Doc")
259
+ container.markdown(instances['full_doc'][i])
260
+
261
+ current_checkboxes.append((combined_id, container.checkbox(f'{combined_id} is Non-Relevant', key=combined_id)))
262
+
263
+
264
+ container.divider()
265
+
266
+ # download the editable text and venue name
267
+ if st.checkbox("Download data as CSV"):
268
+ st.download_button(
269
+ label="Download data as CSV",
270
+ data=get_current_data(),
271
+ file_name=f'annotation_query_{inst_num}_double_check.csv',
272
+ mime='text/csv',
273
+ )
274
+
275
+ # none checked
276
+ elif inst_index < 0:
277
+ st.title("Overview")
diffs.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ default-jre
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas==2.0.3
2
+ streamlit==1.33.0
3
+ plotly==5.15.0
4
+ protobuf==3.20.0
5
+ beautifulsoup4==4.12.2
6
+ nltk==3.7
7
+ tqdm==4.66.2