orionweller commited on
Commit
eb995fd
1 Parent(s): 355d006
Files changed (4) hide show
  1. .gitignore +2 -1
  2. app.py +72 -244
  3. dataset_loading.py +6 -1
  4. example_output.json +0 -0
.gitignore CHANGED
@@ -2,4 +2,5 @@ datasets/
2
  __pycache__/
3
  env/
4
  .ipynb_checkpoints/
5
- *.ipynb
 
 
2
  __pycache__/
3
  env/
4
  .ipynb_checkpoints/
5
+ *.ipynb
6
+ *.pyc
app.py CHANGED
@@ -8,8 +8,6 @@ import copy
8
  import re
9
  import tqdm
10
  import numpy as np
11
- import plotly.express as px
12
- from sentence_transformers import SentenceTransformer
13
 
14
  import pandas as pd
15
  from nltk.corpus import stopwords
@@ -28,7 +26,7 @@ nltk.download('punkt')
28
  nltk.download('stopwords')
29
 
30
 
31
- from dataset_loading import load_local_corpus, load_local_queries, load_local_triples
32
 
33
 
34
  os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
@@ -38,69 +36,6 @@ current_checkboxes = []
38
  query_input = None
39
 
40
 
41
-
42
- @st.cache_data
43
- def create_sim_chart(triples_map, triples1_name, queries1, corpus, instances_to_use, triples2_map, triples2_pos_pids, triples2_name, queries2):
44
- print(f"Calculating cosine similarity for {len(triples_map)} instances")
45
- e5_model = SentenceTransformer('intfloat/e5-large-v2', trust_remote_code=True)
46
- all_triples = [triples_map[str(inst_num)][0] for inst_num in instances_to_use]
47
- # breakpoint()
48
- corpus_texts = [corpus[str(pos_pid)] for _, pos_pid, _ in all_triples]
49
- neg_corpus_texts = [corpus[str(neg_pid)] for _, _, neg_pid in all_triples]
50
- queries_for_embed = [queries[str(qid)] for qid, _, _ in all_triples]
51
- corpus_embeddings = e5_model.encode(corpus_texts)
52
- neg_corpus_embeddings = e5_model.encode(neg_corpus_texts)
53
- query_embeddings = e5_model.encode(queries1)
54
- cos_sim_pos = []
55
- cos_sim_neg = []
56
- for query_emb, pos_emb, neg_emb in zip(query_embeddings, corpus_embeddings, neg_corpus_embeddings):
57
- cos_sim_pos.append(np.dot(query_emb, pos_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(pos_emb)))
58
- cos_sim_neg.append(np.dot(query_emb, neg_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(neg_emb)))
59
- df_sim = pd.DataFrame({"triples": triples1_name, "cosine_similarity_pos": cos_sim_pos, "cosine_similarity_neg": cos_sim_neg})
60
-
61
- if triples2 is not None:
62
- print(f"Calculating cosine similarity for {len(triples2_pos_pids)} instances for 2")
63
- all_triples2 = [triples2_map[str(inst_num)][0] for inst_num in instances_to_use]
64
- corpus_texts2 = [corpus[str(pos_pid)] for _, pos_pid, _ in all_triples2]
65
- neg_corpus_texts2 = [corpus[str(neg_pid)] for _, _, neg_pid in all_triples2]
66
- queries_for_embed2 = [queries[str(qid)] for qid, _, _ in all_triples2]
67
- corpus_embeddings2 = e5_model.encode(corpus_texts2)
68
- neg_corpus_embeddings2 = e5_model.encode(neg_corpus_texts2)
69
- query_embeddings = e5_model.encode(queries2)
70
- cos_sim_pos = []
71
- cos_sim_neg = []
72
- for query_emb, pos_emb, neg_emb in zip(query_embeddings, corpus_embeddings, neg_corpus_embeddings):
73
- cos_sim_pos.append(np.dot(query_emb, pos_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(pos_emb)))
74
- cos_sim_neg.append(np.dot(query_emb, neg_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(neg_emb)))
75
- df_sim2 = pd.DataFrame({"triples": triples2_name, "cosine_similarity_pos": cos_sim_pos, "cosine_similarity_neg": cos_sim_neg})
76
-
77
- df_sim = pd.concat([df_sim, df_sim2])
78
- queries_to_return = list(zip(queries1, queries2))
79
- else:
80
- queries_to_return = queries1
81
-
82
- return df_len, df_sim, queries_to_return
83
-
84
-
85
-
86
-
87
- @st.cache_data
88
- def convert_df(df):
89
- # IMPORTANT: Cache the conversion to prevent computation on every rerun
90
- return df.to_json(orient='records', lines=True, path_or_buf=None).encode('utf-8')
91
-
92
-
93
- def create_histogram_relevant_docs(relevant_df):
94
- # turn results into a dataframe and then plot
95
- fig = px.histogram(relevant_df, x="relevant_docs")
96
- # make it fit in one column
97
- fig.update_layout(
98
- height=400,
99
- width=250
100
- )
101
- return fig
102
-
103
-
104
  def get_current_data():
105
  cur_query_data = []
106
  cur_query = query_input.replace("\n", "\\n")
@@ -135,63 +70,57 @@ def validate(config_option, file_loaded):
135
 
136
  with st.sidebar:
137
  st.title("Options")
138
- # add a text input for naming triples 1
139
- st.header("Name of triples 1")
140
- triples1_name = st.text_input("Name of triples 1", key="triples1_name")
141
- # add a text input for naming triples 2
142
- st.header("Name of triples 2")
143
- triples2_name = st.text_input("Name of triples 2", key="triples2_name")
144
- st.header("Upload corpus")
145
- corpus_file = st.file_uploader("Choose a file", key="corpus")
146
- corpus = load_local_corpus(corpus_file)
147
- st.header("Upload queries")
148
- queries_file = st.file_uploader("Choose a file", key="queries")
149
- queries = load_local_queries(queries_file)
150
- st.header("Upload triples")
151
- triples_file = st.file_uploader("Choose a file", key="triples")
152
- triples = load_local_triples(triples_file)
153
- if triples:
154
- triples_pos_pids = [item[1] for item in triples]
155
- triples_map = {item[1]: [] for item in triples}
156
- query1_ids = [item[0] for item in triples]
157
- for item in triples:
158
- triples_map[item[1]].append(item)
159
-
160
- triples2_file = st.file_uploader("Choose a second triples file for comparison", key="triples2")
161
- triples2 = load_local_triples(triples2_file)
162
- if triples2:
163
- triples2_pos_pids = [item[1] for item in triples2]
164
- triples2_map = {item[1]: [] for item in triples2}
165
- for item in triples2:
166
- triples2_map[item[1]].append(item)
167
-
168
- # filter them by their intersection
169
- triples_pos_pids = list(set(triples_pos_pids).intersection(triples2_pos_pids))
170
- triples2_pos_pids = list(set(triples2_pos_pids).intersection(triples_pos_pids))
171
- triples_map = {item[1]: triples_map[item[1]] for item in triples if item[1] in triples_pos_pids}
172
- triples2_map = {item[1]: triples2_map[item[1]] for item in triples2 if item[1] in triples2_pos_pids}
173
- query2_ids = [item[0] for item in triples2]
174
- print(f"There are {len(triples_pos_pids)} triples in common between the two files")
175
-
176
- ## make sure all qids in triples are in queries and write out a warning if not
177
- if queries is not None and triples is not None:
178
- missing_qids = set(query1_ids) - set(queries.keys())
179
- if len(missing_qids) > 0:
180
- st.warning(f"The following qids in triples are not in queries: {missing_qids}. \nPlease fix")
181
-
182
- if triples2 is not None:
183
- missing_qids = set(query2_ids) - set(queries.keys())
184
- if len(missing_qids) > 0:
185
- st.warning(f"The following qids in triples2 are not in queries: {missing_qids}. \nPlease fix")
186
 
 
 
 
187
  col1, col2 = st.columns([1, 3], gap="large")
188
 
189
- if corpus is not None and queries is not None and triples is not None:
 
 
 
190
  with st.sidebar:
191
  st.success("All files uploaded")
192
 
193
  with col1:
194
- set_of_cols = set(triples_pos_pids) if triples2 is None else set(triples2_pos_pids).intersection(triples_pos_pids)
195
  container_for_nav = st.container()
196
  name_of_columns = sorted([item for item in set_of_cols])
197
  instances_to_use = name_of_columns
@@ -220,144 +149,43 @@ if corpus is not None and queries is not None and triples is not None:
220
  selectbox_instance = container_for_nav.selectbox("Select instance by ID", ["Overview"] + name_of_columns, on_change=sync_from_drop, key="selectbox_instance")
221
  st.divider()
222
 
223
- # get average words lengths for generated queries
224
- queries1 = [queries[str(triples_map[str(inst_num)][0][0])] for inst_num in instances_to_use]
225
- triples1_query_len = [len(query.split()) for query in queries1]
226
- df_len = pd.DataFrame({"triples": triples1_name, "query_len": triples1_query_len})
227
-
228
- if triples2 is not None:
229
- queries2 = [queries[str(triples2_map[str(inst_num)][0][0])] for inst_num in instances_to_use]
230
- triples2_query_len = [len(query.split()) for query in queries2]
231
- df_len = pd.concat([df_len, pd.DataFrame({"triples": triples2_name, "query_len": triples2_query_len})])
232
- else:
233
- queries2 = None
234
- triples2_pos_pids = None
235
- triples2_map = None
236
- triples2_pos_pids = None
237
-
238
- # now load E5-small-v2 and calculate the cosine similarity of query and docs
239
- df_len, df_sim_og, queries_to_return = create_sim_chart(triples_map, triples1_name, queries1, corpus, instances_to_use, triples2_map, triples2_pos_pids, triples2_name, queries2)
240
- df_sim = df_sim_og.copy()
241
 
242
- # make a plotly chart with the lengths (with both triples if available)
243
- st.subheader("Query Lengths")
244
- fig = px.histogram(df_len, x="query_len", color="triples", marginal="box", title="Query Lengths", barmode="overlay", color_discrete_map={triples1_name: "blue", triples2_name: "red"})
245
- fig.update_layout(
246
- height=400,
247
- width=250
248
- )
249
- st.plotly_chart(fig)
250
-
251
- # make a plotly plot of the cosine similarities like the above
252
- st.subheader("Cosine Similarity")
253
- fig = px.histogram(df_sim, x="cosine_similarity_pos", color="triples", marginal="box", title="Cosine Similarity", barmode="overlay", color_discrete_map={triples1_name: "blue", triples2_name: "red"})
254
- fig.update_layout(
255
- height=400,
256
- width=250
257
- )
258
- st.plotly_chart(fig)
259
-
260
- # make a plotly plot of the cosine similarities like the above
261
- df_sim["Diff"] = df_sim["cosine_similarity_pos"] - df_sim["cosine_similarity_neg"]
262
- st.subheader("Cosine Similarity")
263
- fig = px.histogram(df_sim, x="Diff", color="triples", marginal="box", title="Cosine Similarity Diff (Pos - Neg)", barmode="overlay", color_discrete_map={triples1_name: "blue", triples2_name: "red"})
264
- fig.update_layout(
265
- height=400,
266
- width=250
267
- )
268
- st.plotly_chart(fig)
269
-
270
- # make a checkbox that if checked will download a CSV of the queries to return
271
- if st.checkbox("Download data as JSONL"):
272
- df_of_queries_to_return = pd.DataFrame({
273
- triples1_name: [item[0] for item in queries_to_return],
274
- triples2_name: [item[1] for item in queries_to_return]
275
- })
276
- st.download_button(
277
- label="Download data as JSONL",
278
- data=convert_df(df_of_queries_to_return),
279
- file_name='queries_to_return.jsonl',
280
- mime='text/json',
281
- )
282
-
283
-
284
  with col2:
285
  # get instance number
286
  inst_index = number_of_col
287
 
288
  if inst_index >= 0:
289
- inst_num = instances_to_use[inst_index]
290
-
291
- st.markdown("<h1 style='text-align: center; color: black;text-decoration: underline;'>Editor</h1>", unsafe_allow_html=True)
292
-
293
-
294
-
295
- if triples2 is not None:
296
- triples_1_col, triples_2_col = st.columns([1, 1], gap="small")
297
-
298
- with triples_1_col:
299
- container = st.container()
300
- container.divider()
301
- container.subheader(f"Triples 1: {triples1_name}")
302
- container.divider()
303
- container.subheader(f"Query")
304
- all_triples = triples_map[str(inst_num)][0]
305
- query_id = all_triples[0]
306
- query_text = queries[str(query_id)].strip()
307
- query_input = container.markdown(f"QID: {inst_num}\n\n{query_text}")
308
- container.divider()
309
- doc_texts = [(pos, neg, corpus[pos], corpus[neg]) for qid, pos, neg in [all_triples]]
310
-
311
- for i, (pos, neg, pos_text, neg_text) in enumerate(doc_texts):
312
- container.markdown(f"## Positive")
313
- container.markdown(f"\n{extract_doc_text(pos_text)}", True)
314
- container.markdown(f"## Negative")
315
- container.markdown(f"\n{extract_doc_text(neg_text)}", True)
316
-
317
- with triples_2_col:
318
- container = st.container()
319
- container.divider()
320
- container.subheader(f"Triples 2: {triples2_name}")
321
- container.divider()
322
- container.subheader(f"Query")
323
-
324
- # all_triples = triples_map[str(inst_num)][0]
325
- # index_of_inst_num = triples_pos_pids.index(inst_num)
326
- all_triples2 = triples2_map[str(inst_num)][0]
327
-
328
- query_text = queries[str(all_triples2[0])].strip()
329
- query_input = container.markdown(f"QID: {all_triples2[0]}\n\n{query_text}")
330
- container.divider()
331
- doc_texts = [(pos, neg, corpus[pos], corpus[neg]) for qid, pos, neg in [all_triples2]]
332
-
333
- for i, (pos, neg, pos_text, neg_text) in enumerate(doc_texts):
334
- container.markdown(f"## Positive")
335
- container.markdown(f"\n{extract_doc_text(pos_text)}", True)
336
- container.markdown(f"## Negative")
337
- container.markdown(f"\n{extract_doc_text(neg_text)}", True)
338
 
339
 
340
  else:
341
- container = st.container()
342
- container.divider()
343
- st.subheader(f"Triples: {triples1_name}")
 
344
 
345
- container.subheader(f"Query")
346
- print(inst_num)
347
- all_triples = triples_map[str(inst_num)][0]
348
- print(all_triples)
349
- query_text = queries[str(all_triples[0])].strip()
350
- query_input = container.markdown(f"QID: {all_triples[0]}\n\n{query_text}")
351
- container.divider()
352
- doc_texts = [(pos, neg, corpus[pos], corpus[neg]) for qid, pos, neg in [all_triples]]
353
-
354
- for i, (pos, neg, pos_text, neg_text) in enumerate(doc_texts):
355
- container.markdown(f"## Positive")
356
- container.markdown(f"\n{extract_doc_text(pos_text)}", True)
357
- container.markdown(f"## Negative")
358
- container.markdown(f"\n{extract_doc_text(neg_text)}", True)
359
 
360
-
361
 
362
 
363
 
@@ -377,4 +205,4 @@ if corpus is not None and queries is not None and triples is not None:
377
 
378
 
379
  else:
380
- st.warning("Please choose a dataset and upload a run file. If you chose \"custom\" be sure that you uploaded all files (queries, corpus, triples)")
 
8
  import re
9
  import tqdm
10
  import numpy as np
 
 
11
 
12
  import pandas as pd
13
  from nltk.corpus import stopwords
 
26
  nltk.download('stopwords')
27
 
28
 
29
+ from dataset_loading import load_local_corpus, load_local_queries, load_local_triples, load_json
30
 
31
 
32
  os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
 
36
  query_input = None
37
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def get_current_data():
40
  cur_query_data = []
41
  cur_query = query_input.replace("\n", "\\n")
 
70
 
71
  with st.sidebar:
72
  st.title("Options")
73
+ use_default = st.checkbox("Use default data", value=False)
74
+ if use_default:
75
+ st.write("Using default data")
76
+ with open("example_output.json", "r") as fin:
77
+ data = json.load(fin)
78
+ ids = data["ids"]
79
+ prompts = data["input"]
80
+ # they can be longer in partial cases
81
+ outputs = data["parsed"]
82
+ ids = data["ids"][:len(outputs)]
83
+ prompts = prompts[:len(outputs)]
84
+ mapping = {id: (prompt, output) for id, prompt, output in zip(ids, prompts, outputs)}
85
+ data2 = None
86
+ else:
87
+ st.header("Input File")
88
+ input_file = st.file_uploader("Choose a file", key="data")
89
+ data = load_json(input_file)
90
+ if data is not None:
91
+ ids = data["ids"]
92
+ prompts = data["input"]
93
+ # they can be longer in partial cases
94
+ outputs = data["parsed"]
95
+ ids = data["ids"][:len(outputs)]
96
+ prompts = prompts[:len(outputs)]
97
+ mapping = {id: (prompt, output) for id, prompt, output in zip(ids, prompts, outputs)}
98
+
99
+ input_file2 = st.file_uploader("Choose a second file", key="data2")
100
+ data2 = load_json(input_file2)
101
+ if data2 is not None:
102
+ ids2 = data2["ids"]
103
+ prompts2 = data2["input"]
104
+ # they can be longer in partial cases
105
+ outputs2 = data2["parsed"]
106
+ ids2 = data2["ids"][:len(outputs2)]
107
+ prompts2 = prompts2[:len(outputs2)]
108
+ mapping2 = {id: (prompt, output) for id, prompt, output in zip(ids2, prompts2, outputs2)}
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+
111
+
112
+
113
  col1, col2 = st.columns([1, 3], gap="large")
114
 
115
+ if data is not None:
116
+ joint_ids = ids if data2 is None else list(set(ids2).intersection(ids))
117
+ # print(f"Not using ids {set(ids) - set(joint_ids)} and {set(ids2) - set(joint_ids)}")
118
+
119
  with st.sidebar:
120
  st.success("All files uploaded")
121
 
122
  with col1:
123
+ set_of_cols = joint_ids
124
  container_for_nav = st.container()
125
  name_of_columns = sorted([item for item in set_of_cols])
126
  instances_to_use = name_of_columns
 
149
  selectbox_instance = container_for_nav.selectbox("Select instance by ID", ["Overview"] + name_of_columns, on_change=sync_from_drop, key="selectbox_instance")
150
  st.divider()
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  with col2:
154
  # get instance number
155
  inst_index = number_of_col
156
 
157
  if inst_index >= 0:
158
+ prompt, output = mapping[joint_ids[inst_index]]
159
+ if data2 is not None:
160
+ prompt2, output2 = mapping2[joint_ids[inst_index]]
161
+ col1_out, col2_out = st.columns([1, 1], gap="small")
162
+
163
+ with col1_out:
164
+ st.markdown("<h2 style='text-align: center; color: black;'>Prompt</h2>", unsafe_allow_html=True)
165
+ st.write(prompt)
166
+
167
+ st.markdown("<h2 style='text-align: center; color: black;'>Output</h2>", unsafe_allow_html=True)
168
+ st.json(output)
169
+
170
+ with col2_out:
171
+ st.markdown("<h2 style='text-align: center; color: black;'>Prompt</h2>", unsafe_allow_html=True)
172
+ st.write(prompt2)
173
+
174
+ st.markdown("<h2 style='text-align: center; color: black;'>Output</h2>", unsafe_allow_html=True)
175
+ st.json(output2)
176
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
 
179
  else:
180
+ st.markdown("<h1 style='text-align: center; color: black;text-decoration: underline;'>Editor</h1>", unsafe_allow_html=True)
181
+
182
+ st.markdown("<h2 style='text-align: center; color: black;'>Prompt</h2>", unsafe_allow_html=True)
183
+ st.write(prompt)
184
 
185
+ st.markdown("<h2 style='text-align: center; color: black;'>Output</h2>", unsafe_allow_html=True)
186
+ st.json(output)
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
+
189
 
190
 
191
 
 
205
 
206
 
207
  else:
208
+ st.warning("Please choose an output file from prompting and upload it")
dataset_loading.py CHANGED
@@ -5,7 +5,6 @@ import pandas as pd
5
  from collections import defaultdict
6
  import json
7
  import copy
8
- import plotly.express as px
9
 
10
 
11
 
@@ -130,6 +129,12 @@ def load_jsonl(f):
130
  return did2text, sub_did2text
131
 
132
 
 
 
 
 
 
 
133
 
134
  @st.cache_data(persist="disk")
135
  def get_dataset(dataset_name: str, input_fields_doc, input_fields_query):
 
5
  from collections import defaultdict
6
  import json
7
  import copy
 
8
 
9
 
10
 
 
129
  return did2text, sub_did2text
130
 
131
 
132
+ @st.cache_data
133
+ def load_json(f):
134
+ if f is None:
135
+ return None
136
+ return json.load(f)
137
+
138
 
139
  @st.cache_data(persist="disk")
140
  def get_dataset(dataset_name: str, input_fields_doc, input_fields_query):
example_output.json ADDED
The diff for this file is too large to render. See raw diff