Spaces:
Runtime error
Runtime error
import streamlit as st | |
import os | |
import pathlib | |
import pandas as pd | |
from collections import defaultdict | |
import json | |
import ast | |
import copy | |
import re | |
import tqdm | |
import pandas as pd | |
from collections import Counter | |
import string | |
import os | |
import streamlit as st | |
import difflib | |
from html import escape | |
def generate_diff_html_word_level(text1, text2): | |
""" | |
Generates word-level difference between text1 and text2 as HTML, correctly handling spaces. | |
""" | |
# Splitting texts into words | |
words1 = text1.split() | |
words2 = text2.split() | |
diff = [] | |
matcher = difflib.SequenceMatcher(None, words1, words2) | |
for opcode in matcher.get_opcodes(): | |
tag, i1, i2, j1, j2 = opcode | |
if tag == 'replace': | |
diff.append('<del style="background-color: #fbb6ce;">' + escape(' '.join(words1[i1:i2])) + '</del>') | |
diff.append('<ins style="background-color: #b7e4c7;">' + escape(' '.join(words2[j1:j2])) + '</ins>') | |
elif tag == 'delete': | |
diff.append('<del style="background-color: #fbb6ce;">' + escape(' '.join(words1[i1:i2])) + '</del>') | |
elif tag == 'insert': | |
diff.append('<ins style="background-color: #b7e4c7;">' + escape(' '.join(words2[j1:j2])) + '</ins>') | |
elif tag == 'equal': | |
diff.append(escape(' '.join(words1[i1:i2]))) | |
# Construct final HTML string | |
final_html = ' '.join(diff).replace('</del> <ins', '</del> <ins') | |
return f'<pre style="white-space: pre-wrap;">{final_html}</pre>' | |
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" | |
st.set_page_config(layout="wide") | |
query_input = None | |
def convert_df(df): | |
# IMPORTANT: Cache the conversion to prevent computation on every rerun | |
return df.to_csv(path_or_buf=None, index=False, quotechar='"').encode('utf-8') | |
def get_current_data(): | |
cur_query_data = [] | |
for id_num, checkbox in current_checkboxes: | |
if checkbox: | |
qid, pid = id_num.split("-----") | |
cur_query_data.append({ | |
"qid": qid, | |
"pid": pid, | |
"is_relevant": 0 | |
}) | |
return convert_df(pd.DataFrame(cur_query_data)) | |
def escape_markdown(text): | |
# List of characters to escape | |
# Adding backslash to the list of special characters to escape itself as well | |
text = text.replace("``", "\"") | |
text = text.replace("$", "\$") | |
special_chars = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!', '|', "$"] | |
# Escaping each special character | |
escaped_text = "".join(f"\\{char}" if char in special_chars else char for char in text) | |
return escaped_text | |
if 'cur_instance_num' not in st.session_state: | |
st.session_state.cur_instance_num = -1 | |
with st.sidebar: | |
st.title("Options") | |
def load_data(): | |
data = [] | |
with open("diffs.jsonl", "r") as f: | |
for line in f: | |
data.append(json.loads(line)) | |
df = pd.DataFrame(data) | |
return df | |
df = load_data() | |
df["id_combined"] = df.apply(lambda x: str(x["qid"]), axis=1) | |
# aggregate the df so that each qid is a row and the rest is a list of instances | |
df = df.groupby("id_combined").agg(lambda x: x.tolist()).reset_index() | |
# print details about the diff type | |
# diff_types = df["diff_type"].value_counts() | |
# st.write(diff_types) | |
# # print how many titles differ by looping through each instances | |
# diff_titles = 0 | |
# for index, row in df.iterrows(): | |
# if row["old"] is not None and row["new"] is not None and row["old"][0]["title"] != row["new"][0]["title"]: | |
# diff_titles += 1 | |
# st.write(f" Number of titles that differ: {diff_titles}") | |
original_map = {item["id_combined"]: item for item in df.to_dict(orient="records")} | |
col1, col2 = st.columns([1, 3], gap="large") | |
with st.sidebar: | |
st.success("All files uploaded") | |
with col1: | |
# breakpoint() | |
ids = df["id_combined"].tolist() | |
set_of_cols = set(ids) | |
container_for_nav = st.container() | |
name_of_columns = sorted([item for item in set_of_cols]) | |
instances_to_use = name_of_columns | |
st.title("Instances") | |
def sync_from_drop(): | |
if st.session_state.selectbox_instance == "Overview": | |
st.session_state.number_of_col = -1 | |
st.session_state.cur_instance_num = -1 | |
else: | |
index_of_obj = name_of_columns.index(st.session_state.selectbox_instance) | |
# print("Index of obj: ", index_of_obj, type(index_of_obj)) | |
st.session_state.number_of_col = index_of_obj | |
st.session_state.cur_instance_num = index_of_obj | |
def sync_from_number(): | |
st.session_state.cur_instance_num = st.session_state.number_of_col | |
# print("Session state number of col: ", st.session_state.number_of_col, type(st.session_state.number_of_col)) | |
if st.session_state.number_of_col == -1: | |
st.session_state.selectbox_instance = "Overview" | |
else: | |
st.session_state.selectbox_instance = name_of_columns[st.session_state.number_of_col] | |
number_of_col = container_for_nav.number_input(min_value=-1, step=1, max_value=len(instances_to_use) - 1, on_change=sync_from_number, label=f"Select instance by index (up to **{len(instances_to_use) - 1}**)", key="number_of_col") | |
selectbox_instance = container_for_nav.selectbox("Select instance by ID", ["Overview"] + name_of_columns, on_change=sync_from_drop, key="selectbox_instance") | |
st.divider() | |
with col2: | |
# get instance number | |
inst_index = number_of_col | |
if inst_index >= 0: | |
inst_num = instances_to_use[inst_index] | |
st.markdown("<h1 style='text-align: center; color: black;text-decoration: underline;'>Editor</h1>", unsafe_allow_html=True) | |
current_checkboxes = [] | |
diff_types = original_map[inst_num]['diff_type'] | |
num_new = Counter(diff_types)["new"] | |
instances = original_map[inst_num] | |
container = st.container() | |
container.subheader(f"Combined ID: {inst_num} with {len(instances['pid'])} docs ({num_new} are new)") | |
# container.markdown(f"Diff Type: **{**") | |
container.divider() | |
container.subheader(f"Query Info") | |
container.markdown(f"Query ID: {instances['query_info'][0]['_id']}") | |
container.markdown(f"**Query**:{instances['query_info'][0]['text']}") | |
# container.markdown(f"Query Instruction OG: {original_map[inst_num]['query_info']['instruction_og']}") | |
# container.markdown(f"Query Instruction New: {original_map[inst_num]['query_info']['instruction_changed']}") | |
# container.subheader("Instruction ") | |
processed_diff = generate_diff_html_word_level(instances['query_info'][0]['instruction_og'], instances['query_info'][0]['instruction_changed']) | |
with container.container(): | |
st.markdown("**Instruction**: " + processed_diff, unsafe_allow_html=True) | |
container.divider() | |
for i in range(len(instances["pid"])): | |
container.markdown(f"Doc {instances['pid'][i]}: **{diff_types[i] if diff_types[i] == 'new' else 'changed'}**") | |
# previous qrel score was either relevant (>0) or non-relevant | |
qrel_score = instances["qrel_score"][i] | |
# if relevant highlight blue, if non-relevant make orange with html tags | |
if qrel_score > 0: | |
container.markdown(f"<h3>Previous: <span style='color: blue;'>Relevant</span></h3>", unsafe_allow_html=True) | |
else: | |
container.markdown(f"<h3>Previous: <span style='color: orange;'>Not Relevant</span></h3>", unsafe_allow_html=True) | |
combined_id = str(instances["qid"][i]) + "-----" + instances["pid"][i] | |
container.subheader(f"Title") | |
if instances['old'][i] is not None and instances['new'][i] is not None and instances['old'][i][0]['title'] == instances['new'][i][0]['title']: | |
container.markdown(f"{instances['old'][i][0]['title']}") | |
elif instances['old'][i] is None and instances['new'][i] is None: | |
container.markdown("") | |
else: | |
if instances['old'][i] is not None: | |
container.markdown(f"{instances['old'][i][0]['title']}") | |
elif instances['new'][i] is not None: | |
container.markdown(f"{instances['new'][i][0]['title']}") | |
else: | |
assert False | |
if instances['old'][i] is not None and instances['new'][i] is not None: | |
container.subheader("Title Diff") | |
processed_diff = generate_diff_html_word_level(instances['old'][i][0]['title'], instances['new'][i][0]['title']) | |
with container.container(): | |
st.markdown(processed_diff, unsafe_allow_html=True) | |
# if both are none, say that: | |
on = container.toggle('Show original text', key="toggle" + combined_id) | |
if on: | |
container.subheader(f"Original Text") | |
if instances['old'][i] is not None: | |
original_input = container.markdown(instances['old'][i][0]['text']) | |
else: | |
original_input = None | |
container.markdown("") | |
container.subheader(f"New Text") | |
# generated_input = container.markdown(instances['new'][i][0]['text']) | |
# # Diff | |
# if original_input is not None and generated_input is not None: | |
# container.subheader("Diff") | |
# processed_diff = generate_diff_html_word_level(instances['old'][i][0]['text'], instances['new'][i][0]['text']) | |
# with container.container(): | |
# st.markdown(processed_diff, unsafe_allow_html=True) | |
# container.subheader("Full Doc") | |
container.markdown(instances['full_doc'][i]) | |
current_checkboxes.append((combined_id, container.checkbox(f'{combined_id} is Non-Relevant', key=combined_id))) | |
container.divider() | |
# download the editable text and venue name | |
if st.checkbox("Download data as CSV"): | |
st.download_button( | |
label="Download data as CSV", | |
data=get_current_data(), | |
file_name=f'annotation_query_{inst_num}_double_check.csv', | |
mime='text/csv', | |
) | |
# none checked | |
elif inst_index < 0: | |
st.title("Overview") |