Orion Weller commited on
Commit
53fb3e2
1 Parent(s): 39ee16f
Files changed (5) hide show
  1. app.py +242 -0
  2. chunked_data.jsonl +0 -0
  3. generated_data.json +0 -0
  4. packages.txt +1 -0
  5. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import pathlib
4
+ import pandas as pd
5
+ from collections import defaultdict
6
+ import json
7
+ import ast
8
+ import copy
9
+ import re
10
+ import tqdm
11
+
12
+
13
+ import pandas as pd
14
+ from collections import Counter
15
+ import string
16
+ import os
17
+ import streamlit as st
18
+ import difflib
19
+ from html import escape
20
+
21
+
22
+ def generate_diff_html_word_level(text1, text2):
23
+ """
24
+ Generates word-level difference between text1 and text2 as HTML, correctly handling spaces.
25
+ """
26
+ # Splitting texts into words
27
+ words1 = text1.split()
28
+ words2 = text2.split()
29
+
30
+ diff = []
31
+ matcher = difflib.SequenceMatcher(None, words1, words2)
32
+
33
+ for opcode in matcher.get_opcodes():
34
+ tag, i1, i2, j1, j2 = opcode
35
+ if tag == 'replace':
36
+ diff.append('<del style="background-color: #fbb6ce;">' + escape(' '.join(words1[i1:i2])) + '</del>')
37
+ diff.append('<ins style="background-color: #b7e4c7;">' + escape(' '.join(words2[j1:j2])) + '</ins>')
38
+ elif tag == 'delete':
39
+ diff.append('<del style="background-color: #fbb6ce;">' + escape(' '.join(words1[i1:i2])) + '</del>')
40
+ elif tag == 'insert':
41
+ diff.append('<ins style="background-color: #b7e4c7;">' + escape(' '.join(words2[j1:j2])) + '</ins>')
42
+ elif tag == 'equal':
43
+ diff.append(escape(' '.join(words1[i1:i2])))
44
+
45
+ # Construct final HTML string
46
+ final_html = ' '.join(diff).replace('</del> <ins', '</del>&nbsp;<ins')
47
+ return f'<pre style="white-space: pre-wrap;">{final_html}</pre>'
48
+
49
+
50
+
51
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
52
+ st.set_page_config(layout="wide")
53
+
54
+ current_checkboxes = []
55
+ query_input = None
56
+
57
+ @st.cache_data
58
+ def convert_df(df):
59
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
60
+ return df.to_csv(path_or_buf=None, index=False, quotechar='"').encode('utf-8')
61
+
62
+
63
+
64
+ @st.cache_data
65
+ def escape_markdown(text):
66
+ # List of characters to escape
67
+ # Adding backslash to the list of special characters to escape itself as well
68
+ text = text.replace("``", "\"")
69
+ text = text.replace("$", "\$")
70
+ special_chars = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!', '|', "$"]
71
+
72
+ # Escaping each special character
73
+ escaped_text = "".join(f"\\{char}" if char in special_chars else char for char in text)
74
+
75
+ return escaped_text
76
+
77
+
78
+
79
+
80
+ if 'cur_instance_num' not in st.session_state:
81
+ st.session_state.cur_instance_num = -1
82
+
83
+
84
+ def validate(config_option, file_loaded):
85
+ if config_option != "None" and file_loaded is None:
86
+ st.error("Please upload a file for " + config_option)
87
+ st.stop()
88
+
89
+
90
+
91
+
92
+
93
+ with st.sidebar:
94
+ st.title("Options")
95
+
96
+
97
+ @st.cache_data
98
+ def load_chunked_data():
99
+ data = []
100
+ with open("chunked_data.jsonl", "r") as f:
101
+ for line in f:
102
+ data.append(json.loads(line))
103
+ # rename prompt column to text
104
+ df = pd.DataFrame(data)
105
+ df = df.rename(columns={"prompt": "text"})
106
+ return df
107
+
108
+
109
+ def load_generated_data():
110
+ with open("generated_data.json", "r") as fin:
111
+ data = json.load(fin)["outputs"]
112
+
113
+ new_insts = []
114
+ for key, value in data.items():
115
+ item = {
116
+ "venue": key
117
+ }
118
+ if type(value) == str:
119
+ value = ast.literal_eval(value)
120
+
121
+ if type(value) == dict:
122
+ for cur_key, cur_value in value.items():
123
+ item[cur_key] = cur_value
124
+ else:
125
+ raise ValueError(f"Invalid type {type(value)}: {value}")
126
+ new_insts.append(item)
127
+
128
+ return pd.DataFrame(new_insts)
129
+
130
+ original_df = load_chunked_data()
131
+ generated_data = load_generated_data()
132
+
133
+
134
+ def combine_text(item):
135
+ string_text = ""
136
+ for key, value in item.items():
137
+ if key == "venue" or value is None or value == "[]" or type(value) == float or len(value) == 0:
138
+ continue
139
+ string_text += f",{', '.join(value)}\n"
140
+ if "," == string_text[0]:
141
+ string_text = string_text[1:]
142
+ return string_text
143
+
144
+
145
+ original_map = {item["venue"]: item["text"] for item in original_df.to_dict(orient="records")}
146
+ generated_map = {item["venue"]: combine_text(item) for item in generated_data.to_dict(orient="records")}
147
+
148
+
149
+ col1, col2 = st.columns([1, 3], gap="large")
150
+
151
+ with st.sidebar:
152
+ st.success("All files uploaded")
153
+
154
+ with col1:
155
+ # breakpoint()
156
+ ids = original_df["venue"].tolist()
157
+ set_of_cols = set(ids)
158
+ container_for_nav = st.container()
159
+ name_of_columns = sorted([item for item in set_of_cols])
160
+ instances_to_use = name_of_columns
161
+ st.title("Instances")
162
+
163
+ def sync_from_drop():
164
+ if st.session_state.selectbox_instance == "Overview":
165
+ st.session_state.number_of_col = -1
166
+ st.session_state.cur_instance_num = -1
167
+ else:
168
+ index_of_obj = name_of_columns.index(st.session_state.selectbox_instance)
169
+ # print("Index of obj: ", index_of_obj, type(index_of_obj))
170
+ st.session_state.number_of_col = index_of_obj
171
+ st.session_state.cur_instance_num = index_of_obj
172
+
173
+ def sync_from_number():
174
+ st.session_state.cur_instance_num = st.session_state.number_of_col
175
+ # print("Session state number of col: ", st.session_state.number_of_col, type(st.session_state.number_of_col))
176
+ if st.session_state.number_of_col == -1:
177
+ st.session_state.selectbox_instance = "Overview"
178
+ else:
179
+ st.session_state.selectbox_instance = name_of_columns[st.session_state.number_of_col]
180
+
181
+
182
+ number_of_col = container_for_nav.number_input(min_value=-1, step=1, max_value=len(instances_to_use) - 1, on_change=sync_from_number, label=f"Select instance by index (up to **{len(instances_to_use) - 1}**)", key="number_of_col")
183
+ selectbox_instance = container_for_nav.selectbox("Select instance by ID", ["Overview"] + name_of_columns, on_change=sync_from_drop, key="selectbox_instance")
184
+ st.divider()
185
+
186
+
187
+ with col2:
188
+ # get instance number
189
+ inst_index = number_of_col
190
+
191
+ if inst_index >= 0:
192
+ inst_num = instances_to_use[inst_index]
193
+
194
+ st.markdown("<h1 style='text-align: center; color: black;text-decoration: underline;'>Editor</h1>", unsafe_allow_html=True)
195
+
196
+
197
+ container = st.container()
198
+
199
+ container.subheader(f"Venue: {inst_num}")
200
+
201
+ container.divider()
202
+
203
+
204
+ original_text = original_map[inst_num]
205
+ generated_text = generated_map[inst_num]
206
+ container.subheader(f"Original OCR Text")
207
+ original_input = container.markdown(original_text)
208
+ container.divider()
209
+
210
+ container.subheader(f"Generated Text")
211
+ generated_input = container.markdown(generated_text)
212
+ container.divider()
213
+
214
+ # print("Original text: ", original_text)
215
+ # print("Generated text: ", generated_text)
216
+
217
+ # Diff
218
+ if original_text is not None and generated_input is not None:
219
+ container.subheader("Diff")
220
+ processed_diff = generate_diff_html_word_level(original_map[inst_num], generated_map[inst_num])
221
+ with container.container(border=True):
222
+ st.markdown(processed_diff, unsafe_allow_html=True)
223
+
224
+
225
+ # editable text, starting from the generated text
226
+ editable_text = container.text_area("Edit the generated text", value=generated_text, height=300)
227
+
228
+
229
+
230
+ container.divider()
231
+ # download the editable text and venue name
232
+ st.download_button(
233
+ f"Download {inst_num} as CSV",
234
+ convert_df(pd.DataFrame([{"venue": inst_num, "text": editable_text}])),
235
+ f"{inst_num}.csv",
236
+ "text/csv",
237
+ key=f"download_{inst_num}"
238
+ )
239
+
240
+ # none checked
241
+ elif inst_index < 0:
242
+ st.title("Overview")
chunked_data.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
generated_data.json ADDED
The diff for this file is too large to render. See raw diff
 
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ default-jre
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas==2.0.3
2
+ streamlit==1.24.1
3
+ plotly==5.15.0
4
+ protobuf==3.20.0
5
+ beautifulsoup4==4.12.2
6
+ nltk==3.7
7
+ tqdm==4.66.2