Spaces:
Sleeping
Sleeping
orionweller
commited on
Commit
•
0bc0c39
1
Parent(s):
7e588ed
current updates
Browse files- app.py +64 -6
- find_splitting_words.py +11 -3
- requirements.txt +2 -1
app.py
CHANGED
@@ -5,6 +5,8 @@ import pandas as pd
|
|
5 |
from collections import defaultdict
|
6 |
import json
|
7 |
import copy
|
|
|
|
|
8 |
import plotly.express as px
|
9 |
from find_splitting_words import find_dividing_words
|
10 |
|
@@ -49,7 +51,39 @@ def get_current_data():
|
|
49 |
# return the data as a CSV pandas
|
50 |
return convert_df(pd.DataFrame(cur_query_data))
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
|
55 |
if 'cur_instance_num' not in st.session_state:
|
@@ -94,7 +128,7 @@ with st.sidebar:
|
|
94 |
|
95 |
z = st.header("Analysis Options")
|
96 |
# sliderbar of how many Top N to choose
|
97 |
-
n_relevant_docs = st.slider("Number of relevant docs", 1, 999,
|
98 |
|
99 |
|
100 |
col1, col2 = st.columns([1, 3], gap="large")
|
@@ -169,15 +203,39 @@ if corpus is not None and queries is not None and qrels is not None:
|
|
169 |
# relevant
|
170 |
relevant_docs = list(qrels[str(inst_num)].keys())[:n_relevant_docs]
|
171 |
doc_texts = [(doc_id, corpus[doc_id]["title"] if "title" in corpus[doc_id] else "", corpus[doc_id]["text"]) for doc_id in relevant_docs]
|
172 |
-
splitting_words = find_dividing_words(
|
173 |
|
174 |
-
|
|
|
|
|
|
|
|
|
175 |
|
176 |
-
container.subheader(f"Relevant Documents ({len(list(qrels[str(inst_num)].keys()))})")
|
177 |
current_checkboxes = []
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
current_checkboxes.append((docid, container.checkbox(f'{docid} is Non-Relevant', key=docid)))
|
180 |
-
container.text_area(f"{docid}:", text)
|
181 |
|
182 |
|
183 |
container.divider()
|
|
|
5 |
from collections import defaultdict
|
6 |
import json
|
7 |
import copy
|
8 |
+
import re
|
9 |
+
import tqdm
|
10 |
import plotly.express as px
|
11 |
from find_splitting_words import find_dividing_words
|
12 |
|
|
|
51 |
# return the data as a CSV pandas
|
52 |
return convert_df(pd.DataFrame(cur_query_data))
|
53 |
|
54 |
+
@st.cache_data
|
55 |
+
def escape_markdown(text):
|
56 |
+
# List of characters to escape
|
57 |
+
# Adding backslash to the list of special characters to escape itself as well
|
58 |
+
text = text.replace("``", "\"")
|
59 |
+
special_chars = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!', '|', "$"]
|
60 |
+
|
61 |
+
# Escaping each special character
|
62 |
+
escaped_text = "".join(f"\\{char}" if char in special_chars else char for char in text)
|
63 |
+
|
64 |
+
return escaped_text
|
65 |
|
66 |
+
@st.cache_data
|
67 |
+
def highlight_text(text, splitting_words):
|
68 |
+
# remove anything that will mess up markdown
|
69 |
+
text = escape_markdown(text)
|
70 |
+
changed = False
|
71 |
+
if not len(splitting_words):
|
72 |
+
return text, changed
|
73 |
+
|
74 |
+
def replace_function(match):
|
75 |
+
return f'<span style="background-color: #FFFF00">{match.group(0)}</span>'
|
76 |
+
|
77 |
+
# Compile a single regular expression pattern for all splitting words
|
78 |
+
pattern = '|'.join([re.escape(word) for word in splitting_words])
|
79 |
+
|
80 |
+
# Perform case-insensitive replacement
|
81 |
+
new_text, num_subs = re.subn(pattern, replace_function, text, flags=re.IGNORECASE)
|
82 |
+
|
83 |
+
if num_subs > 0:
|
84 |
+
changed = True
|
85 |
+
|
86 |
+
return new_text, changed
|
87 |
|
88 |
|
89 |
if 'cur_instance_num' not in st.session_state:
|
|
|
128 |
|
129 |
z = st.header("Analysis Options")
|
130 |
# sliderbar of how many Top N to choose
|
131 |
+
n_relevant_docs = st.slider("Number of relevant docs", 1, 999, 300)
|
132 |
|
133 |
|
134 |
col1, col2 = st.columns([1, 3], gap="large")
|
|
|
203 |
# relevant
|
204 |
relevant_docs = list(qrels[str(inst_num)].keys())[:n_relevant_docs]
|
205 |
doc_texts = [(doc_id, corpus[doc_id]["title"] if "title" in corpus[doc_id] else "", corpus[doc_id]["text"]) for doc_id in relevant_docs]
|
206 |
+
splitting_words = find_dividing_words([item[1] + " " + item[2] for item in doc_texts])
|
207 |
|
208 |
+
# make a selectbox of these splitting words (allow multiple)
|
209 |
+
container.subheader("Splitting Words")
|
210 |
+
container.text("Select words that are relevant to the query")
|
211 |
+
splitting_word_select = container.multiselect("Splitting Words", splitting_words, key="splitting_words")
|
212 |
+
container.divider()
|
213 |
|
|
|
214 |
current_checkboxes = []
|
215 |
+
total_changed = 0
|
216 |
+
highlighted_texts = []
|
217 |
+
highlighted_titles = []
|
218 |
+
for (docid, title, text) in tqdm.tqdm(doc_texts):
|
219 |
+
if not len(splitting_word_select):
|
220 |
+
highlighted_texts.append(text)
|
221 |
+
highlighted_titles.append(title)
|
222 |
+
continue
|
223 |
+
highlighted_text, changed_text = highlight_text(text, splitting_word_select)
|
224 |
+
highlighted_title, changed_title = highlight_text(title, splitting_word_select)
|
225 |
+
highlighted_titles.append(highlighted_title)
|
226 |
+
highlighted_texts.append(highlighted_text)
|
227 |
+
total_changed += int(int(changed_text) or int(changed_title))
|
228 |
+
|
229 |
+
container.subheader(f"Relevant Documents ({len(list(qrels[str(inst_num)].keys()))})")
|
230 |
+
container.subheader(f"Total have these words: {total_changed}")
|
231 |
+
|
232 |
+
container.divider()
|
233 |
+
|
234 |
+
for i, (docid, title, text) in enumerate(doc_texts):
|
235 |
+
container.markdown(f"## {docid}")
|
236 |
+
container.markdown(f"#### {highlighted_titles[i]}", True)
|
237 |
+
container.markdown(f"\n{highlighted_texts[i]}", True)
|
238 |
current_checkboxes.append((docid, container.checkbox(f'{docid} is Non-Relevant', key=docid)))
|
|
|
239 |
|
240 |
|
241 |
container.divider()
|
find_splitting_words.py
CHANGED
@@ -6,11 +6,14 @@ from nltk.tokenize import word_tokenize
|
|
6 |
from collections import Counter
|
7 |
import string
|
8 |
import os
|
|
|
9 |
|
10 |
# Ensure you've downloaded the set of stop words the first time you run this
|
11 |
import nltk
|
12 |
-
|
13 |
-
nltk.
|
|
|
|
|
14 |
|
15 |
def preprocess_document(doc):
|
16 |
"""
|
@@ -30,6 +33,7 @@ def preprocess_document(doc):
|
|
30 |
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
31 |
return stemmed_tokens
|
32 |
|
|
|
33 |
def find_dividing_words(documents):
|
34 |
"""
|
35 |
Identifies candidate words that might split the set of documents into two groups.
|
@@ -37,10 +41,13 @@ def find_dividing_words(documents):
|
|
37 |
all_words = []
|
38 |
per_doc_word_counts = []
|
39 |
|
|
|
40 |
for doc in documents:
|
|
|
41 |
preprocessed_doc = preprocess_document(doc)
|
42 |
all_words.extend(preprocessed_doc)
|
43 |
per_doc_word_counts.append(Counter(preprocessed_doc))
|
|
|
44 |
|
45 |
# Overall word frequency
|
46 |
overall_word_counts = Counter(all_words)
|
@@ -50,8 +57,9 @@ def find_dividing_words(documents):
|
|
50 |
candidate_words = []
|
51 |
for word, count in overall_word_counts.items():
|
52 |
doc_frequency = sum(1 for doc_count in per_doc_word_counts if doc_count[word] > 0)
|
53 |
-
if 0.
|
54 |
candidate_words.append(word)
|
|
|
55 |
|
56 |
return candidate_words
|
57 |
|
|
|
6 |
from collections import Counter
|
7 |
import string
|
8 |
import os
|
9 |
+
import streamlit as st
|
10 |
|
11 |
# Ensure you've downloaded the set of stop words the first time you run this
|
12 |
import nltk
|
13 |
+
# only download if they don't exist
|
14 |
+
if not os.path.exists(os.path.join(nltk.data.find('corpora'), 'stopwords')):
|
15 |
+
nltk.download('punkt')
|
16 |
+
nltk.download('stopwords')
|
17 |
|
18 |
def preprocess_document(doc):
|
19 |
"""
|
|
|
33 |
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
34 |
return stemmed_tokens
|
35 |
|
36 |
+
@st.cache_data
|
37 |
def find_dividing_words(documents):
|
38 |
"""
|
39 |
Identifies candidate words that might split the set of documents into two groups.
|
|
|
41 |
all_words = []
|
42 |
per_doc_word_counts = []
|
43 |
|
44 |
+
i = 0
|
45 |
for doc in documents:
|
46 |
+
print(i)
|
47 |
preprocessed_doc = preprocess_document(doc)
|
48 |
all_words.extend(preprocessed_doc)
|
49 |
per_doc_word_counts.append(Counter(preprocessed_doc))
|
50 |
+
i += 1
|
51 |
|
52 |
# Overall word frequency
|
53 |
overall_word_counts = Counter(all_words)
|
|
|
57 |
candidate_words = []
|
58 |
for word, count in overall_word_counts.items():
|
59 |
doc_frequency = sum(1 for doc_count in per_doc_word_counts if doc_count[word] > 0)
|
60 |
+
if 0.35 * num_docs <= doc_frequency <= 0.75 * num_docs:
|
61 |
candidate_words.append(word)
|
62 |
+
print("Done with dividing words")
|
63 |
|
64 |
return candidate_words
|
65 |
|
requirements.txt
CHANGED
@@ -3,4 +3,5 @@ streamlit==1.24.1
|
|
3 |
plotly==5.15.0
|
4 |
protobuf==3.20.0
|
5 |
beautifulsoup4==4.12.2
|
6 |
-
nltk==3.7
|
|
|
|
3 |
plotly==5.15.0
|
4 |
protobuf==3.20.0
|
5 |
beautifulsoup4==4.12.2
|
6 |
+
nltk==3.7
|
7 |
+
tqdm
|