jian-mo commited on
Commit
a3d290e
Β·
1 Parent(s): 9397200

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +196 -0
app.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ import numpy as np
4
+ from pandas import DataFrame
5
+ from keybert import KeyBERT
6
+ # For Flair (Keybert)
7
+ from flair.embeddings import TransformerDocumentEmbeddings
8
+ import seaborn as sns
9
+ # For download buttons
10
+ from functionforDownloadButtons import download_button
11
+ import os
12
+ import json
13
+ from transformers import pipeline
14
+
15
+
16
+ st.set_page_config(
17
+ page_title="E2E QA MINING",
18
+ page_icon="?",
19
+ )
20
+
21
+
22
+ def _max_width_():
23
+ max_width_str = f"max-width: 1400px;"
24
+ st.markdown(
25
+ f"""
26
+ <style>
27
+ .reportview-container .main .block-container{{
28
+ {max_width_str}
29
+ }}
30
+ </style>
31
+ """,
32
+ unsafe_allow_html=True,
33
+ )
34
+
35
+
36
+ _max_width_()
37
+
38
+ c30, c31, c32 = st.columns([2.5, 1, 3])
39
+
40
+ with c30:
41
+ # st.image("logo.png", width=400)
42
+ st.title("πŸ”‘ E2E QA MINING")
43
+ st.header("")
44
+
45
+ with st.expander("ℹ️ - About this app", expanded=True):
46
+ st.write(
47
+ """
48
+ - The *E2E QA MINING$ app helps you mine question-answer pairs from a given context.
49
+ """
50
+ )
51
+
52
+ st.markdown("")
53
+
54
+ st.markdown("")
55
+ st.markdown("## **πŸ“Œ Paste document **")
56
+ with st.form(key="my_form"):
57
+ ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 5, 0.07])
58
+ with c1:
59
+
60
+
61
+
62
+ kw_model = pipeline('text2text-generation', model='mojians/E2E-QA-mining')
63
+
64
+ top_N = st.slider(
65
+ "# of results",
66
+ min_value=1,
67
+ max_value=30,
68
+ value=10,
69
+ help="You can choose the number of keywords/keyphrases to display. Between 1 and 30, default number is 10.",
70
+ )
71
+ min_Ngrams = st.number_input(
72
+ "Minimum Ngram",
73
+ min_value=1,
74
+ max_value=4,
75
+ help="""The minimum value for the ngram range.
76
+ *Keyphrase_ngram_range* sets the length of the resulting keywords/keyphrases.
77
+ To extract keyphrases, simply set *keyphrase_ngram_range* to (1, 2) or higher depending on the number of words you would like in the resulting keyphrases.""",
78
+ # help="Minimum value for the keyphrase_ngram_range. keyphrase_ngram_range sets the length of the resulting keywords/keyphrases. To extract keyphrases, simply set keyphrase_ngram_range to (1, # 2) or higher depending on the number of words you would like in the resulting keyphrases.",
79
+ )
80
+
81
+ max_Ngrams = st.number_input(
82
+ "Maximum Ngram",
83
+ value=2,
84
+ min_value=1,
85
+ max_value=4,
86
+ help="""The maximum value for the keyphrase_ngram_range.
87
+ *Keyphrase_ngram_range* sets the length of the resulting keywords/keyphrases.
88
+ To extract keyphrases, simply set *keyphrase_ngram_range* to (1, 2) or higher depending on the number of words you would like in the resulting keyphrases.""",
89
+ )
90
+
91
+ StopWordsCheckbox = st.checkbox(
92
+ "Remove stop words",
93
+ help="Tick this box to remove stop words from the document (currently English only)",
94
+ )
95
+
96
+ use_MMR = st.checkbox(
97
+ "Use MMR",
98
+ value=True,
99
+ help="You can use Maximal Margin Relevance (MMR) to diversify the results. It creates keywords/keyphrases based on cosine similarity. Try high/low 'Diversity' settings below for interesting variations.",
100
+ )
101
+
102
+ Diversity = st.slider(
103
+ "Keyword diversity (MMR only)",
104
+ value=0.5,
105
+ min_value=0.0,
106
+ max_value=1.0,
107
+ step=0.1,
108
+ help="""The higher the setting, the more diverse the keywords.
109
+
110
+ Note that the *Keyword diversity* slider only works if the *MMR* checkbox is ticked.
111
+ """,
112
+ )
113
+
114
+ with c2:
115
+ doc = st.text_area(
116
+ "Paste your text below (max 500 words)",
117
+ height=510,
118
+ )
119
+
120
+ MAX_WORDS = 500
121
+ import re
122
+
123
+ res = len(re.findall(r"\w+", doc))
124
+ if res > MAX_WORDS:
125
+ st.warning(
126
+ "⚠️ Your text contains "
127
+ + str(res)
128
+ + " words."
129
+ + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
130
+ )
131
+
132
+ doc = doc[:MAX_WORDS]
133
+
134
+ submit_button = st.form_submit_button(label="✨ Get me the data!")
135
+
136
+ if use_MMR:
137
+ mmr = True
138
+ else:
139
+ mmr = False
140
+
141
+ if StopWordsCheckbox:
142
+ StopWords = "english"
143
+ else:
144
+ StopWords = None
145
+
146
+ if not submit_button:
147
+ st.stop()
148
+
149
+ if min_Ngrams > max_Ngrams:
150
+ st.warning("min_Ngrams can't be greater than max_Ngrams")
151
+ st.stop()
152
+
153
+ keywords = kw_model("context:"+doc+ "generate questions and answers:", do_sample=True, min_length=50,max_length=300)
154
+ st.markdown("## **🎈 Check & download results **")
155
+
156
+ st.header("")
157
+
158
+ cs, c1, c2, c3, cLast = st.columns([2, 1.5, 1.5, 1.5, 2])
159
+
160
+ with c1:
161
+ CSVButton2 = download_button(keywords, "Data.csv", "πŸ“₯ Download (.csv)")
162
+ with c2:
163
+ CSVButton2 = download_button(keywords, "Data.txt", "πŸ“₯ Download (.txt)")
164
+ with c3:
165
+ CSVButton2 = download_button(keywords, "Data.json", "πŸ“₯ Download (.json)")
166
+
167
+ st.header("")
168
+
169
+ df = (
170
+ DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
171
+ .sort_values(by="Relevancy", ascending=False)
172
+ .reset_index(drop=True)
173
+ )
174
+
175
+ df.index += 1
176
+
177
+ # Add styling
178
+ cmGreen = sns.light_palette("green", as_cmap=True)
179
+ cmRed = sns.light_palette("red", as_cmap=True)
180
+ df = df.style.background_gradient(
181
+ cmap=cmGreen,
182
+ subset=[
183
+ "Relevancy",
184
+ ],
185
+ )
186
+
187
+ c1, c2, c3 = st.columns([1, 3, 1])
188
+
189
+ format_dictionary = {
190
+ "Relevancy": "{:.1%}",
191
+ }
192
+
193
+ df = df.format(format_dictionary)
194
+
195
+ with c2:
196
+ st.table(df)