Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +30 -38
src/streamlit_app.py
CHANGED
|
@@ -1,6 +1,3 @@
|
|
| 1 |
-
import os
|
| 2 |
-
os.environ['HF_HOME'] = '/tmp'
|
| 3 |
-
|
| 4 |
import os
|
| 5 |
import time
|
| 6 |
import streamlit as st
|
|
@@ -18,6 +15,9 @@ import hashlib
|
|
| 18 |
# Set up environment variables
|
| 19 |
os.environ['HF_HOME'] = '/tmp'
|
| 20 |
|
|
|
|
|
|
|
|
|
|
| 21 |
st.markdown(
|
| 22 |
"""
|
| 23 |
<style>
|
|
@@ -69,10 +69,9 @@ st.markdown(
|
|
| 69 |
unsafe_allow_html=True
|
| 70 |
)
|
| 71 |
|
| 72 |
-
# --- Page Configuration and UI Elements ---
|
| 73 |
-
st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
|
| 74 |
st.subheader("HR.ai", divider="green")
|
| 75 |
st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
|
|
|
|
| 76 |
expander = st.expander("**Important notes**")
|
| 77 |
expander.write("""**Named Entities:** This HR.ai predicts thirty-five (35) labels: "Email", "Phone_number", "Street_address", "City", "Country", "Date_of_birth", "Marital_status", "Person", "Full_time", "Part_time", "Contract", "Terminated", "Retired", "Date", "Organization", "Role", "Performance_score", "Leave_of_absence", "Retirement_plan", "Bonus", "Stock_options", "Health_insurance", "Pay_rate", "Annual_salary", "Tax", "Deductions", "Interview_type", "Applicant", "Referral", "Job_board", "Recruiter", "Offer_letter", "Agreement", "Certification", "Skill"
|
| 78 |
Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
|
|
@@ -98,7 +97,6 @@ COMET_API_KEY = os.environ.get("COMET_API_KEY")
|
|
| 98 |
COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
|
| 99 |
COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
|
| 100 |
comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
|
| 101 |
-
|
| 102 |
if not comet_initialized:
|
| 103 |
st.warning("Comet ML not initialized. Check environment variables.")
|
| 104 |
|
|
@@ -122,7 +120,7 @@ category_mapping = {
|
|
| 122 |
}
|
| 123 |
|
| 124 |
# --- Model Loading ---
|
| 125 |
-
@st.
|
| 126 |
def load_ner_model():
|
| 127 |
"""Loads the GLiNER model and caches it."""
|
| 128 |
try:
|
|
@@ -170,34 +168,34 @@ if st.button("Results"):
|
|
| 170 |
)
|
| 171 |
experiment.log_parameter("input_text", text)
|
| 172 |
experiment.log_table("predicted_entities", df_ner)
|
| 173 |
-
|
| 174 |
-
st.subheader("Grouped Entities by Category", divider="green")
|
| 175 |
-
category_names = sorted(list(category_mapping.keys()))
|
| 176 |
-
category_tabs = st.tabs(category_names)
|
| 177 |
-
for i, category_name in enumerate(category_names):
|
| 178 |
-
with category_tabs[i]:
|
| 179 |
-
df_category_filtered = df_ner[df_ner['category'] == category_name]
|
| 180 |
-
if not df_category_filtered.empty:
|
| 181 |
-
st.dataframe(df_category_filtered.drop(columns=['category']), use_container_width=True)
|
| 182 |
-
else:
|
| 183 |
-
st.info(f"No entities found for the '{category_name}' category.")
|
| 184 |
-
|
| 185 |
-
with st.expander("See Glossary of tags"):
|
| 186 |
-
st.write('''
|
| 187 |
-
- **text**: ['entity extracted from your text data']
|
| 188 |
-
- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
|
| 189 |
-
- **label**: ['label (tag) assigned to a given extracted entity']
|
| 190 |
-
- **category**: ['the high-level category for the label']
|
| 191 |
-
- **start**: ['index of the start of the corresponding entity']
|
| 192 |
-
- **end**: ['index of the end of the corresponding entity']
|
| 193 |
-
''')
|
| 194 |
else:
|
| 195 |
st.warning("No entities were found in the provided text.")
|
| 196 |
if 'df_ner' in st.session_state:
|
| 197 |
del st.session_state.df_ner
|
| 198 |
|
| 199 |
-
# ---
|
| 200 |
if 'df_ner' in st.session_state and not st.session_state.df_ner.empty:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
st.divider()
|
| 202 |
st.subheader("Candidate Card", divider="green")
|
| 203 |
fig_treemap = px.treemap(st.session_state.df_ner, path=[px.Constant("all"), 'category', 'label', 'text'], values='score', color='category')
|
|
@@ -215,6 +213,7 @@ def load_gliner_model():
|
|
| 215 |
st.stop()
|
| 216 |
|
| 217 |
qa_model = load_gliner_model()
|
|
|
|
| 218 |
st.subheader("Question-Answering", divider="green")
|
| 219 |
|
| 220 |
if 'user_labels' not in st.session_state:
|
|
@@ -235,6 +234,7 @@ if st.button("Add Question"):
|
|
| 235 |
|
| 236 |
st.markdown("---")
|
| 237 |
st.subheader("Record of Questions", divider="green")
|
|
|
|
| 238 |
if st.session_state.user_labels:
|
| 239 |
for i, label in enumerate(st.session_state.user_labels):
|
| 240 |
col_list, col_delete = st.columns([0.9, 0.1])
|
|
@@ -271,19 +271,16 @@ if st.button("Extract Answers"):
|
|
| 271 |
end_time = time.time()
|
| 272 |
elapsed_time = end_time - start_time
|
| 273 |
st.info(f"Processing took **{elapsed_time:.2f} seconds**.")
|
| 274 |
-
|
| 275 |
if entities:
|
| 276 |
df_qa = pd.DataFrame(entities)
|
| 277 |
df_qa = df_qa[['label', 'text', 'score']].rename(columns={'label': 'question', 'text': 'answer'})
|
| 278 |
st.session_state.df_qa = df_qa # Store QA results in session state
|
| 279 |
-
|
| 280 |
st.subheader("Extracted Answers", divider="green")
|
| 281 |
st.dataframe(df_qa, use_container_width=True)
|
| 282 |
else:
|
| 283 |
st.warning("No answers were found for the provided questions.")
|
| 284 |
if 'df_qa' in st.session_state:
|
| 285 |
del st.session_state.df_qa
|
| 286 |
-
|
| 287 |
except Exception as e:
|
| 288 |
st.error(f"An error occurred during answer extraction: {e}")
|
| 289 |
if 'df_qa' in st.session_state:
|
|
@@ -292,7 +289,6 @@ if st.button("Extract Answers"):
|
|
| 292 |
# --- Download Button Section ---
|
| 293 |
def create_zip_file_and_get_bytes():
|
| 294 |
"""Generates a zip file in memory with all available dataframes."""
|
| 295 |
-
|
| 296 |
# Define the glossary DataFrame here to ensure it's always available
|
| 297 |
dfa = pd.DataFrame(
|
| 298 |
data={
|
|
@@ -307,10 +303,8 @@ def create_zip_file_and_get_bytes():
|
|
| 307 |
]
|
| 308 |
}
|
| 309 |
)
|
| 310 |
-
|
| 311 |
if 'df_ner' not in st.session_state and 'df_qa' not in st.session_state:
|
| 312 |
return None, None
|
| 313 |
-
|
| 314 |
buf = io.BytesIO()
|
| 315 |
with zipfile.ZipFile(buf, "w") as myzip:
|
| 316 |
if 'df_ner' in st.session_state and not st.session_state.df_ner.empty:
|
|
@@ -318,7 +312,6 @@ def create_zip_file_and_get_bytes():
|
|
| 318 |
if 'df_qa' in st.session_state and not st.session_state.df_qa.empty:
|
| 319 |
myzip.writestr("Extracted_Answers.csv", st.session_state.df_qa.to_csv(index=False))
|
| 320 |
myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
|
| 321 |
-
|
| 322 |
return buf.getvalue(), "nlpblogs_results.zip"
|
| 323 |
|
| 324 |
st.divider()
|
|
@@ -336,5 +329,4 @@ if ('df_ner' in st.session_state and not st.session_state.df_ner.empty) or \
|
|
| 336 |
data=zip_data,
|
| 337 |
file_name=file_name,
|
| 338 |
mime="application/zip",
|
| 339 |
-
)
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import time
|
| 3 |
import streamlit as st
|
|
|
|
| 15 |
# Set up environment variables
|
| 16 |
os.environ['HF_HOME'] = '/tmp'
|
| 17 |
|
| 18 |
+
# --- Page Configuration and UI Elements ---
|
| 19 |
+
st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
|
| 20 |
+
|
| 21 |
st.markdown(
|
| 22 |
"""
|
| 23 |
<style>
|
|
|
|
| 69 |
unsafe_allow_html=True
|
| 70 |
)
|
| 71 |
|
|
|
|
|
|
|
| 72 |
st.subheader("HR.ai", divider="green")
|
| 73 |
st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
|
| 74 |
+
|
| 75 |
expander = st.expander("**Important notes**")
|
| 76 |
expander.write("""**Named Entities:** This HR.ai predicts thirty-five (35) labels: "Email", "Phone_number", "Street_address", "City", "Country", "Date_of_birth", "Marital_status", "Person", "Full_time", "Part_time", "Contract", "Terminated", "Retired", "Date", "Organization", "Role", "Performance_score", "Leave_of_absence", "Retirement_plan", "Bonus", "Stock_options", "Health_insurance", "Pay_rate", "Annual_salary", "Tax", "Deductions", "Interview_type", "Applicant", "Referral", "Job_board", "Recruiter", "Offer_letter", "Agreement", "Certification", "Skill"
|
| 77 |
Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
|
|
|
|
| 97 |
COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
|
| 98 |
COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
|
| 99 |
comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
|
|
|
|
| 100 |
if not comet_initialized:
|
| 101 |
st.warning("Comet ML not initialized. Check environment variables.")
|
| 102 |
|
|
|
|
| 120 |
}
|
| 121 |
|
| 122 |
# --- Model Loading ---
|
| 123 |
+
@st.cache_resourced
|
| 124 |
def load_ner_model():
|
| 125 |
"""Loads the GLiNER model and caches it."""
|
| 126 |
try:
|
|
|
|
| 168 |
)
|
| 169 |
experiment.log_parameter("input_text", text)
|
| 170 |
experiment.log_table("predicted_entities", df_ner)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
else:
|
| 172 |
st.warning("No entities were found in the provided text.")
|
| 173 |
if 'df_ner' in st.session_state:
|
| 174 |
del st.session_state.df_ner
|
| 175 |
|
| 176 |
+
# --- Display Sections based on Session State ---
|
| 177 |
if 'df_ner' in st.session_state and not st.session_state.df_ner.empty:
|
| 178 |
+
st.subheader("Grouped Entities by Category", divider="green")
|
| 179 |
+
category_names = sorted(list(category_mapping.keys()))
|
| 180 |
+
category_tabs = st.tabs(category_names)
|
| 181 |
+
for i, category_name in enumerate(category_names):
|
| 182 |
+
with category_tabs[i]:
|
| 183 |
+
df_category_filtered = st.session_state.df_ner[st.session_state.df_ner['category'] == category_name]
|
| 184 |
+
if not df_category_filtered.empty:
|
| 185 |
+
st.dataframe(df_category_filtered.drop(columns=['category']), use_container_width=True)
|
| 186 |
+
else:
|
| 187 |
+
st.info(f"No entities found for the '{category_name}' category.")
|
| 188 |
+
|
| 189 |
+
with st.expander("See Glossary of tags"):
|
| 190 |
+
st.write('''
|
| 191 |
+
- **text**: ['entity extracted from your text data']
|
| 192 |
+
- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
|
| 193 |
+
- **label**: ['label (tag) assigned to a given extracted entity']
|
| 194 |
+
- **category**: ['the high-level category for the label']
|
| 195 |
+
- **start**: ['index of the start of the corresponding entity']
|
| 196 |
+
- **end**: ['index of the end of the corresponding entity']
|
| 197 |
+
''')
|
| 198 |
+
|
| 199 |
st.divider()
|
| 200 |
st.subheader("Candidate Card", divider="green")
|
| 201 |
fig_treemap = px.treemap(st.session_state.df_ner, path=[px.Constant("all"), 'category', 'label', 'text'], values='score', color='category')
|
|
|
|
| 213 |
st.stop()
|
| 214 |
|
| 215 |
qa_model = load_gliner_model()
|
| 216 |
+
|
| 217 |
st.subheader("Question-Answering", divider="green")
|
| 218 |
|
| 219 |
if 'user_labels' not in st.session_state:
|
|
|
|
| 234 |
|
| 235 |
st.markdown("---")
|
| 236 |
st.subheader("Record of Questions", divider="green")
|
| 237 |
+
|
| 238 |
if st.session_state.user_labels:
|
| 239 |
for i, label in enumerate(st.session_state.user_labels):
|
| 240 |
col_list, col_delete = st.columns([0.9, 0.1])
|
|
|
|
| 271 |
end_time = time.time()
|
| 272 |
elapsed_time = end_time - start_time
|
| 273 |
st.info(f"Processing took **{elapsed_time:.2f} seconds**.")
|
|
|
|
| 274 |
if entities:
|
| 275 |
df_qa = pd.DataFrame(entities)
|
| 276 |
df_qa = df_qa[['label', 'text', 'score']].rename(columns={'label': 'question', 'text': 'answer'})
|
| 277 |
st.session_state.df_qa = df_qa # Store QA results in session state
|
|
|
|
| 278 |
st.subheader("Extracted Answers", divider="green")
|
| 279 |
st.dataframe(df_qa, use_container_width=True)
|
| 280 |
else:
|
| 281 |
st.warning("No answers were found for the provided questions.")
|
| 282 |
if 'df_qa' in st.session_state:
|
| 283 |
del st.session_state.df_qa
|
|
|
|
| 284 |
except Exception as e:
|
| 285 |
st.error(f"An error occurred during answer extraction: {e}")
|
| 286 |
if 'df_qa' in st.session_state:
|
|
|
|
| 289 |
# --- Download Button Section ---
|
| 290 |
def create_zip_file_and_get_bytes():
|
| 291 |
"""Generates a zip file in memory with all available dataframes."""
|
|
|
|
| 292 |
# Define the glossary DataFrame here to ensure it's always available
|
| 293 |
dfa = pd.DataFrame(
|
| 294 |
data={
|
|
|
|
| 303 |
]
|
| 304 |
}
|
| 305 |
)
|
|
|
|
| 306 |
if 'df_ner' not in st.session_state and 'df_qa' not in st.session_state:
|
| 307 |
return None, None
|
|
|
|
| 308 |
buf = io.BytesIO()
|
| 309 |
with zipfile.ZipFile(buf, "w") as myzip:
|
| 310 |
if 'df_ner' in st.session_state and not st.session_state.df_ner.empty:
|
|
|
|
| 312 |
if 'df_qa' in st.session_state and not st.session_state.df_qa.empty:
|
| 313 |
myzip.writestr("Extracted_Answers.csv", st.session_state.df_qa.to_csv(index=False))
|
| 314 |
myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
|
|
|
|
| 315 |
return buf.getvalue(), "nlpblogs_results.zip"
|
| 316 |
|
| 317 |
st.divider()
|
|
|
|
| 329 |
data=zip_data,
|
| 330 |
file_name=file_name,
|
| 331 |
mime="application/zip",
|
| 332 |
+
)
|
|
|