Spaces:
Runtime error
Runtime error
File size: 5,835 Bytes
f771463 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
"""
Dashboard for showcasing extraction of text metrics with textdescriptives.
"""
from io import StringIO
import numpy as np
import streamlit as st
import textdescriptives as td
from data_viewer import DataViewer
from options import (
all_model_size_options_pretty_to_short,
available_model_size_options,
language_options,
metrics_options,
)
################
# Introduction #
################
col1, col2 = st.columns([9, 2])
with col1:
st.title("Extract Text Statistics")
with col2:
st.image(
"https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png"
)
st.write(
"Calculate a large variety of statistics from text via the "
"[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package "
f"(v/{td.__version__}). and download the results as a .csv file. "
"Includes descriptive statistics and metrics related to readability, "
"information theory, text coherence and text quality."
)
st.caption(
"Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
"calculating a large variety of statistics from text. "
"[arXiv preprint arXiv:2301.02057](https://arxiv.org/abs/2301.02057)"
)
############
# Settings #
############
input_choice = st.radio(
label="Input", options=["Enter text", "Upload file"], index=0, horizontal=True
)
with st.form(key="settings_form"):
split_by_line = st.checkbox(label="Split by newline", value=True)
string_data = None
if input_choice == "Upload file":
uploaded_file = st.file_uploader(
label="Choose a .txt file", type=["txt"], accept_multiple_files=False
)
if uploaded_file is not None:
# To convert to a string based IO:
string_data = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
else:
default_text = """Little interest or pleasure in doing things?
Feeling down, depressed, or hopeless?
Trouble falling or staying asleep, or sleeping too much?
Feeling tired or having little energy?
Poor appetite or overeating?
Feeling bad about yourself - or that you are a failure or have let yourself or your family down?"""
string_data = st.text_area(
label="Enter text", value=default_text, height=170, max_chars=None
)
# Row of selectors
col1, col2 = st.columns([1, 1])
with col1:
# Selection of language
language_pretty = st.selectbox(
label="Language",
options=list(language_options().keys()),
index=5,
key="language_selector",
)
language_short = language_options()[language_pretty]
with col2:
# Selection of model size
model_size_pretty = st.selectbox(
label="Model Size",
options=available_model_size_options(lang="all"),
index=0,
key="size_selector",
)
model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty]
# Multiselection of metrics
metrics = st.multiselect(
label="Metrics", options=metrics_options(), default=metrics_options()
)
st.write(
"See the [**documentation**](https://hlasse.github.io/TextDescriptives/) for "
"information on the available metrics."
)
# This shouldn't happen but better safe than sorry
if isinstance(metrics, list) and not metrics:
metrics = None
apply_settings_button = st.form_submit_button(label="Apply")
#############
# Apply NLP #
#############
if apply_settings_button and string_data is not None and string_data:
if model_size_pretty not in available_model_size_options(lang=language_short):
st.write(
"**Sorry!** The chosen *model size* is not available in this language. Please try another."
)
else:
# Clean and (optionally) split the text
string_data = string_data.strip()
if split_by_line:
string_data = string_data.split("\n")
else:
string_data = [string_data]
# Remove empty strings
# E.g. due to consecutive newlines
string_data = [s for s in string_data if s]
# Will automatically download the relevant model and extract all metrics
# TODO: Download beforehand to speed up inference
df = td.extract_metrics(
text=string_data,
lang=language_short,
spacy_model_size=model_size_short,
metrics=metrics,
)
###################
# Present Results #
###################
# Create 2 columns with 1) the output header
# and 2) a download button
DataViewer()._header_and_download(
header="The calculated metrics", data=df, file_name="text_metrics.csv"
)
st.write("**Note**: This data frame has been transposed for readability.")
df = df.transpose().reset_index()
df.columns = ["Metric"] + [str(c) for c in list(df.columns)[1:]]
st.dataframe(data=df, use_container_width=True)
############################
# Code For Reproducibility #
############################
with st.expander("See python code"):
st.code(
"""
import textdescriptives as td
# Given a string of text and the settings
text = "..."
model_name = "..."
split_by_newline = True
# Remove whitespace from both ends of the string
text = text.strip()
# When asked, split by newlines
if split_by_newline:
lines = text.split("\\n")
else:
lines = [text]
# Remove empty lines
# E.g. due to consecutive newlines
lines = [l for l in lines if l]
# Extract metrics for each line
extracted_metrics = td.extract_metrics(
text=lines,
spacy_model=model_name
)
""",
language="python",
line_numbers=True,
)
|