File size: 5,835 Bytes
f771463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""
Dashboard for showcasing extraction of text metrics with textdescriptives.

"""

from io import StringIO

import numpy as np
import streamlit as st
import textdescriptives as td

from data_viewer import DataViewer
from options import (
    all_model_size_options_pretty_to_short,
    available_model_size_options,
    language_options,
    metrics_options,
)

################
# Introduction #
################


col1, col2 = st.columns([9, 2])
with col1:
    st.title("Extract Text Statistics")
with col2:
    st.image(
        "https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png"
    )

st.write(
    "Calculate a large variety of statistics from text via the "
    "[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package "
    f"(v/{td.__version__}). and download the results as a .csv file. "
    "Includes descriptive statistics and metrics related to readability, "
    "information theory, text coherence and text quality."
)

st.caption(
    "Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
    "calculating a large variety of statistics from text. "
    "[arXiv preprint arXiv:2301.02057](https://arxiv.org/abs/2301.02057)"
)


############
# Settings #
############


input_choice = st.radio(
    label="Input", options=["Enter text", "Upload file"], index=0, horizontal=True
)

with st.form(key="settings_form"):
    split_by_line = st.checkbox(label="Split by newline", value=True)

    string_data = None

    if input_choice == "Upload file":
        uploaded_file = st.file_uploader(
            label="Choose a .txt file", type=["txt"], accept_multiple_files=False
        )

        if uploaded_file is not None:
            # To convert to a string based IO:
            string_data = StringIO(uploaded_file.getvalue().decode("utf-8")).read()

    else:
        default_text = """Little interest or pleasure in doing things?
Feeling down, depressed, or hopeless?
Trouble falling or staying asleep, or sleeping too much?
Feeling tired or having little energy?
Poor appetite or overeating?
Feeling bad about yourself - or that you are a failure or have let yourself or your family down?"""

        string_data = st.text_area(
            label="Enter text", value=default_text, height=170, max_chars=None
        )

    # Row of selectors
    col1, col2 = st.columns([1, 1])

    with col1:
        # Selection of language
        language_pretty = st.selectbox(
            label="Language",
            options=list(language_options().keys()),
            index=5,
            key="language_selector",
        )

        language_short = language_options()[language_pretty]

    with col2:
        # Selection of model size
        model_size_pretty = st.selectbox(
            label="Model Size",
            options=available_model_size_options(lang="all"),
            index=0,
            key="size_selector",
        )

        model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty]

    # Multiselection of metrics
    metrics = st.multiselect(
        label="Metrics", options=metrics_options(), default=metrics_options()
    )

    st.write(
        "See the [**documentation**](https://hlasse.github.io/TextDescriptives/) for "
        "information on the available metrics."
    )
    # This shouldn't happen but better safe than sorry
    if isinstance(metrics, list) and not metrics:
        metrics = None

    apply_settings_button = st.form_submit_button(label="Apply")


#############
# Apply NLP #
#############


if apply_settings_button and string_data is not None and string_data:
    if model_size_pretty not in available_model_size_options(lang=language_short):
        st.write(
            "**Sorry!** The chosen *model size* is not available in this language. Please try another."
        )
    else:
        # Clean and (optionally) split the text
        string_data = string_data.strip()
        if split_by_line:
            string_data = string_data.split("\n")
        else:
            string_data = [string_data]

        # Remove empty strings
        # E.g. due to consecutive newlines
        string_data = [s for s in string_data if s]

        # Will automatically download the relevant model and extract all metrics
        # TODO: Download beforehand to speed up inference
        df = td.extract_metrics(
            text=string_data,
            lang=language_short,
            spacy_model_size=model_size_short,
            metrics=metrics,
        )

        ###################
        # Present Results #
        ###################

        # Create 2 columns with 1) the output header
        # and 2) a download button
        DataViewer()._header_and_download(
            header="The calculated metrics", data=df, file_name="text_metrics.csv"
        )

        st.write("**Note**: This data frame has been transposed for readability.")
        df = df.transpose().reset_index()
        df.columns = ["Metric"] + [str(c) for c in list(df.columns)[1:]]
        st.dataframe(data=df, use_container_width=True)


############################
# Code For Reproducibility #
############################


with st.expander("See python code"):
    st.code(
        """
import textdescriptives as td

# Given a string of text and the settings
text = "..."
model_name = "..."
split_by_newline = True

# Remove whitespace from both ends of the string
text = text.strip()

# When asked, split by newlines
if split_by_newline:
    lines = text.split("\\n")
else:
    lines = [text]

# Remove empty lines
# E.g. due to consecutive newlines
lines = [l for l in lines if l]

# Extract metrics for each line
extracted_metrics = td.extract_metrics(
    text=lines,
    spacy_model=model_name
)

""",
        language="python",
        line_numbers=True,
    )