nlp-qual-space-dev

Sleeping

App Files Files Community

maxspad commited on Jan 17, 2023

Commit

7e4e512

•

1 Parent(s): a002819

blurb added, q3 now depends on q2, if q2 no suggestion, q3 auto no link

Browse files

Files changed (2) hide show

app.py +11 -5
overview.py +44 -3

app.py CHANGED Viewed

@@ -3,8 +3,6 @@ import transformers as tf
 import pandas as pd
 from overview import NQDOverview
-from fullreport import NQDFullReport
 # Function to load and cache models
 @st.experimental_singleton(show_spinner=False)
@@ -33,9 +31,13 @@ def run_models(model_names, models, c):
 st.title('Assess the *QuAL*ity of your feedback')
 st.caption(
-"""Medical education *requires* high-quality feedback, but evaluating feedback
-is difficult and time-consuming. This tool uses NLP/ML to predict a validated
-feedback quality metric known as the QuAL Score. *Try it for yourself!*
 """)
 ### Load models
@@ -83,6 +85,10 @@ with st.form('comment_form'):
         st.experimental_rerun()
 results = run_models(models_to_load, models, st.session_state['comment'])
 overview = NQDOverview(st, results)
 overview.draw()

 import pandas as pd
 from overview import NQDOverview
 # Function to load and cache models
 @st.experimental_singleton(show_spinner=False)
 st.title('Assess the *QuAL*ity of your feedback')
 st.caption(
+"""Medical education requires high-quality *written* feedback,
+but evaluating these *supervisor narrative comments* is time-consuming.
+The QuAL score has validity evidence for measuring the quality of short
+comments in  this context. We developed a NLP/ML-powered tool to
+assess written comment quality via the QuAL score with high accuracy.
+*Try it for yourself!*
 """)
 ### Load models
         st.experimental_rerun()
 results = run_models(models_to_load, models, st.session_state['comment'])
+# Modify results to sum the QuAL score and to ignore Q3 if Q2 no suggestion
+if results['q2i']['label'] == 1:
+    results['q3i']['label'] = 1 # can't have connection if no suggestion
+results['qual']['label'] = results['q1']['label'] + (not results['q2i']['label']) + (not results['q3i']['label'])
 overview = NQDOverview(st, results)
 overview.draw()

overview.py CHANGED Viewed

@@ -1,6 +1,44 @@
 from matplotlib.cm import get_cmap
 import plotly.graph_objects as go
 class NQDOverview(object):
     def __init__(self, parent, results,
         dial_cmap='RdYlGn'):
@@ -32,6 +70,9 @@ class NQDOverview(object):
     def draw(self):
         st = self.p
         fig = self._build_figure()
         cols = st.columns([7, 3])
@@ -48,7 +89,7 @@ class NQDOverview(object):
             elif q1lab == 3:
                 md_str = '😁 High'
             cols[1].metric('Level of Detail', md_str,
-                help='How specific was the evaluator in describing the behavior?')
             q2lab = self.results['q2i']['label']
             if q2lab == 0:
@@ -56,7 +97,7 @@ class NQDOverview(object):
             else:
                 md_str = '❌ No'
             cols[1].metric('Suggestion Given', (md_str),
-                help='Did the evaluator give a suggestion for improvement?')
             q3lab = self.results['q3i']['label']
             if q3lab == 0:
@@ -64,4 +105,4 @@ class NQDOverview(object):
             else:
                 md_str = '❌ No'
             cols[1].metric('Suggestion Linked', md_str,
-                help='Is the suggestion for improvement linked to the described behavior?')

 from matplotlib.cm import get_cmap
 import plotly.graph_objects as go
+about_blurb = '''
+### About the QuAL Score
+The Quality of Assessment for Learning score (QuAL score),
+was created to evaluate short qualitative comments that are related to specific
+scores entered into a workplace-based assessment,
+common within the competency-based medical education (CBME) context.
+It is rated on a scale of 0-5, with 0 signifying very low quality and 5 very high quality.
+It consists of three subscores which are summed to calculate the overall QuAL score:
+1. Evidence - Does the rater provide sufficient evidence about resident performance? (0-no comment at all, 1-no, but comment present, 2-somewhat, 3-yes/full description)
+2. Suggestion - Does the rater provide a suggestion for improvement? (0-no/1-yes)
+3. Connection - Is the rater's suggestion linked to the behavior described? (0-no/1-yes)
+The QuAL score has validity evidence for accurately measuring the quality of evaluation comments in CBME.
+For more information, see the paper [here](https://doi.org/10.1080/10401334.2019.1708365).
+### About this Tool
+The QuAL score accurately rates the quality of narrative comments in CBME, but
+it still requires time-consuming manual rating. With large volumes of text generated in a
+typical CBME program, large-scale assessment of comment quality is impractical.
+This tool uses machine learning (ML) and natural langugage processing (NLP) to automate
+the rating of the QuAL score on narratie comments.
+We trained a machine learning model to predict each of the three subscores described above.
+The resulting models are accurate:
+1. Evidence - Balanced accuracy of 61.5% for a 0-3 result, within-one accuracy of 96.4%
+2. Suggestion - Accuracy of 85%, sensitivity for lack of suggestion 86.2%
+3. Connection - Accuracy of 82%, sensitivity for lack of connection 90%
+The models are highly accurate, but not perfect! You may experience times where
+the results are not consistent with your interpretation of the text. If you do, please
+leave us [feedback](https://forms.gle/PfXxcGmvLYvd9jWz5). This tool is intendened as a demonstration only
+and should not be used for high-stakes assessment (yet!).
+'''
 class NQDOverview(object):
     def __init__(self, parent, results,
         dial_cmap='RdYlGn'):
     def draw(self):
         st = self.p
+        with st.expander('About the QuAL Score and this Tool', expanded=False):
+            st.markdown(about_blurb)
         fig = self._build_figure()
         cols = st.columns([7, 3])
             elif q1lab == 3:
                 md_str = '😁 High'
             cols[1].metric('Level of Detail', md_str,
+                help='Q1 - Evidence - Does the rater provide sufficient evidence about resident performance? (0-no comment at all, 1-no, but comment present, 2-somewhat, 3-yes/full description)')
             q2lab = self.results['q2i']['label']
             if q2lab == 0:
             else:
                 md_str = '❌ No'
             cols[1].metric('Suggestion Given', (md_str),
+                help='Q2 - Suggestion - Does the rater provide a suggestion for improvement? (0-no/1-yes)')
             q3lab = self.results['q3i']['label']
             if q3lab == 0:
             else:
                 md_str = '❌ No'
             cols[1].metric('Suggestion Linked', md_str,
+                help='Q3 - Connection - Is the rater’s suggestion linked to the behavior described? (0-no/1-yes)')