Spaces:

Labbeti
/

conette

Sleeping

App Files Files Community

Labbeti commited on Mar 25

Commit

4ff8b3b

•

1 Parent(s): b7a5794

Mod: Rework UI, remove tmp files and clear cache after 10min.

Browse files

Files changed (1) hide show

app.py +65 -19

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
 from typing import Any, Optional, Union
@@ -16,14 +18,18 @@ from conette.utils.collections import dict_list_to_list_dict
 ALLOW_REP_MODES = ("stopwords", "all", "none")
 MAX_BEAM_SIZE = 20
 MAX_PRED_SIZE = 30
-MAX_BATCH_SIZE = 32
 RECORD_AUDIO_FNAME = "microphone_conette_record.wav"
 DEFAULT_THRESHOLD = 0.3
 THRESHOLD_PRECISION = 100
 MIN_AUDIO_DURATION_SEC = 0.3
 MAX_AUDIO_DURATION_SEC = 60
 @st.cache_resource
@@ -46,7 +52,7 @@ def format_tags(tags: Optional[list[str]]) -> str:
 def get_result_hash(audio_fname: str, generate_kwds: dict[str, Any]) -> str:
-    return f"{audio_fname}-{generate_kwds}"
 def get_results(
@@ -64,7 +70,7 @@ def get_results(
     # Save audio to be processed
     tmp_files: dict[str, _TemporaryFileWrapper] = {}
     for result_hash, (audio_fname, audio) in audio_to_predict.items():
-        tmp_file = NamedTemporaryFile(delete=False)
         tmp_file.write(audio)
         tmp_file.close()
@@ -109,6 +115,9 @@ def get_results(
         output_i = st.session_state[result_hash]
         outputs[audio_fname] = output_i
     return outputs
@@ -145,20 +154,39 @@ def show_results(outputs: dict[str, Union[dict[str, Any], str]]) -> None:
         else:
             header = f'##### Result for "{audio_fname}"'
-        content = [
             header,
-            f'- **Description:** "{cand}" ({prob*100:.1f}%)',
-            f"- **Tags:** {tags}",
         ]
         if len(mult_cands) > 0:
             msg = f"- **Other descriptions:**"
-            content.append(msg)
         for cand_i, prob_i in zip(mult_cands, mult_probs):
             msg = f'  - "{cand_i}" ({prob_i*100:.1f}%)'
-            content.append(msg)
-        st.success("\n".join(content))
         st.divider()
@@ -167,19 +195,28 @@ def main() -> None:
     st.header("Describe audio content with CoNeTTE")
     st.markdown(
-        "This interface allows you to generate a short description of the sound events of any recording. You can try it from your microphone or upload a file below."
     )
-    record_data = st_audiorec()
-    audio_files: Optional[list[UploadedFile]] = st.file_uploader(
-        "**Or upload audio files here:**",
-        type=["wav", "flac", "mp3", "ogg", "avi"],
-        accept_multiple_files=True,
-        help="Recommanded audio: lasting from **1 to 30s**, sampled at **32 kHz** minimum.",
     )
-    with st.expander("Model hyperparameters"):
-        task = st.selectbox("Task embedding input", model.tasks, 0)
         allow_rep_mode = st.selectbox("Allow repetition of words", ALLOW_REP_MODES, 0)
         beam_size: int = st.select_slider(  # type: ignore
             "Beam size",
@@ -231,6 +268,15 @@ def main() -> None:
         st.header("Results:")
         show_results(outputs)
 if __name__ == "__main__":
     main()

 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+import os
+import time
 from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
 from typing import Any, Optional, Union
 ALLOW_REP_MODES = ("stopwords", "all", "none")
+DEFAULT_TASK = "audiocaps"
 MAX_BEAM_SIZE = 20
 MAX_PRED_SIZE = 30
+MAX_BATCH_SIZE = 16
 RECORD_AUDIO_FNAME = "microphone_conette_record.wav"
 DEFAULT_THRESHOLD = 0.3
 THRESHOLD_PRECISION = 100
 MIN_AUDIO_DURATION_SEC = 0.3
 MAX_AUDIO_DURATION_SEC = 60
+HASH_PREFIX = "hash_"
+TMP_FILE_PREFIX = "audio_tmp_file_"
+SECOND_BEFORE_CLEAR_CACHE = 10 * 60
 @st.cache_resource
 def get_result_hash(audio_fname: str, generate_kwds: dict[str, Any]) -> str:
+    return f"{HASH_PREFIX}{audio_fname}-{generate_kwds}"
 def get_results(
     # Save audio to be processed
     tmp_files: dict[str, _TemporaryFileWrapper] = {}
     for result_hash, (audio_fname, audio) in audio_to_predict.items():
+        tmp_file = NamedTemporaryFile(delete=False, prefix=TMP_FILE_PREFIX)
         tmp_file.write(audio)
         tmp_file.close()
         output_i = st.session_state[result_hash]
         outputs[audio_fname] = output_i
+    for tmp_file in tmp_files.values():
+        os.remove(tmp_file.name)
     return outputs
         else:
             header = f'##### Result for "{audio_fname}"'
+        lines = [
             header,
+            f'<center><p class="space"><p class="big-font">"{cand}"</p></p></center>',
+        ]
+        st.markdown("""
+        <style>
+        .big-font {
+            font-size:22px !important;
+            background-color: rgba(0, 255, 0, 0.1);
+            padding: 10px;
+        }
+        </style>
+        """, unsafe_allow_html=True)
+        content = "<br>".join(lines)
+        st.markdown(content, unsafe_allow_html=True)
+        lines = [
+            f"- **Probability**: {prob*100:.1f}%",
         ]
         if len(mult_cands) > 0:
             msg = f"- **Other descriptions:**"
+            lines.append(msg)
         for cand_i, prob_i in zip(mult_cands, mult_probs):
             msg = f'  - "{cand_i}" ({prob_i*100:.1f}%)'
+            lines.append(msg)
+        msg = f"- **Tags:** {tags}"
+        lines.append(msg)
+        content = "\n".join(lines)
+        st.markdown(content, unsafe_allow_html=False)
         st.divider()
     st.header("Describe audio content with CoNeTTE")
     st.markdown(
+        "This interface allows you to generate a short description of the sound events of any recording using an Audio Captioning system. You can try it from your microphone or upload a file below."
     )
+    st.markdown(
+        "Use '**Start Recording**' and '**Stop**' to record an audio from your microphone."
     )
+    record_data = st_audiorec()
+    with st.expander("Or upload audio files here:"):
+        audio_files: Optional[list[UploadedFile]] = st.file_uploader(
+            f"Audio files are automatically resampled to 32 kHz.\nTheir duration must be in range [{MIN_AUDIO_DURATION_SEC}, {MAX_AUDIO_DURATION_SEC}] seconds.",
+            type=["wav", "flac", "mp3", "ogg", "avi"],
+            accept_multiple_files=True,
+            help="Recommanded audio: lasting from **1 to 30s**, sampled at **32 kHz** minimum.",
+        )
+    with st.expander("Model options"):
+        if DEFAULT_TASK in model.tasks:
+            default_task_idx = list(model.tasks).index(DEFAULT_TASK)
+        else:
+            default_task_idx = 0
+        task = st.selectbox("Task embedding input", model.tasks, default_task_idx)
         allow_rep_mode = st.selectbox("Allow repetition of words", ALLOW_REP_MODES, 0)
         beam_size: int = st.select_slider(  # type: ignore
             "Beam size",
         st.header("Results:")
         show_results(outputs)
+        current = time.perf_counter()
+        last_generation = st.session_state.get("last_generation", current)
+        if current > last_generation + SECOND_BEFORE_CLEAR_CACHE:
+            print(f"Removing result cache...")
+            for key in st.session_state.keys():
+                if isinstance(key, str) and key.startswith(HASH_PREFIX):
+                    del st.session_state[key]
+        st.session_state["last_generation"] = current
 if __name__ == "__main__":
     main()