Spaces:

Luigi
/

VoxSum

Sleeping

App Files Files Community

Luigi commited on Sep 25

Commit

09695c9

1 Parent(s): 2a2dcfd

Fix memory exhaustion on HF Spaces free tier by reducing n_ctx and implementing chunked summarization for long transcripts

Browse files

Files changed (1) hide show

src/summarization.py +39 -5

src/summarization.py CHANGED Viewed

@@ -14,12 +14,27 @@ def get_model(gguf_repo_id, gguf_filename):
         repo_id=gguf_repo_id,
         filename=gguf_filename,
         verbose=False,
-        n_ctx=32768,
         n_threads=num_vcpus,
         repeat_penalty=1.2,
     )
-def summarize_transcript(transcript, selected_gguf_model, prompt_input):
     repo_id, filename = available_gguf_llms[selected_gguf_model]
     t0 = time.time()
     llm = get_model(repo_id, filename)
@@ -31,7 +46,7 @@ def summarize_transcript(transcript, selected_gguf_model, prompt_input):
     stream = llm.create_chat_completion(
         messages=[
             {"role": "system", "content": "You are an expert in transcript summarization."},
-            {"role": "user", "content": f'{prompt_input} \n{transcript}'}
         ],
         stream=True,
     )
@@ -44,5 +59,24 @@ def summarize_transcript(transcript, selected_gguf_model, prompt_input):
                 is_1st_token = False
             token = delta['content']
             full_summary.append(str(token))
-            yield s2tw_converter.convert("".join(full_summary)) #, "Summarizing"
-    yield s2tw_converter.convert("".join(full_summary)) #, "Summary complete"

         repo_id=gguf_repo_id,
         filename=gguf_filename,
         verbose=False,
+        n_ctx=4096,  # Reduced from 32768 to prevent memory exhaustion on HF Spaces free tier
         n_threads=num_vcpus,
         repeat_penalty=1.2,
     )
+def _summarize_text(text, prompt, selected_gguf_model):
+    """Non-streaming summary for internal use."""
+    repo_id, filename = available_gguf_llms[selected_gguf_model]
+    llm = get_model(repo_id, filename)
+    response = llm.create_chat_completion(
+        messages=[
+            {"role": "system", "content": "You are an expert in transcript summarization."},
+            {"role": "user", "content": f'{prompt} \n{text}'}
+        ],
+        stream=False,
+    )
+    summary = response['choices'][0]['message']['content']
+    return s2tw_converter.convert(summary)
+def _stream_summary(text, prompt, selected_gguf_model):
+    """Streaming summary generator."""
     repo_id, filename = available_gguf_llms[selected_gguf_model]
     t0 = time.time()
     llm = get_model(repo_id, filename)
     stream = llm.create_chat_completion(
         messages=[
             {"role": "system", "content": "You are an expert in transcript summarization."},
+            {"role": "user", "content": f'{prompt} \n{text}'}
         ],
         stream=True,
     )
                 is_1st_token = False
             token = delta['content']
             full_summary.append(str(token))
+            yield s2tw_converter.convert("".join(full_summary))
+    yield s2tw_converter.convert("".join(full_summary))
+def summarize_transcript(transcript, selected_gguf_model, prompt_input):
+    # Handle long transcripts with chunked summarization
+    max_chars = 12000  # Conservative limit per chunk
+    if len(transcript) <= max_chars:
+        # Direct summarization
+        yield from _stream_summary(transcript, prompt_input, selected_gguf_model)
+    else:
+        # Chunked summarization
+        chunk_size = 8000  # Smaller chunks to fit within n_ctx
+        chunks = [transcript[i:i+chunk_size] for i in range(0, len(transcript), chunk_size)]
+        partial_summaries = []
+        for chunk in chunks:
+            partial = _summarize_text(chunk, "Summarize this excerpt from the transcript.", selected_gguf_model)
+            partial_summaries.append(partial)
+        combined = "\n\n".join(partial_summaries)
+        print(f"Combined partial summaries length: {len(combined)} chars")
+        # Stream the final summary of combined partials
+        yield from _stream_summary(combined, prompt_input, selected_gguf_model)