Fix memory exhaustion on HF Spaces free tier by reducing n_ctx and implementing chunked summarization for long transcripts
Browse files- src/summarization.py +39 -5
src/summarization.py
CHANGED
|
@@ -14,12 +14,27 @@ def get_model(gguf_repo_id, gguf_filename):
|
|
| 14 |
repo_id=gguf_repo_id,
|
| 15 |
filename=gguf_filename,
|
| 16 |
verbose=False,
|
| 17 |
-
n_ctx=32768
|
| 18 |
n_threads=num_vcpus,
|
| 19 |
repeat_penalty=1.2,
|
| 20 |
)
|
| 21 |
|
| 22 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
repo_id, filename = available_gguf_llms[selected_gguf_model]
|
| 24 |
t0 = time.time()
|
| 25 |
llm = get_model(repo_id, filename)
|
|
@@ -31,7 +46,7 @@ def summarize_transcript(transcript, selected_gguf_model, prompt_input):
|
|
| 31 |
stream = llm.create_chat_completion(
|
| 32 |
messages=[
|
| 33 |
{"role": "system", "content": "You are an expert in transcript summarization."},
|
| 34 |
-
{"role": "user", "content": f'{
|
| 35 |
],
|
| 36 |
stream=True,
|
| 37 |
)
|
|
@@ -44,5 +59,24 @@ def summarize_transcript(transcript, selected_gguf_model, prompt_input):
|
|
| 44 |
is_1st_token = False
|
| 45 |
token = delta['content']
|
| 46 |
full_summary.append(str(token))
|
| 47 |
-
yield s2tw_converter.convert("".join(full_summary))
|
| 48 |
-
yield s2tw_converter.convert("".join(full_summary))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
repo_id=gguf_repo_id,
|
| 15 |
filename=gguf_filename,
|
| 16 |
verbose=False,
|
| 17 |
+
n_ctx=4096, # Reduced from 32768 to prevent memory exhaustion on HF Spaces free tier
|
| 18 |
n_threads=num_vcpus,
|
| 19 |
repeat_penalty=1.2,
|
| 20 |
)
|
| 21 |
|
| 22 |
+
def _summarize_text(text, prompt, selected_gguf_model):
|
| 23 |
+
"""Non-streaming summary for internal use."""
|
| 24 |
+
repo_id, filename = available_gguf_llms[selected_gguf_model]
|
| 25 |
+
llm = get_model(repo_id, filename)
|
| 26 |
+
response = llm.create_chat_completion(
|
| 27 |
+
messages=[
|
| 28 |
+
{"role": "system", "content": "You are an expert in transcript summarization."},
|
| 29 |
+
{"role": "user", "content": f'{prompt} \n{text}'}
|
| 30 |
+
],
|
| 31 |
+
stream=False,
|
| 32 |
+
)
|
| 33 |
+
summary = response['choices'][0]['message']['content']
|
| 34 |
+
return s2tw_converter.convert(summary)
|
| 35 |
+
|
| 36 |
+
def _stream_summary(text, prompt, selected_gguf_model):
|
| 37 |
+
"""Streaming summary generator."""
|
| 38 |
repo_id, filename = available_gguf_llms[selected_gguf_model]
|
| 39 |
t0 = time.time()
|
| 40 |
llm = get_model(repo_id, filename)
|
|
|
|
| 46 |
stream = llm.create_chat_completion(
|
| 47 |
messages=[
|
| 48 |
{"role": "system", "content": "You are an expert in transcript summarization."},
|
| 49 |
+
{"role": "user", "content": f'{prompt} \n{text}'}
|
| 50 |
],
|
| 51 |
stream=True,
|
| 52 |
)
|
|
|
|
| 59 |
is_1st_token = False
|
| 60 |
token = delta['content']
|
| 61 |
full_summary.append(str(token))
|
| 62 |
+
yield s2tw_converter.convert("".join(full_summary))
|
| 63 |
+
yield s2tw_converter.convert("".join(full_summary))
|
| 64 |
+
|
| 65 |
+
def summarize_transcript(transcript, selected_gguf_model, prompt_input):
|
| 66 |
+
# Handle long transcripts with chunked summarization
|
| 67 |
+
max_chars = 12000 # Conservative limit per chunk
|
| 68 |
+
if len(transcript) <= max_chars:
|
| 69 |
+
# Direct summarization
|
| 70 |
+
yield from _stream_summary(transcript, prompt_input, selected_gguf_model)
|
| 71 |
+
else:
|
| 72 |
+
# Chunked summarization
|
| 73 |
+
chunk_size = 8000 # Smaller chunks to fit within n_ctx
|
| 74 |
+
chunks = [transcript[i:i+chunk_size] for i in range(0, len(transcript), chunk_size)]
|
| 75 |
+
partial_summaries = []
|
| 76 |
+
for chunk in chunks:
|
| 77 |
+
partial = _summarize_text(chunk, "Summarize this excerpt from the transcript.", selected_gguf_model)
|
| 78 |
+
partial_summaries.append(partial)
|
| 79 |
+
combined = "\n\n".join(partial_summaries)
|
| 80 |
+
print(f"Combined partial summaries length: {len(combined)} chars")
|
| 81 |
+
# Stream the final summary of combined partials
|
| 82 |
+
yield from _stream_summary(combined, prompt_input, selected_gguf_model)
|