Luigi commited on
Commit
09695c9
·
1 Parent(s): 2a2dcfd

Fix memory exhaustion on HF Spaces free tier by reducing n_ctx and implementing chunked summarization for long transcripts

Browse files
Files changed (1) hide show
  1. src/summarization.py +39 -5
src/summarization.py CHANGED
@@ -14,12 +14,27 @@ def get_model(gguf_repo_id, gguf_filename):
14
  repo_id=gguf_repo_id,
15
  filename=gguf_filename,
16
  verbose=False,
17
- n_ctx=32768,
18
  n_threads=num_vcpus,
19
  repeat_penalty=1.2,
20
  )
21
 
22
- def summarize_transcript(transcript, selected_gguf_model, prompt_input):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  repo_id, filename = available_gguf_llms[selected_gguf_model]
24
  t0 = time.time()
25
  llm = get_model(repo_id, filename)
@@ -31,7 +46,7 @@ def summarize_transcript(transcript, selected_gguf_model, prompt_input):
31
  stream = llm.create_chat_completion(
32
  messages=[
33
  {"role": "system", "content": "You are an expert in transcript summarization."},
34
- {"role": "user", "content": f'{prompt_input} \n{transcript}'}
35
  ],
36
  stream=True,
37
  )
@@ -44,5 +59,24 @@ def summarize_transcript(transcript, selected_gguf_model, prompt_input):
44
  is_1st_token = False
45
  token = delta['content']
46
  full_summary.append(str(token))
47
- yield s2tw_converter.convert("".join(full_summary)) #, "Summarizing"
48
- yield s2tw_converter.convert("".join(full_summary)) #, "Summary complete"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  repo_id=gguf_repo_id,
15
  filename=gguf_filename,
16
  verbose=False,
17
+ n_ctx=4096, # Reduced from 32768 to prevent memory exhaustion on HF Spaces free tier
18
  n_threads=num_vcpus,
19
  repeat_penalty=1.2,
20
  )
21
 
22
+ def _summarize_text(text, prompt, selected_gguf_model):
23
+ """Non-streaming summary for internal use."""
24
+ repo_id, filename = available_gguf_llms[selected_gguf_model]
25
+ llm = get_model(repo_id, filename)
26
+ response = llm.create_chat_completion(
27
+ messages=[
28
+ {"role": "system", "content": "You are an expert in transcript summarization."},
29
+ {"role": "user", "content": f'{prompt} \n{text}'}
30
+ ],
31
+ stream=False,
32
+ )
33
+ summary = response['choices'][0]['message']['content']
34
+ return s2tw_converter.convert(summary)
35
+
36
+ def _stream_summary(text, prompt, selected_gguf_model):
37
+ """Streaming summary generator."""
38
  repo_id, filename = available_gguf_llms[selected_gguf_model]
39
  t0 = time.time()
40
  llm = get_model(repo_id, filename)
 
46
  stream = llm.create_chat_completion(
47
  messages=[
48
  {"role": "system", "content": "You are an expert in transcript summarization."},
49
+ {"role": "user", "content": f'{prompt} \n{text}'}
50
  ],
51
  stream=True,
52
  )
 
59
  is_1st_token = False
60
  token = delta['content']
61
  full_summary.append(str(token))
62
+ yield s2tw_converter.convert("".join(full_summary))
63
+ yield s2tw_converter.convert("".join(full_summary))
64
+
65
+ def summarize_transcript(transcript, selected_gguf_model, prompt_input):
66
+ # Handle long transcripts with chunked summarization
67
+ max_chars = 12000 # Conservative limit per chunk
68
+ if len(transcript) <= max_chars:
69
+ # Direct summarization
70
+ yield from _stream_summary(transcript, prompt_input, selected_gguf_model)
71
+ else:
72
+ # Chunked summarization
73
+ chunk_size = 8000 # Smaller chunks to fit within n_ctx
74
+ chunks = [transcript[i:i+chunk_size] for i in range(0, len(transcript), chunk_size)]
75
+ partial_summaries = []
76
+ for chunk in chunks:
77
+ partial = _summarize_text(chunk, "Summarize this excerpt from the transcript.", selected_gguf_model)
78
+ partial_summaries.append(partial)
79
+ combined = "\n\n".join(partial_summaries)
80
+ print(f"Combined partial summaries length: {len(combined)} chars")
81
+ # Stream the final summary of combined partials
82
+ yield from _stream_summary(combined, prompt_input, selected_gguf_model)