Marroco93 commited on
Commit
c1fff5f
1 Parent(s): 1aafe2e

no message

Browse files
Files changed (1) hide show
  1. main.py +14 -30
main.py CHANGED
@@ -81,45 +81,29 @@ async def generate_text(item: Item):
81
  # Stream response back to the client
82
  return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
83
 
84
-
85
-
86
  def split_text_by_tokens(text, max_tokens=1024):
87
- # Tokenize the text
88
  print("Tokenizing text...")
89
-
90
  tokens = tokenizer.tokenize(text)
91
- # Split into chunks of max_tokens
 
 
 
92
  for i in range(0, len(tokens), max_tokens):
93
- # Ensure not to exceed the token limit
94
- yield tokenizer.convert_tokens_to_string(tokens[i:i+max_tokens])
95
-
96
- def summarize_large_text(text):
97
- # Use the updated split_text_by_tokens function
98
- chunks = list(split_text_by_tokens(text, max_tokens=1024 - 10)) # Slight buffer to avoid edge cases
99
- summaries = []
100
- print("Tokenization complete, summarizing chunks...")
101
-
102
- for chunk in chunks:
103
- print("loop chunks...")
104
-
105
- # Check if chunk is within the token limit just to be sure
106
- chunk_tokens = tokenizer.encode(chunk)
107
- if len(chunk_tokens) > 1024:
108
- continue # Skip chunks that are still too large
109
- # Perform summarization on the chunk
110
- summary = summarizer(chunk, max_length=500, min_length=100, do_sample=False)
111
- if summary:
112
- summaries.append(summary[0]['summary_text'])
113
- combined_summary = ' '.join(summaries)
114
- return combined_summary
115
 
116
  @app.post("/summarize")
117
  async def summarize_text(request: SummarizeRequest):
118
  try:
119
- summarized_text = summarize_large_text(request.text)
120
- return JSONResponse(content={"summary": summarized_text})
 
121
  except Exception as e:
122
- print(f"Error during summarization: {e}")
123
  raise HTTPException(status_code=500, detail=str(e))
124
 
125
 
 
81
  # Stream response back to the client
82
  return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
83
 
 
 
84
  def split_text_by_tokens(text, max_tokens=1024):
 
85
  print("Tokenizing text...")
 
86
  tokens = tokenizer.tokenize(text)
87
+
88
+ chunks = []
89
+ token_counts = []
90
+
91
  for i in range(0, len(tokens), max_tokens):
92
+ chunk = tokenizer.convert_tokens_to_string(tokens[i:i+max_tokens])
93
+ chunks.append(chunk)
94
+ token_counts.append(len(tokenizer.encode(chunk))) # Count tokens of the current chunk
95
+
96
+ print("Tokenization complete.")
97
+ return chunks, token_counts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  @app.post("/summarize")
100
  async def summarize_text(request: SummarizeRequest):
101
  try:
102
+ chunks, token_counts = split_text_by_tokens(request.text, max_tokens=1024 - 10) # Slight buffer to avoid edge cases
103
+ chunk_data = [{'chunk': chunk, 'tokens': count} for chunk, count in zip(chunks, token_counts)]
104
+ return JSONResponse(content={"chunks": chunk_data})
105
  except Exception as e:
106
+ print(f"Error during tokenization: {e}")
107
  raise HTTPException(status_code=500, detail=str(e))
108
 
109