no message
Browse files
@@ -81,45 +81,29 @@ async def generate_text(item: Item):
81 |
# Stream response back to the client
82 |
return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
83 |
84 |
85 |
86 |
def split_text_by_tokens(text, max_tokens=1024):
87 |
# Tokenize the text
88 |
print("Tokenizing text...")
89 |
90 |
tokens = tokenizer.tokenize(text)
91 |
92 |
for i in range(0, len(tokens), max_tokens):
93 |
94 |
95 |
96 |
97 |
98 |
99 |
summaries = []
100 |
print("Tokenization complete, summarizing chunks...")
101 |
102 |
for chunk in chunks:
103 |
print("loop chunks...")
104 |
105 |
# Check if chunk is within the token limit just to be sure
106 |
chunk_tokens = tokenizer.encode(chunk)
107 |
if len(chunk_tokens) > 1024:
108 |
continue # Skip chunks that are still too large
109 |
# Perform summarization on the chunk
110 |
summary = summarizer(chunk, max_length=500, min_length=100, do_sample=False)
111 |
if summary:
112 |
113 |
combined_summary = ' '.join(summaries)
114 |
return combined_summary
115 |
116 |"/summarize")
117 |
async def summarize_text(request: SummarizeRequest):
118 |
119 |
120 |
121 |
except Exception as e:
122 |
print(f"Error during
123 |
raise HTTPException(status_code=500, detail=str(e))
124 |
125 |
81 |
# Stream response back to the client
82 |
return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
83 |
84 |
def split_text_by_tokens(text, max_tokens=1024):
85 |
print("Tokenizing text...")
86 |
tokens = tokenizer.tokenize(text)
87 |
88 |
chunks = []
89 |
token_counts = []
90 |
91 |
for i in range(0, len(tokens), max_tokens):
92 |
chunk = tokenizer.convert_tokens_to_string(tokens[i:i+max_tokens])
93 |
94 |
token_counts.append(len(tokenizer.encode(chunk))) # Count tokens of the current chunk
95 |
96 |
print("Tokenization complete.")
97 |
return chunks, token_counts
98 |
99 |"/summarize")
100 |
async def summarize_text(request: SummarizeRequest):
101 |
102 |
chunks, token_counts = split_text_by_tokens(request.text, max_tokens=1024 - 10) # Slight buffer to avoid edge cases
103 |
chunk_data = [{'chunk': chunk, 'tokens': count} for chunk, count in zip(chunks, token_counts)]
104 |
return JSONResponse(content={"chunks": chunk_data})
105 |
except Exception as e:
106 |
print(f"Error during tokenization: {e}")
107 |
raise HTTPException(status_code=500, detail=str(e))
108 |
109 |