Spaces:
Sleeping
Sleeping
no message
Browse files
main.py
CHANGED
@@ -81,45 +81,29 @@ async def generate_text(item: Item):
|
|
81 |
# Stream response back to the client
|
82 |
return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
|
83 |
|
84 |
-
|
85 |
-
|
86 |
def split_text_by_tokens(text, max_tokens=1024):
|
87 |
-
# Tokenize the text
|
88 |
print("Tokenizing text...")
|
89 |
-
|
90 |
tokens = tokenizer.tokenize(text)
|
91 |
-
|
|
|
|
|
|
|
92 |
for i in range(0, len(tokens), max_tokens):
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
chunks
|
99 |
-
summaries = []
|
100 |
-
print("Tokenization complete, summarizing chunks...")
|
101 |
-
|
102 |
-
for chunk in chunks:
|
103 |
-
print("loop chunks...")
|
104 |
-
|
105 |
-
# Check if chunk is within the token limit just to be sure
|
106 |
-
chunk_tokens = tokenizer.encode(chunk)
|
107 |
-
if len(chunk_tokens) > 1024:
|
108 |
-
continue # Skip chunks that are still too large
|
109 |
-
# Perform summarization on the chunk
|
110 |
-
summary = summarizer(chunk, max_length=500, min_length=100, do_sample=False)
|
111 |
-
if summary:
|
112 |
-
summaries.append(summary[0]['summary_text'])
|
113 |
-
combined_summary = ' '.join(summaries)
|
114 |
-
return combined_summary
|
115 |
|
116 |
@app.post("/summarize")
|
117 |
async def summarize_text(request: SummarizeRequest):
|
118 |
try:
|
119 |
-
|
120 |
-
|
|
|
121 |
except Exception as e:
|
122 |
-
print(f"Error during
|
123 |
raise HTTPException(status_code=500, detail=str(e))
|
124 |
|
125 |
|
|
|
81 |
# Stream response back to the client
|
82 |
return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
|
83 |
|
|
|
|
|
84 |
def split_text_by_tokens(text, max_tokens=1024):
|
|
|
85 |
print("Tokenizing text...")
|
|
|
86 |
tokens = tokenizer.tokenize(text)
|
87 |
+
|
88 |
+
chunks = []
|
89 |
+
token_counts = []
|
90 |
+
|
91 |
for i in range(0, len(tokens), max_tokens):
|
92 |
+
chunk = tokenizer.convert_tokens_to_string(tokens[i:i+max_tokens])
|
93 |
+
chunks.append(chunk)
|
94 |
+
token_counts.append(len(tokenizer.encode(chunk))) # Count tokens of the current chunk
|
95 |
+
|
96 |
+
print("Tokenization complete.")
|
97 |
+
return chunks, token_counts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
@app.post("/summarize")
|
100 |
async def summarize_text(request: SummarizeRequest):
|
101 |
try:
|
102 |
+
chunks, token_counts = split_text_by_tokens(request.text, max_tokens=1024 - 10) # Slight buffer to avoid edge cases
|
103 |
+
chunk_data = [{'chunk': chunk, 'tokens': count} for chunk, count in zip(chunks, token_counts)]
|
104 |
+
return JSONResponse(content={"chunks": chunk_data})
|
105 |
except Exception as e:
|
106 |
+
print(f"Error during tokenization: {e}")
|
107 |
raise HTTPException(status_code=500, detail=str(e))
|
108 |
|
109 |
|