Spaces:
Sleeping
Sleeping
no message
Browse files
main.py
CHANGED
@@ -14,6 +14,7 @@ import sentencepiece
|
|
14 |
from transformers import pipeline, AutoTokenizer,AutoModelForSequenceClassification,AutoModel
|
15 |
import spacy
|
16 |
import numpy as np
|
|
|
17 |
|
18 |
|
19 |
nltk.data.path.append(os.getenv('NLTK_DATA'))
|
@@ -80,25 +81,39 @@ async def generate_text(item: Item):
|
|
80 |
# Stream response back to the client
|
81 |
return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
# Define request model
|
87 |
class TextRequest(BaseModel):
|
88 |
-
text:
|
89 |
|
90 |
# Load Longformer model and tokenizer
|
91 |
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
|
92 |
-
model =
|
93 |
|
94 |
-
# Endpoint to process the document and return embeddings
|
95 |
@app.post("/process_document")
|
96 |
async def process_document(request: TextRequest):
|
97 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
embeddings_list = []
|
99 |
-
for
|
100 |
-
#
|
101 |
-
inputs = tokenizer(text_segment, return_tensors="pt", padding=True, truncation=True, max_length=4096)
|
102 |
outputs = model(**inputs)
|
103 |
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
|
104 |
embeddings_list.append(embeddings.tolist()) # Store embeddings for each segment
|
|
|
14 |
from transformers import pipeline, AutoTokenizer,AutoModelForSequenceClassification,AutoModel
|
15 |
import spacy
|
16 |
import numpy as np
|
17 |
+
import torch
|
18 |
|
19 |
|
20 |
nltk.data.path.append(os.getenv('NLTK_DATA'))
|
|
|
81 |
# Stream response back to the client
|
82 |
return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
|
83 |
|
|
|
|
|
|
|
84 |
# Define request model
|
85 |
class TextRequest(BaseModel):
|
86 |
+
text: str # Single string of long text
|
87 |
|
88 |
# Load Longformer model and tokenizer
|
89 |
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
|
90 |
+
model = AutoModel.from_pretrained("allenai/longformer-base-4096")
|
91 |
|
92 |
+
# Endpoint to process the document and return embeddings
|
93 |
@app.post("/process_document")
|
94 |
async def process_document(request: TextRequest):
|
95 |
try:
|
96 |
+
# Split the text into segments that fit within the model's max input size
|
97 |
+
max_length = 4096 # Maximum token length for Longformer
|
98 |
+
words = request.text.split()
|
99 |
+
tokens = tokenizer.encode(request.text, add_special_tokens=True)
|
100 |
+
input_ids = []
|
101 |
+
current_chunk = []
|
102 |
+
|
103 |
+
for token in tokens:
|
104 |
+
if len(current_chunk) + len(tokenizer.convert_ids_to_tokens([token])) < max_length:
|
105 |
+
current_chunk.append(token)
|
106 |
+
else:
|
107 |
+
input_ids.append(current_chunk)
|
108 |
+
current_chunk = [token]
|
109 |
+
|
110 |
+
if current_chunk:
|
111 |
+
input_ids.append(current_chunk) # Add the last chunk if any
|
112 |
+
|
113 |
+
# Generate embeddings for each segment
|
114 |
embeddings_list = []
|
115 |
+
for ids in input_ids:
|
116 |
+
inputs = {'input_ids': torch.tensor(ids).unsqueeze(0)} # Batch size 1
|
|
|
117 |
outputs = model(**inputs)
|
118 |
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
|
119 |
embeddings_list.append(embeddings.tolist()) # Store embeddings for each segment
|