Marroco93 commited on
Commit
1b8c3e7
1 Parent(s): bcee5ff

no message

Browse files
Files changed (1) hide show
  1. main.py +24 -9
main.py CHANGED
@@ -14,6 +14,7 @@ import sentencepiece
14
  from transformers import pipeline, AutoTokenizer,AutoModelForSequenceClassification,AutoModel
15
  import spacy
16
  import numpy as np
 
17
 
18
 
19
  nltk.data.path.append(os.getenv('NLTK_DATA'))
@@ -80,25 +81,39 @@ async def generate_text(item: Item):
80
  # Stream response back to the client
81
  return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
82
 
83
-
84
-
85
-
86
  # Define request model
87
  class TextRequest(BaseModel):
88
- text: List[str] # Expect a list of text segments
89
 
90
  # Load Longformer model and tokenizer
91
  tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
92
- model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096")
93
 
94
- # Endpoint to process the document and return embeddings for each segment
95
  @app.post("/process_document")
96
  async def process_document(request: TextRequest):
97
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  embeddings_list = []
99
- for text_segment in request.text:
100
- # Process each segment individually
101
- inputs = tokenizer(text_segment, return_tensors="pt", padding=True, truncation=True, max_length=4096)
102
  outputs = model(**inputs)
103
  embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
104
  embeddings_list.append(embeddings.tolist()) # Store embeddings for each segment
 
14
  from transformers import pipeline, AutoTokenizer,AutoModelForSequenceClassification,AutoModel
15
  import spacy
16
  import numpy as np
17
+ import torch
18
 
19
 
20
  nltk.data.path.append(os.getenv('NLTK_DATA'))
 
81
  # Stream response back to the client
82
  return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
83
 
 
 
 
84
  # Define request model
85
  class TextRequest(BaseModel):
86
+ text: str # Single string of long text
87
 
88
  # Load Longformer model and tokenizer
89
  tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
90
+ model = AutoModel.from_pretrained("allenai/longformer-base-4096")
91
 
92
+ # Endpoint to process the document and return embeddings
93
  @app.post("/process_document")
94
  async def process_document(request: TextRequest):
95
  try:
96
+ # Split the text into segments that fit within the model's max input size
97
+ max_length = 4096 # Maximum token length for Longformer
98
+ words = request.text.split()
99
+ tokens = tokenizer.encode(request.text, add_special_tokens=True)
100
+ input_ids = []
101
+ current_chunk = []
102
+
103
+ for token in tokens:
104
+ if len(current_chunk) + len(tokenizer.convert_ids_to_tokens([token])) < max_length:
105
+ current_chunk.append(token)
106
+ else:
107
+ input_ids.append(current_chunk)
108
+ current_chunk = [token]
109
+
110
+ if current_chunk:
111
+ input_ids.append(current_chunk) # Add the last chunk if any
112
+
113
+ # Generate embeddings for each segment
114
  embeddings_list = []
115
+ for ids in input_ids:
116
+ inputs = {'input_ids': torch.tensor(ids).unsqueeze(0)} # Batch size 1
 
117
  outputs = model(**inputs)
118
  embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
119
  embeddings_list.append(embeddings.tolist()) # Store embeddings for each segment