Marroco93 commited on
Commit
8737454
1 Parent(s): 44cdc71

no message

Browse files
Files changed (2) hide show
  1. main.py +22 -12
  2. requirements.txt +1 -0
main.py CHANGED
@@ -12,6 +12,7 @@ import os
12
  import google.protobuf # This line should execute without errors if protobuf is installed correctly
13
  import sentencepiece
14
  from transformers import pipeline, AutoTokenizer,AutoModelForSeq2SeqLM
 
15
 
16
 
17
  nltk.data.path.append(os.getenv('NLTK_DATA'))
@@ -102,31 +103,40 @@ tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
102
  model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
103
 
104
 
 
 
 
105
  class TextRequest(BaseModel):
106
  text: str
107
 
108
-
109
  def preprocess_text(text: str) -> str:
110
- # Normalize whitespace
111
  text = re.sub(r'\s+', ' ', text.strip())
112
-
113
- # Optional: Add additional preprocessing steps
114
- # E.g., handling or stripping special characters, lowercasing, etc.
115
- text = re.sub(r'[^\w\s]', '', text) # Remove punctuation for simplicity
116
-
117
  return text
118
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  @app.post("/summarize")
120
  async def summarize(request: TextRequest):
121
  try:
122
  processed_text = preprocess_text(request.text)
123
-
124
- return {"summary": processed_text}
125
-
126
  except Exception as e:
127
- print(f"Error during summarization: {e}")
128
  raise HTTPException(status_code=500, detail=str(e))
129
 
130
-
131
  if __name__ == "__main__":
132
  uvicorn.run(app, host="0.0.0.0", port=8000)
 
12
  import google.protobuf # This line should execute without errors if protobuf is installed correctly
13
  import sentencepiece
14
  from transformers import pipeline, AutoTokenizer,AutoModelForSeq2SeqLM
15
+ import spacy
16
 
17
 
18
  nltk.data.path.append(os.getenv('NLTK_DATA'))
 
103
  model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
104
 
105
 
106
+ # Load spaCy model
107
+ nlp = spacy.load("en_core_web_sm")
108
+
109
  class TextRequest(BaseModel):
110
  text: str
111
 
 
112
  def preprocess_text(text: str) -> str:
113
+ # Normalize whitespace and strip punctuation
114
  text = re.sub(r'\s+', ' ', text.strip())
115
+ text = re.sub(r'[^\w\s]', '', text)
 
 
 
 
116
  return text
117
 
118
+ def reduce_tokens(text: str) -> str:
119
+ # Process the text with spaCy
120
+ doc = nlp(text)
121
+ # Select sentences that might be more important - this is a simple heuristic
122
+ important_sentences = []
123
+ for sent in doc.sents:
124
+ if any(tok.dep_ == 'ROOT' for tok in sent):
125
+ important_sentences.append(sent.text)
126
+ # Join selected sentences to form the reduced text
127
+ reduced_text = ' '.join(important_sentences)
128
+ return reduced_text
129
+
130
  @app.post("/summarize")
131
  async def summarize(request: TextRequest):
132
  try:
133
  processed_text = preprocess_text(request.text)
134
+ reduced_text = reduce_tokens(processed_text)
135
+ return {"reduced_text": reduced_text}
136
+
137
  except Exception as e:
138
+ print(f"Error during token reduction: {e}")
139
  raise HTTPException(status_code=500, detail=str(e))
140
 
 
141
  if __name__ == "__main__":
142
  uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt CHANGED
@@ -7,3 +7,4 @@ nltk
7
  transformers
8
  sentencepiece
9
  protobuf
 
 
7
  transformers
8
  sentencepiece
9
  protobuf
10
+ spacy