Spaces:

Marroco93
/

PacmanAI-2

Running

App Files Files Community

Marroco93 commited on Apr 19

Commit

8737454

•

1 Parent(s): 44cdc71

no message

Browse files

Files changed (2) hide show

main.py +22 -12
requirements.txt +1 -0

main.py CHANGED Viewed

@@ -12,6 +12,7 @@ import os
 import google.protobuf  # This line should execute without errors if protobuf is installed correctly
 import sentencepiece
 from transformers import pipeline, AutoTokenizer,AutoModelForSeq2SeqLM
 nltk.data.path.append(os.getenv('NLTK_DATA'))
@@ -102,31 +103,40 @@ tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
 model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
 class TextRequest(BaseModel):
     text: str
 def preprocess_text(text: str) -> str:
-    # Normalize whitespace
     text = re.sub(r'\s+', ' ', text.strip())
-    # Optional: Add additional preprocessing steps
-    # E.g., handling or stripping special characters, lowercasing, etc.
-    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation for simplicity
     return text
 @app.post("/summarize")
 async def summarize(request: TextRequest):
     try:
         processed_text = preprocess_text(request.text)
-        return {"summary": processed_text}
     except Exception as e:
-        print(f"Error during summarization: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=8000)

 import google.protobuf  # This line should execute without errors if protobuf is installed correctly
 import sentencepiece
 from transformers import pipeline, AutoTokenizer,AutoModelForSeq2SeqLM
+import spacy
 nltk.data.path.append(os.getenv('NLTK_DATA'))
 model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
+# Load spaCy model
+nlp = spacy.load("en_core_web_sm")
 class TextRequest(BaseModel):
     text: str
 def preprocess_text(text: str) -> str:
+    # Normalize whitespace and strip punctuation
     text = re.sub(r'\s+', ' ', text.strip())
+    text = re.sub(r'[^\w\s]', '', text)
     return text
+def reduce_tokens(text: str) -> str:
+    # Process the text with spaCy
+    doc = nlp(text)
+    # Select sentences that might be more important - this is a simple heuristic
+    important_sentences = []
+    for sent in doc.sents:
+        if any(tok.dep_ == 'ROOT' for tok in sent):
+            important_sentences.append(sent.text)
+    # Join selected sentences to form the reduced text
+    reduced_text = ' '.join(important_sentences)
+    return reduced_text
 @app.post("/summarize")
 async def summarize(request: TextRequest):
     try:
         processed_text = preprocess_text(request.text)
+        reduced_text = reduce_tokens(processed_text)
+        return {"reduced_text": reduced_text}
     except Exception as e:
+        print(f"Error during token reduction: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt CHANGED Viewed

@@ -7,3 +7,4 @@ nltk
 transformers
 sentencepiece
 protobuf

 transformers
 sentencepiece
 protobuf
+spacy