submission-template

Sleeping

App Files Files Community

sumesh4C commited on 27 days ago

Commit

ca7ae72

verified ·

1 Parent(s): c36c35f

Update tasks/text.py

Browse files

Files changed (1) hide show

tasks/text.py +39 -1

tasks/text.py CHANGED Viewed

@@ -15,6 +15,40 @@ import pickle
 import torch
 import os
 router = APIRouter()
 DESCRIPTION = "TF-IDF + RF"
@@ -71,8 +105,12 @@ async def evaluate_text(request: TextEvaluationRequest):
     current_file_path = os.path.abspath(__file__)
     current_dir = os.path.dirname(current_file_path)
     # Make predictions using the loaded model
-    predictions = predict(test_dataset, os.path.join(current_dir,"tf-idf_vectorizer.pkl") ,os.path.join(current_dir,"random_forest_model.pkl"))
     predictions = [LABEL_MAPPING[label] for label in predictions]
     #--------------------------------------------------------------------------------------------

 import torch
 import os
+import nltk
+from nltk.corpus import stopwords
+import spacy
+nltk.download('stopwords')
+# Get the list of English stop words from NLTK
+nltk_stop_words = stopwords.words('english')
+# Load the spaCy model for English
+nlp = spacy.load("en_core_web_sm")
+def process_text(text):
+    """
+    Process text by:
+    1. Lowercasing
+    2. Removing punctuation and non-alphanumeric characters
+    3. Removing stop words
+    4. Lemmatization
+    """
+    # Step 1: Tokenization & Processing with spaCy
+    doc = nlp(text.lower())  # Process text with spaCy
+    # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
+    processed_tokens = [
+        re.sub(r'[^a-zA-Z0-9]', '', token.lemma_)  # Remove non-alphanumeric characters
+        for token in doc
+        if token.text not in nltk_stop_words and token.text not in string.punctuation
+    ]
+    # Optional: Filter out empty strings resulting from the regex replacement
+    processed_tokens = " ".join([word for word in processed_tokens if word])
+    return processed_tokens
 router = APIRouter()
 DESCRIPTION = "TF-IDF + RF"
     current_file_path = os.path.abspath(__file__)
     current_dir = os.path.dirname(current_file_path)
+    with open(os.path.join(current_dir,"tf-idf_vectorizer.pkl"), "rb") as tfidf_file:
+        tfidf_vectorizer = pickle.load(tfidf_file)
     # Make predictions using the loaded model
+    predictions = predict(test_dataset,tfidf_vectorizer,os.path.join(current_dir,"random_forest_model.pkl"))
     predictions = [LABEL_MAPPING[label] for label in predictions]
     #--------------------------------------------------------------------------------------------