Spaces:

Connexus
/

grammar-genie-api

Runtime error

App Files Files Community

Connexus commited on Sep 1

Commit

9998352

verified ·

1 Parent(s): 549488a

Upload 2 files

Browse files

Files changed (1) hide show

services/grammar_service.py +68 -27

services/grammar_service.py CHANGED Viewed

@@ -2,69 +2,110 @@ import os
 import nltk
 import torch
 from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
-from huggingface_hub import snapshot_download # We will use this for a more robust download
 class GrammarService:
     _models = {}
-    _hf_repo_name = "Connexus/grammar-genie-models"
     _hf_token = os.environ.get("HUGGING_FACE_TOKEN")
     @classmethod
     def load_models(cls):
         print("="*50)
-        print(f"BULLETPROOF STARTUP: Loading models from '{cls._hf_repo_name}'...")
-        # --- NLTK Setup (already correct) ---
-        local_nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
-        if not os.path.exists(local_nltk_data_path):
-            os.makedirs(local_nltk_data_path)
-        nltk.data.path.append(local_nltk_data_path)
         try:
             nltk.data.find('tokenizers/punkt')
-            print(f"  > NLTK 'punkt' tokenizer found.")
         except LookupError:
-            print(f"  > NLTK 'punkt' not found. Downloading to: {local_nltk_data_path}")
-            nltk.download('punkt', download_dir=local_nltk_data_path)
         supported_languages = ["english", "french"]
         if not cls._hf_token:
-            print("  > [FATAL ERROR] HUGGING_FACE_TOKEN not set.")
             return
         for lang in supported_languages:
             model_subfolder = lang
             print(f"  > Processing model for '{lang}'...")
             try:
-                # --- NEW, MORE ROBUST DOWNLOAD STEP ---
-                print(f"  > Step 1: Downloading all files from subfolder '{model_subfolder}'...")
-                # snapshot_download is the most reliable way to download a whole folder
-                # It will use the token and save files to a local cache directory
                 local_model_dir = snapshot_download(
                     repo_id=cls._hf_repo_name,
-                    allow_patterns=f"{model_subfolder}/*", # Only download files from this subfolder
                     use_auth_token=cls._hf_token,
                     repo_type="model"
                 )
-                print(f"  > Download complete. Files are cached locally.")
-                # --- The pipeline now loads from the local cache, not the internet ---
-                print(f"  > Step 2: Loading pipeline from local cache...")
-                device_num = 0 if torch.cuda.is_available() else -1
-                # We point the pipeline to the specific subfolder inside the cache
                 final_model_path = os.path.join(local_model_dir, model_subfolder)
                 cls._models[lang] = pipeline(
                     "text2text-generation",
-                    model=final_model_path, # Load from the specific local directory
-                    device=device_num
                 )
                 print(f"  > Model for '{lang}' loaded successfully into memory.")
             except Exception as e:
-                print(f"  > [FATAL ERROR] during processing for '{lang}'.")
-                print(f"  > Details: {e}")
         print("Model loading complete.")
         print("="*50)
-    # ... (correct_paragraph method is unchanged) ...

 import nltk
 import torch
 from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+from huggingface_hub import snapshot_download
 class GrammarService:
+    """
+    Final, definitive service class.
+    - Models are downloaded from a private Hugging Face Hub.
+    - NLTK data is expected to be pre-installed by the Docker build process.
+    """
     _models = {}
+    # --- CONFIGURATION ---
+    _hf_repo_name = "Connexus/grammar-genie-models" # Your specific repo name
     _hf_token = os.environ.get("HUGGING_FACE_TOKEN")
     @classmethod
     def load_models(cls):
+        """
+        Loads all available models from the private Hugging Face repository into memory.
+        """
         print("="*50)
+        print(f"Final Version Startup: Loading models from '{cls._hf_repo_name}'...")
+        # --- FINAL NLTK SETUP ---
+        # The Dockerfile is now responsible for the download.
+        # This code just verifies that the data is present where NLTK can find it.
         try:
             nltk.data.find('tokenizers/punkt')
+            print("  > NLTK 'punkt' tokenizer found successfully.")
         except LookupError:
+            print("  > [FATAL ERROR] NLTK 'punkt' not found. The Docker build may have failed to download it.")
+            # Stop the application if NLTK data is missing, as it cannot function.
+            return
+        # --- END OF NLTK SETUP ---
         supported_languages = ["english", "french"]
         if not cls._hf_token:
+            print("  > [FATAL ERROR] HUGGING_FACE_TOKEN environment variable not set.")
             return
         for lang in supported_languages:
             model_subfolder = lang
             print(f"  > Processing model for '{lang}'...")
             try:
+                print(f"    - Step 1: Downloading files from subfolder '{model_subfolder}'...")
                 local_model_dir = snapshot_download(
                     repo_id=cls._hf_repo_name,
+                    allow_patterns=f"{model_subfolder}/*",
                     use_auth_token=cls._hf_token,
                     repo_type="model"
                 )
+                print(f"    - Download complete.")
+                print(f"    - Step 2: Loading pipeline from local cache...")
                 final_model_path = os.path.join(local_model_dir, model_subfolder)
                 cls._models[lang] = pipeline(
                     "text2text-generation",
+                    model=final_model_path,
+                    device=-1 # Force CPU
                 )
                 print(f"  > Model for '{lang}' loaded successfully into memory.")
             except Exception as e:
+                print(f"  > [FATAL ERROR] during processing for '{lang}'. Details: {e}")
+                return
         print("Model loading complete.")
         print("="*50)
+    @classmethod
+    def correct_paragraph(cls, paragraph: str, language: str) -> str:
+        """
+        Corrects the grammar of a paragraph for a specified language.
+        """
+        if language not in cls._models:
+            return f"Error: Language '{language}' is not supported or its model failed to load."
+        corrector = cls._models[language]
+        sentences = nltk.sent_tokenize(paragraph)
+        if language == 'english':
+            prefix = "fix grammatical errors in the following text: "
+        elif language == 'french':
+            prefix = ""
+        else:
+            prefix = "correct grammar: "
+        corrected_sentences = []
+        for sentence in sentences:
+            input_text = f"{prefix}{sentence}"
+            try:
+                results = corrector(input_text, max_length=256, num_beams=5)
+                raw_output = results[0]['generated_text']
+                if prefix and raw_output.startswith(prefix):
+                     clean_sentence = raw_output.replace(prefix, "", 1).strip()
+                else:
+                     clean_sentence = raw_output.strip()
+                corrected_sentences.append(clean_sentence)
+            except Exception as e:
+                print(f"  > [WARNING] Failed to process a sentence. Using original. Error: {e}")
+                corrected_sentences.append(sentence)
+        return " ".join(corrected_sentences)
+### Next Steps