Spaces:

abdull4h
/

C4AI-Arabic-Playground

Sleeping

App Files Files Community

abdull4h commited on Mar 1

Commit

ffd2a10

verified ·

1 Parent(s): bc39f18

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -54

app.py CHANGED Viewed

@@ -1,8 +1,37 @@
 import os
 import gradio as gr
 from huggingface_hub import login
 import spaces
 # Model ID
 model_id = "CohereForAI/c4ai-command-r7b-arabic-02-2025"
@@ -15,7 +44,6 @@ else:
     print("No HF_TOKEN found. Please set the HF_TOKEN environment variable.")
 # Import libraries at the module level
-import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # Pre-load tokenizer at module level
@@ -27,50 +55,34 @@ except Exception as e:
     print(f"Failed to load tokenizer: {str(e)}")
     tokenizer = None
-# To track if model was loaded
-model_loaded = False
-print(f"Initial model_loaded state: {model_loaded}")
 # Single combined function that handles both loading and generation
 @spaces.GPU
-def load_and_generate(prompt, max_length=100, temperature=0.3, force_reload=False):
-    global model_loaded
-    # First make sure model is loaded
-    if not model_loaded or force_reload:
-        print(f"Loading model (current state: {model_loaded}, force_reload: {force_reload})...")
-        try:
-            # Load model with GPU acceleration
-            model = AutoModelForCausalLM.from_pretrained(
-                model_id,
-                token=hf_token,
-                torch_dtype=torch.float16,
-                device_map="auto"
-            )
-            model_loaded = True
-            print("Model loaded successfully within the function!")
-        except Exception as e:
-            import traceback
-            error_details = traceback.format_exc()
-            print(f"Error loading model: {str(e)}\n{error_details}")
-            return f"Failed to load model: {str(e)}"
-    else:
-        print("Model was already loaded")
-        # We still need to load the model within this function call due to ZeroGPU isolation
-        try:
-            model = AutoModelForCausalLM.from_pretrained(
-                model_id,
-                token=hf_token,
-                torch_dtype=torch.float16,
-                device_map="auto"
-            )
-            print("Model reloaded for this function call")
-        except Exception as e:
-            print(f"Error reloading model: {str(e)}")
-            return f"Error reloading model: {str(e)}"
-    # Now generate text with the loaded model
     if not prompt.strip():
         return "Please enter a prompt."
@@ -89,14 +101,21 @@ def load_and_generate(prompt, max_length=100, temperature=0.3, force_reload=Fals
         # Move to model device
         input_ids = input_ids.to(model.device)
-        # Generate
-        gen_tokens = model.generate(
-            input_ids,
-            max_new_tokens=int(max_length),
-            do_sample=True if temperature > 0 else False,
-            temperature=float(temperature) if temperature > 0 else None,
-            top_p=0.95 if temperature > 0 else None
-        )
         # Decode and return
         gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
@@ -141,10 +160,11 @@ with gr.Blocks(title="Cohere Arabic Model Demo") as demo:
                 with gr.Row():
                     for example in example_prompts[i:i+2]:
                         if example:  # Make sure example exists
-                            def create_click_handler(ex):
                                 return lambda: ex
                             gr.Button(example).click(
-                                fn=create_click_handler(example),
                                 inputs=[],
                                 outputs=[prompt]
                             )
@@ -153,7 +173,6 @@ with gr.Blocks(title="Cohere Arabic Model Demo") as demo:
             with gr.Accordion("Parameters", open=False):
                 max_tokens = gr.Slider(minimum=10, maximum=500, value=100, step=10, label="Max New Tokens")
                 temp = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.1, label="Temperature")
-                force_reload = gr.Checkbox(label="Force reload model (use only if needed)", value=False)
             # Action buttons
             with gr.Row():
@@ -166,8 +185,8 @@ with gr.Blocks(title="Cohere Arabic Model Demo") as demo:
     # Set up event handlers
     submit_btn.click(
-        fn=load_and_generate,
-        inputs=[prompt, max_tokens, temp, force_reload],
         outputs=[output]
     )
     clear_btn.click(fn=lambda: "", inputs=[], outputs=[prompt, output])

 import os
+import sys
 import gradio as gr
 from huggingface_hub import login
 import spaces
+# CRITICAL: Disable PyTorch compiler BEFORE importing torch
+os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
+os.environ["TORCH_COMPILE_DISABLE"] = "1"
+os.environ["TORCH_INDUCTOR_DISABLE"] = "1"
+os.environ["TORCHINDUCTOR_DISABLE_CUDAGRAPHS"] = "1"
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+os.environ["TORCH_USE_CUDA_DSA"] = "0"
+# Now import torch and disable its compiler features
+import torch
+if hasattr(torch, "_dynamo"):
+    if hasattr(torch._dynamo, "config"):
+        torch._dynamo.config.suppress_errors = True
+    if hasattr(torch._dynamo, "disable"):
+        torch._dynamo.disable()
+        print("Disabled torch._dynamo")
+# Disable JIT functionality safely
+if hasattr(torch, "_C") and hasattr(torch._C, "_jit_set_profiling_executor"):
+    torch._C._jit_set_profiling_executor(False)
+    print("Disabled JIT profiling executor")
+if hasattr(torch, "_C") and hasattr(torch._C, "_jit_set_profiling_mode"):
+    torch._C._jit_set_profiling_mode(False)
+    print("Disabled JIT profiling mode")
+if hasattr(torch, "_C") and hasattr(torch._C, "_set_graph_executor_optimize"):
+    torch._C._set_graph_executor_optimize(False)
+    print("Disabled graph executor optimization")
 # Model ID
 model_id = "CohereForAI/c4ai-command-r7b-arabic-02-2025"
     print("No HF_TOKEN found. Please set the HF_TOKEN environment variable.")
 # Import libraries at the module level
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # Pre-load tokenizer at module level
     print(f"Failed to load tokenizer: {str(e)}")
     tokenizer = None
 # Single combined function that handles both loading and generation
 @spaces.GPU
+def generate_text(prompt, max_length=100, temperature=0.3):
+    # Load model with compiler disabled
+    try:
+        # Configure the model loading to avoid compiler
+        print("Loading model with compiler disabled...")
+        # Load model with no optimizations
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            token=hf_token,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            # Disable features that might trigger compiler
+            use_cache=True,
+            use_flash_attention_2=False,
+            _attn_implementation="eager"
+        )
+        print(f"Model loaded successfully on {next(model.parameters()).device}")
+    except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        print(f"Error loading model: {str(e)}\n{error_details}")
+        return f"Failed to load model: {str(e)}"
+    # Generate text with the loaded model
     if not prompt.strip():
         return "Please enter a prompt."
         # Move to model device
         input_ids = input_ids.to(model.device)
+        # Generate with compiler completely disabled
+        with torch.inference_mode():
+            # Force eager execution
+            torch._C._jit_override_can_fuse_on_cpu(False)
+            torch._C._jit_override_can_fuse_on_gpu(False)
+            # Safe generation
+            gen_tokens = model.generate(
+                input_ids,
+                max_new_tokens=int(max_length),
+                do_sample=True if temperature > 0 else False,
+                temperature=float(temperature) if temperature > 0 else None,
+                top_p=0.95 if temperature > 0 else None,
+                use_cache=True
+            )
         # Decode and return
         gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
                 with gr.Row():
                     for example in example_prompts[i:i+2]:
                         if example:  # Make sure example exists
+                            # This is a workaround for closure binding in loops
+                            def make_click_handler(ex):
                                 return lambda: ex
                             gr.Button(example).click(
+                                fn=make_click_handler(example),
                                 inputs=[],
                                 outputs=[prompt]
                             )
             with gr.Accordion("Parameters", open=False):
                 max_tokens = gr.Slider(minimum=10, maximum=500, value=100, step=10, label="Max New Tokens")
                 temp = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.1, label="Temperature")
             # Action buttons
             with gr.Row():
     # Set up event handlers
     submit_btn.click(
+        fn=generate_text,
+        inputs=[prompt, max_tokens, temp],
         outputs=[output]
     )
     clear_btn.click(fn=lambda: "", inputs=[], outputs=[prompt, output])