Spaces:

rootxhacker
/

llama3-diffusion

Sleeping

App Files Files Community

rootxhacker commited on 5 days ago

Commit

dd39b51

verified ·

1 Parent(s): 2d8cff9

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -17

app.py CHANGED Viewed

@@ -513,6 +513,13 @@ def load_model():
     if tokenizer is not None and model is not None:
         return tokenizer, model, device
     try:
         # This appears to be a LoRA adapter
         adapter_path = "rootxhacker/llama-3B-diffusion-exp-fixed"
@@ -520,19 +527,24 @@ def load_model():
         print(f"Loading AR-Diffusion model on {device}...")
-        # Load tokenizer from adapter
-        tokenizer = AutoTokenizer.from_pretrained(adapter_path, trust_remote_code=True)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        # Load the adapter model
         print("Loading adapter model...")
         model = AutoModelForCausalLM.from_pretrained(
             adapter_path,
             torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
             device_map="auto" if device.type == "cuda" else None,
             trust_remote_code=True,
-            low_cpu_mem_usage=True
         )
         print("✅ AR-Diffusion model loaded successfully!")
@@ -541,24 +553,56 @@ def load_model():
     except Exception as e:
         print(f"❌ Error loading {adapter_path}: {e}")
-        # Fallback to a working model for demonstration
-        print("🔄 Falling back to demonstration model...")
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        fallback_model = "gpt2-medium"
-        tokenizer = AutoTokenizer.from_pretrained(fallback_model)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         model = AutoModelForCausalLM.from_pretrained(
-            fallback_model,
             torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
             device_map="auto" if device.type == "cuda" else None,
             low_cpu_mem_usage=True
         )
-        print(f"✅ Fallback model {fallback_model} loaded successfully!")
-        print("⚠️ Note: Using fallback model - AR-Diffusion features may not work as expected")
         return tokenizer, model, device
 def cleanup_memory():
@@ -604,8 +648,9 @@ def chat_function(message, history, mode, progress=gr.Progress()):
 - **Words/Second:** {stats['words_per_second']:.1f}
 - **Steps:** {stats['steps']}"""
-        # Update history
-        history.append([message, response])
         # Cleanup memory for Zero GPU efficiency
         cleanup_memory()
@@ -614,7 +659,8 @@ def chat_function(message, history, mode, progress=gr.Progress()):
     except Exception as e:
         error_msg = f"Error: {str(e)}"
-        history.append([message, error_msg])
         cleanup_memory()
         return history, "", f"**❌ Error occurred during generation**"
@@ -646,6 +692,7 @@ def create_interface():
             <p>This is an experimental AR-Diffusion model. Results may vary and the model is still under development.</p>
             <p><em>🔥 Powered by Zero GPU with @spaces.GPU</em></p>
             <p><small>Model: rootxhacker/llama-3B-diffusion-exp-fixed (LoRA Adapter)</small></p>
         </div>
         """)
@@ -654,9 +701,9 @@ def create_interface():
                 chatbot = gr.Chatbot(
                     [],
                     elem_id="chatbot",
-                    bubble_full_width=False,
                     height=500,
-                    show_label=False
                 )
                 with gr.Row():
@@ -698,7 +745,8 @@ def create_interface():
                     <p>This experimental model uses autoregressive diffusion for text generation, creating responses by iteratively denoising masked tokens.</p>
                     <br>
                     <p><strong>Model:</strong> LoRA adapter trained for AR-Diffusion</p>
-                    <p><strong>Note:</strong> This model is experimental and may produce unexpected results. If the specific model fails to load, a fallback model will be used for demonstration.</p>
                 </div>
                 """)

     if tokenizer is not None and model is not None:
         return tokenizer, model, device
+    # Get HF token from environment
+    hf_token = os.getenv("HF_TOKEN")
+    if hf_token:
+        print("🔑 HF_TOKEN found - using authenticated access")
+    else:
+        print("⚠️ No HF_TOKEN found - using public access only")
     try:
         # This appears to be a LoRA adapter
         adapter_path = "rootxhacker/llama-3B-diffusion-exp-fixed"
         print(f"Loading AR-Diffusion model on {device}...")
+        # Load tokenizer from adapter with token
+        tokenizer = AutoTokenizer.from_pretrained(
+            adapter_path,
+            trust_remote_code=True,
+            token=hf_token
+        )
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
+        # Load the adapter model with token
         print("Loading adapter model...")
         model = AutoModelForCausalLM.from_pretrained(
             adapter_path,
             torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
             device_map="auto" if device.type == "cuda" else None,
             trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            token=hf_token
         )
         print("✅ AR-Diffusion model loaded successfully!")
     except Exception as e:
         print(f"❌ Error loading {adapter_path}: {e}")
+        # Try alternative working models for AR-Diffusion demo
+        print("🔄 Trying alternative models...")
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Try different models in order of preference
+        alternative_models = [
+            "microsoft/DialoGPT-medium",
+            "gpt2-large",
+            "gpt2-medium",
+            "distilgpt2"
+        ]
+        for alt_model in alternative_models:
+            try:
+                print(f"Trying {alt_model}...")
+                tokenizer = AutoTokenizer.from_pretrained(alt_model, token=hf_token)
+                if tokenizer.pad_token is None:
+                    tokenizer.pad_token = tokenizer.eos_token
+                model = AutoModelForCausalLM.from_pretrained(
+                    alt_model,
+                    torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
+                    device_map="auto" if device.type == "cuda" else None,
+                    low_cpu_mem_usage=True,
+                    token=hf_token
+                )
+                print(f"✅ Alternative model {alt_model} loaded successfully!")
+                print("⚠️ Note: Using alternative model - AR-Diffusion features adapted for demo")
+                return tokenizer, model, device
+            except Exception as alt_e:
+                print(f"❌ {alt_model} failed: {alt_e}")
+                continue
+        # Final fallback
+        print("🔄 Using final fallback model...")
+        tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         model = AutoModelForCausalLM.from_pretrained(
+            "distilgpt2",
             torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
             device_map="auto" if device.type == "cuda" else None,
             low_cpu_mem_usage=True
         )
+        print("✅ Final fallback model loaded successfully!")
+        print("⚠️ Note: Using basic model - AR-Diffusion features adapted for demo")
         return tokenizer, model, device
 def cleanup_memory():
 - **Words/Second:** {stats['words_per_second']:.1f}
 - **Steps:** {stats['steps']}"""
+        # Update history with proper message format
+        history.append({"role": "user", "content": message})
+        history.append({"role": "assistant", "content": response})
         # Cleanup memory for Zero GPU efficiency
         cleanup_memory()
     except Exception as e:
         error_msg = f"Error: {str(e)}"
+        history.append({"role": "user", "content": message})
+        history.append({"role": "assistant", "content": error_msg})
         cleanup_memory()
         return history, "", f"**❌ Error occurred during generation**"
             <p>This is an experimental AR-Diffusion model. Results may vary and the model is still under development.</p>
             <p><em>🔥 Powered by Zero GPU with @spaces.GPU</em></p>
             <p><small>Model: rootxhacker/llama-3B-diffusion-exp-fixed (LoRA Adapter)</small></p>
+            <p><small>🔑 Requires HF_TOKEN for gated model access</small></p>
         </div>
         """)
                 chatbot = gr.Chatbot(
                     [],
                     elem_id="chatbot",
                     height=500,
+                    show_label=False,
+                    type="messages"
                 )
                 with gr.Row():
                     <p>This experimental model uses autoregressive diffusion for text generation, creating responses by iteratively denoising masked tokens.</p>
                     <br>
                     <p><strong>Model:</strong> LoRA adapter trained for AR-Diffusion</p>
+                    <p><strong>Authentication:</strong> Requires HF_TOKEN for gated Llama model access</p>
+                    <p><strong>Note:</strong> This model is experimental and may produce unexpected results. If the specific model fails to load, alternative models will be used for demonstration.</p>
                 </div>
                 """)