Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -7,11 +7,14 @@ import os 
     | 
|
| 7 | 
         
             
            import torch
         
     | 
| 8 | 
         
             
            import gradio as gr
         
     | 
| 9 | 
         
             
            from transformers import AutoModelForCausalLM, AutoTokenizer
         
     | 
| 
         | 
|
| 
         | 
|
| 10 | 
         
             
            import spaces # 1. Import the spaces library
         
     | 
| 11 | 
         | 
| 12 | 
         
             
            IS_CUDA = torch.cuda.is_available()
         
     | 
| 13 | 
         
             
            IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
         
     | 
| 14 | 
         
             
            if IS_ZEROGPU: torch.set_float32_matmul_precision("high")
         
     | 
| 
         | 
|
| 15 | 
         | 
| 16 | 
         
             
            # ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
         
     | 
| 17 | 
         
             
            MODEL_ID = "Reubencf/gemma3-konkani"
         
     | 
| 
         @@ -28,8 +31,14 @@ print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...") 
     | 
|
| 28 | 
         
             
            def load_model():
         
     | 
| 29 | 
         
             
                try:
         
     | 
| 30 | 
         
             
                    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
         
     | 
| 31 | 
         
            -
                     
     | 
| 32 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 33 | 
         
             
                    print("[Init] Model loaded successfully.")
         
     | 
| 34 | 
         
             
                    return model, tokenizer
         
     | 
| 35 | 
         
             
                except Exception as e:
         
     | 
| 
         @@ -79,7 +88,7 @@ def generate_response(message, history=[], system_message="", max_tokens=DEF_TOK 
     | 
|
| 79 | 
         
             
                    # Generate the response
         
     | 
| 80 | 
         
             
                    gen_kwargs = dict(
         
     | 
| 81 | 
         
             
                        input_ids=inputs["input_ids"],
         
     | 
| 82 | 
         
            -
                        attention_mask=inputs["attention_mask"],
         
     | 
| 83 | 
         
             
                        max_new_tokens=max_tokens,
         
     | 
| 84 | 
         
             
                        do_sample=True,
         
     | 
| 85 | 
         
             
                        temperature=temperature,
         
     | 
| 
         @@ -129,4 +138,4 @@ demo = gr.ChatInterface( 
     | 
|
| 129 | 
         
             
            # ββ Launch ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
         
     | 
| 130 | 
         
             
            if __name__ == "__main__":
         
     | 
| 131 | 
         
             
                print("π Starting Gradio app for ZeroGPU...")
         
     | 
| 132 | 
         
            -
                demo.queue().launch()
         
     | 
| 
         | 
|
| 7 | 
         
             
            import torch
         
     | 
| 8 | 
         
             
            import gradio as gr
         
     | 
| 9 | 
         
             
            from transformers import AutoModelForCausalLM, AutoTokenizer
         
     | 
| 10 | 
         
            +
            from transformers import TorchAoConfig # not for Zero GPU
         
     | 
| 11 | 
         
            +
            from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Float8DynamicActivationFloat8WeightConfig # not for Zero GPU
         
     | 
| 12 | 
         
             
            import spaces # 1. Import the spaces library
         
     | 
| 13 | 
         | 
| 14 | 
         
             
            IS_CUDA = torch.cuda.is_available()
         
     | 
| 15 | 
         
             
            IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
         
     | 
| 16 | 
         
             
            if IS_ZEROGPU: torch.set_float32_matmul_precision("high")
         
     | 
| 17 | 
         
            +
            IS_QUANT = True
         
     | 
| 18 | 
         | 
| 19 | 
         
             
            # ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
         
     | 
| 20 | 
         
             
            MODEL_ID = "Reubencf/gemma3-konkani"
         
     | 
| 
         | 
|
| 31 | 
         
             
            def load_model():
         
     | 
| 32 | 
         
             
                try:
         
     | 
| 33 | 
         
             
                    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
         
     | 
| 34 | 
         
            +
                    if IS_QUANT: # not for Zero GPU
         
     | 
| 35 | 
         
            +
                        quant_config = Float8DynamicActivationFloat8WeightConfig() if IS_CUDA else Int8DynamicActivationInt8WeightConfig()
         
     | 
| 36 | 
         
            +
                        quantization_config = TorchAoConfig(quant_type=quant_config)
         
     | 
| 37 | 
         
            +
                        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
         
     | 
| 38 | 
         
            +
                                                                     device_map="auto", quantization_config=quantization_config, token=HF_TOKEN)
         
     | 
| 39 | 
         
            +
                    else:
         
     | 
| 40 | 
         
            +
                        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
         
     | 
| 41 | 
         
            +
                                                                     device_map="auto", token=HF_TOKEN)
         
     | 
| 42 | 
         
             
                    print("[Init] Model loaded successfully.")
         
     | 
| 43 | 
         
             
                    return model, tokenizer
         
     | 
| 44 | 
         
             
                except Exception as e:
         
     | 
| 
         | 
|
| 88 | 
         
             
                    # Generate the response
         
     | 
| 89 | 
         
             
                    gen_kwargs = dict(
         
     | 
| 90 | 
         
             
                        input_ids=inputs["input_ids"],
         
     | 
| 91 | 
         
            +
                        #attention_mask=inputs["attention_mask"],
         
     | 
| 92 | 
         
             
                        max_new_tokens=max_tokens,
         
     | 
| 93 | 
         
             
                        do_sample=True,
         
     | 
| 94 | 
         
             
                        temperature=temperature,
         
     | 
| 
         | 
|
| 138 | 
         
             
            # ββ Launch ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
         
     | 
| 139 | 
         
             
            if __name__ == "__main__":
         
     | 
| 140 | 
         
             
                print("π Starting Gradio app for ZeroGPU...")
         
     | 
| 141 | 
         
            +
                demo.queue().launch()
         
     |