Spaces:

Kr08
/

ASR

Build error

Kr08 commited on Dec 9, 2024

Commit

8669b40

verified ·

1 Parent(s): 2fe6a19

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -23,24 +23,31 @@ logger = logging.getLogger(__name__)
 def load_qa_model():
     """Load question-answering model with long context support."""
     try:
-        from transformers import AutoTokenizer, AutoModelForCausalLM
         model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
         # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.getenv("HF_TOKEN"))
-        tokenizer.model_max_length = 8192  # Configure tokenizer for long inputs
         # Load the model with simplified rope_scaling configuration
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             torch_dtype=torch.bfloat16,
             device_map="auto",
             rope_scaling={
                 "type": "dynamic",  # Simplified type as expected by the model
                 "factor": 8.0       # Scaling factor to support longer contexts
             },
-            use_auth_token=os.getenv("HF_TOKEN")
         )
         # Initialize the pipeline

 def load_qa_model():
     """Load question-answering model with long context support."""
     try:
+        from transformers import AutoModelForCausalLM, AwqConfig
         model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
         # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.getenv("HF_TOKEN"))
+        quantization_config = AwqConfig(
+            bits=4,
+            fuse_max_seq_len=8192,   # Configure tokenizer for long inputs
+            do_fuse=True,
+        )
         # Load the model with simplified rope_scaling configuration
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
             device_map="auto",
             rope_scaling={
                 "type": "dynamic",  # Simplified type as expected by the model
                 "factor": 8.0       # Scaling factor to support longer contexts
             },
+            use_auth_token=os.getenv("HF_TOKEN"),
+            quantization_config=quantization_config
         )
         # Initialize the pipeline