Case-Study-1

Sleeping

Julian-Hans commited on Oct 4, 2024

Commit

714b9dd

1 Parent(s): 52d92f0

changed inference provider for phi3 to lama_cpp

Files changed (3) hide show

.gitignore CHANGED Viewed

@@ -1,2 +1,5 @@
 /__pycache__
-*.wav

 /__pycache__
+*.wav
+.pytest_cache
+/audio_data
+.cache

phi3_mini_4k_instruct.py CHANGED Viewed

@@ -1,9 +1,10 @@
 # external imports
 from transformers import pipeline
 from huggingface_hub import InferenceClient
 # local imports
 import config
 class Phi3_Mini_4k_Instruct:
@@ -12,12 +13,22 @@ class Phi3_Mini_4k_Instruct:
     def generate_text(self, messages, use_local_llm):
         if use_local_llm:
-            return self.generate_text_local_pipeline(messages)
         else:
             return self.generate_text_api(messages)
     def generate_text_local_pipeline(self, messages):
-        self.local_pipeline = pipeline("text-generation", model=config.LLM_MODEL, trust_remote_code=True)
         self.local_pipeline.model.config.max_length = config.LLM_MAX_LENGTH
         self.local_pipeline.model.config.max_new_tokens = config.LLM_MAX_NEW_TOKENS
         self.local_pipeline.model.config.temperature = config.LLM_TEMPERATURE

 # external imports
 from transformers import pipeline
 from huggingface_hub import InferenceClient
+import torch
 # local imports
 import config
+from llama_cpp import Llama
 class Phi3_Mini_4k_Instruct:
     def generate_text(self, messages, use_local_llm):
         if use_local_llm:
+            return self.generate_text_llama_cpp(messages)
         else:
             return self.generate_text_api(messages)
+    def generate_text_llama_cpp(self, messages):
+        model = Llama.from_pretrained(
+            repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
+            filename="Phi-3-mini-4k-instruct-q4.gguf"
+        )
+        response = model.create_chat_completion(messages)
+        generated_message = response['choices'][0]['message']['content']
+        return generated_message
     def generate_text_local_pipeline(self, messages):
+        self.local_pipeline = pipeline("text-generation", model=config.LLM_MODEL, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto")
         self.local_pipeline.model.config.max_length = config.LLM_MAX_LENGTH
         self.local_pipeline.model.config.max_new_tokens = config.LLM_MAX_NEW_TOKENS
         self.local_pipeline.model.config.temperature = config.LLM_TEMPERATURE

requirements.txt CHANGED Viewed

@@ -23,6 +23,7 @@ importlib_resources==6.4.5
 iniconfig==2.0.0
 Jinja2==3.1.4
 kiwisolver==1.4.7
 markdown-it-py==3.0.0
 MarkupSafe==2.1.5
 matplotlib==3.9.2

 iniconfig==2.0.0
 Jinja2==3.1.4
 kiwisolver==1.4.7
+llama_cpp_python==0.3.1
 markdown-it-py==3.0.0
 MarkupSafe==2.1.5
 matplotlib==3.9.2