Spaces:

hpcgroup
/

perf-analysis-chat

Sleeping

App Files Files

Daniel Nichols commited on Dec 2, 2024

Commit

dc10826

1 Parent(s): 0cfd67a

add hf inference api and update streaming

Browse files

Files changed (1) hide show

src/models.py +41 -8

src/models.py CHANGED Viewed

@@ -8,6 +8,7 @@ import glob
 import openai
 import google.generativeai as genai
 from llama_cpp import Llama
 class ChatModel(ABC):
     def __init__(self, name):
@@ -86,12 +87,12 @@ class LocalModel(ChatModel):
         super().__init__(model)
         self.llm = Llama(
             model_path=model_path,
-            n_ctx=8000,
         )
     def get_response(self, prompt) -> Generator[str, None, None]:
-        output = self.llm.create_chat_completion(
             messages = [
                 {"role": "system", "content": "You are PerfGuru, a helpful assistant for assisting developers in identifying performance bottlenecks in their code and optimizing them."},
                 {
@@ -100,22 +101,47 @@ class LocalModel(ChatModel):
                 }
             ],
             max_tokens=4000,
         )
-        result = output["choices"][0]["message"]["content"]
-        for idx in range(len(result)):
-            yield result[:idx+1]
 HF_HOME = os.environ.get("HF_HOME", "/home/user/.cache/huggingface")
 GGUF_WILDCARD = os.path.join(HF_HOME, "hub", "models-*", "**", "*.gguf")
 GGUF_PATHS = [(os.path.basename(p), p) for p in glob.glob(GGUF_WILDCARD, recursive=True)]
 LOCAL_MODEL_PATHS = [(os.path.basename(p), p) for p in glob.glob(os.path.join("local_models", "*.gguf"))]
 ALL_LOCAL_MODELS = GGUF_PATHS + LOCAL_MODEL_PATHS
-print(HF_HOME)
-print(ALL_LOCAL_MODELS)
 AVAILABLE_MODELS = [
     LocalModel(model_name, model_path)
     for model_name, model_path in ALL_LOCAL_MODELS
@@ -133,6 +159,13 @@ if os.environ.get("GOOGLE_API_KEY"):
     AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-flash") )
     AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-pro") )
 if not AVAILABLE_MODELS:
     raise ValueError("No models available. Please set OPENAI_API_KEY or GOOGLE_API_KEY environment variables.")

 import openai
 import google.generativeai as genai
 from llama_cpp import Llama
+from huggingface_hub import InferenceClient
 class ChatModel(ABC):
     def __init__(self, name):
         super().__init__(model)
         self.llm = Llama(
             model_path=model_path,
+            n_ctx=4096,
         )
     def get_response(self, prompt) -> Generator[str, None, None]:
+        outputs = self.llm.create_chat_completion(
             messages = [
                 {"role": "system", "content": "You are PerfGuru, a helpful assistant for assisting developers in identifying performance bottlenecks in their code and optimizing them."},
                 {
                 }
             ],
             max_tokens=4000,
+            stream=True,
         )
+        response = ""
+        for chunk in outputs:
+            response += chunk['choices'][0]['delta'].get('content', '')
+            yield response
+class InferenceHubModel(ChatModel):
+    def __init__(self, model: str, client: InferenceClient, supports_system_messages: bool = True):
+        super().__init__(model)
+        self.model = model
+        self.client = client
+        self.supports_system_messages = supports_system_messages
+    def get_response(self, prompt: str) -> Generator[str, None, None]:
+        messages = []
+        if self.supports_system_messages:
+            messages.append({"role": "system", "content": "You are PerfGuru, a helpful assistant for assisting developers in identifying performance bottlenecks in their code and optimizing them."})
+        messages.append({"role": "user", "content": prompt})
+        stream = self.client.chat.completions.create(
+            model=self.model,
+            messages=messages,
+            stream=True,
+            max_tokens=1024,
+        )
+        response = ""
+        for chunk in stream:
+            response += chunk.choices[0].delta.content or ""
+            yield response
 HF_HOME = os.environ.get("HF_HOME", "/home/user/.cache/huggingface")
 GGUF_WILDCARD = os.path.join(HF_HOME, "hub", "models-*", "**", "*.gguf")
 GGUF_PATHS = [(os.path.basename(p), p) for p in glob.glob(GGUF_WILDCARD, recursive=True)]
 LOCAL_MODEL_PATHS = [(os.path.basename(p), p) for p in glob.glob(os.path.join("local_models", "*.gguf"))]
 ALL_LOCAL_MODELS = GGUF_PATHS + LOCAL_MODEL_PATHS
 AVAILABLE_MODELS = [
     LocalModel(model_name, model_path)
     for model_name, model_path in ALL_LOCAL_MODELS
     AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-flash") )
     AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-pro") )
+if os.environ.get("HF_API_KEY"):
+    hf_inference_client = InferenceClient(api_key=os.environ.get("HF_API_KEY"))
+    #AVAILABLE_MODELS.append( InferenceHubModel("google/gemma-2-2b-it", hf_inference_client, supports_system_messages=False) )
+    #AVAILABLE_MODELS.append( InferenceHubModel("Qwen/Qwen2.5-7B-Instruct", hf_inference_client) )
+    AVAILABLE_MODELS.append( InferenceHubModel("microsoft/Phi-3-mini-4k-instruct", hf_inference_client) )
+    #AVAILABLE_MODELS.append( InferenceHubModel("meta-llama/Meta-Llama-3.1-8B-Instruct", hf_inference_client) )
 if not AVAILABLE_MODELS:
     raise ValueError("No models available. Please set OPENAI_API_KEY or GOOGLE_API_KEY environment variables.")