Daniel Nichols commited on
Commit
dc10826
·
1 Parent(s): 0cfd67a

add hf inference api and update streaming

Browse files
Files changed (1) hide show
  1. src/models.py +41 -8
src/models.py CHANGED
@@ -8,6 +8,7 @@ import glob
8
  import openai
9
  import google.generativeai as genai
10
  from llama_cpp import Llama
 
11
 
12
  class ChatModel(ABC):
13
  def __init__(self, name):
@@ -86,12 +87,12 @@ class LocalModel(ChatModel):
86
  super().__init__(model)
87
  self.llm = Llama(
88
  model_path=model_path,
89
- n_ctx=8000,
90
  )
91
 
92
  def get_response(self, prompt) -> Generator[str, None, None]:
93
 
94
- output = self.llm.create_chat_completion(
95
  messages = [
96
  {"role": "system", "content": "You are PerfGuru, a helpful assistant for assisting developers in identifying performance bottlenecks in their code and optimizing them."},
97
  {
@@ -100,22 +101,47 @@ class LocalModel(ChatModel):
100
  }
101
  ],
102
  max_tokens=4000,
 
103
  )
104
 
105
- result = output["choices"][0]["message"]["content"]
106
- for idx in range(len(result)):
107
- yield result[:idx+1]
 
108
 
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  HF_HOME = os.environ.get("HF_HOME", "/home/user/.cache/huggingface")
111
  GGUF_WILDCARD = os.path.join(HF_HOME, "hub", "models-*", "**", "*.gguf")
112
  GGUF_PATHS = [(os.path.basename(p), p) for p in glob.glob(GGUF_WILDCARD, recursive=True)]
113
  LOCAL_MODEL_PATHS = [(os.path.basename(p), p) for p in glob.glob(os.path.join("local_models", "*.gguf"))]
114
  ALL_LOCAL_MODELS = GGUF_PATHS + LOCAL_MODEL_PATHS
115
 
116
- print(HF_HOME)
117
- print(ALL_LOCAL_MODELS)
118
-
119
  AVAILABLE_MODELS = [
120
  LocalModel(model_name, model_path)
121
  for model_name, model_path in ALL_LOCAL_MODELS
@@ -133,6 +159,13 @@ if os.environ.get("GOOGLE_API_KEY"):
133
  AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-flash") )
134
  AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-pro") )
135
 
 
 
 
 
 
 
 
136
  if not AVAILABLE_MODELS:
137
  raise ValueError("No models available. Please set OPENAI_API_KEY or GOOGLE_API_KEY environment variables.")
138
 
 
8
  import openai
9
  import google.generativeai as genai
10
  from llama_cpp import Llama
11
+ from huggingface_hub import InferenceClient
12
 
13
  class ChatModel(ABC):
14
  def __init__(self, name):
 
87
  super().__init__(model)
88
  self.llm = Llama(
89
  model_path=model_path,
90
+ n_ctx=4096,
91
  )
92
 
93
  def get_response(self, prompt) -> Generator[str, None, None]:
94
 
95
+ outputs = self.llm.create_chat_completion(
96
  messages = [
97
  {"role": "system", "content": "You are PerfGuru, a helpful assistant for assisting developers in identifying performance bottlenecks in their code and optimizing them."},
98
  {
 
101
  }
102
  ],
103
  max_tokens=4000,
104
+ stream=True,
105
  )
106
 
107
+ response = ""
108
+ for chunk in outputs:
109
+ response += chunk['choices'][0]['delta'].get('content', '')
110
+ yield response
111
 
112
 
113
+ class InferenceHubModel(ChatModel):
114
+
115
+ def __init__(self, model: str, client: InferenceClient, supports_system_messages: bool = True):
116
+ super().__init__(model)
117
+ self.model = model
118
+ self.client = client
119
+ self.supports_system_messages = supports_system_messages
120
+
121
+ def get_response(self, prompt: str) -> Generator[str, None, None]:
122
+ messages = []
123
+ if self.supports_system_messages:
124
+ messages.append({"role": "system", "content": "You are PerfGuru, a helpful assistant for assisting developers in identifying performance bottlenecks in their code and optimizing them."})
125
+ messages.append({"role": "user", "content": prompt})
126
+
127
+ stream = self.client.chat.completions.create(
128
+ model=self.model,
129
+ messages=messages,
130
+ stream=True,
131
+ max_tokens=1024,
132
+ )
133
+ response = ""
134
+ for chunk in stream:
135
+ response += chunk.choices[0].delta.content or ""
136
+ yield response
137
+
138
+
139
  HF_HOME = os.environ.get("HF_HOME", "/home/user/.cache/huggingface")
140
  GGUF_WILDCARD = os.path.join(HF_HOME, "hub", "models-*", "**", "*.gguf")
141
  GGUF_PATHS = [(os.path.basename(p), p) for p in glob.glob(GGUF_WILDCARD, recursive=True)]
142
  LOCAL_MODEL_PATHS = [(os.path.basename(p), p) for p in glob.glob(os.path.join("local_models", "*.gguf"))]
143
  ALL_LOCAL_MODELS = GGUF_PATHS + LOCAL_MODEL_PATHS
144
 
 
 
 
145
  AVAILABLE_MODELS = [
146
  LocalModel(model_name, model_path)
147
  for model_name, model_path in ALL_LOCAL_MODELS
 
159
  AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-flash") )
160
  AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-pro") )
161
 
162
+ if os.environ.get("HF_API_KEY"):
163
+ hf_inference_client = InferenceClient(api_key=os.environ.get("HF_API_KEY"))
164
+ #AVAILABLE_MODELS.append( InferenceHubModel("google/gemma-2-2b-it", hf_inference_client, supports_system_messages=False) )
165
+ #AVAILABLE_MODELS.append( InferenceHubModel("Qwen/Qwen2.5-7B-Instruct", hf_inference_client) )
166
+ AVAILABLE_MODELS.append( InferenceHubModel("microsoft/Phi-3-mini-4k-instruct", hf_inference_client) )
167
+ #AVAILABLE_MODELS.append( InferenceHubModel("meta-llama/Meta-Llama-3.1-8B-Instruct", hf_inference_client) )
168
+
169
  if not AVAILABLE_MODELS:
170
  raise ValueError("No models available. Please set OPENAI_API_KEY or GOOGLE_API_KEY environment variables.")
171