avimittal30 commited on
Commit
c1fe264
·
verified ·
1 Parent(s): 2529e2a

Update helper.py to use correct huggingface inferencing URL

Browse files
Files changed (1) hide show
  1. helper.py +18 -11
helper.py CHANGED
@@ -1,4 +1,4 @@
1
- from sentence_transformers import SentenceTransformer
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from pypdf import PdfReader
4
  import requests
@@ -35,9 +35,10 @@ def generate_hypothetical_answer(query):
35
  import os
36
  import time
37
 
38
- # Hugging Face API endpoint
39
- api_url = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
40
 
 
41
  # Get API token from environment variable
42
  api_token = os.getenv("HUGGINGFACE_API_TOKEN")
43
  if not api_token:
@@ -58,20 +59,21 @@ def generate_hypothetical_answer(query):
58
  Hypothetical answer:
59
  """
60
 
61
- # Prepare the request payload
62
  payload = {
63
  "inputs": prompt,
64
  "parameters": {
65
  "max_new_tokens": 256,
66
  "temperature": 0.7,
67
  "top_p": 0.95,
68
- "do_sample": True
 
69
  }
70
  }
71
 
72
  try:
73
  # Make the API request to Hugging Face
74
- print("Sending request to Hugging Face API for hypothetical answer...")
75
  print(f"API URL: {api_url}")
76
  print(f"Headers: {headers}")
77
  print(f"Payload: {json.dumps(payload, indent=2)}")
@@ -164,8 +166,8 @@ def query_llm_with_context(query, context, top_n=3):
164
  Query: {query}
165
  """
166
 
167
- # Hugging Face API endpoint
168
- api_url = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
169
 
170
  # Get API token from environment variable
171
  api_token = os.getenv("HUGGINGFACE_API_TOKEN")
@@ -179,20 +181,21 @@ def query_llm_with_context(query, context, top_n=3):
179
  "Content-Type": "application/json"
180
  }
181
 
182
- # Prepare the request payload
183
  payload = {
184
  "inputs": prompt,
185
  "parameters": {
186
  "max_new_tokens": 512,
187
  "temperature": 0.7,
188
  "top_p": 0.95,
189
- "do_sample": True
 
190
  }
191
  }
192
 
193
  try:
194
  # Make the API request to Hugging Face
195
- print("Sending request to Hugging Face API...")
196
  print(f"API URL: {api_url}")
197
  print(f"Headers: {headers}")
198
  print(f"Payload: {json.dumps(payload, indent=2)}")
@@ -239,6 +242,10 @@ def query_llm_with_context(query, context, top_n=3):
239
  print(f"HTTP error occurred: {e}")
240
  print(f"Response status code: {e.response.status_code}")
241
  print(f"Response headers: {e.response.headers}")
 
 
 
 
242
 
243
  if e.response.status_code == 401:
244
  return "Authentication error. Please check your Hugging Face API token."
 
1
+ from sentence_transformers import SentenceTransformer
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from pypdf import PdfReader
4
  import requests
 
35
  import os
36
  import time
37
 
38
+ # Hugging Face API endpoint with vLLM
39
+ api_url = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-2-7b-chat-hf/v1/chat/completions"
40
 
41
+
42
  # Get API token from environment variable
43
  api_token = os.getenv("HUGGINGFACE_API_TOKEN")
44
  if not api_token:
 
59
  Hypothetical answer:
60
  """
61
 
62
+ # Prepare the request payload for vLLM
63
  payload = {
64
  "inputs": prompt,
65
  "parameters": {
66
  "max_new_tokens": 256,
67
  "temperature": 0.7,
68
  "top_p": 0.95,
69
+ "do_sample": True,
70
+ "use_vllm": True # Enable vLLM for faster inference
71
  }
72
  }
73
 
74
  try:
75
  # Make the API request to Hugging Face
76
+ print("Sending request to Hugging Face API with vLLM for hypothetical answer...")
77
  print(f"API URL: {api_url}")
78
  print(f"Headers: {headers}")
79
  print(f"Payload: {json.dumps(payload, indent=2)}")
 
166
  Query: {query}
167
  """
168
 
169
+ # Hugging Face API endpoint with vLLM
170
+ api_url = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-2-7b-chat-hf/v1/chat/completions"
171
 
172
  # Get API token from environment variable
173
  api_token = os.getenv("HUGGINGFACE_API_TOKEN")
 
181
  "Content-Type": "application/json"
182
  }
183
 
184
+ # Prepare the request payload for vLLM
185
  payload = {
186
  "inputs": prompt,
187
  "parameters": {
188
  "max_new_tokens": 512,
189
  "temperature": 0.7,
190
  "top_p": 0.95,
191
+ "do_sample": True,
192
+ "use_vllm": True # Enable vLLM for faster inference
193
  }
194
  }
195
 
196
  try:
197
  # Make the API request to Hugging Face
198
+ print("Sending request to Hugging Face API with vLLM...")
199
  print(f"API URL: {api_url}")
200
  print(f"Headers: {headers}")
201
  print(f"Payload: {json.dumps(payload, indent=2)}")
 
242
  print(f"HTTP error occurred: {e}")
243
  print(f"Response status code: {e.response.status_code}")
244
  print(f"Response headers: {e.response.headers}")
245
+ try:
246
+ print(f"Response content: {e.response.text}")
247
+ except:
248
+ print("Could not print response content")
249
 
250
  if e.response.status_code == 401:
251
  return "Authentication error. Please check your Hugging Face API token."