Spaces:

avimittal30
/

FinQuery

Sleeping

App Files Files Community

avimittal30 commited on Apr 6

Commit

c1fe264

verified ·

1 Parent(s): 2529e2a

Update helper.py to use correct huggingface inferencing URL

Browse files

Files changed (1) hide show

helper.py +18 -11

helper.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from sentence_transformers import SentenceTransformer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from pypdf import PdfReader
 import requests
@@ -35,9 +35,10 @@ def generate_hypothetical_answer(query):
     import os
     import time
-    # Hugging Face API endpoint
-    api_url = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
     # Get API token from environment variable
     api_token = os.getenv("HUGGINGFACE_API_TOKEN")
     if not api_token:
@@ -58,20 +59,21 @@ def generate_hypothetical_answer(query):
     Hypothetical answer:
     """
-    # Prepare the request payload
     payload = {
         "inputs": prompt,
         "parameters": {
             "max_new_tokens": 256,
             "temperature": 0.7,
             "top_p": 0.95,
-            "do_sample": True
         }
     }
     try:
         # Make the API request to Hugging Face
-        print("Sending request to Hugging Face API for hypothetical answer...")
         print(f"API URL: {api_url}")
         print(f"Headers: {headers}")
         print(f"Payload: {json.dumps(payload, indent=2)}")
@@ -164,8 +166,8 @@ def query_llm_with_context(query, context, top_n=3):
     Query: {query}
     """
-    # Hugging Face API endpoint
-    api_url = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
     # Get API token from environment variable
     api_token = os.getenv("HUGGINGFACE_API_TOKEN")
@@ -179,20 +181,21 @@ def query_llm_with_context(query, context, top_n=3):
         "Content-Type": "application/json"
     }
-    # Prepare the request payload
     payload = {
         "inputs": prompt,
         "parameters": {
             "max_new_tokens": 512,
             "temperature": 0.7,
             "top_p": 0.95,
-            "do_sample": True
         }
     }
     try:
         # Make the API request to Hugging Face
-        print("Sending request to Hugging Face API...")
         print(f"API URL: {api_url}")
         print(f"Headers: {headers}")
         print(f"Payload: {json.dumps(payload, indent=2)}")
@@ -239,6 +242,10 @@ def query_llm_with_context(query, context, top_n=3):
         print(f"HTTP error occurred: {e}")
         print(f"Response status code: {e.response.status_code}")
         print(f"Response headers: {e.response.headers}")
         if e.response.status_code == 401:
             return "Authentication error. Please check your Hugging Face API token."

+    from sentence_transformers import SentenceTransformer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from pypdf import PdfReader
 import requests
     import os
     import time
+    # Hugging Face API endpoint with vLLM
+    api_url = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-2-7b-chat-hf/v1/chat/completions"
     # Get API token from environment variable
     api_token = os.getenv("HUGGINGFACE_API_TOKEN")
     if not api_token:
     Hypothetical answer:
     """
+    # Prepare the request payload for vLLM
     payload = {
         "inputs": prompt,
         "parameters": {
             "max_new_tokens": 256,
             "temperature": 0.7,
             "top_p": 0.95,
+            "do_sample": True,
+            "use_vllm": True  # Enable vLLM for faster inference
         }
     }
     try:
         # Make the API request to Hugging Face
+        print("Sending request to Hugging Face API with vLLM for hypothetical answer...")
         print(f"API URL: {api_url}")
         print(f"Headers: {headers}")
         print(f"Payload: {json.dumps(payload, indent=2)}")
     Query: {query}
     """
+    # Hugging Face API endpoint with vLLM
+    api_url = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-2-7b-chat-hf/v1/chat/completions"
     # Get API token from environment variable
     api_token = os.getenv("HUGGINGFACE_API_TOKEN")
         "Content-Type": "application/json"
     }
+    # Prepare the request payload for vLLM
     payload = {
         "inputs": prompt,
         "parameters": {
             "max_new_tokens": 512,
             "temperature": 0.7,
             "top_p": 0.95,
+            "do_sample": True,
+            "use_vllm": True  # Enable vLLM for faster inference
         }
     }
     try:
         # Make the API request to Hugging Face
+        print("Sending request to Hugging Face API with vLLM...")
         print(f"API URL: {api_url}")
         print(f"Headers: {headers}")
         print(f"Payload: {json.dumps(payload, indent=2)}")
         print(f"HTTP error occurred: {e}")
         print(f"Response status code: {e.response.status_code}")
         print(f"Response headers: {e.response.headers}")
+        try:
+            print(f"Response content: {e.response.text}")
+        except:
+            print("Could not print response content")
         if e.response.status_code == 401:
             return "Authentication error. Please check your Hugging Face API token."