Spaces:
Sleeping
Sleeping
from sentence_transformers import SentenceTransformer | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from pypdf import PdfReader | |
import requests | |
import json | |
import os | |
import time | |
def extract_text_from_pdf(pdf_path): | |
reader = PdfReader(pdf_path) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text.strip() | |
def chunk_text(text, chunk_size=500, chunk_overlap=100): | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, # Overlap to preserve context | |
separators=["\n\n", "\n", " ", ""], # Prioritize logical breaks | |
) | |
return splitter.split_text(text) | |
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
def embedding_function(texts): | |
return embedding_model.encode(texts, convert_to_numpy=True).tolist() | |
def generate_hypothetical_answer(query): | |
import requests | |
import json | |
import os | |
import time | |
# Hugging Face API endpoint with vLLM | |
api_url = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-2-7b-chat-hf/v1/chat/completions" | |
# Get API token from environment variable | |
api_token = os.getenv("HUGGINGFACE_API_TOKEN") | |
if not api_token: | |
print("Error: HUGGINGFACE_API_TOKEN environment variable not set") | |
return "Error: HUGGINGFACE_API_TOKEN environment variable not set" | |
# Headers for the API request | |
headers = { | |
"Authorization": f"Bearer {api_token}", | |
"Content-Type": "application/json" | |
} | |
# Create a prompt for generating a hypothetical answer | |
prompt = f""" | |
Given the following query, generate a hypothetical answer that might be found in a document: | |
Query: {query} | |
Hypothetical answer: | |
""" | |
# Prepare the request payload for vLLM | |
payload = { | |
"inputs": prompt, | |
"parameters": { | |
"max_new_tokens": 256, | |
"temperature": 0.7, | |
"top_p": 0.95, | |
"do_sample": True, | |
"use_vllm": True # Enable vLLM for faster inference | |
} | |
} | |
try: | |
# Make the API request to Hugging Face | |
print("Sending request to Hugging Face API with vLLM for hypothetical answer...") | |
print(f"API URL: {api_url}") | |
print(f"Headers: {headers}") | |
print(f"Payload: {json.dumps(payload, indent=2)}") | |
start_time = time.time() | |
# Set a longer timeout (5 minutes) | |
response = requests.post(api_url, headers=headers, json=payload, timeout=300) | |
end_time = time.time() | |
print(f"Received hypothetical answer from Hugging Face API in {end_time - start_time:.2f} seconds") | |
print(f"Response status code: {response.status_code}") | |
print(f"Response headers: {response.headers}") | |
# Try to print the response content for debugging | |
try: | |
print(f"Response content: {response.text[:1000]}...") # Print first 1000 chars | |
except: | |
print("Could not print response content") | |
response.raise_for_status() # Raise an exception for HTTP errors | |
# Parse the response | |
result = response.json() | |
print(f"Parsed response: {json.dumps(result, indent=2)[:1000]}...") # Print first 1000 chars | |
# Extract the generated text | |
if isinstance(result, list) and len(result) > 0: | |
generated_text = result[0].get("generated_text", "") | |
else: | |
generated_text = result.get("generated_text", "") | |
return generated_text.strip() | |
except requests.exceptions.Timeout: | |
print("Request to Hugging Face API timed out after 5 minutes") | |
return "The request timed out. The model is taking too long to respond. Please try again with a simpler query." | |
except requests.exceptions.ConnectionError: | |
print("Could not connect to Hugging Face API") | |
return "Could not connect to the Hugging Face API. Please check your internet connection." | |
except requests.exceptions.HTTPError as e: | |
print(f"HTTP error occurred: {e}") | |
print(f"Response status code: {e.response.status_code}") | |
print(f"Response headers: {e.response.headers}") | |
try: | |
print(f"Response content: {e.response.text}") | |
except: | |
print("Could not print response content") | |
if e.response.status_code == 401: | |
return "Authentication error. Please check your Hugging Face API token." | |
elif e.response.status_code == 429: | |
return "Rate limit exceeded. Please try again later." | |
return f"HTTP error occurred: {e}" | |
except Exception as e: | |
print(f"Error generating hypothetical answer: {e}") | |
import traceback | |
print(f"Traceback: {traceback.format_exc()}") | |
return "Failed to generate a hypothetical answer." | |
def query_llm_with_context(query, context, top_n=3): | |
import requests | |
import json | |
import os | |
import time | |
# Unpack the context tuple | |
documents, similarity_scores = context | |
# Use only the top N documents | |
top_docs = documents[:top_n] | |
# Create a context string by joining the top documents | |
context_text = "\n\n===Document Boundary===\n\n".join(top_docs) | |
# Create a prompt with the context and query | |
prompt = f""" | |
Context information is below. | |
--------------------- | |
{context_text} | |
--------------------- | |
Given the context information and not prior knowledge, answer the following query: | |
Query: {query} | |
""" | |
# Hugging Face API endpoint with vLLM | |
api_url = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-2-7b-chat-hf/v1/chat/completions" | |
# Get API token from environment variable | |
api_token = os.getenv("HUGGINGFACE_API_TOKEN") | |
if not api_token: | |
print("Error: HUGGINGFACE_API_TOKEN environment variable not set") | |
return "Error: HUGGINGFACE_API_TOKEN environment variable not set" | |
# Headers for the API request | |
headers = { | |
"Authorization": f"Bearer {api_token}", | |
"Content-Type": "application/json" | |
} | |
# Prepare the request payload for vLLM | |
payload = { | |
"inputs": prompt, | |
"parameters": { | |
"max_new_tokens": 512, | |
"temperature": 0.7, | |
"top_p": 0.95, | |
"do_sample": True, | |
"use_vllm": True # Enable vLLM for faster inference | |
} | |
} | |
try: | |
# Make the API request to Hugging Face | |
print("Sending request to Hugging Face API with vLLM...") | |
print(f"API URL: {api_url}") | |
print(f"Headers: {headers}") | |
print(f"Payload: {json.dumps(payload, indent=2)}") | |
start_time = time.time() | |
# Set a longer timeout (5 minutes) | |
response = requests.post(api_url, headers=headers, json=payload, timeout=300) | |
end_time = time.time() | |
print(f"Received response from Hugging Face API in {end_time - start_time:.2f} seconds") | |
print(f"Response status code: {response.status_code}") | |
print(f"Response headers: {response.headers}") | |
# Try to print the response content for debugging | |
try: | |
print(f"Response content: {response.text[:1000]}...") # Print first 1000 chars | |
except: | |
print("Could not print response content") | |
response.raise_for_status() # Raise an exception for HTTP errors | |
# Parse the response | |
result = response.json() | |
print(f"Parsed response: {json.dumps(result, indent=2)[:1000]}...") # Print first 1000 chars | |
# Extract the generated text | |
if isinstance(result, list) and len(result) > 0: | |
generated_text = result[0].get("generated_text", "") | |
else: | |
generated_text = result.get("generated_text", "") | |
return generated_text.strip() | |
except requests.exceptions.Timeout: | |
print("Request to Hugging Face API timed out after 5 minutes") | |
return "The request timed out. The model is taking too long to respond. Please try again with a simpler query or fewer context documents." | |
except requests.exceptions.ConnectionError: | |
print("Could not connect to Hugging Face API") | |
return "Could not connect to the Hugging Face API. Please check your internet connection." | |
except requests.exceptions.HTTPError as e: | |
print(f"HTTP error occurred: {e}") | |
print(f"Response status code: {e.response.status_code}") | |
print(f"Response headers: {e.response.headers}") | |
try: | |
print(f"Response content: {e.response.text}") | |
except: | |
print("Could not print response content") | |
if e.response.status_code == 401: | |
return "Authentication error. Please check your Hugging Face API token." | |
elif e.response.status_code == 429: | |
return "Rate limit exceeded. Please try again later." | |
return f"HTTP error occurred: {e}" | |
except Exception as e: | |
print(f"Error querying LLM with context: {e}") | |
import traceback | |
print(f"Traceback: {traceback.format_exc()}") | |
return "Failed to generate an answer with the provided context." | |