Spaces:
Runtime error
Runtime error
import requests | |
import os | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from datasets import load_dataset | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
my_token = os.getenv('my_repo_token') | |
def find_most_relevant_context(contexts, question, max_features=10000): | |
# Vectorize contexts and question with limited features | |
tfidf_vectorizer = TfidfVectorizer(max_features=max_features) | |
tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts) | |
# Compute cosine similarity between question and contexts | |
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten() | |
# Get index of context with highest similarity | |
most_relevant_index = similarity_scores.argmax() | |
return contexts[most_relevant_index] | |
def load_and_prepare_dataset(file_path): | |
dataset = load_dataset('json', data_files=file_path, field='data')['train'] | |
return dataset | |
# Load dataset and get question and contexts | |
dataset = load_and_prepare_dataset('./train.json') | |
Question = dataset[0]['paragraphs'][0]['qas'][0]['question'] | |
contexts = [dataset[i]['paragraphs'][0]['context'] for i in range(min(3, len(dataset)))] | |
# Find the most relevant context | |
most_relevant_context = find_most_relevant_context(contexts, Question) | |
# Create the instruction for the model | |
instruction = most_relevant_context[:300] + " " + Question | |
API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2" | |
headers = {"Authorization": f"Bearer {my_token}"} | |
def query(payload): | |
response = requests.post(API_URL, headers=headers, json=payload) | |
return response.json() | |
output = query({ | |
"inputs": instruction, | |
}) | |
print(output) |