Spaces:
Sleeping
Sleeping
import os | |
import pandas as pd | |
from sentence_transformers import SentenceTransformer | |
import numpy as np | |
from tqdm import tqdm | |
import csv | |
import faiss | |
import requests | |
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache' | |
# URLs of files from Hugging Face | |
base_url = "https://huggingface.co/datasets/manhteky123/LawVietnamese/resolve/main/" | |
data_url = f"{base_url}data.csv" | |
faiss_index_url = f"{base_url}faiss_index.bin" | |
vectors_url = f"{base_url}vectors.npy" | |
# Function to download files to disk | |
def download_to_disk(url, filename): | |
response = requests.get(url) | |
if response.status_code == 200: | |
with open(filename, 'wb') as f: | |
f.write(response.content) | |
print(f"Downloaded {url} to {filename}.") | |
else: | |
raise Exception(f"Failed to download {url}: {response.status_code}") | |
# Download the necessary files to disk | |
data_file_path = 'data.csv' | |
faiss_index_file_path = 'faiss_index.bin' | |
vectors_file_path = 'vectors.npy' | |
download_to_disk(data_url, data_file_path) | |
download_to_disk(faiss_index_url, faiss_index_file_path) | |
download_to_disk(vectors_url, vectors_file_path) | |
# Read the CSV data from the downloaded file | |
df = pd.read_csv(data_file_path) | |
# Use the 'truncated_text' column | |
column_name = 'truncated_text' | |
# Load SentenceTransformer | |
model = SentenceTransformer('intfloat/multilingual-e5-small') | |
# Read FAISS index from file | |
index = faiss.read_index(faiss_index_file_path) | |
# Load vectors | |
vectors = np.load(vectors_file_path) | |
def retrieve_documents(query, k=5, threshold=0.7): | |
query_vector = model.encode([query], convert_to_tensor=True).cpu().numpy() | |
D, I = index.search(query_vector, k) | |
similarities = 1 / (1 + D[0]) | |
filtered_documents = [] | |
for i, similarity in enumerate(similarities): | |
if similarity >= threshold: | |
filtered_documents.append(df.iloc[I[0][i]][column_name]) | |
return filtered_documents | |