Spaces:
Sleeping
Sleeping
File size: 1,903 Bytes
6d71ad7 fc53e73 a792a9d fc53e73 6d71ad7 211d1b0 a792a9d b2738c5 a792a9d fc53e73 b2738c5 211d1b0 a0f39d7 211d1b0 a0f39d7 211d1b0 a0f39d7 0a820f3 a0f39d7 b2738c5 a0f39d7 a792a9d fc53e73 a792a9d fc53e73 211d1b0 fc53e73 a0f39d7 fc53e73 fcb7648 a0f39d7 a792a9d fc53e73 a792a9d fc53e73 a0f39d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import csv
import faiss
import requests
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
# URLs of files from Hugging Face
base_url = "https://huggingface.co/datasets/manhteky123/LawVietnamese/resolve/main/"
data_url = f"{base_url}data.csv"
faiss_index_url = f"{base_url}faiss_index.bin"
vectors_url = f"{base_url}vectors.npy"
# Function to download files to disk
def download_to_disk(url, filename):
response = requests.get(url)
if response.status_code == 200:
with open(filename, 'wb') as f:
f.write(response.content)
print(f"Downloaded {url} to {filename}.")
else:
raise Exception(f"Failed to download {url}: {response.status_code}")
# Download the necessary files to disk
data_file_path = 'data.csv'
faiss_index_file_path = 'faiss_index.bin'
vectors_file_path = 'vectors.npy'
download_to_disk(data_url, data_file_path)
download_to_disk(faiss_index_url, faiss_index_file_path)
download_to_disk(vectors_url, vectors_file_path)
# Read the CSV data from the downloaded file
df = pd.read_csv(data_file_path)
# Use the 'truncated_text' column
column_name = 'truncated_text'
# Load SentenceTransformer
model = SentenceTransformer('intfloat/multilingual-e5-small')
# Read FAISS index from file
index = faiss.read_index(faiss_index_file_path)
# Load vectors
vectors = np.load(vectors_file_path)
def retrieve_documents(query, k=5, threshold=0.7):
query_vector = model.encode([query], convert_to_tensor=True).cpu().numpy()
D, I = index.search(query_vector, k)
similarities = 1 / (1 + D[0])
filtered_documents = []
for i, similarity in enumerate(similarities):
if similarity >= threshold:
filtered_documents.append(df.iloc[I[0][i]][column_name])
return filtered_documents
|