File size: 1,903 Bytes
6d71ad7
fc53e73
a792a9d
 
 
fc53e73
6d71ad7
211d1b0
a792a9d
b2738c5
a792a9d
fc53e73
b2738c5
211d1b0
 
 
 
a0f39d7
 
211d1b0
 
a0f39d7
 
 
211d1b0
 
 
a0f39d7
 
 
 
0a820f3
a0f39d7
 
 
b2738c5
a0f39d7
 
a792a9d
fc53e73
 
a792a9d
fc53e73
211d1b0
fc53e73
a0f39d7
 
fc53e73
fcb7648
a0f39d7
a792a9d
 
 
fc53e73
a792a9d
 
 
 
fc53e73
a0f39d7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import csv
import faiss
import requests

os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'

# URLs of files from Hugging Face
base_url = "https://huggingface.co/datasets/manhteky123/LawVietnamese/resolve/main/"
data_url = f"{base_url}data.csv"
faiss_index_url = f"{base_url}faiss_index.bin"
vectors_url = f"{base_url}vectors.npy"

# Function to download files to disk
def download_to_disk(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {url} to {filename}.")
    else:
        raise Exception(f"Failed to download {url}: {response.status_code}")

# Download the necessary files to disk
data_file_path = 'data.csv'
faiss_index_file_path = 'faiss_index.bin'
vectors_file_path = 'vectors.npy'

download_to_disk(data_url, data_file_path)
download_to_disk(faiss_index_url, faiss_index_file_path)
download_to_disk(vectors_url, vectors_file_path)

# Read the CSV data from the downloaded file
df = pd.read_csv(data_file_path)

# Use the 'truncated_text' column
column_name = 'truncated_text'

# Load SentenceTransformer
model = SentenceTransformer('intfloat/multilingual-e5-small')

# Read FAISS index from file
index = faiss.read_index(faiss_index_file_path)

# Load vectors
vectors = np.load(vectors_file_path)

def retrieve_documents(query, k=5, threshold=0.7):
    query_vector = model.encode([query], convert_to_tensor=True).cpu().numpy()
    D, I = index.search(query_vector, k)
    similarities = 1 / (1 + D[0])
    filtered_documents = []
    for i, similarity in enumerate(similarities):
        if similarity >= threshold:
            filtered_documents.append(df.iloc[I[0][i]][column_name])
    return filtered_documents