File size: 1,374 Bytes
1581c72
19b90a7
1581c72
 
 
 
 
 
 
 
 
 
 
 
19b90a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1581c72
 
19b90a7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import logging
import gradio as gr
import datasets
import sentence_transformers

logging.disable(logging.CRITICAL)

model = sentence_transformers.SentenceTransformer(
    "dangvantuan/sentence-camembert-large", device="cuda"
)

dataset = datasets.load_dataset("json", data_files=["./data/dataset.json"], split="train")
dataset.load_faiss_index("embeddings", "index.faiss")

def search(query: str, k: int):
    query_embedding = model.encode(query)
    _, retrieved_examples = dataset.get_nearest_examples(
        "embeddings",
        query_embedding,
        k=k,
    )

    results = []
    for text, start, end, title, url in zip(
        retrieved_examples["text"],
        retrieved_examples["start"],
        retrieved_examples["end"],
        retrieved_examples["title"],
        retrieved_examples["url"],
    ):
        start = start
        end = end
        result = {
            "title": title,
            "transcript": f"[{str(start)+' ====> '+str(end)}] {text}",
            "link": url
        }
        results.append(result)
    return results

iface = gr.Interface(
    fn=search,
    inputs=["text", "number"],
    outputs=gr.outputs.JSON(),
    title="Search Dataset",
    description="Search a dataset using Camembert and Faiss.",
    example=[
        "Enter a query to search for.",
        5
    ]
)

if __name__ == "__main__":
    iface.launch()