import faiss import gradio as gr import numpy as np import pandas as pd from datasets import load_dataset from sentence_transformers import SentenceTransformer idx = 0 index = None newdoc = None dataset = load_dataset("tollefj/rettsavgjoerelser_100samples_embeddings") model = SentenceTransformer("NbAiLab/nb-sbert-base") df = dataset["train"].to_pandas() def build_doc_frame(df, idx): doc = df.iloc[idx] # as df: doc_df = pd.DataFrame(doc).T # keep only sentences + embedding: doc_df = doc_df[["url", "sentences", "embedding"]] # unpack the sentences and embedding in separate rows doc_df = doc_df.explode(["sentences", "embedding"]) return doc_df def get_doc_embeddings(doc): return np.array(doc.embedding.tolist(), dtype="float32") def faiss_search(query_str, K=5): global idx global index global newdoc # find idx from url: # doc_idx = df[df.url == doc_url].index[0] # idx = int(doc_idx) target_emb = model.encode([query_str]) target_emb = np.array([target_emb.reshape(-1)]) faiss.normalize_L2(target_emb) D, I = index.search(np.array(target_emb), K) print(list(zip(D[0], I[0]))) # prettyprint the results: pretty_results = [] for idx, score in zip(I[0], D[0]): pretty_results.append((round(float(score), 3), newdoc.iloc[idx].sentences)) pretty_results_str = "\n".join([f"Score: {score}\t\t{sent}" for score, sent in pretty_results]) top_k_str = f"Top {K} results for: {query_str}" # return str: return f"{top_k_str}\n{pretty_results_str}" # def DropdownSummary(): # next_opts = df.iloc[idx].summary.tolist() # return gr.Dropdown.update(choices=next_opts, label="Velg fra oppsummeringene") dropdown_opts = [doc.url for idx, doc in df.iterrows()] with gr.Blocks() as demo: gr.HTML( """

Lovdata rettsavgjørelser - semantisk søk

""" ) def on_selection_change(selected_case): global idx global index global newdoc idx = df[df.url == selected_case].index[0] print("Selection changed!") print(selected_case) newdoc = build_doc_frame(df, idx) embeddings = get_doc_embeddings(newdoc) faiss.normalize_L2(embeddings) index = faiss.IndexFlatIP(768) index.add(embeddings) summary = df.iloc[idx].summary.tolist() # make a nice html-formatted ul-li list: summary_html = "" # summary_dropdown.update(choices=summary, label="Velg fra oppsummeringene") url_html = f"{selected_case}" return summary_html, url_html with gr.Row(): with gr.Column(): case_dropdown = gr.Dropdown(label="Velg en rettsavgjørelse", choices=dropdown_opts) summary_html = gr.HTML(label="Predefinert oppsummering", placeholder="

Velg en sak først

") case_url = gr.HTML(label="URL til rettsavgjørelse", placeholder="https://lovdata.no/...") with gr.Column(): query = gr.Textbox( label="Søk etter setninger", lines=1, placeholder="Kollisjon mellom to kjøretøy.", ) k_slider = gr.Slider(minimum=1, maximum=10, label="Antall treff", value=5, step=1) search_btn = gr.Button("Søk") output = gr.Textbox(label="Resultater", lines=10) case_dropdown.change( on_selection_change, inputs=[case_dropdown], outputs=[summary_html, case_url], ) search_btn.click(faiss_search, inputs=[query, k_slider], outputs=[output]) # clear_btn.click(None, inputs=[None, None], outputs=None) # search_btn.click(faiss_search, inputs=[None, None, None], outputs=["text"]) # search_btn.click(faiss_search, inputs=[idx, query, k_slider], outputs=["text"]) demo.launch()