import gradio as gr from datasets import load_dataset from transformers import AutoTokenizer, AutoModel import torch import pandas as pd import os os.environ['CURL_CA_BUNDLE'] = '' # Load dataset issues_dataset = load_dataset("gvozdev/subspace-info-v2", split="train") # Load tokenizer and model model_ckpt = "sentence-transformers/all-MiniLM-L12-v1" tokenizer = AutoTokenizer.from_pretrained(model_ckpt) model = AutoModel.from_pretrained(model_ckpt, trust_remote_code=True) # Text concatenation - not used in this case as mapping only on subject returns better results # def concatenate_text(examples): # return { # "text": examples["subject"] # + " \n " # + examples["details"] # } issues_dataset = issues_dataset.map() # To speed up embedding, we can switch to GPU (change device to "cuda") - for larger models device = torch.device("cpu") model.to(device) # CLS pooling on model’s outputs: collect the last hidden state for the special [CLS] token def cls_pooling(model_output): return model_output.last_hidden_state[:, 0] # Tokenize a list of documents, place the tensors on the CPU/GPU, feed them to the model, # and apply CLS pooling to the outputs def get_embeddings(text_list): encoded_input = tokenizer( text_list, padding=True, truncation=True, return_tensors="pt" ) encoded_input = {k: v.to(device) for k, v in encoded_input.items()} model_output = model(**encoded_input) return cls_pooling(model_output) # Test if the function works # embedding = get_embeddings(issues_dataset["details"][0]) # print(embedding.shape) # Use Dataset.map() to apply get_embeddings() function to each row in the dataset and create a new "embeddings" column # Convert the embeddings to NumPy arrays as Datasets requires this format when we try to index them with FAISS embeddings_dataset = issues_dataset.map( lambda x: {"embeddings": get_embeddings(x["subject"]).detach().cpu().numpy()[0]} ) # Create a FAISS index embeddings_dataset.add_faiss_index(column="embeddings") # def answer_question(question): # Get an embedding for the question question_embedding = get_embeddings([question]).cpu().detach().numpy() # Find a nearest neighbor in our dataset scores, samples = embeddings_dataset.get_nearest_examples( "embeddings", question_embedding, k=1 ) samples_df = pd.DataFrame.from_dict(samples) # This part is needed in case we use k>1 # samples_df["scores"] = scores # samples_df.sort_values("scores", ascending=False, inplace=True) return samples_df["details"].values[0] # Gradio interface title = "Subspace Docs bot" description = '

This is a bot trained on Subspace Network documentation ' \ 'to answer the most common questions about the project

' def chat(message, history): history = history or [] response = answer_question(message) history.append((message, response)) return history, history iface = gr.Interface( chat, ["text", "state"], ["chatbot", "state"], allow_flagging="never", title=title, description=description, theme="Monochrome", examples=["What is Subspace Network?", "Do you have a token?", "System requirements"] ) iface.launch(share=False)