File size: 4,757 Bytes
7782f2b
44de40e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7782f2b
44de40e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import pandas as pd
from rank_bm25 import BM25Okapi
import numpy as np
from transformers import AutoTokenizer
from rank_bm25 import BM25Okapi
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import pandas as pd

dataset = pd.read_csv("filtered_133k_data_cleanlab.csv")
df1 = dataset[['text' , 'label' , "Chat_ID" , "x" , "y"]].dropna()
df2 = dataset[["text", "classifier_label" , "Chat_ID" , "scores_proba_countvectr" , "x" , "y"]].dropna()
df2 = df2[df2.scores_proba_countvectr > 0.9]

df2 = df2[["text" , "classifier_label" , "Chat_ID" , "x" , "y"]]
df2.columns = ["text" , "label" , "Chat_ID" , "x" , "y"]
dataset = pd.concat( (df1 , df2) ).reset_index(drop=True)
dataset = dataset.sample(frac = 1).reset_index(drop=True)



class KeyWordSearch:
    
    def __init__(self, corpus: pd.DataFrame, tokenizer = None):
        """
        
        """
        self.corpus = corpus
        self.tokenizer = tokenizer # if you want
        self.tokenized_corpus = [doc.split(" ") for doc in self.corpus['text']]
        self.search_engine = BM25Okapi(self.tokenized_corpus)
        
    def get_top_10(self , query):
        tokenized_query = query.split(" ")
        scores = self.search_engine.get_scores(tokenized_query)
        sorted_indices = np.argsort(scores)
        top_indices = []
        for idx in reversed(sorted_indices):
            top_indices.append(idx)
            if len(top_indices) == 10:
                break
                    
        top_results = []

        for top_index in top_indices:
            top_results.append({
                "positive" : query,
                "look_up": self.corpus['text'].iloc[top_index],
                "score": scores[top_index],
            })
        top_results = pd.DataFrame(top_results)
        return dict(zip(top_results.look_up.tolist() , top_results.score.tolist()))
    
class VectorSearch:
    
    def __init__(self, corpus):
        """
        corpus : list of text
        """
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

        self.docs = self.text_splitter.create_documents(corpus)

        modelPath = "omarelsayeed/bert_large_mnr"

        model_kwargs = {'device': 'cpu'}

        encode_kwargs = {'normalize_embeddings': False}

        self.embeddings = HuggingFaceEmbeddings(
            model_name=modelPath,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs
        )

        self.db = FAISS.from_documents(self.docs, self.embeddings)
        self.retriever = self.db.as_retriever()
        
    def search_query(self, query):
        return (pd.DataFrame([[x.page_content, y] for x, y in self.db.similarity_search_with_score(query , k=10)]),
                self.db.max_marginal_relevance_search(query , k=10 , return_score=True))
import gradio as gr
import pandas as pd

df = pd.read_csv('filtered_133k_data_cleanlab.csv')
class CurrentLabel:
    current_label = None

class VCC:
    def __init__(self):
        self.vcc = None
        self.current_label = None
        
    def filter_corpus(self, label, search_query, search_method):    
        corpus = df[df['label'] == label]
        kw = KeyWordSearch(corpus)
        # Implement your search functions (BM25 and Semantic) here and get the search results
        search_results = ""

        if search_method == "BM25":
            return kw.get_top_10(search_query)

        if search_method == "Semantic":
            if CurrentLabel.current_label != label:
                CurrentLabel.current_label = label
                self.vcc = VectorSearch(corpus.text.tolist())

            results = self.vcc.db.similarity_search_with_score(search_query , k = 10)
            results = [(x.page_content , y) for x, y in results]
            res = [x[0] for x in results]
            score = [x[1] for x in results]
            sc = [float(x) for x in score]
            return dict(zip(res , sc))

        # Format and return the search results as a string
        if search_results == "":
            search_results = "No results found."
        return search_results

v = VCC()


# Create a Gradio interface
label_dropdown = gr.inputs.Dropdown(choices=list(df['label'].unique()), label="Select Label")
search_query_input = gr.inputs.Textbox(label="Search Query")
search_method_radio = gr.inputs.Radio(["BM25", "Semantic"], label="Search Method")


search_interface = gr.Interface(
    fn=v.filter_corpus,
    inputs=[label_dropdown, search_query_input, search_method_radio],
    outputs=gr.outputs.Label(label="Search Results"),
    title="Search and Filter Corpus"
)

search_interface.launch()