ShawnAI commited on
Commit
f0d1783
1 Parent(s): 2aba62d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -0
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings, OpenAIEmbeddings
4
+ from langchain.vectorstores import Pinecone
5
+ import pinecone
6
+ import os
7
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
8
+
9
+
10
+ PINECONE_KEY = os.environ.get("PINECONE_KEY", "")
11
+ PINECONE_ENV = os.environ.get("PINECONE_ENV", "asia-northeast1-gcp")
12
+ PINECONE_INDEX = os.environ.get("PINECONE_INDEX", '3gpp-r16')
13
+
14
+ EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "hkunlp/instructor-large")
15
+ EMBEDDING_LOADER = os.environ.get("EMBEDDING_LOADER", "HuggingFaceInstructEmbeddings")
16
+ EMBEDDING_LIST = ["HuggingFaceInstructEmbeddings", "HuggingFaceEmbeddings"]
17
+
18
+ # return top-k text chunks from vector store
19
+ TOP_K_DEFAULT = 15
20
+ TOP_K_MAX = 30
21
+ SCORE_DEFAULT = 0.33
22
+
23
+ global g_db
24
+ g_db = None
25
+
26
+ def init_db(emb_name, emb_loader, db_api_key, db_env, db_index):
27
+
28
+ embeddings = eval(emb_loader)(model_name=emb_name)
29
+
30
+ pinecone.init(api_key = db_api_key,
31
+ environment = db_env)
32
+
33
+ global g_db
34
+
35
+ g_db = Pinecone.from_existing_index(index_name = db_index,
36
+ embedding = embeddings)
37
+ return str(g_db)
38
+
39
+
40
+ def get_db():
41
+ return g_db
42
+
43
+
44
+ def remove_duplicates(documents, score_min):
45
+ seen_content = set()
46
+ unique_documents = []
47
+ for (doc, score) in documents:
48
+ if (doc.page_content not in seen_content) and (score >= score_min):
49
+ seen_content.add(doc.page_content)
50
+ unique_documents.append(doc)
51
+ return unique_documents
52
+
53
+
54
+ def get_data(query, top_k, score):
55
+ if not query:
56
+ return "Please init db in configuration"
57
+
58
+ print("Use db: " + str(g_db))
59
+
60
+ docs = g_db.similarity_search_with_score(query = query,
61
+ k=top_k)
62
+ #docsearch = db.as_retriever(search_kwargs={'k':top_k})
63
+ #docs = docsearch.get_relevant_documents(query)
64
+ udocs = remove_duplicates(docs, score)
65
+ return udocs
66
+
67
+ with gr.Blocks(
68
+ title = "3GPP Database",
69
+ theme = "Base",
70
+ css = """.bigbox {
71
+ min-height:250px;
72
+ }
73
+ """) as demo:
74
+ with gr.Tab("Matching"):
75
+ with gr.Accordion("Vector similarity"):
76
+ with gr.Row():
77
+ with gr.Column():
78
+ top_k = gr.Slider(1,
79
+ TOP_K_MAX,
80
+ value=TOP_K_DEFAULT,
81
+ step=1,
82
+ label="Vector similarity top_k",
83
+ interactive=True)
84
+ with gr.Column():
85
+ score = gr.Slider(0.01,
86
+ 0.99,
87
+ value=SCORE_DEFAULT,
88
+ step=0.01,
89
+ label="Vector similarity score",
90
+ interactive=True)
91
+
92
+ with gr.Row():
93
+ inp = gr.Textbox(label = "Input",
94
+ placeholder="What are you looking for?")
95
+ out = gr.Textbox(label = "Output")
96
+
97
+ btn_run = gr.Button("Run", variant="primary")
98
+
99
+ with gr.Tab("Configuration"):
100
+ with gr.Row():
101
+ loading = gr.Textbox(get_db, max_lines=1, show_label=False)
102
+ btn_init = gr.Button("Init")
103
+ with gr.Accordion("Embedding"):
104
+ with gr.Row():
105
+ with gr.Column():
106
+ emb_textbox = gr.Textbox(
107
+ label = "Embedding Model",
108
+ # show_label = False,
109
+ value = EMBEDDING_MODEL,
110
+ placeholder = "Paste Your Embedding Model Repo on HuggingFace",
111
+ lines=1,
112
+ interactive=True,
113
+ type='email')
114
+
115
+ with gr.Column():
116
+ emb_dropdown = gr.Dropdown(
117
+ EMBEDDING_LIST,
118
+ value=EMBEDDING_LOADER,
119
+ multiselect=False,
120
+ interactive=True,
121
+ label="Embedding Loader")
122
+
123
+ with gr.Accordion("Pinecone Database"):
124
+ with gr.Row():
125
+ db_api_textbox = gr.Textbox(
126
+ label = "Pinecone API Key",
127
+ # show_label = False,
128
+ value = PINECONE_KEY,
129
+ placeholder = "Paste Your Pinecone API Key (xx-xx-xx-xx-xx) and Hit ENTER",
130
+ lines=1,
131
+ interactive=True,
132
+ type='password')
133
+ with gr.Row():
134
+ db_env_textbox = gr.Textbox(
135
+ label = "Pinecone Environment",
136
+ # show_label = False,
137
+ value = PINECONE_ENV,
138
+ placeholder = "Paste Your Pinecone Environment (xx-xx-xx) and Hit ENTER",
139
+ lines=1,
140
+ interactive=True,
141
+ type='email')
142
+ db_index_textbox = gr.Textbox(
143
+ label = "Pinecone Index",
144
+ # show_label = False,
145
+ value = PINECONE_INDEX,
146
+ placeholder = "Paste Your Pinecone Index (xxxx) and Hit ENTER",
147
+ lines=1,
148
+ interactive=True,
149
+ type='email')
150
+
151
+ btn_init.click(fn=init_db, inputs=[emb_textbox, emb_dropdown, db_api_textbox, db_env_textbox, db_index_textbox], outputs=loading)
152
+ btn_run.click(fn=get_data, inputs=[inp, top_k, score], outputs=out)
153
+
154
+ if __name__ == "__main__":
155
+ demo.queue()
156
+ demo.launch(inbrowser = True)