dreji18 commited on
Commit
01ace26
1 Parent(s): 1ef5d9f
Files changed (1) hide show
  1. app.py +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Nov 6 16:26:17 2020
4
+
5
+ @author: rejid4996
6
+ """
7
+
8
+ import streamlit as st
9
+ import numpy as np
10
+ import pandas as pd
11
+ import base64
12
+ from io import BytesIO
13
+ from sentence_transformers import SentenceTransformer
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
+ model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
16
+
17
+ def find_similar(vector_representation, all_representations, k=1):
18
+ similarity_matrix = cosine_similarity(vector_representation, all_representations)
19
+ np.fill_diagonal(similarity_matrix, 0)
20
+ similarities = similarity_matrix[0]
21
+ if k == 1:
22
+ return [np.argmax(similarities)]
23
+ elif k is not None:
24
+ return np.flip(similarities.argsort()[-k:][::1])
25
+
26
+ def to_excel(df):
27
+ output = BytesIO()
28
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
29
+ df.to_excel(writer, sheet_name='Sheet1')
30
+ writer.save()
31
+ processed_data = output.getvalue()
32
+ return processed_data
33
+
34
+ def get_table_download_link(df):
35
+ """Generates a link allowing the data in a given panda dataframe to be downloaded
36
+ in: dataframe
37
+ out: href string
38
+ """
39
+ val = to_excel(df)
40
+ b64 = base64.b64encode(val) # val looks like b'...'
41
+ return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="extract.xlsx">Download file</a>'
42
+
43
+ def main():
44
+ """NLP App with Streamlit"""
45
+
46
+ from PIL import Image
47
+
48
+ wallpaper = Image.open('thorteam.jpg')
49
+ wallpaper = wallpaper.resize((700,350))
50
+
51
+ st.sidebar.title("Semantic Search App")
52
+ st.sidebar.success("Please reach out to https://www.linkedin.com/in/deepak-john-reji/ for more queries")
53
+ st.sidebar.subheader("Text extraction using NLP model ")
54
+
55
+ st.info("For more contents subscribe to my Youtube Channel https://www.youtube.com/channel/UCgOwsx5injeaB_TKGsVD5GQ")
56
+ st.image(wallpaper)
57
+
58
+ uploaded_file = st.sidebar.file_uploader("Choose the Knowledge base file", type="xlsx")
59
+
60
+ if uploaded_file:
61
+ df = pd.read_excel(uploaded_file)
62
+
63
+ search_string = st.sidebar.text_input("your search word", "")
64
+
65
+ gcr_config = st.sidebar.slider(label="choose the no of Sentences",
66
+ min_value=1,
67
+ max_value=10,
68
+ step=1)
69
+
70
+ run_button = st.sidebar.button(label='Run Extraction')
71
+ if run_button:
72
+
73
+ paragraph = df.iloc[:, 0]
74
+ embeddings_distilbert = model.encode(paragraph.values)
75
+
76
+ description = search_string
77
+ K = gcr_config
78
+
79
+ distilbert_similar_indexes = find_similar(model.encode([description]), embeddings_distilbert, K)
80
+ output_data = []
81
+ for index in distilbert_similar_indexes:
82
+ output_data.append(paragraph[index])
83
+
84
+ output1 = pd.DataFrame(output_data, columns = ['extracted text'])
85
+ output1.dropna()
86
+
87
+ st.table(output1)
88
+
89
+ st.markdown(get_table_download_link(output1), unsafe_allow_html=True)
90
+
91
+
92
+ if __name__ == "__main__":
93
+ main()