Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Fri Nov 6 16:26:17 2020 | |
| @author: rejid4996 | |
| """ | |
| import streamlit as st | |
| import numpy as np | |
| import pandas as pd | |
| import base64 | |
| from io import BytesIO | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') | |
| def find_similar(vector_representation, all_representations, k=1): | |
| similarity_matrix = cosine_similarity(vector_representation, all_representations) | |
| np.fill_diagonal(similarity_matrix, 0) | |
| similarities = similarity_matrix[0] | |
| if k == 1: | |
| return [np.argmax(similarities)] | |
| elif k is not None: | |
| return np.flip(similarities.argsort()[-k:][::1]) | |
| def to_excel(df): | |
| output = BytesIO() | |
| writer = pd.ExcelWriter(output, engine='xlsxwriter') | |
| df.to_excel(writer, sheet_name='Sheet1') | |
| writer.save() | |
| processed_data = output.getvalue() | |
| return processed_data | |
| def get_table_download_link(df): | |
| """Generates a link allowing the data in a given panda dataframe to be downloaded | |
| in: dataframe | |
| out: href string | |
| """ | |
| val = to_excel(df) | |
| b64 = base64.b64encode(val) # val looks like b'...' | |
| return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="extract.xlsx">Download file</a>' | |
| def main(): | |
| """NLP App with Streamlit""" | |
| from PIL import Image | |
| wallpaper = Image.open('thorteam.jpg') | |
| wallpaper = wallpaper.resize((700,350)) | |
| st.sidebar.title("Semantic Search App") | |
| st.sidebar.success("Please reach out to https://www.linkedin.com/in/deepak-john-reji/ for more queries") | |
| st.sidebar.subheader("Text extraction using NLP model ") | |
| st.info("For more contents subscribe to my Youtube Channel https://www.youtube.com/channel/UCgOwsx5injeaB_TKGsVD5GQ") | |
| st.image(wallpaper) | |
| uploaded_file = st.sidebar.file_uploader("Choose the Knowledge base file", type="xlsx") | |
| if uploaded_file: | |
| df = pd.read_excel(uploaded_file) | |
| search_string = st.sidebar.text_input("your search word", "") | |
| gcr_config = st.sidebar.slider(label="choose the no of Sentences", | |
| min_value=1, | |
| max_value=10, | |
| step=1) | |
| run_button = st.sidebar.button(label='Run Extraction') | |
| if run_button: | |
| paragraph = df.iloc[:, 0] | |
| embeddings_distilbert = model.encode(paragraph.values) | |
| description = search_string | |
| K = gcr_config | |
| distilbert_similar_indexes = find_similar(model.encode([description]), embeddings_distilbert, K) | |
| output_data = [] | |
| for index in distilbert_similar_indexes: | |
| output_data.append(paragraph[index]) | |
| output1 = pd.DataFrame(output_data, columns = ['extracted text']) | |
| output1.dropna() | |
| st.table(output1) | |
| st.markdown(get_table_download_link(output1), unsafe_allow_html=True) | |
| if __name__ == "__main__": | |
| main() | |