app file
# -*- coding: utf-8 -*-
Created on Fri Nov 6 16:26:17 2020
@author: rejid4996
import streamlit as st
import numpy as np
import pandas as pd
import base64
from io import BytesIO
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
def find_similar(vector_representation, all_representations, k=1):
similarity_matrix = cosine_similarity(vector_representation, all_representations)
np.fill_diagonal(similarity_matrix, 0)
similarities = similarity_matrix[0]
if k == 1:
return [np.argmax(similarities)]
elif k is not None:
return np.flip(similarities.argsort()[-k:][::1])
def to_excel(df):
output = BytesIO()
writer = pd.ExcelWriter(output, engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
processed_data = output.getvalue()
return processed_data
def get_table_download_link(df):
"""Generates a link allowing the data in a given panda dataframe to be downloaded
in: dataframe
out: href string
val = to_excel(df)
b64 = base64.b64encode(val) # val looks like b'...'
return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="extract.xlsx">Download file</a>'
def main():
"""NLP App with Streamlit"""
from PIL import Image
wallpaper = Image.open('thorteam.jpg')
wallpaper = wallpaper.resize((700,350))
st.sidebar.subheader("Text extraction using NLP model ")
uploaded_file = st.sidebar.file_uploader("Choose the Knowledge base file", type="xlsx")
if uploaded_file:
df = pd.read_excel(uploaded_file)
search_string = st.sidebar.text_input("your search word", "")
gcr_config = st.sidebar.slider(label="choose the no of Sentences",
run_button = st.sidebar.button(label='Run Extraction')
if run_button:
paragraph = df.iloc[:, 0]
embeddings_distilbert = model.encode(paragraph.values)
description = search_string
K = gcr_config
distilbert_similar_indexes = find_similar(model.encode([description]), embeddings_distilbert, K)
output_data = []
for index in distilbert_similar_indexes:
output1 = pd.DataFrame(output_data, columns = ['extracted text'])
st.markdown(get_table_download_link(output1), unsafe_allow_html=True)
if __name__ == "__main__":