|
import streamlit as st |
|
import os |
|
import PyPDF2 |
|
import nltk |
|
nltk.download('punkt') |
|
from nltk.tokenize import sent_tokenize |
|
from sentence_transformers import SentenceTransformer |
|
import numpy as np |
|
from numpy.linalg import norm |
|
|
|
url = "https://www.independentschoolparent.com/wp-content/uploads/2018/01/AI.jpg" |
|
|
|
st.title("AIP-S³") |
|
st.write("AI Powered Smart Search System") |
|
st.image(url) |
|
|
|
st.markdown('_Welcome to Question Answering System 🧠 🤖_') |
|
|
|
a = st.sidebar.radio("SELECT -", ['PDF', 'Website']) |
|
|
|
if a == 'PDF' : |
|
uploaded_files = st.file_uploader("Upload files - ", accept_multiple_files=True , |
|
type = ['pdf'] ) |
|
|
|
query = st.text_input('Ask me anything!', placeholder = 'Type.....') |
|
|
|
if st.button("Confirm!"): |
|
|
|
|
|
for i in uploaded_files: |
|
if i.type == "application/pdf" : |
|
reader = PyPDF2.PdfReader(i) |
|
|
|
text_ext = [] |
|
for i in range(len(reader.pages)): |
|
pageObj = reader.pages[i] |
|
|
|
text_ext.append(pageObj.extract_text()) |
|
|
|
|
|
|
|
sent_toks = [] |
|
for i in text_ext: |
|
sent_toks.append(sent_tokenize(i)) |
|
|
|
concat_list = [j for i in sent_toks for j in i] |
|
|
|
filt1_list = [] |
|
for i in concat_list: |
|
a = (i.replace('\n', ' ')) |
|
filt1_list.append(a) |
|
|
|
|
|
|
|
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
|
embeddings = model.encode(filt1_list) |
|
|
|
query_embedding = model.encode(query) |
|
|
|
cosine_lis = [] |
|
for i in range(len(filt1_list)): |
|
cosine = np.dot(query_embedding , embeddings[i])/ (norm(query_embedding)*norm(embeddings[i])) |
|
cosine_lis.append(cosine) |
|
|
|
|
|
N = 3 |
|
list1 = cosine_lis |
|
|
|
indexes_final= sorted(range(len(list1)), key=lambda i: list1[i], reverse=True)[:N] |
|
|
|
for i in indexes_final: |
|
st.write(filt1_list[i]) |
|
st.write("") |
|
|
|
|
|
|