import streamlit as st import os import PyPDF2 import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize from sentence_transformers import SentenceTransformer import numpy as np from numpy.linalg import norm url = "https://www.independentschoolparent.com/wp-content/uploads/2018/01/AI.jpg" st.title("AIP-S³") st.write("AI Powered Smart Search System") st.image(url) st.markdown('_Welcome to Question Answering System 🧠 🤖_') a = st.sidebar.radio("SELECT -", ['PDF', 'Website']) model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #filt1_list = [] class PDFProcessor: def __init__(self): self.filt1_list = [] def process_files(self, uploaded_files): for i in uploaded_files: if i.type == "application/pdf" : reader = PyPDF2.PdfReader(i) text_ext = [] for i in range(len(reader.pages)): pageObj = reader.pages[i] text_ext.append(pageObj.extract_text()) sent_toks = [] for i in text_ext: sent_toks.append(sent_tokenize(i)) concat_list = [j for i in sent_toks for j in i] for i in concat_list: a = (i.replace('\n', ' ')) self.filt1_list.append(a) if a == 'PDF': pdf_processor = PDFProcessor() uploaded_files = st.file_uploader("Upload files - ", accept_multiple_files=True, type=['pdf']) if st.button("Process!"): pdf_processor.process_files(uploaded_files) filt1_list = pdf_processor.filt1_list st.write("Process Completed") query = st.text_input('Ask me anything!', placeholder = 'Type.....') if st.button("Confirm!"): model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') query_embedding = model.encode(query) cosine_lis = [] for i in range(len(pdf_processor.filt1_list)): cosine = np.dot(query_embedding , embeddings[i])/ (norm(query_embedding)*norm(embeddings[i])) cosine_lis.append(cosine) N = 3 list1 = cosine_lis indexes_final= sorted(range(len(list1)), key=lambda i: list1[i], reverse=True)[:N] for i in indexes_final: st.write(filt1_list[i]) st.write("") # if a == 'PDF' : # uploaded_files = st.file_uploader("Upload files - ", accept_multiple_files=True , # type = ['pdf'] ) # if st.button("Process!"): # for i in uploaded_files: # if i.type == "application/pdf" : # reader = PyPDF2.PdfReader(i) # text_ext = [] # for i in range(len(reader.pages)): # pageObj = reader.pages[i] # # extracting text from page # text_ext.append(pageObj.extract_text()) # sent_toks = [] # for i in text_ext: # sent_toks.append(sent_tokenize(i)) # concat_list = [j for i in sent_toks for j in i] # filt1_list = [] # for i in concat_list: # a = (i.replace('\n', ' ')) # filt1_list.append(a) # #model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # embeddings = model.encode(filt1_list) # st.write("Process Completed") # '''