AIP_pdf / app.py
rushi29's picture
Update app.py
8dba09e
raw
history blame
3.77 kB
import streamlit as st
from txtai.pipeline import Textractor
from txtai.embeddings import Embeddings
import nltk
nltk.download('punkt')
#Web Scraping
import bs4 as bs
import urllib.request
import re
import docx2txt
from PyPDF2 import PdfReader
from nltk import tokenize
# Create embeddings model, backed by sentence-transformers & transformers
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})
url = "https://cdn.pixabay.com/photo/2022/02/25/09/23/background-7033808_1280.jpg"
st.title("AIP-S³")
st.write("AI Powered Smart Search System")
st.image(url)
st.markdown('_Welecome to Question Answering System 🧠 🤖_')
a = st.sidebar.radio("SELECT -", ['PDF', 'Website'])
## webscrap function
def my_web():
from txtai.pipeline import Textractor
textract = Textractor(sentences=True)
data_lines = []
total_lines = []
article_text = " "
for i in (locations_max):
#print(i)
scraped_data = urllib.request.urlopen(i)
article = scraped_data.read()
parsed_article = bs.BeautifulSoup(article,'lxml')
paragraphs = parsed_article.find_all('p')
for p in paragraphs:
article_text += p.text
lines = textract(i)
data_lines.append(lines)
total_lines = []
for i in data_lines:
total_lines += i
seq = embeddings.similarity(quer, total_lines)
three_most = seq[0:3]
indexes = []
for i in three_most:
indexes.append(i[0])
for j in indexes:
st.write(total_lines[j])
if a == 'PDF' :
uploaded_files = st.file_uploader("Upload files - ", accept_multiple_files=True ,
type = ['pdf', 'docx' , 'txt'] )
# for query
quer = st.text_input('ask me anything!', placeholder = 'ex - what is AI?')
st.write('Your query is - ', quer)
if st.button("Process"):
for uploaded_file in uploaded_files:
if uploaded_file is not None:
file_details = {"Filename":uploaded_file.name,"FileType":uploaded_file.type,"FileSize":uploaded_file.size}
#st.write(file_details)
if uploaded_file.type == "text/plain":
raw_text = str(uploaded_file.read(),"utf-8")
st.write(raw_text)
elif uploaded_file.type == "application/pdf" :
reader = PdfReader(uploaded_file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
#st.write(text)
data_lines = tokenize.sent_tokenize(text)
#st.write(data_lines)
seq = embeddings.similarity(quer, data_lines)
three_most = seq[0:3]
indexes = []
for i in three_most:
indexes.append(i[0])
for j in indexes:
st.write(data_lines[j])
#total_lines = []
#for i in data_lines:
#total_lines += i
#st.write(data_lines)
#try:
#with pdfplumber.open(uploaded_file) as pdf:
#pages = pdf.pages[0]
#st.write(pages.extract_text())
#except:
#st.write("None")
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" :
raw_text = docx2txt.process(uploaded_file)
st.write(raw_text)
## web
else:
number = st.number_input('Insert a number of Links -',value =1, step =1)
st.write('Number of web pages - ', number)
st.markdown("---")
locations_max = []
for i in range (number) :
loc = st.text_input('Enter the URL :', placeholder = 'ex- https:\\', key = i)
locations_max.append(loc)
# for query
quer = st.text_input('ask me anything!', placeholder = 'ex - what is AI?')
st.write('Your query is - ', quer)
if st.button('Confirm!'):
st.write('Confirmed')
my_web()
else:
st.write('')