|
import streamlit as st |
|
from txtai.pipeline import Textractor |
|
from txtai.embeddings import Embeddings |
|
import nltk |
|
nltk.download('punkt') |
|
|
|
import bs4 as bs |
|
import urllib.request |
|
import re |
|
import docx2txt |
|
from PyPDF2 import PdfReader |
|
from nltk import tokenize |
|
|
|
|
|
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"}) |
|
|
|
url = "https://cdn.pixabay.com/photo/2022/02/25/09/23/background-7033808_1280.jpg" |
|
|
|
st.title("AIP-S³") |
|
st.write("AI Powered Smart Search System") |
|
st.image(url) |
|
|
|
st.markdown('_Welecome to Question Answering System 🧠 🤖_') |
|
|
|
a = st.sidebar.radio("SELECT -", ['PDF', 'Website']) |
|
|
|
|
|
def my_web(): |
|
from txtai.pipeline import Textractor |
|
textract = Textractor(sentences=True) |
|
data_lines = [] |
|
total_lines = [] |
|
article_text = " " |
|
for i in (locations_max): |
|
|
|
scraped_data = urllib.request.urlopen(i) |
|
article = scraped_data.read() |
|
parsed_article = bs.BeautifulSoup(article,'lxml') |
|
paragraphs = parsed_article.find_all('p') |
|
for p in paragraphs: |
|
article_text += p.text |
|
lines = textract(i) |
|
data_lines.append(lines) |
|
total_lines = [] |
|
for i in data_lines: |
|
total_lines += i |
|
seq = embeddings.similarity(quer, total_lines) |
|
three_most = seq[0:3] |
|
indexes = [] |
|
for i in three_most: |
|
indexes.append(i[0]) |
|
for j in indexes: |
|
st.write(total_lines[j]) |
|
|
|
|
|
|
|
|
|
if a == 'PDF' : |
|
uploaded_files = st.file_uploader("Upload files - ", accept_multiple_files=True , |
|
type = ['pdf', 'docx' , 'txt'] ) |
|
|
|
|
|
|
|
quer = st.text_input('ask me anything!', placeholder = 'ex - what is AI?') |
|
st.write('Your query is - ', quer) |
|
|
|
if st.button("Process"): |
|
|
|
for uploaded_file in uploaded_files: |
|
if uploaded_file is not None: |
|
file_details = {"Filename":uploaded_file.name,"FileType":uploaded_file.type,"FileSize":uploaded_file.size} |
|
|
|
|
|
if uploaded_file.type == "text/plain": |
|
raw_text = str(uploaded_file.read(),"utf-8") |
|
st.write(raw_text) |
|
|
|
elif uploaded_file.type == "application/pdf" : |
|
reader = PdfReader(uploaded_file) |
|
text = "" |
|
for page in reader.pages: |
|
text += page.extract_text() + "\n" |
|
|
|
|
|
data_lines = tokenize.sent_tokenize(text) |
|
|
|
|
|
seq = embeddings.similarity(quer, data_lines) |
|
three_most = seq[0:3] |
|
indexes = [] |
|
for i in three_most: |
|
indexes.append(i[0]) |
|
for j in indexes: |
|
st.write(data_lines[j]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" : |
|
raw_text = docx2txt.process(uploaded_file) |
|
st.write(raw_text) |
|
|
|
|
|
else: |
|
number = st.number_input('Insert a number of Links -',value =1, step =1) |
|
st.write('Number of web pages - ', number) |
|
st.markdown("---") |
|
locations_max = [] |
|
for i in range (number) : |
|
loc = st.text_input('Enter the URL :', placeholder = 'ex- https:\\', key = i) |
|
locations_max.append(loc) |
|
|
|
|
|
quer = st.text_input('ask me anything!', placeholder = 'ex - what is AI?') |
|
st.write('Your query is - ', quer) |
|
|
|
if st.button('Confirm!'): |
|
st.write('Confirmed') |
|
my_web() |
|
else: |
|
st.write('') |