Spaces:

rushi29
/

AIP_pdf

Runtime error

App Files Files Community

AIP_pdf / app.py

rushi29

Update app.py

8dba09e over 2 years ago

raw

history blame

3.77 kB

	import streamlit as st
	from txtai.pipeline import Textractor
	from txtai.embeddings import Embeddings
	import nltk
	nltk.download('punkt')
	#Web Scraping
	import bs4 as bs
	import urllib.request
	import re
	import docx2txt
	from PyPDF2 import PdfReader
	from nltk import tokenize

	# Create embeddings model, backed by sentence-transformers & transformers
	embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})

	url = "https://cdn.pixabay.com/photo/2022/02/25/09/23/background-7033808_1280.jpg"

	st.title("AIP-S³")
	st.write("AI Powered Smart Search System")
	st.image(url)

	st.markdown('_Welecome to Question Answering System 🧠 🤖_')

	a = st.sidebar.radio("SELECT -", ['PDF', 'Website'])

	## webscrap function
	def my_web():
	from txtai.pipeline import Textractor
	textract = Textractor(sentences=True)
	data_lines = []
	total_lines = []
	article_text = " "
	for i in (locations_max):
	#print(i)
	scraped_data = urllib.request.urlopen(i)
	article = scraped_data.read()
	parsed_article = bs.BeautifulSoup(article,'lxml')
	paragraphs = parsed_article.find_all('p')
	for p in paragraphs:
	article_text += p.text
	lines = textract(i)
	data_lines.append(lines)
	total_lines = []
	for i in data_lines:
	total_lines += i
	seq = embeddings.similarity(quer, total_lines)
	three_most = seq[0:3]
	indexes = []
	for i in three_most:
	indexes.append(i[0])
	for j in indexes:
	st.write(total_lines[j])




	if a == 'PDF' :
	uploaded_files = st.file_uploader("Upload files - ", accept_multiple_files=True ,
	type = ['pdf', 'docx' , 'txt'] )

	# for query

	quer = st.text_input('ask me anything!', placeholder = 'ex - what is AI?')
	st.write('Your query is - ', quer)

	if st.button("Process"):

	for uploaded_file in uploaded_files:
	if uploaded_file is not None:
	file_details = {"Filename":uploaded_file.name,"FileType":uploaded_file.type,"FileSize":uploaded_file.size}
	#st.write(file_details)

	if uploaded_file.type == "text/plain":
	raw_text = str(uploaded_file.read(),"utf-8")
	st.write(raw_text)

	elif uploaded_file.type == "application/pdf" :
	reader = PdfReader(uploaded_file)
	text = ""
	for page in reader.pages:
	text += page.extract_text() + "\n"
	#st.write(text)

	data_lines = tokenize.sent_tokenize(text)
	#st.write(data_lines)

	seq = embeddings.similarity(quer, data_lines)
	three_most = seq[0:3]
	indexes = []
	for i in three_most:
	indexes.append(i[0])
	for j in indexes:
	st.write(data_lines[j])


	#total_lines = []
	#for i in data_lines:
	#total_lines += i

	#st.write(data_lines)

	#try:
	#with pdfplumber.open(uploaded_file) as pdf:
	#pages = pdf.pages[0]
	#st.write(pages.extract_text())
	#except:
	#st.write("None")

	elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" :
	raw_text = docx2txt.process(uploaded_file)
	st.write(raw_text)

	## web
	else:
	number = st.number_input('Insert a number of Links -',value =1, step =1)
	st.write('Number of web pages - ', number)
	st.markdown("---")
	locations_max = []
	for i in range (number) :
	loc = st.text_input('Enter the URL :', placeholder = 'ex- https:\\', key = i)
	locations_max.append(loc)

	# for query
	quer = st.text_input('ask me anything!', placeholder = 'ex - what is AI?')
	st.write('Your query is - ', quer)

	if st.button('Confirm!'):
	st.write('Confirmed')
	my_web()
	else:
	st.write('')