Spaces:

OdiaGenAI
/

Olive_scrapper

Sleeping

App Files Files Community

Olive_scrapper / app.py

arkamaldeen

Update app.py

a84eaca about 1 year ago

raw

history blame

14.2 kB

	import justext
	import streamlit as st
	from lxml import etree
	# import streamlit.components.v1 as components


	# File Processing pkgs
	from PIL import Image
	import requests
	# import xml.dom.minidom
	from bs4 import BeautifulSoup
	# import json
	import docx2txt
	# import textract
	from PyPDF2 import PdfFileReader
	import pdfplumber
	import os



	# ---- LOAD ASSETS ----
	img_page_icon = Image.open("./olive_webscrapping.jpg")

	# Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/
	st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide")


	# Load CSS file
	def load_css(file_path):
	with open(file_path) as f:
	st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)


	# Load CSS file
	load_css('styles.css')


	# ----- FUNCTIONS ----
	# function to check whether the url is a sitemap or not
	def check_sitemap(url):
	# Check the URL's ending
	if url.lower().endswith(('sitemap.xml', 'sitemap_index.xml', 'sitemap')):
	try:
	# Parse the content as XML
	response = requests.get(url)
	xml_content = etree.fromstring(response.content)
	# Check for sitemap-specific elements
	if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
	return True
	except etree.XMLSyntaxError:
	pass

	# Additional conditions for identifying sitemaps
	if 'sitemap' in url.lower():
	# Perform additional checks specific to the website's structure or naming conventions
	return True

	return False


	# function to get urls from the site map and extract those data
	def extract_urls_from_sitemaps(xml_url):
	# Make a GET request to the URL and extract the xml content
	response = requests.get(xml_url)

	soup = BeautifulSoup(response.text, 'xml')
	extracted_urls = []

	# check if the sitemap contains nested sitemaps
	sitemap_tags = soup.find_all('sitemap')
	if sitemap_tags:
	# Process nested sitemaps
	for sitemap_tag in sitemap_tags:
	print("sitemap_tags:" + sitemap_tag)
	nested_url = sitemap_tag.find('loc').text
	print('nested_url:', nested_url)
	nested_urls = extract_urls_from_sitemaps(nested_url)
	extracted_urls.extend(nested_urls)
	else:
	# Extract URLs from the current sitemap
	loc_tags = soup.find_all('loc')
	for loc_tag in loc_tags:
	# if loc_tag.parent.name != 'image':
	url = loc_tag.text
	if url.endswith('.pdf') or url.endswith('.jpg') or url.endswith('.jpeg'):
	print(f"url skipped because it is a {url.split('.')[-1]}")
	else:
	print('url:', url)
	extracted_urls.append(url)

	return extracted_urls


	# function to check whether the entered url is valid
	def valid_url(url):
	try:
	# Make a GET request to the URL and extract the text content
	response = requests.get(url)
	if response.status_code == 200:
	return True

	except requests.exceptions.RequestException as e:
	return False


	# function to create a custom stoplist for justext
	def custom_stoplist():
	odia_stopwords = [
	"ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
	"ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
	"ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ \|", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
	"ସେମାନଙ୍କର", "ନିଜେ \|", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
	"ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି \|", "ଥିଲା", "ଥିଲା \|", "ହୁଅ", "ହୋଇସାରିଛି \|", "ହେବା",
	"ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ \|",
	"କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
	"ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
	"ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ \|", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
	"ସମାପ୍ତ", "ତଳେ \|", "ପୁନର୍ବାର", "ଆଗକୁ",
	"ତାପରେ", "ଥରେ \|", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
	"ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
	"ନୁହେଁ \|", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ \|", "ଇଚ୍ଛା", "କେବଳ",
	"କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
	]
	return frozenset(odia_stopwords)


	# function to extract data from url using justext
	def extract_data_from_url_(url):
	response = requests.get(url)
	response.raise_for_status()
	page = response.content

	data_url = ""
	para = ""
	paragraphs = justext.justext(page, custom_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False)
	for paragraph in paragraphs:
	if not paragraph.is_boilerplate:
	para = para + '\n' + paragraph.text

	data_url = ('\n\nFrom url:' + url + '\n' + para + '\n')

	return data_url


	sitemap_data = ""


	# function to get the text from pdf using PyPDF2
	def read_pdf(file):
	pdfReader = PdfFileReader(file)
	count = pdfReader.numPages
	# all_page_text = ""
	# for i in range(count):
	# page = pdfReader.getPage(i)
	# all_page_text += page.extractText()
	#
	# return all_page_text
	return count


	# function to run the enter button
	def run_function(url, documents):
	data = ""
	# Check if the user has provided a URL
	if url:
	if valid_url(url):
	data = extract_data_from_url_(url)
	st.text_area("Extracted Text", value=data, height=200)
	# return extract status, and the data extracted
	return True, data
	else:
	return False, data


	# Check if the user has provided a document
	elif documents is not None:
	for document in documents:
	document_details = {
	"filename": document.name,
	"filetype": document.type,
	"filesize": document.size
	}
	st.write(document_details)

	# Extract content from the txt file
	if document.type == "text/plain":
	# Read as bytes
	data += str(document.read(), "utf-8")

	# Extract content from the pdf file
	elif document.type == "application/pdf":
	# using PyPDF2
	# data += read_pdf(document)

	# using pdfplumber
	try:
	with pdfplumber.open(document) as pdf:
	all_text = ""
	for page in pdf.pages:
	text = page.extract_text()
	all_text += text + "\n"
	data += all_text
	except requests.exceptions.RequestException as e:
	st.write("None")

	# Extract content from the docx file
	elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
	data += docx2txt.process(document)

	# Display the extracted text content from file
	st.write("attached")
	st.text_area("Extracted Text", value=data, height=200)
	# return extract status, and the data extracted
	return True, data

	else:
	st.error("Error: An error occurred while fetching content.")
	# return extract status, and the data extracted
	return False, data


	def main():
	# ---- HEADER SECTION ----
	with st.container():
	st.subheader("Hi!! :wave:")
	st.write("##")
	st.markdown("<h5 class='text'>OdiaGenAI is a collaborative initiative that conducts research on </h5>",
	unsafe_allow_html=True)
	st.markdown("<h5>Generative AI and LLM for the Odia Language.</h5>", unsafe_allow_html=True)
	# st.title("Odia Generative AI")

	st.markdown("<h1 class='title'>Odia Generative AI</h1>", unsafe_allow_html=True)

	# ---- BODY SECTION ----
	with st.container():
	st.subheader("Collecting monolingual data (Odia or any Indic Languages)")

	# dividing the body section into 3 columns for url, attach button and enter button
	col1, col2, col3 = st.columns([0.6, 0.2, 0.2])
	# url/xml
	with col1:

	url_or_xml = st.text_input(label='', placeholder="Enter URL")
	is_a_sitemap = check_sitemap(url_or_xml)

	# attached files
	with col2:

	documents = st.file_uploader("", type=["pdf", "txt", "docx"], accept_multiple_files=True)
	if not documents:
	documents = None
	else:
	for doc in documents:
	if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
	# if documents is not the relevant type
	st.error("Unsupported file: " + doc.name)

	# Initialize state of button Enter
	with col3:
	st.write('##')
	if "button_enter" not in st.session_state:
	st.session_state.button_enter = False

	if st.button("Enter"):
	st.session_state.button_enter = True
	# st.write("session state true")

	if "extracted" not in st.session_state:
	st.session_state.extracted = False
	data = ""

	# the enter button
	if st.session_state.button_enter:
	# check if it is a sitemap or not
	if is_a_sitemap:
	if "Initial" not in st.session_state:
	st.session_state.Initial = True
	# check whether its the initial state
	if st.session_state.Initial == True:
	# print("\n\n\n\n1)Initial State", st.session_state.Initial, "\n\n\n\n\n")
	xml = url_or_xml
	st.write("It is a sitemap")
	stored_sitemap_urls = extract_urls_from_sitemaps(xml)
	print('\nno. of urls: ', len(stored_sitemap_urls))

	if stored_sitemap_urls:
	print(stored_sitemap_urls)
	for sitemap_url in stored_sitemap_urls:

	if valid_url(sitemap_url):
	print(sitemap_url)
	# using justext to extract data
	data = data + extract_data_from_url_(sitemap_url)
	else:
	st.error("Couldnt extract data from " + sitemap_url)

	if "sitemap_data" not in st.session_state:
	st.session_state.sitemap_data = data
	# print("\n\n\nst.session.data ", st.session_state.sitemap_data)
	# print("\n\n\n\nRUNNING \n\n\n\n")
	st.session_state.Initial = False
	print("\n\n\n\n2)Initial State", st.session_state.Initial, "\n\n\n\n\n")
	st.session_state.extracted = True
	# st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)

	else:
	st.error("Error: Invalid sitemap.")


	else:
	url = url_or_xml
	st.session_state.extracted, data = run_function(url, documents)

	if st.session_state.extracted:
	if is_a_sitemap:
	st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)
	col1, col2 = st.columns([0.5, 0.5])

	with col1:
	saved_button = False

	if is_a_sitemap:
	saved_data = st.session_state.sitemap_data

	if st.download_button(
	label="Save",
	data=saved_data
	):
	saved_button = True

	else:
	if st.download_button(
	label="Save",
	data=data
	):
	saved_button = True

	with col2:
	if st.button("Clear"):
	st.session_state.button_enter = False
	st.session_state.Initial = True
	st.session_state.extracted = False
	if 'sitemap_data' in st.session_state:
	del st.session_state['sitemap_data']
	st.session_state.button_enter = False
	st.experimental_rerun()

	if saved_button:
	# Confirmation message
	st.success(f"File saved successfully.")

	else:
	st.warning("Data not extracted")


	if __name__ == "__main__":
	main()