import justext import streamlit as st from lxml import etree # import streamlit.components.v1 as components # File Processing pkgs from PIL import Image import requests # import xml.dom.minidom from bs4 import BeautifulSoup # import json import docx2txt # import textract from PyPDF2 import PdfFileReader import pdfplumber import os # ---- LOAD ASSETS ---- img_page_icon = Image.open("./olive_webscrapping.jpg") # Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/ st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide") # Load CSS file def load_css(file_path): with open(file_path) as f: st.markdown(f"", unsafe_allow_html=True) # Load CSS file load_css('styles.css') # ----- FUNCTIONS ---- # function to check whether the url is a sitemap or not def check_sitemap(url): # Check the URL's ending if url.lower().endswith(('sitemap.xml', 'sitemap_index.xml', 'sitemap')): try: # Parse the content as XML response = requests.get(url) xml_content = etree.fromstring(response.content) # Check for sitemap-specific elements if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex': return True except etree.XMLSyntaxError: pass # Additional conditions for identifying sitemaps if 'sitemap' in url.lower(): # Perform additional checks specific to the website's structure or naming conventions return True return False # function to get urls from the site map and extract those data def extract_urls_from_sitemaps(xml_url): # Make a GET request to the URL and extract the xml content response = requests.get(xml_url) soup = BeautifulSoup(response.text, 'xml') extracted_urls = [] # check if the sitemap contains nested sitemaps sitemap_tags = soup.find_all('sitemap') if sitemap_tags: # Process nested sitemaps for sitemap_tag in sitemap_tags: print("sitemap_tags:" + sitemap_tag) nested_url = sitemap_tag.find('loc').text print('nested_url:', nested_url) nested_urls = extract_urls_from_sitemaps(nested_url) extracted_urls.extend(nested_urls) else: # Extract URLs from the current sitemap loc_tags = soup.find_all('loc') for loc_tag in loc_tags: # if loc_tag.parent.name != 'image': url = loc_tag.text if url.endswith('.pdf') or url.endswith('.jpg') or url.endswith('.jpeg'): print(f"url skipped because it is a {url.split('.')[-1]}") else: print('url:', url) extracted_urls.append(url) return extracted_urls # function to check whether the entered url is valid def valid_url(url): try: # Make a GET request to the URL and extract the text content response = requests.get(url) if response.status_code == 200: return True except requests.exceptions.RequestException as e: return False # function to create a custom stoplist for justext def custom_stoplist(): odia_stopwords = [ "ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର", "ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର", "ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ", "ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା", "ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |", "କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ", "ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ", "ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ", "ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ", "ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ", "ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି", "ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ", "କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ" ] return frozenset(odia_stopwords) # function to extract data from url using justext def extract_data_from_url_(url): response = requests.get(url) response.raise_for_status() page = response.content data_url = "" para = "" paragraphs = justext.justext(page, custom_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False) for paragraph in paragraphs: if not paragraph.is_boilerplate: para = para + '\n' + paragraph.text data_url = ('\n\nFrom url:' + url + '\n' + para + '\n') return data_url sitemap_data = "" # function to get the text from pdf using PyPDF2 def read_pdf(file): pdfReader = PdfFileReader(file) count = pdfReader.numPages # all_page_text = "" # for i in range(count): # page = pdfReader.getPage(i) # all_page_text += page.extractText() # # return all_page_text return count # function to run the enter button def run_function(url, documents): data = "" # Check if the user has provided a URL if url: if valid_url(url): data = extract_data_from_url_(url) st.text_area("Extracted Text", value=data, height=200) # return extract status, and the data extracted return True, data else: return False, data # Check if the user has provided a document elif documents is not None: for document in documents: document_details = { "filename": document.name, "filetype": document.type, "filesize": document.size } st.write(document_details) # Extract content from the txt file if document.type == "text/plain": # Read as bytes data += str(document.read(), "utf-8") # Extract content from the pdf file elif document.type == "application/pdf": # using PyPDF2 # data += read_pdf(document) # using pdfplumber try: with pdfplumber.open(document) as pdf: all_text = "" for page in pdf.pages: text = page.extract_text() all_text += text + "\n" data += all_text except requests.exceptions.RequestException as e: st.write("None") # Extract content from the docx file elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": data += docx2txt.process(document) # Display the extracted text content from file st.write("attached") st.text_area("Extracted Text", value=data, height=200) # return extract status, and the data extracted return True, data else: st.error("Error: An error occurred while fetching content.") # return extract status, and the data extracted return False, data def main(): # ---- HEADER SECTION ---- with st.container(): st.subheader("Hi!! :wave:") st.write("##") st.markdown("