Spaces:

OdiaGenAI
/

Olive_scrapper

Runtime error

@@ -1,29 +1,9 @@
-import justext
 import streamlit as st
-from lxml import etree
-# import streamlit.components.v1 as components
-# File Processing pkgs
-from PIL import Image
-import requests
-# import xml.dom.minidom
-from bs4 import BeautifulSoup
-# import json
-import docx2txt
-# import textract
-from PyPDF2 import PdfFileReader
-import pdfplumber
-import os
-# ---- LOAD ASSETS ----
-img_page_icon = Image.open("./olive_webscrapping.jpg")
-# Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/
-st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide")
 # Load CSS file
 def load_css(file_path):
@@ -34,324 +14,38 @@ def load_css(file_path):
 # Load CSS file
 load_css('styles.css')
-# ----- FUNCTIONS ----
-# function to check whether the url is a sitemap or not
-def check_sitemap(url):
-    # Check the URL's ending
-    if url.lower().endswith(('sitemap.xml', 'sitemap_index.xml', 'sitemap')):
-        try:
-            # Parse the content as XML
-            response = requests.get(url)
-            xml_content = etree.fromstring(response.content)
-            # Check for sitemap-specific elements
-            if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
-                return True
-        except etree.XMLSyntaxError:
-            pass
-    # Additional conditions for identifying sitemaps
-    if 'sitemap' in url.lower():
-        # Perform additional checks specific to the website's structure or naming conventions
-        return True
-    return False
-# function to get urls from the site map and extract those data
-def extract_urls_from_sitemaps(xml_url):
-    # Make a GET request to the URL and extract the xml content
-    response = requests.get(xml_url)
-    soup = BeautifulSoup(response.text, 'xml')
-    extracted_urls = []
-    # check if the sitemap contains nested sitemaps
-    sitemap_tags = soup.find_all('sitemap')
-    if sitemap_tags:
-        # Process nested sitemaps
-        for sitemap_tag in sitemap_tags:
-            print("sitemap_tags:" + sitemap_tag)
-            nested_url = sitemap_tag.find('loc').text
-            print('nested_url:', nested_url)
-            nested_urls = extract_urls_from_sitemaps(nested_url)
-            extracted_urls.extend(nested_urls)
-    else:
-        # Extract URLs from the current sitemap
-        loc_tags = soup.find_all('loc')
-        for loc_tag in loc_tags:
-            # if loc_tag.parent.name != 'image':
-            url = loc_tag.text
-            if url.endswith('.pdf') or url.endswith('.jpg') or url.endswith('.jpeg'):
-                print(f"url skipped because it is a {url.split('.')[-1]}")
-            else:
-                print('url:', url)
-                extracted_urls.append(url)
-    return extracted_urls
-# function to check whether the entered url is valid
-def valid_url(url):
-    try:
-        # Make a GET request to the URL and extract the text content
-        response = requests.get(url)
-        if response.status_code == 200:
-            return True
-    except requests.exceptions.RequestException as e:
-        return False
-# function to create a custom stoplist for justext
-def custom_stoplist():
-    odia_stopwords = [
-        "ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
-        "ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
-        "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
-        "ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
-        "ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
-        "ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
-        "କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
-        "ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
-        "ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
-        "ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
-        "ତାପର��", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
-        "ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
-        "ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
-        "କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
-    ]
-    return frozenset(odia_stopwords)
-# function to extract data from url using justext
-def extract_data_from_url_(url):
-    response = requests.get(url)
-    response.raise_for_status()
-    page = response.content
-    data_url = ""
-    para = ""
-    paragraphs = justext.justext(page, custom_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False)
-    for paragraph in paragraphs:
-        if not paragraph.is_boilerplate:
-            para = para + '\n' + paragraph.text
-    data_url = ('\n\nFrom url:' + url + '\n' + para + '\n')
-    return data_url
-sitemap_data = ""
-# function to get the text from pdf using PyPDF2
-def read_pdf(file):
-    pdfReader = PdfFileReader(file)
-    count = pdfReader.numPages
-    # all_page_text = ""
-    # for i in range(count):
-    #     page = pdfReader.getPage(i)
-    #     all_page_text += page.extractText()
-    #
-    # return all_page_text
-    return count
-# function to run the enter button
-def run_function(url, documents):
-    data = ""
-    # Check if the user has provided a URL
-    if url:
-        if valid_url(url):
-            data = extract_data_from_url_(url)
-            st.text_area("Extracted Text", value=data, height=200)
-            # return extract status, and the data extracted
-            return True, data
-        else:
-            return False, data
-    # Check if the user has provided a document
-    elif documents is not None:
-        for document in documents:
-            document_details = {
-                "filename": document.name,
-                "filetype": document.type,
-                "filesize": document.size
-            }
-            st.write(document_details)
-            # Extract content from the txt file
-            if document.type == "text/plain":
-                # Read as bytes
-                data += str(document.read(), "utf-8")
-            # Extract content from the pdf file
-            elif document.type == "application/pdf":
-                # using PyPDF2
-                # data += read_pdf(document)
-                # using pdfplumber
-                try:
-                    with pdfplumber.open(document) as pdf:
-                        all_text = ""
-                        for page in pdf.pages:
-                            text = page.extract_text()
-                            all_text += text + "\n"
-                        data += all_text
-                except requests.exceptions.RequestException as e:
-                    st.write("None")
-            # Extract content from the docx file
-            elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
-                data += docx2txt.process(document)
-        # Display the extracted text content from file
-        st.write("attached")
-        st.text_area("Extracted Text", value=data, height=200)
-        # return extract status, and the data extracted
-        return True, data
-    else:
-        st.error("Error: An error occurred while fetching content.")
-        # return extract status, and the data extracted
-        return False, data
 def main():
-    # ---- HEADER SECTION ----
-    with st.container():
-        st.subheader("Hi!! :wave:")
-        st.write("##")
-        st.markdown("<h5 class='text'>OdiaGenAI is a collaborative initiative that conducts research on </h5>",
-                    unsafe_allow_html=True)
-        st.markdown("<h5>Generative AI and LLM for the Odia Language.</h5>", unsafe_allow_html=True)
-        # st.title("Odia Generative AI")
-        st.markdown("<h1 class='title'>Odia Generative AI</h1>", unsafe_allow_html=True)
-    # ---- BODY SECTION ----
-    with st.container():
-        st.subheader("Collecting monolingual data (Odia or any Indic Languages)")
-        # dividing the body section into 3 columns for url, attach button and enter button
-        col1, col2, col3 = st.columns([0.6, 0.2, 0.2])
-        # url/xml
-        with col1:
-            url_or_xml = st.text_input(label='', placeholder="Enter URL")
-            is_a_sitemap = check_sitemap(url_or_xml)
-        # attached files
-        with col2:
-            documents = st.file_uploader("", type=["pdf", "txt", "docx"], accept_multiple_files=True)
-            if not documents:
-                documents = None
-            else:
-                for doc in documents:
-                    if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
-                        # if documents is not the relevant type
-                        st.error("Unsupported file: " + doc.name)
-        # Initialize state of button Enter
-        with col3:
-            st.write('##')
-            if "button_enter" not in st.session_state:
-                st.session_state.button_enter = False
-            if st.button("Enter"):
-                st.session_state.button_enter = True
-                # st.write("session state true")
-        if "extracted" not in st.session_state:
-            st.session_state.extracted = False
-        data = ""
-        # the enter button
-        if st.session_state.button_enter:
-            # check if it is a sitemap or not
-            if is_a_sitemap:
-                if "Initial" not in st.session_state:
-                    st.session_state.Initial = True
-                # check whether its the initial state
-                if st.session_state.Initial == True:
-                    # print("\n\n\n\n1)Initial State", st.session_state.Initial, "\n\n\n\n\n")
-                    xml = url_or_xml
-                    st.write("It is a sitemap")
-                    stored_sitemap_urls = extract_urls_from_sitemaps(xml)
-                    print('\nno. of urls: ', len(stored_sitemap_urls))
-                    if stored_sitemap_urls:
-                        print(stored_sitemap_urls)
-                        for sitemap_url in stored_sitemap_urls:
-                            if valid_url(sitemap_url):
-                                print(sitemap_url)
-                                # using justext to extract data
-                                data = data + extract_data_from_url_(sitemap_url)
-                            else:
-                                st.error("Couldnt extract data from " + sitemap_url)
-                        if "sitemap_data" not in st.session_state:
-                            st.session_state.sitemap_data = data
-                        # print("\n\n\nst.session.data ", st.session_state.sitemap_data)
-                        # print("\n\n\n\nRUNNING \n\n\n\n")
-                        st.session_state.Initial = False
-                        print("\n\n\n\n2)Initial State", st.session_state.Initial, "\n\n\n\n\n")
-                        st.session_state.extracted = True
-                        # st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)
-                    else:
-                        st.error("Error: Invalid sitemap.")
-            else:
-                url = url_or_xml
-                st.session_state.extracted, data = run_function(url, documents)
-            if st.session_state.extracted:
-                if is_a_sitemap:
-                    st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)
-                col1, col2 = st.columns([0.5, 0.5])
-                with col1:
-                    saved_button = False
-                    if is_a_sitemap:
-                        saved_data = st.session_state.sitemap_data
-                        if st.download_button(
-                            label="Save",
-                            data=saved_data
-                        ):
-                            saved_button = True
-                    else:
-                        if st.download_button(
-                            label="Save",
-                            data=data
-                        ):
-                            saved_button = True
-                with col2:
-                    if st.button("Clear"):
-                        st.session_state.button_enter = False
-                        st.session_state.Initial = True
-                        st.session_state.extracted = False
-                        if 'sitemap_data' in st.session_state:
-                            del st.session_state['sitemap_data']
-                        st.session_state.button_enter = False
-                        st.experimental_rerun()
-                if saved_button:
-                    # Confirmation message
-                    st.success(f"File saved as {file_name} in the current directory.")
-            else:
-                st.warning("Data not extracted")
 if __name__ == "__main__":
     main()

 import streamlit as st
+# setting page config. for centered mode
+st.set_page_config(layout="centered")
+from utils.footer import cust_footer
 # Load CSS file
 def load_css(file_path):
 # Load CSS file
 load_css('styles.css')
 def main():
+    # Title of the app and description.
+    title = """
+        <div>
+        <p class="title">Olive Scrapper</p>
+        </div>
+    """
+    st.markdown(title, unsafe_allow_html=True)
+    st.write("#")
+    st.write("#")
+    introduction = """
+        <div>
+        <p class="text">Olive Scraper is a web scraping tool developed by OdiaGenAI for web scraping Odia contents from different sources (e.g., websites, PDF, DOC, etc.)</p>
+        </div>
+    """
+    st.markdown(introduction, unsafe_allow_html=True)
+    st.write("#")
+    st.write("###")
+    contributors = """
+        <div>
+        <p class="text">Contributors: Dr. Shantipriya Parida, Sambit, A.R. Kamaldeen, Prosper</p>
+        </div>
+    """
+    st.markdown(contributors, unsafe_allow_html=True)
+    # Add a success message to the sidebar
+    st.sidebar.success("Select a page above.")
+    # importing the custom footer from utils
+    cust_footer()
 if __name__ == "__main__":
     main()

pages/1_URLs.py ADDED Viewed

	@@ -0,0 +1,375 @@

+import streamlit as st
+# setting page config. for centered mode
+st.set_page_config(layout="centered")
+from utils.footer import cust_footer
+from lxml import etree
+import justext
+import concurrent.futures
+import datetime
+import requests
+from bs4 import BeautifulSoup
+import json
+# ----- FUNCTIONS -----
+# function to check whether the url is a sitemap or not
+def check_sitemap(url):
+    # Check the URL's ending
+    if url.lower().endswith(('sitemap.xml', 'sitemap_index.xml', 'sitemap')):
+        try:
+            # Parse the content as XML
+            response = requests.get(url)
+            xml_content = etree.fromstring(response.content)
+            # Check for sitemap-specific elements
+            if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
+                return True
+        except etree.XMLSyntaxError:
+            pass
+    # Additional conditions for identifying sitemaps
+    if 'sitemap' in url.lower():
+        # Perform additional checks specific to the website's structure or naming conventions
+        return True
+    return False
+def extract_urls_from_sitemaps(xml_url):
+    # Make a GET request to the URL and extract the xml content
+    response = requests.get(xml_url)
+    soup = BeautifulSoup(response.text, 'xml')
+    extracted_urls = []
+    # check if the sitemap contains nested sitemaps
+    sitemap_tags = soup.find_all('sitemap')
+    if sitemap_tags:
+        # Process nested sitemaps
+        for sitemap_tag in sitemap_tags:
+            print("sitemap_tags:" + str(sitemap_tag))
+            nested_url = sitemap_tag.find('loc').text
+            print('nested_url:', nested_url)
+            nested_urls = extract_urls_from_sitemaps(nested_url)
+            extracted_urls.extend(nested_urls)
+    else:
+        # Extract URLs from the current sitemap
+        loc_tags = soup.find_all('loc')
+        for loc_tag in loc_tags:
+            # if loc_tag.parent.name != 'image':
+            url = loc_tag.text
+            if url.endswith('.pdf') or url.endswith('.jpg') or url.endswith('.jpeg'):
+                print(f"url skipped because it is a {url.split('.')[-1]}")
+            else:
+                print('url:', url)
+                extracted_urls.append(url)
+    return extracted_urls
+# function to check whether the entered url is valid
+def valid_url(url):
+    try:
+        # Make a GET request to the URL and extract the text content
+        response = requests.get(url)
+        if response.status_code == 200:
+            return True
+    except requests.exceptions.RequestException as e:
+        return False
+# function to create a custom stoplist for justext
+def custom_stoplist():
+    odia_stopwords = [
+        "ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
+        "ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
+        "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
+        "ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
+        "ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
+        "ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
+        "କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
+        "ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
+        "ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
+        "ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
+        "ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
+        "ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
+        "ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
+        "କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
+    ]
+    return frozenset(odia_stopwords)
+# function to extract data from url using justext
+def extract_data_from_url_(url):
+    response = requests.get(url)
+    response.raise_for_status()
+    page = response.content
+    para = ""
+    paragraphs = justext.justext(page, custom_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False)
+    for paragraph in paragraphs:
+        if not paragraph.is_boilerplate:
+            para = para + '\n' + paragraph.text
+    return para
+sitemap_data = ""
+# function to process a batch of URLS in sitemaps
+def process_urls(sitemap_urls):
+    extracted_txt = ""
+    extracted_jsonl_list= []
+    for url in sitemap_urls:
+        if valid_url(url):
+            print(url)
+            # using justext to extract data
+            temp_para = extract_data_from_url_(url)
+            temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
+            temp_jsonl_data = {"text": temp_para, "url": url}
+            extracted_txt += temp_txt_data
+            extracted_jsonl_list.append(temp_jsonl_data)
+        else:
+            st.error("Couldnt extract data from " + url)
+    # Convert data_list to JSONL string
+    extracted_jsonl_list_encoded = [json.dumps(data, ensure_ascii=False) for data in extracted_jsonl_list]
+    extracted_jsonl = '\n'.join(extracted_jsonl_list_encoded)
+    return extracted_txt, extracted_jsonl
+# function to process for a single URL
+def run_function(url):
+    extracted_txt = ""
+    # Check if the user has provided a URL
+    if url:
+        if valid_url(url):
+            temp_para = extract_data_from_url_(url)
+            temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
+            extracted_txt = temp_txt_data
+            extracted_jsonl = {"text": str(temp_para), "url":str(url)}
+            # displaying extracted txt for single URL
+            st.text_area("Extracted Text", value=extracted_txt, height=200)
+            extracted_jsonl = json.dumps(extracted_jsonl, ensure_ascii=False)
+            # return extract status, and the data extracted
+            return True, extracted_txt, extracted_jsonl
+        else:
+            return False, None, None
+    else:
+        st.error("Error: An error occurred while fetching content.")
+        # return extract status, and the data extracted
+        return False, None, None
+def main():
+    st.subheader("Extract Data from URLs")
+    # dividing the body section into 2 columns for url and enter button
+    col1, col2 = st.columns([0.7,0.3])
+    with col1:
+        url_or_xml = st.text_input(label='', placeholder="Enter URL")
+        is_a_sitemap = check_sitemap(url_or_xml)
+    with col2:
+        st.write('##')
+        if "button_enter_url" not in st.session_state:
+            st.session_state.button_enter_url = False
+        if st.button("Enter"):
+            st.session_state.button_enter_url = True
+    if "extracted_url" not in st.session_state:
+        st.session_state.extracted_url = False
+    data = ""
+    # the enter button
+    if st.session_state.button_enter_url:
+        # check if it is a sitemap or not
+        if is_a_sitemap:
+            if "Initial" not in st.session_state:
+                st.session_state.Initial = True
+            # check whether its the initial state
+            if st.session_state.Initial == True:
+                xml = url_or_xml
+                st.write("It is a sitemap")
+                stored_sitemap_urls = extract_urls_from_sitemaps(xml)
+                print('\nno. of urls: ', len(stored_sitemap_urls))
+                st.write('no. of urls {}', format(len(stored_sitemap_urls)))
+                if stored_sitemap_urls:
+                    print(stored_sitemap_urls)
+                    current_time = datetime.datetime.now()
+                    print(current_time)
+                    st.write(current_time)
+                    # for sitemap_url in stored_sitemap_urls:
+                    #     if valid_url(sitemap_url):
+                    #         print(sitemap_url)
+                    #         # using justext to extract data
+                    #         data = data + extract_data_from_url_(sitemap_url)
+                    #     else:
+                    #         st.error("Couldnt extract data from " + sitemap_url)
+                    num_threads = 16  # Number of threads to use
+                    # Calculate the split size for each thread
+                    split_size = len(stored_sitemap_urls) // num_threads
+                    # Create a ThreadPoolExecutor with maximum `num_threads` threads
+                    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
+                        futures = []
+                        for i in range(num_threads):
+                            start_index = i * split_size
+                            end_index = start_index + split_size if i != num_threads - 1 else None
+                            temp_urls = stored_sitemap_urls[start_index:end_index]
+                            future = executor.submit(process_urls, temp_urls)
+                            futures.append(future)
+                        # Retrieve the extracted data from each thread
+                        text_data = []
+                        jsonl_data = []
+                        for future in futures:
+                            text_result, jsonl_result = future.result()
+                            text_data.append(text_result)
+                            jsonl_data.append(jsonl_result)
+                    # Combine the extracted data from all threads
+                    combined_text_data = ''.join(text_data)
+                    combined_jsonl_data = '\n'.join(jsonl_data)
+                    # Use the combined data as needed
+                    # print("Combined Text Data:")
+                    # print(combined_text_data)
+                    # print("Combined JSONL Data:")
+                    # print(combined_jsonl_data)
+                    if "sitemap_data_jsonl" not in st.session_state:
+                        st.session_state.sitemap_data_jsonl = combined_jsonl_data
+                    if "sitemap_data_text" not in st.session_state:
+                        st.session_state.sitemap_data_text = combined_text_data
+                    current_time = datetime.datetime.now()
+                    print(current_time)
+                    st.write(current_time)
+                    st.session_state.Initial = False
+                    print("\n\n\n\n2)Initial State", st.session_state.Initial, "\n\n\n\n\n")
+                    st.session_state.extracted_url = True
+                else:
+                    st.error("Error: Invalid sitemap.")
+        else:
+            url = url_or_xml
+            st.session_state.extracted_url, data_txt, data_jsonl = run_function(url)
+        if st.session_state.extracted_url:
+            # displaying extracted txt for sitemaps
+            if is_a_sitemap:
+                st.text_area("Extracted Text", value=st.session_state.sitemap_data_text, height=300)
+            save_as,checkbox_c1, checkbox_c2 = st.columns([0.33 , 0.33 , 0.33])
+            # initializing the checbox bool
+            save_as_txt =False
+            save_as_json = False
+            saved_successfully = False
+            with save_as:
+                st.write("Save as ")
+            with checkbox_c1:
+                save_as_txt = st.checkbox("text", value=False)
+            with checkbox_c2:
+                save_as_json = st.checkbox("jsonl", value=False)
+            if not save_as_txt and not save_as_json:
+                if st.button("Clear"):
+                    st.session_state.button_enter_url = False
+                    st.session_state.Initial = True
+                    st.session_state.extracted_url = False
+                    if 'sitemap_data_text' in st.session_state:
+                        del st.session_state['sitemap_data_text']
+                    if 'sitemap_data_jsonl' in st.session_state:
+                        del st.session_state['sitemap_data_jsonl']
+                    st.session_state.button_enter_url = False
+                    st.experimental_rerun()
+            else:
+                col1, col2 = st.columns([0.5, 0.5])
+                # save column
+                with col1:
+                    if is_a_sitemap:
+                        if save_as_txt:
+                            if st.download_button(label="Save as txt",data=st.session_state.sitemap_data_text ):
+                                saved_successfully = True
+                        if save_as_json:
+                            if st.download_button(label="Save as jsonl", data=st.session_state.sitemap_data_jsonl, mime="application/json"):
+                                saved_successfully = True
+                    else:
+                        if save_as_txt:
+                            if st.download_button(label="Save as txt",data=data_txt ):
+                                saved_successfully = True
+                        if save_as_json:
+                            if st.download_button(label="Save as jsonl", data=data_jsonl, mime="application/json"):
+                                saved_successfully = True
+                # clear column
+                with col2:
+                    if st.button("Clear"):
+                        st.session_state.button_enter_url = False
+                        st.session_state.Initial = True
+                        st.session_state.extracted_url = False
+                        if 'sitemap_data_text' in st.session_state:
+                            del st.session_state['sitemap_data_text']
+                        if 'sitemap_data_jsonl' in st.session_state:
+                            del st.session_state['sitemap_data_jsonl']
+                        st.session_state.button_enter_url = False
+                        st.experimental_rerun()
+            if saved_successfully:
+                # Confirmation message
+                st.success(f"File saved successfully.")
+        else:
+            st.warning("Data not extracted")
+            if st.button("clear"):
+                st.session_state.button_enter_url = False
+                st.session_state.extracted_url = False
+                st.experimental_rerun()
+    # Add a success message to the sidebar
+    st.sidebar.success("Select a page above.")
+    # importing the custom footer from utils
+    cust_footer()
+if __name__ == "__main__":
+    main()

pages/2_Documents.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import streamlit as st
+# setting page config. for centered mode
+st.set_page_config(layout="centered")
+from utils.footer import cust_footer
+import docx2txt
+import requests
+import pdfplumber
+# function to run the enter button
+def run_function(documents):
+    data = ""
+    if documents is not None:
+        for document in documents:
+            document_details = {
+                "filename": document.name,
+                "filetype": document.type,
+                "filesize": document.size
+            }
+            st.write(document_details)
+            # Extract content from the txt file
+            if document.type == "text/plain":
+                # Read as bytes
+                data += str(document.read(), "utf-8")
+            # Extract content from the pdf file
+            elif document.type == "application/pdf":
+                # using PyPDF2
+                # data += read_pdf(document)
+                # using pdfplumber
+                try:
+                    with pdfplumber.open(document) as pdf:
+                        all_text = ""
+                        for page in pdf.pages:
+                            text = page.extract_text()
+                            all_text += text + "\n"
+                        data += all_text
+                except requests.exceptions.RequestException as e:
+                    st.write("None")
+            # Extract content from the docx file
+            elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+                data += docx2txt.process(document)
+        # Display the extracted text content from file
+        st.text_area("Extracted Text", value=data, height=200)
+        # return extract status, and the data extracted
+        return True, data
+    else:
+        st.error("Error: An error occurred while fetching content.")
+        # return extract status, and the data extracted
+        return False, data
+def main():
+    st.subheader("Extract Data from Documents")
+    documents = st.file_uploader(
+        "", type=["pdf", "txt", "docx"], accept_multiple_files=True
+    )
+    if "button_enter_doc" not in st.session_state:
+        st.session_state.button_enter_doc = False
+    if "extracted_doc" not in st.session_state:
+        st.session_state.extracted_doc = False
+    data = ""
+    if st.button("Enter"):
+        st.session_state.button_enter_doc = True
+    # the enter button
+    if st.session_state.button_enter_doc:
+        # check if it is a sitemap or not
+        if not documents:
+            documents = None
+        else:
+            for doc in documents:
+                if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
+                    # if documents is not the relevant type
+                    st.error("Unsupported file: " + doc.name)
+        st.session_state.extracted_doc, data = run_function(documents)
+        if st.session_state.extracted_doc:
+            col1, col2 = st.columns([0.5, 0.5])
+            with col1:
+                saved_button = False
+                if st.download_button(
+                    label="Save",
+                    data=data
+                ):
+                    saved_button = True
+            with col2:
+                if st.button("Clear"):
+                    st.session_state.button_enter_doc = False
+                    st.session_state.extracted_doc = False
+                    st.experimental_rerun()
+            if saved_button:
+                # Confirmation message
+                st.success(f"File saved successfully.")
+        else:
+            st.warning("Data not extracted")
+            if st.button("clear"):
+                st.session_state.button_enter_doc = False
+                st.session_state.extracted_doc = False
+                st.experimental_rerun()
+    # Add a success message to the sidebar
+    st.sidebar.success("Select a page above.")
+    # importing the custom footer from utils
+    cust_footer()
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -6,3 +6,4 @@ pdfplumber
 justext
 Pillow
 requests

 justext
 Pillow
 requests
+lxml

styles.css CHANGED Viewed

@@ -1,75 +1,35 @@
-.e1tzin5v2 >.row-widget.stButton > button {
-    width: 100%;
-    height: 100%;
-}
-/* div that wraps all 3 columns */
-.e1tzin5v3 {
-    width: 70%;
-    margin-left: 15%;
-}
-/* drag drop div */
-.exg6vvm14 {
-    display: none;
-}
-/* resizing the attach button  */
-.exg6vvm15 > button {
-    width: 100%;
-    height: 100%;
-}
-/* div which wraps attach button */
-.exg6vvm15 {
-    padding: 0;
-    margin-left: -15px;
-    height: 40px;
 }
-/* div that wraps the enter button  */
-.e1tzin5v2 >.row-widget.stButton  {
-    height: 40px;
 }
-/* resizing the enter button  */
-#root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(2) > div:nth-child(3) > div:nth-child(1) > div > div:nth-child(2) > div > button{
-    background: rgb(204, 49, 49);
-    color: black;
 }
-#root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(2) > div:nth-child(3) > div:nth-child(1) > div > div:nth-child(2) > div > button :hover,:active{
-    color: white;
-}
-/* save button */
-#root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(4) > div:nth-child(1) > div:nth-child(1) > div > div > div > button {
-    background-color: rgb(0,255,127);
-    color: black;
-    height: 40px;
-    margin-top: 20px;
-    width: 70%;
-    margin-left: 30%;
-}
-#root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(4) > div:nth-child(1) > div:nth-child(1) > div > div > div > button:hover, :active {
-    border: rgb(0,255,127);
-    color: white;
-}
-/* clear button  */
-#root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(4) > div:nth-child(2) > div:nth-child(1) > div > div > div > button {
-    background: rgb(204, 49, 49);
-    color: black;
-    height: 40px;
-    margin-top: 20px;
-    width: 70%;
-    margin-right: 30%;
-}
-#root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(4) > div:nth-child(2) > div:nth-child(1) > div > div > div > button:hover, :active {
-    color: white;
-}

+.title {
+    font-family: "Source Sans Pro", sans-serif;
+    font-weight: 600;
+    padding: 1.25rem 0px 1rem;
+    margin: 0px;
+    margin-left: -2%;
+    line-height: 1.2;
+    font-size: 70px;
+    text-decoration: underline 2px;
+    text-underline-position: under;
 }
+@media (max-width: 768px) {
+    .title {
+        font-size: xx-large;
+        margin-left: auto;
+    }
 }
+.text {
+    font-family: "Source Sans Pro", sans-serif;
+    font-size: 1.25pc;
+    padding: 1.25rem 0px 1rem;
+    margin: 0px;
 }
+/* #root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) {
+    background-color: #3498db;
+    padding: 5px;
+} */
+/* #root > div:nth-child(1) > div.withScreencast > div > div > div > section.css-1cypcdb.e1fqkh3o11 > div.css-6qob1r.e1fqkh3o3 > div.css-1b9x38r.e1fqkh3o2 > button{
+    visibility: hidden;
+} */

utils/__pycache__/footer.cpython-38.pyc ADDED Viewed

Binary file (1.1 kB). View file

utils/footer.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import streamlit as st
+def cust_footer():
+    footer = """
+        <style>
+        footer {
+            visibility: hidden !important;
+        }
+        .divfooter {
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            position: fixed;
+            left: 0;
+            bottom: 0;
+            width: 100%;
+            padding: 10px;
+            border-top: 2px solid grey;
+            background: white;
+        }
+        @media (min-width: 768px) {
+            .divfooter {
+                justify-content: center;
+                padding-left: 10%;
+            }
+        }
+        </style>
+        <div class="divfooter">
+        <p style="margin-bottom: 0px">© 2023 Odia Generative AI</p>
+        </div>
+        """
+    st.markdown(footer, unsafe_allow_html=True)