Spaces:

OdiaGenAI
/

web_scrapper_odia

Runtime error

App Files Files Community

sam2ai commited on Jun 5, 2023

Commit

a5adcd2

•

1 Parent(s): c4a251a

Upload 2 files

Browse files

Files changed (2) hide show

app.py +162 -0
styles.css +78 -0

app.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# installed pip packages
+# pip install streamlit
+# pip install beautifulsoup4
+# pip install docx2txt
+# pip install pypdf2
+# pip install pdfplumber
+import streamlit as st
+# File Processing pkgs
+from PIL import Image
+import requests
+from bs4 import BeautifulSoup
+import json
+import docx2txt
+# import textract
+from PyPDF2 import PdfFileReader
+import pdfplumber
+# ---- LOAD ASSETS ----
+img_page_icon = Image.open("images/web_icon.jpeg")
+# Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/
+st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide")
+# Load CSS file
+def load_css(file_path):
+    with open(file_path) as f:
+        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
+# Load CSS file
+load_css('styles.css')
+# ---- HEADER SECTION ----
+with st.container():
+    st.subheader("Hi, username :wave:")
+    st.write("##")
+    st.markdown("<h5 class='text'>OdiaGenAI is a collaborative initiative that conducts research on </h5>",
+                unsafe_allow_html=True)
+    st.markdown("<h5>Generative AI and LLM for the Odia Language.</h5>", unsafe_allow_html=True)
+    # st.title("Odia Generative AI")
+    st.markdown("<h1 class='title'>Odia Generative AI</h1>", unsafe_allow_html=True)
+# ---- BODY SECTION ----
+with st.container():
+    st.subheader("Collecting monolingual data (Odia or any Indic Languages)")
+    # ----- FUNCTIONS ----
+    # function to get the text from pdf using PyPDF2
+    def read_pdf(file):
+        pdfReader = PdfFileReader(file)
+        count = pdfReader.numPages
+        # all_page_text = ""
+        # for i in range(count):
+        #     page = pdfReader.getPage(i)
+        #     all_page_text += page.extractText()
+        #
+        # return all_page_text
+        return count
+    # function to run the enter button
+    def run_function(url , documents):
+        news = ""
+        # Check if the user has provided a URL
+        if url:
+            try:
+                # Make a GET request to the URL and extract the text content
+                response = requests.get(url)
+                if response.status_code == 200:
+                    text_content = response.text
+                    soup = BeautifulSoup(text_content, 'html.parser')
+                    # Extracting the header
+                    # Extracting the script tag which includes the heading
+                    heading = soup.find('script', type='application/ld+json')
+                    # Extract the JSON data from the script tag
+                    json_data_heading = heading.string
+                    # Load the JSON data into a Python dictionary
+                    data = json.loads(json_data_heading)
+                    headline = data['headline']
+                    body = soup.find('div', class_='oi-article-lt')
+                    # Find all <p> tags within the div_tag
+                    p_tags = body.find_all('p')
+                    # Extract the text content from each <p> tag
+                    paragraphs = [p.get_text(strip=True) for p in p_tags]
+                    paragraphs = '\n'.join(paragraphs)
+                    news = news + (headline + '\n\n' + paragraphs)
+                    # Display the extracted text content from url
+                    st.text_area("Extracted Text", value=news, height=200)
+                else:
+                    st.error("Error: Unable to fetch content from the provided URL.")
+            except requests.exceptions.RequestException as e:
+                st.error("Error: An exception occurred while fetching content from the URL.")
+        # Check if the user has provided a document
+        elif documents is not None:
+            for document in documents:
+                document_details = {
+                    "filename":document.name,
+                    "filetype":document.type,
+                    "filesize":document.size
+                }
+                st.write(document_details)
+                # Extract content from the txt file
+                if document.type == "text/plain":
+                    # Read as bytes
+                    news += str(document.read(), "utf-8")
+                # Extract content from the pdf file
+                elif document.type == "application/pdf":
+                    # using PyPDF2
+                    # news += read_pdf(document)
+                    # using pdfplumber
+                    try:
+                        with pdfplumber.open(document) as pdf:
+                            all_text = ""
+                            for page in pdf.pages:
+                                text = page.extract_text()
+                                all_text += text + "\n"
+                            news += all_text
+                    except:
+                        st.write("None")
+                # Extract content from the docx file
+                else:
+                    news += docx2txt.process(document)
+            # Display the extracted text content from file
+            st.text_area("Extracted Text", value=news, height=200)
+        else:
+            st.error("Error: An error occurred while fetching content .")
+    col1, col2, col3 = st.columns([0.6, 0.2, 0.2])
+    with col1:
+        url = st.text_input(label='', placeholder="Enter URL")
+    with col2:
+        documents = st.file_uploader("", type=["png", "jpg", "jpeg", "pdf", "txt", "docx"], accept_multiple_files=True)
+    with col3:
+        b = st.button("Enter")
+    if b:
+        run_function(url, documents)

styles.css ADDED Viewed

	@@ -0,0 +1,78 @@

+/* styles.css */
+/* body background  */
+body .stApp {
+    background-color: black;
+}
+/* title - Odia Gen AI  */
+.title {
+    text-align: center;
+    margin-top: 100px;
+    margin-bottom: 100px;
+}
+/* text in web  */
+.text {
+    padding-bottom: 10px;
+}
+/* div which contains all the 3 columns */
+.css-ocqkz7.e1tzin5v3 {
+    width: 75%;
+    margin-left: 12.5%;
+}
+/* column 1 */
+/* removing press enter to apply text  */
+.css-1if5ada {
+    visibility: hidden;
+}
+/* editing Input text box  */
+input.st-bg.st-c1.st-c2.st-c3.st-c4.st-c5.st-c6.st-c7.st-c8.st-c9.st-ca.st-b8.st-cb.st-cc.st-cd.st-ce.st-cf.st-cg.st-ch.st-ci.st-ae.st-af.st-ag.st-cj.st-ai.st-aj.st-c0.st-ck.st-cl.st-cm {
+    padding-top: 0.6rem;
+    padding-bottom: 0.6rem;
+}
+/* column 2 */
+/* editing for browse file */
+/* removing drag drop area */
+.css-u8hs99.exg6vvm14 {
+    display: none;
+}
+/* editing the div which wraps the browse button */
+section.css-z8f339.exg6vvm15 {
+    padding: 0rem;
+    margin-left: -1.2rem;
+}
+/* editing the browse buttton */
+button.css-b3z5c9.edgvbvh10 {
+    width: 100%;
+}
+/* editing the div which shows the attached files */
+.css-fis6aj.exg6vvm10 {
+    overflow: auto;
+    width: 20rem;
+    margin-left: -5rem;
+}
+/* column 3 */
+/* Enter button  */
+div.stButton > button:first-child {
+    background-color: rgb(204, 49, 49);
+    width: 100%;
+    margin-top: 32px;
+}
+.css-b3z5c9 {
+    padding: 0.5rem 0.75rem;
+}
+/* text area */
+.stTextArea {
+    margin-top: 1rem;
+}