Spaces:
Runtime error
Runtime error
Upload 2 files
Browse files- app.py +162 -0
- styles.css +78 -0
app.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# installed pip packages
|
2 |
+
# pip install streamlit
|
3 |
+
# pip install beautifulsoup4
|
4 |
+
# pip install docx2txt
|
5 |
+
# pip install pypdf2
|
6 |
+
# pip install pdfplumber
|
7 |
+
|
8 |
+
import streamlit as st
|
9 |
+
|
10 |
+
# File Processing pkgs
|
11 |
+
from PIL import Image
|
12 |
+
import requests
|
13 |
+
from bs4 import BeautifulSoup
|
14 |
+
import json
|
15 |
+
import docx2txt
|
16 |
+
# import textract
|
17 |
+
from PyPDF2 import PdfFileReader
|
18 |
+
import pdfplumber
|
19 |
+
|
20 |
+
|
21 |
+
# ---- LOAD ASSETS ----
|
22 |
+
img_page_icon = Image.open("images/web_icon.jpeg")
|
23 |
+
|
24 |
+
# Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/
|
25 |
+
st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide")
|
26 |
+
|
27 |
+
# Load CSS file
|
28 |
+
def load_css(file_path):
|
29 |
+
with open(file_path) as f:
|
30 |
+
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
31 |
+
|
32 |
+
# Load CSS file
|
33 |
+
load_css('styles.css')
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
# ---- HEADER SECTION ----
|
38 |
+
with st.container():
|
39 |
+
st.subheader("Hi, username :wave:")
|
40 |
+
st.write("##")
|
41 |
+
st.markdown("<h5 class='text'>OdiaGenAI is a collaborative initiative that conducts research on </h5>",
|
42 |
+
unsafe_allow_html=True)
|
43 |
+
st.markdown("<h5>Generative AI and LLM for the Odia Language.</h5>", unsafe_allow_html=True)
|
44 |
+
# st.title("Odia Generative AI")
|
45 |
+
|
46 |
+
st.markdown("<h1 class='title'>Odia Generative AI</h1>", unsafe_allow_html=True)
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
# ---- BODY SECTION ----
|
51 |
+
with st.container():
|
52 |
+
st.subheader("Collecting monolingual data (Odia or any Indic Languages)")
|
53 |
+
|
54 |
+
# ----- FUNCTIONS ----
|
55 |
+
# function to get the text from pdf using PyPDF2
|
56 |
+
def read_pdf(file):
|
57 |
+
pdfReader = PdfFileReader(file)
|
58 |
+
count = pdfReader.numPages
|
59 |
+
# all_page_text = ""
|
60 |
+
# for i in range(count):
|
61 |
+
# page = pdfReader.getPage(i)
|
62 |
+
# all_page_text += page.extractText()
|
63 |
+
#
|
64 |
+
# return all_page_text
|
65 |
+
return count
|
66 |
+
|
67 |
+
# function to run the enter button
|
68 |
+
def run_function(url , documents):
|
69 |
+
news = ""
|
70 |
+
# Check if the user has provided a URL
|
71 |
+
if url:
|
72 |
+
try:
|
73 |
+
# Make a GET request to the URL and extract the text content
|
74 |
+
response = requests.get(url)
|
75 |
+
if response.status_code == 200:
|
76 |
+
text_content = response.text
|
77 |
+
|
78 |
+
soup = BeautifulSoup(text_content, 'html.parser')
|
79 |
+
|
80 |
+
# Extracting the header
|
81 |
+
# Extracting the script tag which includes the heading
|
82 |
+
heading = soup.find('script', type='application/ld+json')
|
83 |
+
|
84 |
+
# Extract the JSON data from the script tag
|
85 |
+
json_data_heading = heading.string
|
86 |
+
|
87 |
+
# Load the JSON data into a Python dictionary
|
88 |
+
data = json.loads(json_data_heading)
|
89 |
+
headline = data['headline']
|
90 |
+
|
91 |
+
body = soup.find('div', class_='oi-article-lt')
|
92 |
+
# Find all <p> tags within the div_tag
|
93 |
+
p_tags = body.find_all('p')
|
94 |
+
|
95 |
+
# Extract the text content from each <p> tag
|
96 |
+
paragraphs = [p.get_text(strip=True) for p in p_tags]
|
97 |
+
paragraphs = '\n'.join(paragraphs)
|
98 |
+
|
99 |
+
news = news + (headline + '\n\n' + paragraphs)
|
100 |
+
|
101 |
+
# Display the extracted text content from url
|
102 |
+
st.text_area("Extracted Text", value=news, height=200)
|
103 |
+
|
104 |
+
else:
|
105 |
+
st.error("Error: Unable to fetch content from the provided URL.")
|
106 |
+
except requests.exceptions.RequestException as e:
|
107 |
+
st.error("Error: An exception occurred while fetching content from the URL.")
|
108 |
+
|
109 |
+
# Check if the user has provided a document
|
110 |
+
elif documents is not None:
|
111 |
+
for document in documents:
|
112 |
+
document_details = {
|
113 |
+
"filename":document.name,
|
114 |
+
"filetype":document.type,
|
115 |
+
"filesize":document.size
|
116 |
+
}
|
117 |
+
st.write(document_details)
|
118 |
+
|
119 |
+
# Extract content from the txt file
|
120 |
+
if document.type == "text/plain":
|
121 |
+
# Read as bytes
|
122 |
+
news += str(document.read(), "utf-8")
|
123 |
+
|
124 |
+
# Extract content from the pdf file
|
125 |
+
elif document.type == "application/pdf":
|
126 |
+
# using PyPDF2
|
127 |
+
# news += read_pdf(document)
|
128 |
+
|
129 |
+
# using pdfplumber
|
130 |
+
try:
|
131 |
+
with pdfplumber.open(document) as pdf:
|
132 |
+
all_text = ""
|
133 |
+
for page in pdf.pages:
|
134 |
+
text = page.extract_text()
|
135 |
+
all_text += text + "\n"
|
136 |
+
news += all_text
|
137 |
+
except:
|
138 |
+
st.write("None")
|
139 |
+
|
140 |
+
# Extract content from the docx file
|
141 |
+
else:
|
142 |
+
news += docx2txt.process(document)
|
143 |
+
|
144 |
+
# Display the extracted text content from file
|
145 |
+
st.text_area("Extracted Text", value=news, height=200)
|
146 |
+
else:
|
147 |
+
st.error("Error: An error occurred while fetching content .")
|
148 |
+
|
149 |
+
|
150 |
+
|
151 |
+
col1, col2, col3 = st.columns([0.6, 0.2, 0.2])
|
152 |
+
with col1:
|
153 |
+
url = st.text_input(label='', placeholder="Enter URL")
|
154 |
+
|
155 |
+
with col2:
|
156 |
+
documents = st.file_uploader("", type=["png", "jpg", "jpeg", "pdf", "txt", "docx"], accept_multiple_files=True)
|
157 |
+
|
158 |
+
with col3:
|
159 |
+
b = st.button("Enter")
|
160 |
+
|
161 |
+
if b:
|
162 |
+
run_function(url, documents)
|
styles.css
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* styles.css */
|
2 |
+
|
3 |
+
/* body background */
|
4 |
+
body .stApp {
|
5 |
+
background-color: black;
|
6 |
+
}
|
7 |
+
|
8 |
+
|
9 |
+
/* title - Odia Gen AI */
|
10 |
+
.title {
|
11 |
+
text-align: center;
|
12 |
+
margin-top: 100px;
|
13 |
+
margin-bottom: 100px;
|
14 |
+
}
|
15 |
+
|
16 |
+
|
17 |
+
/* text in web */
|
18 |
+
.text {
|
19 |
+
padding-bottom: 10px;
|
20 |
+
}
|
21 |
+
|
22 |
+
|
23 |
+
/* div which contains all the 3 columns */
|
24 |
+
.css-ocqkz7.e1tzin5v3 {
|
25 |
+
width: 75%;
|
26 |
+
margin-left: 12.5%;
|
27 |
+
}
|
28 |
+
|
29 |
+
|
30 |
+
/* column 1 */
|
31 |
+
/* removing press enter to apply text */
|
32 |
+
.css-1if5ada {
|
33 |
+
visibility: hidden;
|
34 |
+
}
|
35 |
+
/* editing Input text box */
|
36 |
+
input.st-bg.st-c1.st-c2.st-c3.st-c4.st-c5.st-c6.st-c7.st-c8.st-c9.st-ca.st-b8.st-cb.st-cc.st-cd.st-ce.st-cf.st-cg.st-ch.st-ci.st-ae.st-af.st-ag.st-cj.st-ai.st-aj.st-c0.st-ck.st-cl.st-cm {
|
37 |
+
padding-top: 0.6rem;
|
38 |
+
padding-bottom: 0.6rem;
|
39 |
+
}
|
40 |
+
|
41 |
+
/* column 2 */
|
42 |
+
/* editing for browse file */
|
43 |
+
/* removing drag drop area */
|
44 |
+
.css-u8hs99.exg6vvm14 {
|
45 |
+
display: none;
|
46 |
+
}
|
47 |
+
/* editing the div which wraps the browse button */
|
48 |
+
section.css-z8f339.exg6vvm15 {
|
49 |
+
padding: 0rem;
|
50 |
+
margin-left: -1.2rem;
|
51 |
+
}
|
52 |
+
/* editing the browse buttton */
|
53 |
+
button.css-b3z5c9.edgvbvh10 {
|
54 |
+
width: 100%;
|
55 |
+
}
|
56 |
+
/* editing the div which shows the attached files */
|
57 |
+
.css-fis6aj.exg6vvm10 {
|
58 |
+
overflow: auto;
|
59 |
+
width: 20rem;
|
60 |
+
margin-left: -5rem;
|
61 |
+
}
|
62 |
+
|
63 |
+
/* column 3 */
|
64 |
+
/* Enter button */
|
65 |
+
div.stButton > button:first-child {
|
66 |
+
background-color: rgb(204, 49, 49);
|
67 |
+
width: 100%;
|
68 |
+
margin-top: 32px;
|
69 |
+
}
|
70 |
+
.css-b3z5c9 {
|
71 |
+
padding: 0.5rem 0.75rem;
|
72 |
+
}
|
73 |
+
|
74 |
+
|
75 |
+
/* text area */
|
76 |
+
.stTextArea {
|
77 |
+
margin-top: 1rem;
|
78 |
+
}
|