Spaces:
Runtime error
Runtime error
Synced repo using 'sync_with_huggingface' Github Action
Browse files- .streamlit/config.toml +3 -0
- .vscode/settings.json +6 -0
- User Interface/User-Interface-multipages.pdf +0 -0
- app.py +34 -340
- pages/1_URLs.py +375 -0
- pages/2_Documents.py +138 -0
- requirements.txt +1 -0
- styles.css +27 -67
- utils/__pycache__/footer.cpython-38.pyc +0 -0
- utils/footer.py +38 -0
.streamlit/config.toml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
base="light"
|
3 |
+
|
.vscode/settings.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"[python]": {
|
3 |
+
"editor.defaultFormatter": "ms-python.black-formatter"
|
4 |
+
},
|
5 |
+
"python.formatting.provider": "none"
|
6 |
+
}
|
User Interface/User-Interface-multipages.pdf
ADDED
Binary file (103 kB). View file
|
|
app.py
CHANGED
@@ -1,29 +1,9 @@
|
|
1 |
-
import justext
|
2 |
import streamlit as st
|
3 |
-
from lxml import etree
|
4 |
-
# import streamlit.components.v1 as components
|
5 |
|
|
|
|
|
6 |
|
7 |
-
|
8 |
-
from PIL import Image
|
9 |
-
import requests
|
10 |
-
# import xml.dom.minidom
|
11 |
-
from bs4 import BeautifulSoup
|
12 |
-
# import json
|
13 |
-
import docx2txt
|
14 |
-
# import textract
|
15 |
-
from PyPDF2 import PdfFileReader
|
16 |
-
import pdfplumber
|
17 |
-
import os
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
# ---- LOAD ASSETS ----
|
22 |
-
img_page_icon = Image.open("./olive_webscrapping.jpg")
|
23 |
-
|
24 |
-
# Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/
|
25 |
-
st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide")
|
26 |
-
|
27 |
|
28 |
# Load CSS file
|
29 |
def load_css(file_path):
|
@@ -34,324 +14,38 @@ def load_css(file_path):
|
|
34 |
# Load CSS file
|
35 |
load_css('styles.css')
|
36 |
|
37 |
-
|
38 |
-
# ----- FUNCTIONS ----
|
39 |
-
# function to check whether the url is a sitemap or not
|
40 |
-
def check_sitemap(url):
|
41 |
-
# Check the URL's ending
|
42 |
-
if url.lower().endswith(('sitemap.xml', 'sitemap_index.xml', 'sitemap')):
|
43 |
-
try:
|
44 |
-
# Parse the content as XML
|
45 |
-
response = requests.get(url)
|
46 |
-
xml_content = etree.fromstring(response.content)
|
47 |
-
# Check for sitemap-specific elements
|
48 |
-
if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
|
49 |
-
return True
|
50 |
-
except etree.XMLSyntaxError:
|
51 |
-
pass
|
52 |
-
|
53 |
-
# Additional conditions for identifying sitemaps
|
54 |
-
if 'sitemap' in url.lower():
|
55 |
-
# Perform additional checks specific to the website's structure or naming conventions
|
56 |
-
return True
|
57 |
-
|
58 |
-
return False
|
59 |
-
|
60 |
-
|
61 |
-
# function to get urls from the site map and extract those data
|
62 |
-
def extract_urls_from_sitemaps(xml_url):
|
63 |
-
# Make a GET request to the URL and extract the xml content
|
64 |
-
response = requests.get(xml_url)
|
65 |
-
|
66 |
-
soup = BeautifulSoup(response.text, 'xml')
|
67 |
-
extracted_urls = []
|
68 |
-
|
69 |
-
# check if the sitemap contains nested sitemaps
|
70 |
-
sitemap_tags = soup.find_all('sitemap')
|
71 |
-
if sitemap_tags:
|
72 |
-
# Process nested sitemaps
|
73 |
-
for sitemap_tag in sitemap_tags:
|
74 |
-
print("sitemap_tags:" + sitemap_tag)
|
75 |
-
nested_url = sitemap_tag.find('loc').text
|
76 |
-
print('nested_url:', nested_url)
|
77 |
-
nested_urls = extract_urls_from_sitemaps(nested_url)
|
78 |
-
extracted_urls.extend(nested_urls)
|
79 |
-
else:
|
80 |
-
# Extract URLs from the current sitemap
|
81 |
-
loc_tags = soup.find_all('loc')
|
82 |
-
for loc_tag in loc_tags:
|
83 |
-
# if loc_tag.parent.name != 'image':
|
84 |
-
url = loc_tag.text
|
85 |
-
if url.endswith('.pdf') or url.endswith('.jpg') or url.endswith('.jpeg'):
|
86 |
-
print(f"url skipped because it is a {url.split('.')[-1]}")
|
87 |
-
else:
|
88 |
-
print('url:', url)
|
89 |
-
extracted_urls.append(url)
|
90 |
-
|
91 |
-
return extracted_urls
|
92 |
-
|
93 |
-
|
94 |
-
# function to check whether the entered url is valid
|
95 |
-
def valid_url(url):
|
96 |
-
try:
|
97 |
-
# Make a GET request to the URL and extract the text content
|
98 |
-
response = requests.get(url)
|
99 |
-
if response.status_code == 200:
|
100 |
-
return True
|
101 |
-
|
102 |
-
except requests.exceptions.RequestException as e:
|
103 |
-
return False
|
104 |
-
|
105 |
-
|
106 |
-
# function to create a custom stoplist for justext
|
107 |
-
def custom_stoplist():
|
108 |
-
odia_stopwords = [
|
109 |
-
"ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
|
110 |
-
"ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
|
111 |
-
"ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
|
112 |
-
"ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
|
113 |
-
"ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
|
114 |
-
"ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
|
115 |
-
"କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
|
116 |
-
"ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
|
117 |
-
"ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
|
118 |
-
"ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
|
119 |
-
"ତାପର��", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
|
120 |
-
"ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
|
121 |
-
"ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
|
122 |
-
"କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
|
123 |
-
]
|
124 |
-
return frozenset(odia_stopwords)
|
125 |
-
|
126 |
-
|
127 |
-
# function to extract data from url using justext
|
128 |
-
def extract_data_from_url_(url):
|
129 |
-
response = requests.get(url)
|
130 |
-
response.raise_for_status()
|
131 |
-
page = response.content
|
132 |
-
|
133 |
-
data_url = ""
|
134 |
-
para = ""
|
135 |
-
paragraphs = justext.justext(page, custom_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False)
|
136 |
-
for paragraph in paragraphs:
|
137 |
-
if not paragraph.is_boilerplate:
|
138 |
-
para = para + '\n' + paragraph.text
|
139 |
-
|
140 |
-
data_url = ('\n\nFrom url:' + url + '\n' + para + '\n')
|
141 |
-
|
142 |
-
return data_url
|
143 |
-
|
144 |
-
|
145 |
-
sitemap_data = ""
|
146 |
-
|
147 |
-
|
148 |
-
# function to get the text from pdf using PyPDF2
|
149 |
-
def read_pdf(file):
|
150 |
-
pdfReader = PdfFileReader(file)
|
151 |
-
count = pdfReader.numPages
|
152 |
-
# all_page_text = ""
|
153 |
-
# for i in range(count):
|
154 |
-
# page = pdfReader.getPage(i)
|
155 |
-
# all_page_text += page.extractText()
|
156 |
-
#
|
157 |
-
# return all_page_text
|
158 |
-
return count
|
159 |
-
|
160 |
-
|
161 |
-
# function to run the enter button
|
162 |
-
def run_function(url, documents):
|
163 |
-
data = ""
|
164 |
-
# Check if the user has provided a URL
|
165 |
-
if url:
|
166 |
-
if valid_url(url):
|
167 |
-
data = extract_data_from_url_(url)
|
168 |
-
st.text_area("Extracted Text", value=data, height=200)
|
169 |
-
# return extract status, and the data extracted
|
170 |
-
return True, data
|
171 |
-
else:
|
172 |
-
return False, data
|
173 |
-
|
174 |
-
|
175 |
-
# Check if the user has provided a document
|
176 |
-
elif documents is not None:
|
177 |
-
for document in documents:
|
178 |
-
document_details = {
|
179 |
-
"filename": document.name,
|
180 |
-
"filetype": document.type,
|
181 |
-
"filesize": document.size
|
182 |
-
}
|
183 |
-
st.write(document_details)
|
184 |
-
|
185 |
-
# Extract content from the txt file
|
186 |
-
if document.type == "text/plain":
|
187 |
-
# Read as bytes
|
188 |
-
data += str(document.read(), "utf-8")
|
189 |
-
|
190 |
-
# Extract content from the pdf file
|
191 |
-
elif document.type == "application/pdf":
|
192 |
-
# using PyPDF2
|
193 |
-
# data += read_pdf(document)
|
194 |
-
|
195 |
-
# using pdfplumber
|
196 |
-
try:
|
197 |
-
with pdfplumber.open(document) as pdf:
|
198 |
-
all_text = ""
|
199 |
-
for page in pdf.pages:
|
200 |
-
text = page.extract_text()
|
201 |
-
all_text += text + "\n"
|
202 |
-
data += all_text
|
203 |
-
except requests.exceptions.RequestException as e:
|
204 |
-
st.write("None")
|
205 |
-
|
206 |
-
# Extract content from the docx file
|
207 |
-
elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
208 |
-
data += docx2txt.process(document)
|
209 |
-
|
210 |
-
# Display the extracted text content from file
|
211 |
-
st.write("attached")
|
212 |
-
st.text_area("Extracted Text", value=data, height=200)
|
213 |
-
# return extract status, and the data extracted
|
214 |
-
return True, data
|
215 |
-
|
216 |
-
else:
|
217 |
-
st.error("Error: An error occurred while fetching content.")
|
218 |
-
# return extract status, and the data extracted
|
219 |
-
return False, data
|
220 |
-
|
221 |
-
|
222 |
def main():
|
223 |
-
#
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
|
255 |
-
# if documents is not the relevant type
|
256 |
-
st.error("Unsupported file: " + doc.name)
|
257 |
-
|
258 |
-
# Initialize state of button Enter
|
259 |
-
with col3:
|
260 |
-
st.write('##')
|
261 |
-
if "button_enter" not in st.session_state:
|
262 |
-
st.session_state.button_enter = False
|
263 |
-
|
264 |
-
if st.button("Enter"):
|
265 |
-
st.session_state.button_enter = True
|
266 |
-
# st.write("session state true")
|
267 |
-
|
268 |
-
if "extracted" not in st.session_state:
|
269 |
-
st.session_state.extracted = False
|
270 |
-
data = ""
|
271 |
-
|
272 |
-
# the enter button
|
273 |
-
if st.session_state.button_enter:
|
274 |
-
# check if it is a sitemap or not
|
275 |
-
if is_a_sitemap:
|
276 |
-
if "Initial" not in st.session_state:
|
277 |
-
st.session_state.Initial = True
|
278 |
-
# check whether its the initial state
|
279 |
-
if st.session_state.Initial == True:
|
280 |
-
# print("\n\n\n\n1)Initial State", st.session_state.Initial, "\n\n\n\n\n")
|
281 |
-
xml = url_or_xml
|
282 |
-
st.write("It is a sitemap")
|
283 |
-
stored_sitemap_urls = extract_urls_from_sitemaps(xml)
|
284 |
-
print('\nno. of urls: ', len(stored_sitemap_urls))
|
285 |
-
|
286 |
-
if stored_sitemap_urls:
|
287 |
-
print(stored_sitemap_urls)
|
288 |
-
for sitemap_url in stored_sitemap_urls:
|
289 |
-
|
290 |
-
if valid_url(sitemap_url):
|
291 |
-
print(sitemap_url)
|
292 |
-
# using justext to extract data
|
293 |
-
data = data + extract_data_from_url_(sitemap_url)
|
294 |
-
else:
|
295 |
-
st.error("Couldnt extract data from " + sitemap_url)
|
296 |
-
|
297 |
-
if "sitemap_data" not in st.session_state:
|
298 |
-
st.session_state.sitemap_data = data
|
299 |
-
# print("\n\n\nst.session.data ", st.session_state.sitemap_data)
|
300 |
-
# print("\n\n\n\nRUNNING \n\n\n\n")
|
301 |
-
st.session_state.Initial = False
|
302 |
-
print("\n\n\n\n2)Initial State", st.session_state.Initial, "\n\n\n\n\n")
|
303 |
-
st.session_state.extracted = True
|
304 |
-
# st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)
|
305 |
-
|
306 |
-
else:
|
307 |
-
st.error("Error: Invalid sitemap.")
|
308 |
-
|
309 |
-
|
310 |
-
else:
|
311 |
-
url = url_or_xml
|
312 |
-
st.session_state.extracted, data = run_function(url, documents)
|
313 |
-
|
314 |
-
if st.session_state.extracted:
|
315 |
-
if is_a_sitemap:
|
316 |
-
st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)
|
317 |
-
col1, col2 = st.columns([0.5, 0.5])
|
318 |
-
|
319 |
-
with col1:
|
320 |
-
saved_button = False
|
321 |
-
|
322 |
-
if is_a_sitemap:
|
323 |
-
saved_data = st.session_state.sitemap_data
|
324 |
-
|
325 |
-
if st.download_button(
|
326 |
-
label="Save",
|
327 |
-
data=saved_data
|
328 |
-
):
|
329 |
-
saved_button = True
|
330 |
-
|
331 |
-
else:
|
332 |
-
if st.download_button(
|
333 |
-
label="Save",
|
334 |
-
data=data
|
335 |
-
):
|
336 |
-
saved_button = True
|
337 |
-
|
338 |
-
with col2:
|
339 |
-
if st.button("Clear"):
|
340 |
-
st.session_state.button_enter = False
|
341 |
-
st.session_state.Initial = True
|
342 |
-
st.session_state.extracted = False
|
343 |
-
if 'sitemap_data' in st.session_state:
|
344 |
-
del st.session_state['sitemap_data']
|
345 |
-
st.session_state.button_enter = False
|
346 |
-
st.experimental_rerun()
|
347 |
-
|
348 |
-
if saved_button:
|
349 |
-
# Confirmation message
|
350 |
-
st.success(f"File saved as {file_name} in the current directory.")
|
351 |
-
|
352 |
-
else:
|
353 |
-
st.warning("Data not extracted")
|
354 |
-
|
355 |
|
356 |
if __name__ == "__main__":
|
357 |
main()
|
|
|
|
|
1 |
import streamlit as st
|
|
|
|
|
2 |
|
3 |
+
# setting page config. for centered mode
|
4 |
+
st.set_page_config(layout="centered")
|
5 |
|
6 |
+
from utils.footer import cust_footer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
# Load CSS file
|
9 |
def load_css(file_path):
|
|
|
14 |
# Load CSS file
|
15 |
load_css('styles.css')
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
def main():
|
18 |
+
# Title of the app and description.
|
19 |
+
title = """
|
20 |
+
<div>
|
21 |
+
<p class="title">Olive Scrapper</p>
|
22 |
+
</div>
|
23 |
+
"""
|
24 |
+
st.markdown(title, unsafe_allow_html=True)
|
25 |
+
st.write("#")
|
26 |
+
st.write("#")
|
27 |
+
|
28 |
+
introduction = """
|
29 |
+
<div>
|
30 |
+
<p class="text">Olive Scraper is a web scraping tool developed by OdiaGenAI for web scraping Odia contents from different sources (e.g., websites, PDF, DOC, etc.)</p>
|
31 |
+
</div>
|
32 |
+
"""
|
33 |
+
st.markdown(introduction, unsafe_allow_html=True)
|
34 |
+
|
35 |
+
st.write("#")
|
36 |
+
st.write("###")
|
37 |
+
contributors = """
|
38 |
+
<div>
|
39 |
+
<p class="text">Contributors: Dr. Shantipriya Parida, Sambit, A.R. Kamaldeen, Prosper</p>
|
40 |
+
</div>
|
41 |
+
"""
|
42 |
+
st.markdown(contributors, unsafe_allow_html=True)
|
43 |
+
|
44 |
+
# Add a success message to the sidebar
|
45 |
+
st.sidebar.success("Select a page above.")
|
46 |
+
|
47 |
+
# importing the custom footer from utils
|
48 |
+
cust_footer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
if __name__ == "__main__":
|
51 |
main()
|
pages/1_URLs.py
ADDED
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
# setting page config. for centered mode
|
4 |
+
st.set_page_config(layout="centered")
|
5 |
+
|
6 |
+
from utils.footer import cust_footer
|
7 |
+
|
8 |
+
from lxml import etree
|
9 |
+
import justext
|
10 |
+
import concurrent.futures
|
11 |
+
import datetime
|
12 |
+
import requests
|
13 |
+
from bs4 import BeautifulSoup
|
14 |
+
import json
|
15 |
+
|
16 |
+
# ----- FUNCTIONS -----
|
17 |
+
# function to check whether the url is a sitemap or not
|
18 |
+
def check_sitemap(url):
|
19 |
+
# Check the URL's ending
|
20 |
+
if url.lower().endswith(('sitemap.xml', 'sitemap_index.xml', 'sitemap')):
|
21 |
+
try:
|
22 |
+
# Parse the content as XML
|
23 |
+
response = requests.get(url)
|
24 |
+
xml_content = etree.fromstring(response.content)
|
25 |
+
# Check for sitemap-specific elements
|
26 |
+
if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
|
27 |
+
return True
|
28 |
+
except etree.XMLSyntaxError:
|
29 |
+
pass
|
30 |
+
|
31 |
+
# Additional conditions for identifying sitemaps
|
32 |
+
if 'sitemap' in url.lower():
|
33 |
+
# Perform additional checks specific to the website's structure or naming conventions
|
34 |
+
return True
|
35 |
+
|
36 |
+
return False
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
def extract_urls_from_sitemaps(xml_url):
|
41 |
+
# Make a GET request to the URL and extract the xml content
|
42 |
+
response = requests.get(xml_url)
|
43 |
+
|
44 |
+
soup = BeautifulSoup(response.text, 'xml')
|
45 |
+
extracted_urls = []
|
46 |
+
|
47 |
+
# check if the sitemap contains nested sitemaps
|
48 |
+
sitemap_tags = soup.find_all('sitemap')
|
49 |
+
if sitemap_tags:
|
50 |
+
# Process nested sitemaps
|
51 |
+
for sitemap_tag in sitemap_tags:
|
52 |
+
print("sitemap_tags:" + str(sitemap_tag))
|
53 |
+
nested_url = sitemap_tag.find('loc').text
|
54 |
+
print('nested_url:', nested_url)
|
55 |
+
nested_urls = extract_urls_from_sitemaps(nested_url)
|
56 |
+
extracted_urls.extend(nested_urls)
|
57 |
+
else:
|
58 |
+
# Extract URLs from the current sitemap
|
59 |
+
loc_tags = soup.find_all('loc')
|
60 |
+
for loc_tag in loc_tags:
|
61 |
+
# if loc_tag.parent.name != 'image':
|
62 |
+
url = loc_tag.text
|
63 |
+
if url.endswith('.pdf') or url.endswith('.jpg') or url.endswith('.jpeg'):
|
64 |
+
print(f"url skipped because it is a {url.split('.')[-1]}")
|
65 |
+
else:
|
66 |
+
print('url:', url)
|
67 |
+
extracted_urls.append(url)
|
68 |
+
|
69 |
+
return extracted_urls
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
# function to check whether the entered url is valid
|
74 |
+
def valid_url(url):
|
75 |
+
try:
|
76 |
+
# Make a GET request to the URL and extract the text content
|
77 |
+
response = requests.get(url)
|
78 |
+
if response.status_code == 200:
|
79 |
+
return True
|
80 |
+
|
81 |
+
except requests.exceptions.RequestException as e:
|
82 |
+
return False
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
# function to create a custom stoplist for justext
|
87 |
+
def custom_stoplist():
|
88 |
+
odia_stopwords = [
|
89 |
+
"ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
|
90 |
+
"ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
|
91 |
+
"ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
|
92 |
+
"ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
|
93 |
+
"ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
|
94 |
+
"ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
|
95 |
+
"କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
|
96 |
+
"ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
|
97 |
+
"ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
|
98 |
+
"ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
|
99 |
+
"ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
|
100 |
+
"ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
|
101 |
+
"ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
|
102 |
+
"କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
|
103 |
+
]
|
104 |
+
return frozenset(odia_stopwords)
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
# function to extract data from url using justext
|
109 |
+
def extract_data_from_url_(url):
|
110 |
+
response = requests.get(url)
|
111 |
+
response.raise_for_status()
|
112 |
+
page = response.content
|
113 |
+
|
114 |
+
para = ""
|
115 |
+
paragraphs = justext.justext(page, custom_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False)
|
116 |
+
for paragraph in paragraphs:
|
117 |
+
if not paragraph.is_boilerplate:
|
118 |
+
para = para + '\n' + paragraph.text
|
119 |
+
|
120 |
+
return para
|
121 |
+
|
122 |
+
|
123 |
+
sitemap_data = ""
|
124 |
+
|
125 |
+
|
126 |
+
|
127 |
+
# function to process a batch of URLS in sitemaps
|
128 |
+
def process_urls(sitemap_urls):
|
129 |
+
|
130 |
+
extracted_txt = ""
|
131 |
+
extracted_jsonl_list= []
|
132 |
+
for url in sitemap_urls:
|
133 |
+
if valid_url(url):
|
134 |
+
print(url)
|
135 |
+
# using justext to extract data
|
136 |
+
temp_para = extract_data_from_url_(url)
|
137 |
+
temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
|
138 |
+
temp_jsonl_data = {"text": temp_para, "url": url}
|
139 |
+
extracted_txt += temp_txt_data
|
140 |
+
extracted_jsonl_list.append(temp_jsonl_data)
|
141 |
+
else:
|
142 |
+
st.error("Couldnt extract data from " + url)
|
143 |
+
|
144 |
+
# Convert data_list to JSONL string
|
145 |
+
extracted_jsonl_list_encoded = [json.dumps(data, ensure_ascii=False) for data in extracted_jsonl_list]
|
146 |
+
extracted_jsonl = '\n'.join(extracted_jsonl_list_encoded)
|
147 |
+
|
148 |
+
return extracted_txt, extracted_jsonl
|
149 |
+
|
150 |
+
|
151 |
+
|
152 |
+
# function to process for a single URL
|
153 |
+
def run_function(url):
|
154 |
+
extracted_txt = ""
|
155 |
+
# Check if the user has provided a URL
|
156 |
+
if url:
|
157 |
+
if valid_url(url):
|
158 |
+
temp_para = extract_data_from_url_(url)
|
159 |
+
temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
|
160 |
+
extracted_txt = temp_txt_data
|
161 |
+
extracted_jsonl = {"text": str(temp_para), "url":str(url)}
|
162 |
+
|
163 |
+
# displaying extracted txt for single URL
|
164 |
+
st.text_area("Extracted Text", value=extracted_txt, height=200)
|
165 |
+
|
166 |
+
|
167 |
+
extracted_jsonl = json.dumps(extracted_jsonl, ensure_ascii=False)
|
168 |
+
|
169 |
+
# return extract status, and the data extracted
|
170 |
+
return True, extracted_txt, extracted_jsonl
|
171 |
+
else:
|
172 |
+
return False, None, None
|
173 |
+
else:
|
174 |
+
st.error("Error: An error occurred while fetching content.")
|
175 |
+
# return extract status, and the data extracted
|
176 |
+
return False, None, None
|
177 |
+
|
178 |
+
|
179 |
+
|
180 |
+
def main():
|
181 |
+
st.subheader("Extract Data from URLs")
|
182 |
+
|
183 |
+
# dividing the body section into 2 columns for url and enter button
|
184 |
+
col1, col2 = st.columns([0.7,0.3])
|
185 |
+
|
186 |
+
with col1:
|
187 |
+
url_or_xml = st.text_input(label='', placeholder="Enter URL")
|
188 |
+
is_a_sitemap = check_sitemap(url_or_xml)
|
189 |
+
|
190 |
+
with col2:
|
191 |
+
st.write('##')
|
192 |
+
if "button_enter_url" not in st.session_state:
|
193 |
+
st.session_state.button_enter_url = False
|
194 |
+
|
195 |
+
if st.button("Enter"):
|
196 |
+
st.session_state.button_enter_url = True
|
197 |
+
|
198 |
+
if "extracted_url" not in st.session_state:
|
199 |
+
st.session_state.extracted_url = False
|
200 |
+
data = ""
|
201 |
+
|
202 |
+
# the enter button
|
203 |
+
if st.session_state.button_enter_url:
|
204 |
+
# check if it is a sitemap or not
|
205 |
+
if is_a_sitemap:
|
206 |
+
if "Initial" not in st.session_state:
|
207 |
+
st.session_state.Initial = True
|
208 |
+
# check whether its the initial state
|
209 |
+
if st.session_state.Initial == True:
|
210 |
+
|
211 |
+
xml = url_or_xml
|
212 |
+
st.write("It is a sitemap")
|
213 |
+
stored_sitemap_urls = extract_urls_from_sitemaps(xml)
|
214 |
+
print('\nno. of urls: ', len(stored_sitemap_urls))
|
215 |
+
st.write('no. of urls {}', format(len(stored_sitemap_urls)))
|
216 |
+
|
217 |
+
if stored_sitemap_urls:
|
218 |
+
print(stored_sitemap_urls)
|
219 |
+
current_time = datetime.datetime.now()
|
220 |
+
print(current_time)
|
221 |
+
st.write(current_time)
|
222 |
+
# for sitemap_url in stored_sitemap_urls:
|
223 |
+
|
224 |
+
# if valid_url(sitemap_url):
|
225 |
+
# print(sitemap_url)
|
226 |
+
# # using justext to extract data
|
227 |
+
# data = data + extract_data_from_url_(sitemap_url)
|
228 |
+
# else:
|
229 |
+
# st.error("Couldnt extract data from " + sitemap_url)
|
230 |
+
|
231 |
+
num_threads = 16 # Number of threads to use
|
232 |
+
|
233 |
+
# Calculate the split size for each thread
|
234 |
+
split_size = len(stored_sitemap_urls) // num_threads
|
235 |
+
|
236 |
+
# Create a ThreadPoolExecutor with maximum `num_threads` threads
|
237 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
|
238 |
+
futures = []
|
239 |
+
for i in range(num_threads):
|
240 |
+
start_index = i * split_size
|
241 |
+
end_index = start_index + split_size if i != num_threads - 1 else None
|
242 |
+
temp_urls = stored_sitemap_urls[start_index:end_index]
|
243 |
+
future = executor.submit(process_urls, temp_urls)
|
244 |
+
futures.append(future)
|
245 |
+
|
246 |
+
# Retrieve the extracted data from each thread
|
247 |
+
text_data = []
|
248 |
+
jsonl_data = []
|
249 |
+
for future in futures:
|
250 |
+
text_result, jsonl_result = future.result()
|
251 |
+
text_data.append(text_result)
|
252 |
+
jsonl_data.append(jsonl_result)
|
253 |
+
|
254 |
+
# Combine the extracted data from all threads
|
255 |
+
combined_text_data = ''.join(text_data)
|
256 |
+
combined_jsonl_data = '\n'.join(jsonl_data)
|
257 |
+
|
258 |
+
# Use the combined data as needed
|
259 |
+
# print("Combined Text Data:")
|
260 |
+
# print(combined_text_data)
|
261 |
+
# print("Combined JSONL Data:")
|
262 |
+
# print(combined_jsonl_data)
|
263 |
+
|
264 |
+
|
265 |
+
|
266 |
+
if "sitemap_data_jsonl" not in st.session_state:
|
267 |
+
st.session_state.sitemap_data_jsonl = combined_jsonl_data
|
268 |
+
if "sitemap_data_text" not in st.session_state:
|
269 |
+
st.session_state.sitemap_data_text = combined_text_data
|
270 |
+
|
271 |
+
|
272 |
+
|
273 |
+
|
274 |
+
current_time = datetime.datetime.now()
|
275 |
+
print(current_time)
|
276 |
+
st.write(current_time)
|
277 |
+
st.session_state.Initial = False
|
278 |
+
print("\n\n\n\n2)Initial State", st.session_state.Initial, "\n\n\n\n\n")
|
279 |
+
st.session_state.extracted_url = True
|
280 |
+
|
281 |
+
else:
|
282 |
+
st.error("Error: Invalid sitemap.")
|
283 |
+
|
284 |
+
|
285 |
+
else:
|
286 |
+
url = url_or_xml
|
287 |
+
st.session_state.extracted_url, data_txt, data_jsonl = run_function(url)
|
288 |
+
|
289 |
+
|
290 |
+
if st.session_state.extracted_url:
|
291 |
+
# displaying extracted txt for sitemaps
|
292 |
+
if is_a_sitemap:
|
293 |
+
st.text_area("Extracted Text", value=st.session_state.sitemap_data_text, height=300)
|
294 |
+
|
295 |
+
save_as,checkbox_c1, checkbox_c2 = st.columns([0.33 , 0.33 , 0.33])
|
296 |
+
|
297 |
+
# initializing the checbox bool
|
298 |
+
save_as_txt =False
|
299 |
+
save_as_json = False
|
300 |
+
saved_successfully = False
|
301 |
+
|
302 |
+
with save_as:
|
303 |
+
st.write("Save as ")
|
304 |
+
with checkbox_c1:
|
305 |
+
save_as_txt = st.checkbox("text", value=False)
|
306 |
+
|
307 |
+
with checkbox_c2:
|
308 |
+
save_as_json = st.checkbox("jsonl", value=False)
|
309 |
+
|
310 |
+
if not save_as_txt and not save_as_json:
|
311 |
+
if st.button("Clear"):
|
312 |
+
st.session_state.button_enter_url = False
|
313 |
+
st.session_state.Initial = True
|
314 |
+
st.session_state.extracted_url = False
|
315 |
+
if 'sitemap_data_text' in st.session_state:
|
316 |
+
del st.session_state['sitemap_data_text']
|
317 |
+
if 'sitemap_data_jsonl' in st.session_state:
|
318 |
+
del st.session_state['sitemap_data_jsonl']
|
319 |
+
st.session_state.button_enter_url = False
|
320 |
+
st.experimental_rerun()
|
321 |
+
else:
|
322 |
+
col1, col2 = st.columns([0.5, 0.5])
|
323 |
+
# save column
|
324 |
+
with col1:
|
325 |
+
|
326 |
+
if is_a_sitemap:
|
327 |
+
|
328 |
+
if save_as_txt:
|
329 |
+
if st.download_button(label="Save as txt",data=st.session_state.sitemap_data_text ):
|
330 |
+
saved_successfully = True
|
331 |
+
if save_as_json:
|
332 |
+
if st.download_button(label="Save as jsonl", data=st.session_state.sitemap_data_jsonl, mime="application/json"):
|
333 |
+
saved_successfully = True
|
334 |
+
else:
|
335 |
+
if save_as_txt:
|
336 |
+
if st.download_button(label="Save as txt",data=data_txt ):
|
337 |
+
saved_successfully = True
|
338 |
+
if save_as_json:
|
339 |
+
if st.download_button(label="Save as jsonl", data=data_jsonl, mime="application/json"):
|
340 |
+
saved_successfully = True
|
341 |
+
|
342 |
+
# clear column
|
343 |
+
with col2:
|
344 |
+
if st.button("Clear"):
|
345 |
+
st.session_state.button_enter_url = False
|
346 |
+
st.session_state.Initial = True
|
347 |
+
st.session_state.extracted_url = False
|
348 |
+
if 'sitemap_data_text' in st.session_state:
|
349 |
+
del st.session_state['sitemap_data_text']
|
350 |
+
if 'sitemap_data_jsonl' in st.session_state:
|
351 |
+
del st.session_state['sitemap_data_jsonl']
|
352 |
+
st.session_state.button_enter_url = False
|
353 |
+
st.experimental_rerun()
|
354 |
+
|
355 |
+
if saved_successfully:
|
356 |
+
# Confirmation message
|
357 |
+
st.success(f"File saved successfully.")
|
358 |
+
|
359 |
+
else:
|
360 |
+
st.warning("Data not extracted")
|
361 |
+
if st.button("clear"):
|
362 |
+
st.session_state.button_enter_url = False
|
363 |
+
st.session_state.extracted_url = False
|
364 |
+
st.experimental_rerun()
|
365 |
+
|
366 |
+
|
367 |
+
# Add a success message to the sidebar
|
368 |
+
st.sidebar.success("Select a page above.")
|
369 |
+
|
370 |
+
# importing the custom footer from utils
|
371 |
+
cust_footer()
|
372 |
+
|
373 |
+
|
374 |
+
if __name__ == "__main__":
|
375 |
+
main()
|
pages/2_Documents.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
# setting page config. for centered mode
|
4 |
+
st.set_page_config(layout="centered")
|
5 |
+
|
6 |
+
|
7 |
+
from utils.footer import cust_footer
|
8 |
+
import docx2txt
|
9 |
+
import requests
|
10 |
+
import pdfplumber
|
11 |
+
|
12 |
+
# function to run the enter button
|
13 |
+
def run_function(documents):
|
14 |
+
data = ""
|
15 |
+
if documents is not None:
|
16 |
+
for document in documents:
|
17 |
+
document_details = {
|
18 |
+
"filename": document.name,
|
19 |
+
"filetype": document.type,
|
20 |
+
"filesize": document.size
|
21 |
+
}
|
22 |
+
st.write(document_details)
|
23 |
+
|
24 |
+
# Extract content from the txt file
|
25 |
+
if document.type == "text/plain":
|
26 |
+
# Read as bytes
|
27 |
+
data += str(document.read(), "utf-8")
|
28 |
+
|
29 |
+
# Extract content from the pdf file
|
30 |
+
elif document.type == "application/pdf":
|
31 |
+
# using PyPDF2
|
32 |
+
# data += read_pdf(document)
|
33 |
+
|
34 |
+
# using pdfplumber
|
35 |
+
try:
|
36 |
+
with pdfplumber.open(document) as pdf:
|
37 |
+
all_text = ""
|
38 |
+
for page in pdf.pages:
|
39 |
+
text = page.extract_text()
|
40 |
+
all_text += text + "\n"
|
41 |
+
data += all_text
|
42 |
+
except requests.exceptions.RequestException as e:
|
43 |
+
st.write("None")
|
44 |
+
|
45 |
+
# Extract content from the docx file
|
46 |
+
elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
47 |
+
data += docx2txt.process(document)
|
48 |
+
|
49 |
+
# Display the extracted text content from file
|
50 |
+
st.text_area("Extracted Text", value=data, height=200)
|
51 |
+
# return extract status, and the data extracted
|
52 |
+
return True, data
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
else:
|
57 |
+
st.error("Error: An error occurred while fetching content.")
|
58 |
+
# return extract status, and the data extracted
|
59 |
+
return False, data
|
60 |
+
|
61 |
+
|
62 |
+
def main():
|
63 |
+
|
64 |
+
st.subheader("Extract Data from Documents")
|
65 |
+
|
66 |
+
documents = st.file_uploader(
|
67 |
+
"", type=["pdf", "txt", "docx"], accept_multiple_files=True
|
68 |
+
)
|
69 |
+
|
70 |
+
if "button_enter_doc" not in st.session_state:
|
71 |
+
st.session_state.button_enter_doc = False
|
72 |
+
|
73 |
+
if "extracted_doc" not in st.session_state:
|
74 |
+
st.session_state.extracted_doc = False
|
75 |
+
data = ""
|
76 |
+
|
77 |
+
|
78 |
+
if st.button("Enter"):
|
79 |
+
st.session_state.button_enter_doc = True
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
# the enter button
|
84 |
+
if st.session_state.button_enter_doc:
|
85 |
+
# check if it is a sitemap or not
|
86 |
+
if not documents:
|
87 |
+
documents = None
|
88 |
+
else:
|
89 |
+
for doc in documents:
|
90 |
+
if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
|
91 |
+
# if documents is not the relevant type
|
92 |
+
st.error("Unsupported file: " + doc.name)
|
93 |
+
|
94 |
+
st.session_state.extracted_doc, data = run_function(documents)
|
95 |
+
|
96 |
+
if st.session_state.extracted_doc:
|
97 |
+
col1, col2 = st.columns([0.5, 0.5])
|
98 |
+
with col1:
|
99 |
+
saved_button = False
|
100 |
+
|
101 |
+
if st.download_button(
|
102 |
+
label="Save",
|
103 |
+
data=data
|
104 |
+
):
|
105 |
+
saved_button = True
|
106 |
+
|
107 |
+
with col2:
|
108 |
+
if st.button("Clear"):
|
109 |
+
st.session_state.button_enter_doc = False
|
110 |
+
st.session_state.extracted_doc = False
|
111 |
+
st.experimental_rerun()
|
112 |
+
|
113 |
+
if saved_button:
|
114 |
+
# Confirmation message
|
115 |
+
st.success(f"File saved successfully.")
|
116 |
+
|
117 |
+
else:
|
118 |
+
st.warning("Data not extracted")
|
119 |
+
if st.button("clear"):
|
120 |
+
st.session_state.button_enter_doc = False
|
121 |
+
st.session_state.extracted_doc = False
|
122 |
+
st.experimental_rerun()
|
123 |
+
|
124 |
+
|
125 |
+
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
# Add a success message to the sidebar
|
131 |
+
st.sidebar.success("Select a page above.")
|
132 |
+
|
133 |
+
# importing the custom footer from utils
|
134 |
+
cust_footer()
|
135 |
+
|
136 |
+
|
137 |
+
if __name__ == "__main__":
|
138 |
+
main()
|
requirements.txt
CHANGED
@@ -6,3 +6,4 @@ pdfplumber
|
|
6 |
justext
|
7 |
Pillow
|
8 |
requests
|
|
|
|
6 |
justext
|
7 |
Pillow
|
8 |
requests
|
9 |
+
lxml
|
styles.css
CHANGED
@@ -1,75 +1,35 @@
|
|
1 |
-
.
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
margin-left: 15%;
|
12 |
-
}
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
|
18 |
-
/* drag drop div */
|
19 |
-
.exg6vvm14 {
|
20 |
-
display: none;
|
21 |
-
}
|
22 |
-
/* resizing the attach button */
|
23 |
-
.exg6vvm15 > button {
|
24 |
-
width: 100%;
|
25 |
-
height: 100%;
|
26 |
-
}
|
27 |
-
/* div which wraps attach button */
|
28 |
-
.exg6vvm15 {
|
29 |
-
padding: 0;
|
30 |
-
margin-left: -15px;
|
31 |
-
height: 40px;
|
32 |
}
|
33 |
|
34 |
-
|
35 |
-
.
|
36 |
-
|
|
|
|
|
37 |
}
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
43 |
}
|
44 |
-
#root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(2) > div:nth-child(3) > div:nth-child(1) > div > div:nth-child(2) > div > button :hover,:active{
|
45 |
-
color: white;
|
46 |
-
}
|
47 |
-
|
48 |
|
49 |
-
/*
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
height: 40px;
|
54 |
-
margin-top: 20px;
|
55 |
-
width: 70%;
|
56 |
-
margin-left: 30%;
|
57 |
-
}
|
58 |
-
#root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(4) > div:nth-child(1) > div:nth-child(1) > div > div > div > button:hover, :active {
|
59 |
-
border: rgb(0,255,127);
|
60 |
-
color: white;
|
61 |
-
|
62 |
-
}
|
63 |
|
64 |
-
/*
|
65 |
-
|
66 |
-
|
67 |
-
color: black;
|
68 |
-
height: 40px;
|
69 |
-
margin-top: 20px;
|
70 |
-
width: 70%;
|
71 |
-
margin-right: 30%;
|
72 |
-
}
|
73 |
-
#root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(4) > div:nth-child(2) > div:nth-child(1) > div > div > div > button:hover, :active {
|
74 |
-
color: white;
|
75 |
-
}
|
|
|
1 |
+
.title {
|
2 |
+
font-family: "Source Sans Pro", sans-serif;
|
3 |
+
font-weight: 600;
|
4 |
+
padding: 1.25rem 0px 1rem;
|
5 |
+
margin: 0px;
|
6 |
+
margin-left: -2%;
|
7 |
+
line-height: 1.2;
|
8 |
+
font-size: 70px;
|
9 |
+
text-decoration: underline 2px;
|
10 |
+
text-underline-position: under;
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
}
|
13 |
|
14 |
+
@media (max-width: 768px) {
|
15 |
+
.title {
|
16 |
+
font-size: xx-large;
|
17 |
+
margin-left: auto;
|
18 |
+
}
|
19 |
}
|
20 |
|
21 |
+
.text {
|
22 |
+
font-family: "Source Sans Pro", sans-serif;
|
23 |
+
font-size: 1.25pc;
|
24 |
+
padding: 1.25rem 0px 1rem;
|
25 |
+
margin: 0px;
|
26 |
}
|
|
|
|
|
|
|
|
|
27 |
|
28 |
+
/* #root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) {
|
29 |
+
background-color: #3498db;
|
30 |
+
padding: 5px;
|
31 |
+
} */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
/* #root > div:nth-child(1) > div.withScreencast > div > div > div > section.css-1cypcdb.e1fqkh3o11 > div.css-6qob1r.e1fqkh3o3 > div.css-1b9x38r.e1fqkh3o2 > button{
|
34 |
+
visibility: hidden;
|
35 |
+
} */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/__pycache__/footer.cpython-38.pyc
ADDED
Binary file (1.1 kB). View file
|
|
utils/footer.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def cust_footer():
|
4 |
+
footer = """
|
5 |
+
<style>
|
6 |
+
footer {
|
7 |
+
visibility: hidden !important;
|
8 |
+
}
|
9 |
+
|
10 |
+
.divfooter {
|
11 |
+
display: flex;
|
12 |
+
justify-content: center;
|
13 |
+
align-items: center;
|
14 |
+
position: fixed;
|
15 |
+
left: 0;
|
16 |
+
bottom: 0;
|
17 |
+
width: 100%;
|
18 |
+
padding: 10px;
|
19 |
+
border-top: 2px solid grey;
|
20 |
+
background: white;
|
21 |
+
|
22 |
+
|
23 |
+
}
|
24 |
+
@media (min-width: 768px) {
|
25 |
+
.divfooter {
|
26 |
+
justify-content: center;
|
27 |
+
padding-left: 10%;
|
28 |
+
}
|
29 |
+
|
30 |
+
}
|
31 |
+
|
32 |
+
|
33 |
+
</style>
|
34 |
+
<div class="divfooter">
|
35 |
+
<p style="margin-bottom: 0px">© 2023 Odia Generative AI</p>
|
36 |
+
</div>
|
37 |
+
"""
|
38 |
+
st.markdown(footer, unsafe_allow_html=True)
|