sam2ai commited on
Commit
26998f0
1 Parent(s): d611569

Synced repo using 'sync_with_huggingface' Github Action

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+
.vscode/settings.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[python]": {
3
+ "editor.defaultFormatter": "ms-python.black-formatter"
4
+ },
5
+ "python.formatting.provider": "none"
6
+ }
User Interface/User-Interface-multipages.pdf ADDED
Binary file (103 kB). View file
 
app.py CHANGED
@@ -1,29 +1,9 @@
1
- import justext
2
  import streamlit as st
3
- from lxml import etree
4
- # import streamlit.components.v1 as components
5
 
 
 
6
 
7
- # File Processing pkgs
8
- from PIL import Image
9
- import requests
10
- # import xml.dom.minidom
11
- from bs4 import BeautifulSoup
12
- # import json
13
- import docx2txt
14
- # import textract
15
- from PyPDF2 import PdfFileReader
16
- import pdfplumber
17
- import os
18
-
19
-
20
-
21
- # ---- LOAD ASSETS ----
22
- img_page_icon = Image.open("./olive_webscrapping.jpg")
23
-
24
- # Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/
25
- st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide")
26
-
27
 
28
  # Load CSS file
29
  def load_css(file_path):
@@ -34,324 +14,38 @@ def load_css(file_path):
34
  # Load CSS file
35
  load_css('styles.css')
36
 
37
-
38
- # ----- FUNCTIONS ----
39
- # function to check whether the url is a sitemap or not
40
- def check_sitemap(url):
41
- # Check the URL's ending
42
- if url.lower().endswith(('sitemap.xml', 'sitemap_index.xml', 'sitemap')):
43
- try:
44
- # Parse the content as XML
45
- response = requests.get(url)
46
- xml_content = etree.fromstring(response.content)
47
- # Check for sitemap-specific elements
48
- if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
49
- return True
50
- except etree.XMLSyntaxError:
51
- pass
52
-
53
- # Additional conditions for identifying sitemaps
54
- if 'sitemap' in url.lower():
55
- # Perform additional checks specific to the website's structure or naming conventions
56
- return True
57
-
58
- return False
59
-
60
-
61
- # function to get urls from the site map and extract those data
62
- def extract_urls_from_sitemaps(xml_url):
63
- # Make a GET request to the URL and extract the xml content
64
- response = requests.get(xml_url)
65
-
66
- soup = BeautifulSoup(response.text, 'xml')
67
- extracted_urls = []
68
-
69
- # check if the sitemap contains nested sitemaps
70
- sitemap_tags = soup.find_all('sitemap')
71
- if sitemap_tags:
72
- # Process nested sitemaps
73
- for sitemap_tag in sitemap_tags:
74
- print("sitemap_tags:" + sitemap_tag)
75
- nested_url = sitemap_tag.find('loc').text
76
- print('nested_url:', nested_url)
77
- nested_urls = extract_urls_from_sitemaps(nested_url)
78
- extracted_urls.extend(nested_urls)
79
- else:
80
- # Extract URLs from the current sitemap
81
- loc_tags = soup.find_all('loc')
82
- for loc_tag in loc_tags:
83
- # if loc_tag.parent.name != 'image':
84
- url = loc_tag.text
85
- if url.endswith('.pdf') or url.endswith('.jpg') or url.endswith('.jpeg'):
86
- print(f"url skipped because it is a {url.split('.')[-1]}")
87
- else:
88
- print('url:', url)
89
- extracted_urls.append(url)
90
-
91
- return extracted_urls
92
-
93
-
94
- # function to check whether the entered url is valid
95
- def valid_url(url):
96
- try:
97
- # Make a GET request to the URL and extract the text content
98
- response = requests.get(url)
99
- if response.status_code == 200:
100
- return True
101
-
102
- except requests.exceptions.RequestException as e:
103
- return False
104
-
105
-
106
- # function to create a custom stoplist for justext
107
- def custom_stoplist():
108
- odia_stopwords = [
109
- "ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
110
- "ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
111
- "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
112
- "ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
113
- "ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
114
- "ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
115
- "କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
116
- "ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
117
- "ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
118
- "ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
119
- "ତାପର��", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
120
- "ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
121
- "ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
122
- "କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
123
- ]
124
- return frozenset(odia_stopwords)
125
-
126
-
127
- # function to extract data from url using justext
128
- def extract_data_from_url_(url):
129
- response = requests.get(url)
130
- response.raise_for_status()
131
- page = response.content
132
-
133
- data_url = ""
134
- para = ""
135
- paragraphs = justext.justext(page, custom_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False)
136
- for paragraph in paragraphs:
137
- if not paragraph.is_boilerplate:
138
- para = para + '\n' + paragraph.text
139
-
140
- data_url = ('\n\nFrom url:' + url + '\n' + para + '\n')
141
-
142
- return data_url
143
-
144
-
145
- sitemap_data = ""
146
-
147
-
148
- # function to get the text from pdf using PyPDF2
149
- def read_pdf(file):
150
- pdfReader = PdfFileReader(file)
151
- count = pdfReader.numPages
152
- # all_page_text = ""
153
- # for i in range(count):
154
- # page = pdfReader.getPage(i)
155
- # all_page_text += page.extractText()
156
- #
157
- # return all_page_text
158
- return count
159
-
160
-
161
- # function to run the enter button
162
- def run_function(url, documents):
163
- data = ""
164
- # Check if the user has provided a URL
165
- if url:
166
- if valid_url(url):
167
- data = extract_data_from_url_(url)
168
- st.text_area("Extracted Text", value=data, height=200)
169
- # return extract status, and the data extracted
170
- return True, data
171
- else:
172
- return False, data
173
-
174
-
175
- # Check if the user has provided a document
176
- elif documents is not None:
177
- for document in documents:
178
- document_details = {
179
- "filename": document.name,
180
- "filetype": document.type,
181
- "filesize": document.size
182
- }
183
- st.write(document_details)
184
-
185
- # Extract content from the txt file
186
- if document.type == "text/plain":
187
- # Read as bytes
188
- data += str(document.read(), "utf-8")
189
-
190
- # Extract content from the pdf file
191
- elif document.type == "application/pdf":
192
- # using PyPDF2
193
- # data += read_pdf(document)
194
-
195
- # using pdfplumber
196
- try:
197
- with pdfplumber.open(document) as pdf:
198
- all_text = ""
199
- for page in pdf.pages:
200
- text = page.extract_text()
201
- all_text += text + "\n"
202
- data += all_text
203
- except requests.exceptions.RequestException as e:
204
- st.write("None")
205
-
206
- # Extract content from the docx file
207
- elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
208
- data += docx2txt.process(document)
209
-
210
- # Display the extracted text content from file
211
- st.write("attached")
212
- st.text_area("Extracted Text", value=data, height=200)
213
- # return extract status, and the data extracted
214
- return True, data
215
-
216
- else:
217
- st.error("Error: An error occurred while fetching content.")
218
- # return extract status, and the data extracted
219
- return False, data
220
-
221
-
222
  def main():
223
- # ---- HEADER SECTION ----
224
- with st.container():
225
- st.subheader("Hi!! :wave:")
226
- st.write("##")
227
- st.markdown("<h5 class='text'>OdiaGenAI is a collaborative initiative that conducts research on </h5>",
228
- unsafe_allow_html=True)
229
- st.markdown("<h5>Generative AI and LLM for the Odia Language.</h5>", unsafe_allow_html=True)
230
- # st.title("Odia Generative AI")
231
-
232
- st.markdown("<h1 class='title'>Odia Generative AI</h1>", unsafe_allow_html=True)
233
-
234
- # ---- BODY SECTION ----
235
- with st.container():
236
- st.subheader("Collecting monolingual data (Odia or any Indic Languages)")
237
-
238
- # dividing the body section into 3 columns for url, attach button and enter button
239
- col1, col2, col3 = st.columns([0.6, 0.2, 0.2])
240
- # url/xml
241
- with col1:
242
-
243
- url_or_xml = st.text_input(label='', placeholder="Enter URL")
244
- is_a_sitemap = check_sitemap(url_or_xml)
245
-
246
- # attached files
247
- with col2:
248
-
249
- documents = st.file_uploader("", type=["pdf", "txt", "docx"], accept_multiple_files=True)
250
- if not documents:
251
- documents = None
252
- else:
253
- for doc in documents:
254
- if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
255
- # if documents is not the relevant type
256
- st.error("Unsupported file: " + doc.name)
257
-
258
- # Initialize state of button Enter
259
- with col3:
260
- st.write('##')
261
- if "button_enter" not in st.session_state:
262
- st.session_state.button_enter = False
263
-
264
- if st.button("Enter"):
265
- st.session_state.button_enter = True
266
- # st.write("session state true")
267
-
268
- if "extracted" not in st.session_state:
269
- st.session_state.extracted = False
270
- data = ""
271
-
272
- # the enter button
273
- if st.session_state.button_enter:
274
- # check if it is a sitemap or not
275
- if is_a_sitemap:
276
- if "Initial" not in st.session_state:
277
- st.session_state.Initial = True
278
- # check whether its the initial state
279
- if st.session_state.Initial == True:
280
- # print("\n\n\n\n1)Initial State", st.session_state.Initial, "\n\n\n\n\n")
281
- xml = url_or_xml
282
- st.write("It is a sitemap")
283
- stored_sitemap_urls = extract_urls_from_sitemaps(xml)
284
- print('\nno. of urls: ', len(stored_sitemap_urls))
285
-
286
- if stored_sitemap_urls:
287
- print(stored_sitemap_urls)
288
- for sitemap_url in stored_sitemap_urls:
289
-
290
- if valid_url(sitemap_url):
291
- print(sitemap_url)
292
- # using justext to extract data
293
- data = data + extract_data_from_url_(sitemap_url)
294
- else:
295
- st.error("Couldnt extract data from " + sitemap_url)
296
-
297
- if "sitemap_data" not in st.session_state:
298
- st.session_state.sitemap_data = data
299
- # print("\n\n\nst.session.data ", st.session_state.sitemap_data)
300
- # print("\n\n\n\nRUNNING \n\n\n\n")
301
- st.session_state.Initial = False
302
- print("\n\n\n\n2)Initial State", st.session_state.Initial, "\n\n\n\n\n")
303
- st.session_state.extracted = True
304
- # st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)
305
-
306
- else:
307
- st.error("Error: Invalid sitemap.")
308
-
309
-
310
- else:
311
- url = url_or_xml
312
- st.session_state.extracted, data = run_function(url, documents)
313
-
314
- if st.session_state.extracted:
315
- if is_a_sitemap:
316
- st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)
317
- col1, col2 = st.columns([0.5, 0.5])
318
-
319
- with col1:
320
- saved_button = False
321
-
322
- if is_a_sitemap:
323
- saved_data = st.session_state.sitemap_data
324
-
325
- if st.download_button(
326
- label="Save",
327
- data=saved_data
328
- ):
329
- saved_button = True
330
-
331
- else:
332
- if st.download_button(
333
- label="Save",
334
- data=data
335
- ):
336
- saved_button = True
337
-
338
- with col2:
339
- if st.button("Clear"):
340
- st.session_state.button_enter = False
341
- st.session_state.Initial = True
342
- st.session_state.extracted = False
343
- if 'sitemap_data' in st.session_state:
344
- del st.session_state['sitemap_data']
345
- st.session_state.button_enter = False
346
- st.experimental_rerun()
347
-
348
- if saved_button:
349
- # Confirmation message
350
- st.success(f"File saved as {file_name} in the current directory.")
351
-
352
- else:
353
- st.warning("Data not extracted")
354
-
355
 
356
  if __name__ == "__main__":
357
  main()
 
 
1
  import streamlit as st
 
 
2
 
3
+ # setting page config. for centered mode
4
+ st.set_page_config(layout="centered")
5
 
6
+ from utils.footer import cust_footer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # Load CSS file
9
  def load_css(file_path):
 
14
  # Load CSS file
15
  load_css('styles.css')
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def main():
18
+ # Title of the app and description.
19
+ title = """
20
+ <div>
21
+ <p class="title">Olive Scrapper</p>
22
+ </div>
23
+ """
24
+ st.markdown(title, unsafe_allow_html=True)
25
+ st.write("#")
26
+ st.write("#")
27
+
28
+ introduction = """
29
+ <div>
30
+ <p class="text">Olive Scraper is a web scraping tool developed by OdiaGenAI for web scraping Odia contents from different sources (e.g., websites, PDF, DOC, etc.)</p>
31
+ </div>
32
+ """
33
+ st.markdown(introduction, unsafe_allow_html=True)
34
+
35
+ st.write("#")
36
+ st.write("###")
37
+ contributors = """
38
+ <div>
39
+ <p class="text">Contributors: Dr. Shantipriya Parida, Sambit, A.R. Kamaldeen, Prosper</p>
40
+ </div>
41
+ """
42
+ st.markdown(contributors, unsafe_allow_html=True)
43
+
44
+ # Add a success message to the sidebar
45
+ st.sidebar.success("Select a page above.")
46
+
47
+ # importing the custom footer from utils
48
+ cust_footer()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  if __name__ == "__main__":
51
  main()
pages/1_URLs.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # setting page config. for centered mode
4
+ st.set_page_config(layout="centered")
5
+
6
+ from utils.footer import cust_footer
7
+
8
+ from lxml import etree
9
+ import justext
10
+ import concurrent.futures
11
+ import datetime
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+ import json
15
+
16
+ # ----- FUNCTIONS -----
17
+ # function to check whether the url is a sitemap or not
18
+ def check_sitemap(url):
19
+ # Check the URL's ending
20
+ if url.lower().endswith(('sitemap.xml', 'sitemap_index.xml', 'sitemap')):
21
+ try:
22
+ # Parse the content as XML
23
+ response = requests.get(url)
24
+ xml_content = etree.fromstring(response.content)
25
+ # Check for sitemap-specific elements
26
+ if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
27
+ return True
28
+ except etree.XMLSyntaxError:
29
+ pass
30
+
31
+ # Additional conditions for identifying sitemaps
32
+ if 'sitemap' in url.lower():
33
+ # Perform additional checks specific to the website's structure or naming conventions
34
+ return True
35
+
36
+ return False
37
+
38
+
39
+
40
+ def extract_urls_from_sitemaps(xml_url):
41
+ # Make a GET request to the URL and extract the xml content
42
+ response = requests.get(xml_url)
43
+
44
+ soup = BeautifulSoup(response.text, 'xml')
45
+ extracted_urls = []
46
+
47
+ # check if the sitemap contains nested sitemaps
48
+ sitemap_tags = soup.find_all('sitemap')
49
+ if sitemap_tags:
50
+ # Process nested sitemaps
51
+ for sitemap_tag in sitemap_tags:
52
+ print("sitemap_tags:" + str(sitemap_tag))
53
+ nested_url = sitemap_tag.find('loc').text
54
+ print('nested_url:', nested_url)
55
+ nested_urls = extract_urls_from_sitemaps(nested_url)
56
+ extracted_urls.extend(nested_urls)
57
+ else:
58
+ # Extract URLs from the current sitemap
59
+ loc_tags = soup.find_all('loc')
60
+ for loc_tag in loc_tags:
61
+ # if loc_tag.parent.name != 'image':
62
+ url = loc_tag.text
63
+ if url.endswith('.pdf') or url.endswith('.jpg') or url.endswith('.jpeg'):
64
+ print(f"url skipped because it is a {url.split('.')[-1]}")
65
+ else:
66
+ print('url:', url)
67
+ extracted_urls.append(url)
68
+
69
+ return extracted_urls
70
+
71
+
72
+
73
+ # function to check whether the entered url is valid
74
+ def valid_url(url):
75
+ try:
76
+ # Make a GET request to the URL and extract the text content
77
+ response = requests.get(url)
78
+ if response.status_code == 200:
79
+ return True
80
+
81
+ except requests.exceptions.RequestException as e:
82
+ return False
83
+
84
+
85
+
86
+ # function to create a custom stoplist for justext
87
+ def custom_stoplist():
88
+ odia_stopwords = [
89
+ "ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
90
+ "ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
91
+ "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
92
+ "ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
93
+ "ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
94
+ "ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
95
+ "କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
96
+ "ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
97
+ "ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
98
+ "ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
99
+ "ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
100
+ "ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
101
+ "ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
102
+ "କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
103
+ ]
104
+ return frozenset(odia_stopwords)
105
+
106
+
107
+
108
+ # function to extract data from url using justext
109
+ def extract_data_from_url_(url):
110
+ response = requests.get(url)
111
+ response.raise_for_status()
112
+ page = response.content
113
+
114
+ para = ""
115
+ paragraphs = justext.justext(page, custom_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False)
116
+ for paragraph in paragraphs:
117
+ if not paragraph.is_boilerplate:
118
+ para = para + '\n' + paragraph.text
119
+
120
+ return para
121
+
122
+
123
+ sitemap_data = ""
124
+
125
+
126
+
127
+ # function to process a batch of URLS in sitemaps
128
+ def process_urls(sitemap_urls):
129
+
130
+ extracted_txt = ""
131
+ extracted_jsonl_list= []
132
+ for url in sitemap_urls:
133
+ if valid_url(url):
134
+ print(url)
135
+ # using justext to extract data
136
+ temp_para = extract_data_from_url_(url)
137
+ temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
138
+ temp_jsonl_data = {"text": temp_para, "url": url}
139
+ extracted_txt += temp_txt_data
140
+ extracted_jsonl_list.append(temp_jsonl_data)
141
+ else:
142
+ st.error("Couldnt extract data from " + url)
143
+
144
+ # Convert data_list to JSONL string
145
+ extracted_jsonl_list_encoded = [json.dumps(data, ensure_ascii=False) for data in extracted_jsonl_list]
146
+ extracted_jsonl = '\n'.join(extracted_jsonl_list_encoded)
147
+
148
+ return extracted_txt, extracted_jsonl
149
+
150
+
151
+
152
+ # function to process for a single URL
153
+ def run_function(url):
154
+ extracted_txt = ""
155
+ # Check if the user has provided a URL
156
+ if url:
157
+ if valid_url(url):
158
+ temp_para = extract_data_from_url_(url)
159
+ temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
160
+ extracted_txt = temp_txt_data
161
+ extracted_jsonl = {"text": str(temp_para), "url":str(url)}
162
+
163
+ # displaying extracted txt for single URL
164
+ st.text_area("Extracted Text", value=extracted_txt, height=200)
165
+
166
+
167
+ extracted_jsonl = json.dumps(extracted_jsonl, ensure_ascii=False)
168
+
169
+ # return extract status, and the data extracted
170
+ return True, extracted_txt, extracted_jsonl
171
+ else:
172
+ return False, None, None
173
+ else:
174
+ st.error("Error: An error occurred while fetching content.")
175
+ # return extract status, and the data extracted
176
+ return False, None, None
177
+
178
+
179
+
180
+ def main():
181
+ st.subheader("Extract Data from URLs")
182
+
183
+ # dividing the body section into 2 columns for url and enter button
184
+ col1, col2 = st.columns([0.7,0.3])
185
+
186
+ with col1:
187
+ url_or_xml = st.text_input(label='', placeholder="Enter URL")
188
+ is_a_sitemap = check_sitemap(url_or_xml)
189
+
190
+ with col2:
191
+ st.write('##')
192
+ if "button_enter_url" not in st.session_state:
193
+ st.session_state.button_enter_url = False
194
+
195
+ if st.button("Enter"):
196
+ st.session_state.button_enter_url = True
197
+
198
+ if "extracted_url" not in st.session_state:
199
+ st.session_state.extracted_url = False
200
+ data = ""
201
+
202
+ # the enter button
203
+ if st.session_state.button_enter_url:
204
+ # check if it is a sitemap or not
205
+ if is_a_sitemap:
206
+ if "Initial" not in st.session_state:
207
+ st.session_state.Initial = True
208
+ # check whether its the initial state
209
+ if st.session_state.Initial == True:
210
+
211
+ xml = url_or_xml
212
+ st.write("It is a sitemap")
213
+ stored_sitemap_urls = extract_urls_from_sitemaps(xml)
214
+ print('\nno. of urls: ', len(stored_sitemap_urls))
215
+ st.write('no. of urls {}', format(len(stored_sitemap_urls)))
216
+
217
+ if stored_sitemap_urls:
218
+ print(stored_sitemap_urls)
219
+ current_time = datetime.datetime.now()
220
+ print(current_time)
221
+ st.write(current_time)
222
+ # for sitemap_url in stored_sitemap_urls:
223
+
224
+ # if valid_url(sitemap_url):
225
+ # print(sitemap_url)
226
+ # # using justext to extract data
227
+ # data = data + extract_data_from_url_(sitemap_url)
228
+ # else:
229
+ # st.error("Couldnt extract data from " + sitemap_url)
230
+
231
+ num_threads = 16 # Number of threads to use
232
+
233
+ # Calculate the split size for each thread
234
+ split_size = len(stored_sitemap_urls) // num_threads
235
+
236
+ # Create a ThreadPoolExecutor with maximum `num_threads` threads
237
+ with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
238
+ futures = []
239
+ for i in range(num_threads):
240
+ start_index = i * split_size
241
+ end_index = start_index + split_size if i != num_threads - 1 else None
242
+ temp_urls = stored_sitemap_urls[start_index:end_index]
243
+ future = executor.submit(process_urls, temp_urls)
244
+ futures.append(future)
245
+
246
+ # Retrieve the extracted data from each thread
247
+ text_data = []
248
+ jsonl_data = []
249
+ for future in futures:
250
+ text_result, jsonl_result = future.result()
251
+ text_data.append(text_result)
252
+ jsonl_data.append(jsonl_result)
253
+
254
+ # Combine the extracted data from all threads
255
+ combined_text_data = ''.join(text_data)
256
+ combined_jsonl_data = '\n'.join(jsonl_data)
257
+
258
+ # Use the combined data as needed
259
+ # print("Combined Text Data:")
260
+ # print(combined_text_data)
261
+ # print("Combined JSONL Data:")
262
+ # print(combined_jsonl_data)
263
+
264
+
265
+
266
+ if "sitemap_data_jsonl" not in st.session_state:
267
+ st.session_state.sitemap_data_jsonl = combined_jsonl_data
268
+ if "sitemap_data_text" not in st.session_state:
269
+ st.session_state.sitemap_data_text = combined_text_data
270
+
271
+
272
+
273
+
274
+ current_time = datetime.datetime.now()
275
+ print(current_time)
276
+ st.write(current_time)
277
+ st.session_state.Initial = False
278
+ print("\n\n\n\n2)Initial State", st.session_state.Initial, "\n\n\n\n\n")
279
+ st.session_state.extracted_url = True
280
+
281
+ else:
282
+ st.error("Error: Invalid sitemap.")
283
+
284
+
285
+ else:
286
+ url = url_or_xml
287
+ st.session_state.extracted_url, data_txt, data_jsonl = run_function(url)
288
+
289
+
290
+ if st.session_state.extracted_url:
291
+ # displaying extracted txt for sitemaps
292
+ if is_a_sitemap:
293
+ st.text_area("Extracted Text", value=st.session_state.sitemap_data_text, height=300)
294
+
295
+ save_as,checkbox_c1, checkbox_c2 = st.columns([0.33 , 0.33 , 0.33])
296
+
297
+ # initializing the checbox bool
298
+ save_as_txt =False
299
+ save_as_json = False
300
+ saved_successfully = False
301
+
302
+ with save_as:
303
+ st.write("Save as ")
304
+ with checkbox_c1:
305
+ save_as_txt = st.checkbox("text", value=False)
306
+
307
+ with checkbox_c2:
308
+ save_as_json = st.checkbox("jsonl", value=False)
309
+
310
+ if not save_as_txt and not save_as_json:
311
+ if st.button("Clear"):
312
+ st.session_state.button_enter_url = False
313
+ st.session_state.Initial = True
314
+ st.session_state.extracted_url = False
315
+ if 'sitemap_data_text' in st.session_state:
316
+ del st.session_state['sitemap_data_text']
317
+ if 'sitemap_data_jsonl' in st.session_state:
318
+ del st.session_state['sitemap_data_jsonl']
319
+ st.session_state.button_enter_url = False
320
+ st.experimental_rerun()
321
+ else:
322
+ col1, col2 = st.columns([0.5, 0.5])
323
+ # save column
324
+ with col1:
325
+
326
+ if is_a_sitemap:
327
+
328
+ if save_as_txt:
329
+ if st.download_button(label="Save as txt",data=st.session_state.sitemap_data_text ):
330
+ saved_successfully = True
331
+ if save_as_json:
332
+ if st.download_button(label="Save as jsonl", data=st.session_state.sitemap_data_jsonl, mime="application/json"):
333
+ saved_successfully = True
334
+ else:
335
+ if save_as_txt:
336
+ if st.download_button(label="Save as txt",data=data_txt ):
337
+ saved_successfully = True
338
+ if save_as_json:
339
+ if st.download_button(label="Save as jsonl", data=data_jsonl, mime="application/json"):
340
+ saved_successfully = True
341
+
342
+ # clear column
343
+ with col2:
344
+ if st.button("Clear"):
345
+ st.session_state.button_enter_url = False
346
+ st.session_state.Initial = True
347
+ st.session_state.extracted_url = False
348
+ if 'sitemap_data_text' in st.session_state:
349
+ del st.session_state['sitemap_data_text']
350
+ if 'sitemap_data_jsonl' in st.session_state:
351
+ del st.session_state['sitemap_data_jsonl']
352
+ st.session_state.button_enter_url = False
353
+ st.experimental_rerun()
354
+
355
+ if saved_successfully:
356
+ # Confirmation message
357
+ st.success(f"File saved successfully.")
358
+
359
+ else:
360
+ st.warning("Data not extracted")
361
+ if st.button("clear"):
362
+ st.session_state.button_enter_url = False
363
+ st.session_state.extracted_url = False
364
+ st.experimental_rerun()
365
+
366
+
367
+ # Add a success message to the sidebar
368
+ st.sidebar.success("Select a page above.")
369
+
370
+ # importing the custom footer from utils
371
+ cust_footer()
372
+
373
+
374
+ if __name__ == "__main__":
375
+ main()
pages/2_Documents.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # setting page config. for centered mode
4
+ st.set_page_config(layout="centered")
5
+
6
+
7
+ from utils.footer import cust_footer
8
+ import docx2txt
9
+ import requests
10
+ import pdfplumber
11
+
12
+ # function to run the enter button
13
+ def run_function(documents):
14
+ data = ""
15
+ if documents is not None:
16
+ for document in documents:
17
+ document_details = {
18
+ "filename": document.name,
19
+ "filetype": document.type,
20
+ "filesize": document.size
21
+ }
22
+ st.write(document_details)
23
+
24
+ # Extract content from the txt file
25
+ if document.type == "text/plain":
26
+ # Read as bytes
27
+ data += str(document.read(), "utf-8")
28
+
29
+ # Extract content from the pdf file
30
+ elif document.type == "application/pdf":
31
+ # using PyPDF2
32
+ # data += read_pdf(document)
33
+
34
+ # using pdfplumber
35
+ try:
36
+ with pdfplumber.open(document) as pdf:
37
+ all_text = ""
38
+ for page in pdf.pages:
39
+ text = page.extract_text()
40
+ all_text += text + "\n"
41
+ data += all_text
42
+ except requests.exceptions.RequestException as e:
43
+ st.write("None")
44
+
45
+ # Extract content from the docx file
46
+ elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
47
+ data += docx2txt.process(document)
48
+
49
+ # Display the extracted text content from file
50
+ st.text_area("Extracted Text", value=data, height=200)
51
+ # return extract status, and the data extracted
52
+ return True, data
53
+
54
+
55
+
56
+ else:
57
+ st.error("Error: An error occurred while fetching content.")
58
+ # return extract status, and the data extracted
59
+ return False, data
60
+
61
+
62
+ def main():
63
+
64
+ st.subheader("Extract Data from Documents")
65
+
66
+ documents = st.file_uploader(
67
+ "", type=["pdf", "txt", "docx"], accept_multiple_files=True
68
+ )
69
+
70
+ if "button_enter_doc" not in st.session_state:
71
+ st.session_state.button_enter_doc = False
72
+
73
+ if "extracted_doc" not in st.session_state:
74
+ st.session_state.extracted_doc = False
75
+ data = ""
76
+
77
+
78
+ if st.button("Enter"):
79
+ st.session_state.button_enter_doc = True
80
+
81
+
82
+
83
+ # the enter button
84
+ if st.session_state.button_enter_doc:
85
+ # check if it is a sitemap or not
86
+ if not documents:
87
+ documents = None
88
+ else:
89
+ for doc in documents:
90
+ if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
91
+ # if documents is not the relevant type
92
+ st.error("Unsupported file: " + doc.name)
93
+
94
+ st.session_state.extracted_doc, data = run_function(documents)
95
+
96
+ if st.session_state.extracted_doc:
97
+ col1, col2 = st.columns([0.5, 0.5])
98
+ with col1:
99
+ saved_button = False
100
+
101
+ if st.download_button(
102
+ label="Save",
103
+ data=data
104
+ ):
105
+ saved_button = True
106
+
107
+ with col2:
108
+ if st.button("Clear"):
109
+ st.session_state.button_enter_doc = False
110
+ st.session_state.extracted_doc = False
111
+ st.experimental_rerun()
112
+
113
+ if saved_button:
114
+ # Confirmation message
115
+ st.success(f"File saved successfully.")
116
+
117
+ else:
118
+ st.warning("Data not extracted")
119
+ if st.button("clear"):
120
+ st.session_state.button_enter_doc = False
121
+ st.session_state.extracted_doc = False
122
+ st.experimental_rerun()
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+ # Add a success message to the sidebar
131
+ st.sidebar.success("Select a page above.")
132
+
133
+ # importing the custom footer from utils
134
+ cust_footer()
135
+
136
+
137
+ if __name__ == "__main__":
138
+ main()
requirements.txt CHANGED
@@ -6,3 +6,4 @@ pdfplumber
6
  justext
7
  Pillow
8
  requests
 
 
6
  justext
7
  Pillow
8
  requests
9
+ lxml
styles.css CHANGED
@@ -1,75 +1,35 @@
1
- .e1tzin5v2 >.row-widget.stButton > button {
2
- width: 100%;
3
- height: 100%;
4
-
5
- }
6
-
7
-
8
- /* div that wraps all 3 columns */
9
- .e1tzin5v3 {
10
- width: 70%;
11
- margin-left: 15%;
12
- }
13
-
14
-
15
-
16
-
17
 
18
- /* drag drop div */
19
- .exg6vvm14 {
20
- display: none;
21
- }
22
- /* resizing the attach button */
23
- .exg6vvm15 > button {
24
- width: 100%;
25
- height: 100%;
26
- }
27
- /* div which wraps attach button */
28
- .exg6vvm15 {
29
- padding: 0;
30
- margin-left: -15px;
31
- height: 40px;
32
  }
33
 
34
- /* div that wraps the enter button */
35
- .e1tzin5v2 >.row-widget.stButton {
36
- height: 40px;
 
 
37
  }
38
 
39
- /* resizing the enter button */
40
- #root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(2) > div:nth-child(3) > div:nth-child(1) > div > div:nth-child(2) > div > button{
41
- background: rgb(204, 49, 49);
42
- color: black;
 
43
  }
44
- #root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(2) > div:nth-child(3) > div:nth-child(1) > div > div:nth-child(2) > div > button :hover,:active{
45
- color: white;
46
- }
47
-
48
 
49
- /* save button */
50
- #root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(4) > div:nth-child(1) > div:nth-child(1) > div > div > div > button {
51
- background-color: rgb(0,255,127);
52
- color: black;
53
- height: 40px;
54
- margin-top: 20px;
55
- width: 70%;
56
- margin-left: 30%;
57
- }
58
- #root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(4) > div:nth-child(1) > div:nth-child(1) > div > div > div > button:hover, :active {
59
- border: rgb(0,255,127);
60
- color: white;
61
-
62
- }
63
 
64
- /* clear button */
65
- #root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(4) > div:nth-child(2) > div:nth-child(1) > div > div > div > button {
66
- background: rgb(204, 49, 49);
67
- color: black;
68
- height: 40px;
69
- margin-top: 20px;
70
- width: 70%;
71
- margin-right: 30%;
72
- }
73
- #root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) > div:nth-child(1) > div > div:nth-child(4) > div > div:nth-child(4) > div:nth-child(2) > div:nth-child(1) > div > div > div > button:hover, :active {
74
- color: white;
75
- }
 
1
+ .title {
2
+ font-family: "Source Sans Pro", sans-serif;
3
+ font-weight: 600;
4
+ padding: 1.25rem 0px 1rem;
5
+ margin: 0px;
6
+ margin-left: -2%;
7
+ line-height: 1.2;
8
+ font-size: 70px;
9
+ text-decoration: underline 2px;
10
+ text-underline-position: under;
 
 
 
 
 
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  }
13
 
14
+ @media (max-width: 768px) {
15
+ .title {
16
+ font-size: xx-large;
17
+ margin-left: auto;
18
+ }
19
  }
20
 
21
+ .text {
22
+ font-family: "Source Sans Pro", sans-serif;
23
+ font-size: 1.25pc;
24
+ padding: 1.25rem 0px 1rem;
25
+ margin: 0px;
26
  }
 
 
 
 
27
 
28
+ /* #root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) {
29
+ background-color: #3498db;
30
+ padding: 5px;
31
+ } */
 
 
 
 
 
 
 
 
 
 
32
 
33
+ /* #root > div:nth-child(1) > div.withScreencast > div > div > div > section.css-1cypcdb.e1fqkh3o11 > div.css-6qob1r.e1fqkh3o3 > div.css-1b9x38r.e1fqkh3o2 > button{
34
+ visibility: hidden;
35
+ } */
 
 
 
 
 
 
 
 
 
utils/__pycache__/footer.cpython-38.pyc ADDED
Binary file (1.1 kB). View file
 
utils/footer.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def cust_footer():
4
+ footer = """
5
+ <style>
6
+ footer {
7
+ visibility: hidden !important;
8
+ }
9
+
10
+ .divfooter {
11
+ display: flex;
12
+ justify-content: center;
13
+ align-items: center;
14
+ position: fixed;
15
+ left: 0;
16
+ bottom: 0;
17
+ width: 100%;
18
+ padding: 10px;
19
+ border-top: 2px solid grey;
20
+ background: white;
21
+
22
+
23
+ }
24
+ @media (min-width: 768px) {
25
+ .divfooter {
26
+ justify-content: center;
27
+ padding-left: 10%;
28
+ }
29
+
30
+ }
31
+
32
+
33
+ </style>
34
+ <div class="divfooter">
35
+ <p style="margin-bottom: 0px">© 2023 Odia Generative AI</p>
36
+ </div>
37
+ """
38
+ st.markdown(footer, unsafe_allow_html=True)