sam2ai commited on
Commit
a5adcd2
1 Parent(s): c4a251a

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +162 -0
  2. styles.css +78 -0
app.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # installed pip packages
2
+ # pip install streamlit
3
+ # pip install beautifulsoup4
4
+ # pip install docx2txt
5
+ # pip install pypdf2
6
+ # pip install pdfplumber
7
+
8
+ import streamlit as st
9
+
10
+ # File Processing pkgs
11
+ from PIL import Image
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+ import json
15
+ import docx2txt
16
+ # import textract
17
+ from PyPDF2 import PdfFileReader
18
+ import pdfplumber
19
+
20
+
21
+ # ---- LOAD ASSETS ----
22
+ img_page_icon = Image.open("images/web_icon.jpeg")
23
+
24
+ # Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/
25
+ st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide")
26
+
27
+ # Load CSS file
28
+ def load_css(file_path):
29
+ with open(file_path) as f:
30
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
31
+
32
+ # Load CSS file
33
+ load_css('styles.css')
34
+
35
+
36
+
37
+ # ---- HEADER SECTION ----
38
+ with st.container():
39
+ st.subheader("Hi, username :wave:")
40
+ st.write("##")
41
+ st.markdown("<h5 class='text'>OdiaGenAI is a collaborative initiative that conducts research on </h5>",
42
+ unsafe_allow_html=True)
43
+ st.markdown("<h5>Generative AI and LLM for the Odia Language.</h5>", unsafe_allow_html=True)
44
+ # st.title("Odia Generative AI")
45
+
46
+ st.markdown("<h1 class='title'>Odia Generative AI</h1>", unsafe_allow_html=True)
47
+
48
+
49
+
50
+ # ---- BODY SECTION ----
51
+ with st.container():
52
+ st.subheader("Collecting monolingual data (Odia or any Indic Languages)")
53
+
54
+ # ----- FUNCTIONS ----
55
+ # function to get the text from pdf using PyPDF2
56
+ def read_pdf(file):
57
+ pdfReader = PdfFileReader(file)
58
+ count = pdfReader.numPages
59
+ # all_page_text = ""
60
+ # for i in range(count):
61
+ # page = pdfReader.getPage(i)
62
+ # all_page_text += page.extractText()
63
+ #
64
+ # return all_page_text
65
+ return count
66
+
67
+ # function to run the enter button
68
+ def run_function(url , documents):
69
+ news = ""
70
+ # Check if the user has provided a URL
71
+ if url:
72
+ try:
73
+ # Make a GET request to the URL and extract the text content
74
+ response = requests.get(url)
75
+ if response.status_code == 200:
76
+ text_content = response.text
77
+
78
+ soup = BeautifulSoup(text_content, 'html.parser')
79
+
80
+ # Extracting the header
81
+ # Extracting the script tag which includes the heading
82
+ heading = soup.find('script', type='application/ld+json')
83
+
84
+ # Extract the JSON data from the script tag
85
+ json_data_heading = heading.string
86
+
87
+ # Load the JSON data into a Python dictionary
88
+ data = json.loads(json_data_heading)
89
+ headline = data['headline']
90
+
91
+ body = soup.find('div', class_='oi-article-lt')
92
+ # Find all <p> tags within the div_tag
93
+ p_tags = body.find_all('p')
94
+
95
+ # Extract the text content from each <p> tag
96
+ paragraphs = [p.get_text(strip=True) for p in p_tags]
97
+ paragraphs = '\n'.join(paragraphs)
98
+
99
+ news = news + (headline + '\n\n' + paragraphs)
100
+
101
+ # Display the extracted text content from url
102
+ st.text_area("Extracted Text", value=news, height=200)
103
+
104
+ else:
105
+ st.error("Error: Unable to fetch content from the provided URL.")
106
+ except requests.exceptions.RequestException as e:
107
+ st.error("Error: An exception occurred while fetching content from the URL.")
108
+
109
+ # Check if the user has provided a document
110
+ elif documents is not None:
111
+ for document in documents:
112
+ document_details = {
113
+ "filename":document.name,
114
+ "filetype":document.type,
115
+ "filesize":document.size
116
+ }
117
+ st.write(document_details)
118
+
119
+ # Extract content from the txt file
120
+ if document.type == "text/plain":
121
+ # Read as bytes
122
+ news += str(document.read(), "utf-8")
123
+
124
+ # Extract content from the pdf file
125
+ elif document.type == "application/pdf":
126
+ # using PyPDF2
127
+ # news += read_pdf(document)
128
+
129
+ # using pdfplumber
130
+ try:
131
+ with pdfplumber.open(document) as pdf:
132
+ all_text = ""
133
+ for page in pdf.pages:
134
+ text = page.extract_text()
135
+ all_text += text + "\n"
136
+ news += all_text
137
+ except:
138
+ st.write("None")
139
+
140
+ # Extract content from the docx file
141
+ else:
142
+ news += docx2txt.process(document)
143
+
144
+ # Display the extracted text content from file
145
+ st.text_area("Extracted Text", value=news, height=200)
146
+ else:
147
+ st.error("Error: An error occurred while fetching content .")
148
+
149
+
150
+
151
+ col1, col2, col3 = st.columns([0.6, 0.2, 0.2])
152
+ with col1:
153
+ url = st.text_input(label='', placeholder="Enter URL")
154
+
155
+ with col2:
156
+ documents = st.file_uploader("", type=["png", "jpg", "jpeg", "pdf", "txt", "docx"], accept_multiple_files=True)
157
+
158
+ with col3:
159
+ b = st.button("Enter")
160
+
161
+ if b:
162
+ run_function(url, documents)
styles.css ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* styles.css */
2
+
3
+ /* body background */
4
+ body .stApp {
5
+ background-color: black;
6
+ }
7
+
8
+
9
+ /* title - Odia Gen AI */
10
+ .title {
11
+ text-align: center;
12
+ margin-top: 100px;
13
+ margin-bottom: 100px;
14
+ }
15
+
16
+
17
+ /* text in web */
18
+ .text {
19
+ padding-bottom: 10px;
20
+ }
21
+
22
+
23
+ /* div which contains all the 3 columns */
24
+ .css-ocqkz7.e1tzin5v3 {
25
+ width: 75%;
26
+ margin-left: 12.5%;
27
+ }
28
+
29
+
30
+ /* column 1 */
31
+ /* removing press enter to apply text */
32
+ .css-1if5ada {
33
+ visibility: hidden;
34
+ }
35
+ /* editing Input text box */
36
+ input.st-bg.st-c1.st-c2.st-c3.st-c4.st-c5.st-c6.st-c7.st-c8.st-c9.st-ca.st-b8.st-cb.st-cc.st-cd.st-ce.st-cf.st-cg.st-ch.st-ci.st-ae.st-af.st-ag.st-cj.st-ai.st-aj.st-c0.st-ck.st-cl.st-cm {
37
+ padding-top: 0.6rem;
38
+ padding-bottom: 0.6rem;
39
+ }
40
+
41
+ /* column 2 */
42
+ /* editing for browse file */
43
+ /* removing drag drop area */
44
+ .css-u8hs99.exg6vvm14 {
45
+ display: none;
46
+ }
47
+ /* editing the div which wraps the browse button */
48
+ section.css-z8f339.exg6vvm15 {
49
+ padding: 0rem;
50
+ margin-left: -1.2rem;
51
+ }
52
+ /* editing the browse buttton */
53
+ button.css-b3z5c9.edgvbvh10 {
54
+ width: 100%;
55
+ }
56
+ /* editing the div which shows the attached files */
57
+ .css-fis6aj.exg6vvm10 {
58
+ overflow: auto;
59
+ width: 20rem;
60
+ margin-left: -5rem;
61
+ }
62
+
63
+ /* column 3 */
64
+ /* Enter button */
65
+ div.stButton > button:first-child {
66
+ background-color: rgb(204, 49, 49);
67
+ width: 100%;
68
+ margin-top: 32px;
69
+ }
70
+ .css-b3z5c9 {
71
+ padding: 0.5rem 0.75rem;
72
+ }
73
+
74
+
75
+ /* text area */
76
+ .stTextArea {
77
+ margin-top: 1rem;
78
+ }