gopiashokan commited on
Commit
ab4fb4c
·
verified ·
1 Parent(s): 2bd80ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -161
app.py CHANGED
@@ -1,161 +1,162 @@
1
- import os
2
- import requests
3
- import streamlit as st
4
- import streamlit.components.v1 as components
5
- from streamlit_extras.add_vertical_space import add_vertical_space
6
- from bs4 import BeautifulSoup
7
- from dotenv import load_dotenv
8
- from warnings import filterwarnings
9
- filterwarnings('ignore')
10
-
11
-
12
- def streamlit_config():
13
-
14
- # page configuration
15
- st.set_page_config(page_title='Document Classification', layout='centered')
16
-
17
- # page header transparent color
18
- page_background_color = """
19
- <style>
20
-
21
- [data-testid="stHeader"]
22
- {
23
- background: rgba(0,0,0,0);
24
- }
25
-
26
- </style>
27
- """
28
- st.markdown(page_background_color, unsafe_allow_html=True)
29
-
30
- # title and position
31
- st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
32
- unsafe_allow_html=True)
33
- add_vertical_space(2)
34
-
35
-
36
- def display_html_document(input_file):
37
-
38
- # Read the file content
39
- html_content = input_file.getvalue().decode("utf-8")
40
-
41
- # Define CSS to control the container size and center content
42
- styled_html = f"""
43
- <div style="width: 610px; height: 300px;
44
- overflow: auto; border: 1px solid #ddd;
45
- padding: 10px; background-color: white;
46
- color: black; white-space: normal;
47
- display: block;">
48
- {html_content}
49
- </div>
50
- """
51
-
52
- # Display the HTML content inside a fixed-size container
53
- components.html(styled_html, height=320, width=650, scrolling=False)
54
-
55
-
56
- def text_extract_from_html(html_file):
57
-
58
- # Read the uploaded HTML file
59
- html_content = html_file.read().decode('utf-8')
60
-
61
- # Parse the HTML Content
62
- soup = BeautifulSoup(html_content, 'html.parser')
63
-
64
- # Extract the Text
65
- text = soup.get_text()
66
-
67
- # Split the Text and Remove Unwanted Space
68
- result = [i.strip() for i in text.split()]
69
- result = ' '.join(result)
70
-
71
- return result
72
-
73
-
74
- def classify_text_with_huggingface_api(extracted_text):
75
-
76
- # Load environment variables from .env file
77
- load_dotenv()
78
-
79
- # Retrieve the Hugging Face API token from environment variables
80
- hf_token = os.getenv("HUGGINGFACE_TOKEN")
81
-
82
- # Define the Hugging Face API URL for the model
83
- API_URL = "https://api-inference.huggingface.co/models/gopiashokan/Financial-Document-Classification-using-Deep-Learning"
84
-
85
- # Set the authorization headers with the Hugging Face token
86
- HEADERS = {"Authorization": f"Bearer {hf_token}"}
87
-
88
- # Send a POST request to the Hugging Face API with the extracted text
89
- response = requests.post(API_URL, headers=HEADERS, json={"inputs": extracted_text})
90
-
91
- # Parse and return the JSON response
92
- if response.status_code == 200:
93
- result = response.json()
94
- return result[0]
95
-
96
- else:
97
- return None
98
-
99
-
100
- def prediction(input_file):
101
-
102
- # Extract text from the uploaded HTML file
103
- extracted_text = text_extract_from_html(input_file)
104
-
105
- # Limit the extracted text to the first 512 characters to avoid API input limits
106
- extracted_text = extracted_text[0:512]
107
-
108
- # Classify the extracted text using the Hugging Face API
109
- result = classify_text_with_huggingface_api(extracted_text)
110
-
111
- if result is not None:
112
- # Select the prediction with the highest confidence score
113
- prediction = max(result, key=lambda x: x['score'])
114
-
115
- # Map model labels to human-readable class names
116
- label_mapping = {'LABEL_0':'Others', 'LABEL_1':'Balance Sheets', 'LABEL_2':'Notes', 'LABEL_3':'Cash Flow', 'LABEL_4':'Income Statement'}
117
-
118
- # Get the predicted class name based on the model output
119
- predicted_class = label_mapping[prediction['label']]
120
-
121
- # Convert the confidence score to a percentage
122
- confidence = prediction['score'] * 100
123
-
124
- # Display the prediction results
125
- add_vertical_space(1)
126
- st.markdown(f"""
127
- <div style="text-align: center; line-height: 1; padding: 0px;">
128
- <h4 style="color: orange; margin: 0px; padding: 0px;">{confidence:.2f}% Match Found</h4>
129
- <h3 style="color: green; margin-top: 10px; padding: 0px;">Predicted Class = {predicted_class}</h3>
130
- </div>
131
- """, unsafe_allow_html=True)
132
-
133
-
134
- else:
135
- add_vertical_space(1)
136
- st.markdown(f'<h4 style="text-align: center; color: orange; margin-top: 10px;">Refresh the Page and Try Again</h4>',
137
- unsafe_allow_html=True)
138
-
139
-
140
-
141
- # Streamlit Configuration Setup
142
- streamlit_config()
143
-
144
-
145
- try:
146
-
147
- # File uploader to upload the HTML file
148
- input_file = st.file_uploader('Upload an HTML file', type='html')
149
-
150
- if input_file is not None:
151
-
152
- # Display the HTML Document to User Interface
153
- display_html_document(input_file)
154
-
155
- # Predict the Class and Confidence Score
156
- with st.spinner('Processing'):
157
- prediction(input_file)
158
-
159
-
160
- except Exception as e:
161
- st.markdown(f'<h3 style="text-align: center;">{e}</h3>', unsafe_allow_html=True)
 
 
1
+ import os
2
+ import requests
3
+ import streamlit as st
4
+ import streamlit.components.v1 as components
5
+ from streamlit_extras.add_vertical_space import add_vertical_space
6
+ from bs4 import BeautifulSoup
7
+ from dotenv import load_dotenv
8
+ from warnings import filterwarnings
9
+ filterwarnings('ignore')
10
+
11
+
12
+ def streamlit_config():
13
+
14
+ # page configuration
15
+ st.set_page_config(page_title='Document Classification', layout='centered')
16
+
17
+ # page header transparent color
18
+ page_background_color = """
19
+ <style>
20
+
21
+ [data-testid="stHeader"]
22
+ {
23
+ background: rgba(0,0,0,0);
24
+ }
25
+
26
+ </style>
27
+ """
28
+ st.markdown(page_background_color, unsafe_allow_html=True)
29
+
30
+ # title and position
31
+ st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
32
+ unsafe_allow_html=True)
33
+ add_vertical_space(2)
34
+
35
+
36
+ def display_html_document(input_file):
37
+
38
+ # Read the file content
39
+ html_content = input_file.getvalue().decode("utf-8")
40
+
41
+ # Define CSS to control the container size and center content
42
+ styled_html = f"""
43
+ <div style="width: 610px; height: 300px;
44
+ overflow: auto; border: 1px solid #ddd;
45
+ padding: 10px; background-color: white;
46
+ color: black; white-space: normal;
47
+ display: block;">
48
+ {html_content}
49
+ </div>
50
+ """
51
+
52
+ # Display the HTML content inside a fixed-size container
53
+ components.html(styled_html, height=320, width=650, scrolling=False)
54
+
55
+
56
+ def text_extract_from_html(html_file):
57
+
58
+ # Read the uploaded HTML file
59
+ html_content = html_file.read().decode('utf-8')
60
+
61
+ # Parse the HTML Content
62
+ soup = BeautifulSoup(html_content, 'html.parser')
63
+
64
+ # Extract the Text
65
+ text = soup.get_text()
66
+
67
+ # Split the Text and Remove Unwanted Space
68
+ result = [i.strip() for i in text.split()]
69
+ result = ' '.join(result)
70
+
71
+ return result
72
+
73
+
74
+ def classify_text_with_huggingface_api(extracted_text):
75
+
76
+ # Load environment variables from .env file
77
+ load_dotenv()
78
+
79
+ # Retrieve the Hugging Face API token from environment variables
80
+ hf_token = os.getenv("HUGGINGFACE_TOKEN")
81
+
82
+ # Define the Hugging Face API URL for the model
83
+ API_URL = "https://api-inference.huggingface.co/models/gopiashokan/Financial-Document-Classification-using-Deep-Learning"
84
+
85
+ # Set the authorization headers with the Hugging Face token
86
+ HEADERS = {"Authorization": f"Bearer {hf_token}"}
87
+
88
+ # Send a POST request to the Hugging Face API with the extracted text
89
+ response = requests.post(API_URL, headers=HEADERS, json={"inputs": extracted_text})
90
+
91
+ # Parse and return the JSON response
92
+ if response.status_code == 200:
93
+ result = response.json()
94
+ return result[0]
95
+
96
+ else:
97
+ return None
98
+
99
+
100
+ def prediction(input_file):
101
+
102
+ # Extract text from the uploaded HTML file
103
+ extracted_text = text_extract_from_html(input_file)
104
+
105
+ # Limit the extracted text to the first 512 characters to avoid API input limits
106
+ extracted_text = extracted_text[0:512]
107
+
108
+ # Classify the extracted text using the Hugging Face API
109
+ result = classify_text_with_huggingface_api(extracted_text)
110
+
111
+ if result is not None:
112
+ # Select the prediction with the highest confidence score
113
+ prediction = max(result, key=lambda x: x['score'])
114
+
115
+ # Map model labels to human-readable class names
116
+ label_mapping = {'LABEL_0':'Others', 'LABEL_1':'Balance Sheets', 'LABEL_2':'Notes', 'LABEL_3':'Cash Flow', 'LABEL_4':'Income Statement'}
117
+
118
+ # Get the predicted class name based on the model output
119
+ predicted_class = label_mapping[prediction['label']]
120
+
121
+ # Convert the confidence score to a percentage
122
+ confidence = prediction['score'] * 100
123
+
124
+ # Display the prediction results
125
+ add_vertical_space(1)
126
+ st.markdown(f"""
127
+ <div style="text-align: center; line-height: 1; padding: 0px;">
128
+ <h4 style="color: orange; margin: 0px; padding: 0px;">{confidence:.2f}% Match Found</h4>
129
+ <h3 style="color: green; margin-top: 10px; padding: 0px;">Predicted Class = {predicted_class}</h3>
130
+ </div>
131
+ """, unsafe_allow_html=True)
132
+
133
+
134
+ else:
135
+ add_vertical_space(1)
136
+ st.markdown(f'<h4 style="text-align: center; color: orange; margin-top: 10px;">Refresh the Page and Try Again</h4>',
137
+ unsafe_allow_html=True)
138
+
139
+
140
+
141
+ # Streamlit Configuration Setup
142
+ streamlit_config()
143
+
144
+
145
+ try:
146
+
147
+ # File uploader to upload the HTML file
148
+ input_file = st.file_uploader('Upload an HTML file', type='html')
149
+
150
+ if input_file is not None:
151
+
152
+ # Display the HTML Document to User Interface
153
+ display_html_document(input_file)
154
+
155
+ # Predict the Class and Confidence Score
156
+ with st.spinner('Processing'):
157
+ prediction(input_file)
158
+ add_vertical_space(1)
159
+
160
+
161
+ except Exception as e:
162
+ st.markdown(f'<h3 style="text-align: center;">{e}</h3>', unsafe_allow_html=True)