File size: 12,076 Bytes
3b0cd25
d97c686
a7b3a3f
3b0cd25
276f5d0
 
 
 
 
 
 
 
 
 
 
 
a7b3a3f
 
276f5d0
3b0cd25
 
 
276f5d0
 
 
3a279ad
 
 
30aabc7
3a279ad
 
30aabc7
3a279ad
30aabc7
 
3a279ad
 
30aabc7
 
276f5d0
 
 
 
 
 
 
 
 
 
 
 
 
 
7b3350f
 
 
 
 
 
 
 
 
 
 
 
30aabc7
31496fa
 
 
30aabc7
301d94b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1ebb79
31496fa
 
 
 
 
 
 
301d94b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94fc3e6
 
 
 
 
 
301d94b
 
 
 
94fc3e6
301d94b
 
 
3a279ad
276f5d0
e345f11
 
 
276f5d0
 
 
e345f11
276f5d0
31496fa
4a5f717
31496fa
4a5f717
 
ec3cb98
 
a589b68
 
 
 
 
ec3cb98
 
a589b68
ec3cb98
 
a589b68
 
 
 
 
d1ebb79
276f5d0
d1ebb79
4a5f717
276f5d0
4a5f717
9b93bcb
 
 
 
 
 
 
 
 
 
 
 
 
 
4a5f717
276f5d0
 
8bf2642
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276f5d0
8bf2642
 
 
 
 
 
 
 
 
 
 
 
 
06ce905
 
b8e1a4e
 
 
 
 
 
 
 
 
 
fc152bd
 
 
b8e1a4e
 
 
61d26e7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
import streamlit as st
import pandas as pd
import json

import streamlit as st
import pandas as pd
import json

# Header for Data Section
st.subheader("**What is Data?**")
st.write("Data refers to information, facts, or statistics that are collected, stored, and analyzed to derive meaningful insights. It represents raw, unprocessed values that can be used for decision-making, analysis, and predictions.")

st.subheader("**Types of Data**")
data_type = st.radio("**Select a type of data:**", ["Structured", "Unstructured", "Semi-Structured"], index=0)

# Initialize format_selected to avoid errors
format_selected = None

# Structured Data Section
if data_type == "Structured":
    st.write("### Structured Data")
    st.write("Structured data is organized in a predefined format, such as rows and columns.")

    format_selected = st.radio("Select a data format to learn more:", ["CSV", "Excel", "XML"], index=0)

    if format_selected == "Excel":
        st.write("#### Excel Format")
        st.subheader("What is Excel?")
        st.write("""
        Excel is a popular file format used for storing structured data in tabular form.
        It is commonly used in business and data analysis, and its file extensions include `.xlsx` and `.xls`.
        """)
        st.subheader("How to Read Excel Files?")
        st.code("""
        import pandas as pd
        # Read an Excel file
        df = pd.read_excel("file.xlsx")
        print(df.head())
        """)
        st.subheader("Common Issues Encountered When Handling Excel Files")
        st.write("""
        - **Missing Data**: Some cells may contain empty or null values.
        - **Encoding Problems**: Files saved in non-standard formats may have encoding issues.
        - **File Corruption**: The file may become unreadable if improperly saved or transferred.
        - **Large Files**: Handling very large Excel files may exceed memory limits.
        """)
        st.subheader("How to Overcome These Issues?")
        st.write("""
        - **Missing Data**: Use data imputation techniques to fill in missing values.
        - **Encoding Problems**: Specify the encoding format when reading the file, e.g., `encoding='utf-8'`.
        - **File Corruption**: Use repair tools or convert to a compatible format like CSV.
        - **Large Files**: Process the file in chunks using `pandas` or optimize it using external tools.
        """)
         # Downloadable Guide Button
        st.markdown("### Download Coding Guide:")
        if st.button("Download Excel Guide"):
            # Provide a downloadable file
            file_path = "Excel_guide.ipynb"  # Ensure this file exists in the app directory
            with open(file_path, "rb") as file:
                st.download_button(
                    label="Download Excel Guide",
                    data=file,
                    file_name="Excel_guide.ipynb",
                    mime="application/octet-stream",
                )

    elif format_selected == "CSV":
        st.write("#### CSV Format")
        st.subheader("What is CSV?")
        st.write("""
        CSV (Comma-Separated Values) is a plain-text file format used to store tabular data, 
    where each row corresponds to a record, and fields are separated by commas. 
    It is widely used for data exchange due to its simplicity and compatibility across systems. 
    Common file extensions include `.csv`.
    """)
        # Part (b) How to Read These Files
    st.subheader("How to Read CSV Files?")
    st.code("""
    import pandas as pd
    # Reading a CSV file
    df = pd.read_csv("file.csv")
    print(df.head())
    
    # Reading a CSV file with custom delimiter
    df = pd.read_csv("file.csv", sep=";")
    """)
    # Part (c) Issues Encountered
    st.subheader("Common Issues Encountered When Handling CSV Files")
    st.write("""
    - **Incorrect Delimiters**: Files may use delimiters other than commas, e.g., semicolons or tabs.
    - **Encoding Problems**: Files with different encodings (e.g., UTF-8, ISO-8859-1) may cause errors.
    - **Missing or Corrupted Data**: Blank fields or inconsistencies in data.
    - **Header Issues**: Missing headers or extra/unexpected columns.
    - **Large File Sizes**: Memory limitations when processing large datasets.
    """)
    # Part (d) How to Overcome These Issues
    st.subheader("How to Overcome These Issues?")
    st.write("""
    - **Incorrect Delimiters**: Specify the correct delimiter when reading:
      ```python
      df = pd.read_csv("file.csv", sep=";")
      ```
    - **Encoding Problems**: Specify the encoding explicitly:
      ```python
      df = pd.read_csv("file.csv", encoding="utf-8")
      ```
    - **Missing or Corrupted Data**: Handle missing values using pandas:
      ```python
      df.fillna("NA", inplace=True)
      ```
    - **Header Issues**: Assign custom headers or skip problematic rows:
      ```python
      df = pd.read_csv("file.csv", header=None)
      df.columns = ["Column1", "Column2", "Column3"]
      ```
    - **Large Files**: Use chunk processing for large files:
      ```python
      chunks = pd.read_csv("file.csv", chunksize=1000)
      for chunk in chunks:
          process(chunk)
      ```
    """)

    # Downloadable Guide Button
    st.markdown("### Download Coding Guide:")
    if st.button("Download CSV Guide"):
        # Provide a downloadable Jupyter Notebook file
        file_path = "CSV_guide.ipynb"  # Replace with the actual file path
        with open(file_path, "rb") as file:
            st.download_button(
                label="Download CSV Guide",
                data=file,
                file_name="CSV_guide.ipynb",
                mime="application/octet-stream",
            )
        

    elif format_selected == "XML":
        st.write("#### XML Format")
        st.subheader("What is XML?")
        st.write("""
        XML (eXtensible Markup Language) is a markup language designed to store and transport data.
        It uses a hierarchical structure and tags, making it both human-readable and machine-readable.
        """)
        st.write("#### b. How to Read XML Files")
        st.code("""
        import xml.etree.ElementTree as ET

        # Parse an XML file
        tree = ET.parse("file.xml")
        root = tree.getroot()

        # Access elements
        for child in root:
            print(child.tag, child.text)
        """, language="python")

        st.write("#### c. Issues Encountered When Handling XML Files")
        st.write("""
        1. **Complex Structures:** XML files may have deeply nested hierarchies.
        2. **Large File Sizes:** Memory-intensive parsing for large files.
        3. **Data Inconsistency:** Missing or unexpected tags may cause parsing errors.
        4. **Encoding Issues:** Files with non-standard encodings can fail to parse.
        """)

        st.write("#### d. How to Overcome These Issues")
        st.code("""
        from lxml import etree

        # Handle large XML files using event-driven parsing
        for event, element in etree.iterparse("large_file.xml", events=("end",)):
            print(element.tag, element.text)
            element.clear()
        """, language="python")

       # Downloadable Guide Button
    st.markdown("### Download Coding Guide:")
    if st.button("Download XML Guide"):
        # Provide a downloadable Jupyter Notebook file
        file_path = "XML_guide.ipynb"  # Replace with the actual file path
        with open(file_path, "rb") as file:
            st.download_button(
                label="Download XML Guide",
                data=file,
                file_name="XML_guide.ipynb",
                mime="application/octet-stream",
            )



# Semi-Structured Data Section
elif data_type == "Semi-Structured":
    st.write("### Semi-Structured Data")
    st.write("Semi-structured data does not have a predefined format but contains tags or markers to separate data elements.")

    format_selected = st.radio("Choose a format to explore:", ["JSON", "HTML"], index=0)

    if format_selected == "JSON":
        st.write("#### JSON Format")
        st.subheader("What is JSON?")
        st.write("""
        JSON (JavaScript Object Notation) is a lightweight, text-based data format used for data interchange.
        """)

        st.subheader("How to Read JSON Files")
        st.code("""
        import json
        with open('file.json') as f:
            data = json.load(f)
        print(data)
        """)
        st.subheader("Common Issues with JSON Files")
        st.write("Issues include complex nested structures and inconsistent data.")

        st.subheader("How to Overcome These Issues?")
        st.write("""
        - Use `json_normalize` to flatten nested structures.
        - Handle missing or inconsistent data with default values or conditionals.
        """)


    elif format_selected == "HTML":
        st.write("#### HTML Format")
        st.subheader("What is HTML?")
        st.write("""
        HTML (HyperText Markup Language) is a standard language used to structure content on web pages.
        """)
        st.subheader("How to Read HTML Files")
        st.code("""
        import pandas as pd
        df = pd.read_html('file.html')[0]  # Reading the first table
        print(df.head())
        """)
        st.subheader("Common Issues with HTML Files")
        st.write("Issues include multiple tables and improper tags.")
        st.subheader("How to Overcome These Issues?")
        st.write("""
        - Specify the correct table index when reading multiple tables.
        - Use `BeautifulSoup` to clean improperly formatted HTML.
        """)


# Unstructured Data Section
elif data_type == "Unstructured":
    st.write("### **Unstructured Data**")
    st.write("Unstructured data refers to information that does not follow a predefined format or organizational schema. Unlike structured data (organized in rows and columns, such as in databases), unstructured data is more flexible but harder to analyze using traditional methods.")
    st.write("""
    Examples of Unstructured Data:
    1. Text Documents: Emails, PDFs, Word documents.
    2. Multimedia: Images, videos, audio recordings.
    3. Social Media Content: Posts, comments, likes, and shares.
    4. Sensor Data: Logs, IoT device readings without a fixed schema.
    """)
    
    st.write("**Characteristics of Unstructured Data:**")
    st.write("""
    1.Lack of Predefined Format: The data does not fit into traditional tables or relational databases.
    2.High Volume: Unstructured data is often generated in large amounts (e.g., social media, video content).
    3.Variety: It comes in diverse forms (e.g., images, free-form text, logs).
    4.Complex Analysis: Requires advanced tools like Natural Language Processing (NLP) for text or Computer Vision for images.
    """)

    st.write("**How to Work with Unstructured Data:**")
    st.write("""
    1.Storage Solutions: Use scalable systems like AWS S3, Hadoop, or NoSQL databases.
    2.Preprocessing: Convert unstructured data into analyzable formats using:
        - Text data: Tokenization, stemming, and lemmatization.
        - Image data: Feature extraction using tools like OpenCV.
    3.Tools for Analysis:
        - Text Analysis: NLP libraries (e.g., spaCy, NLTK).
        - Image/Video Analysis: TensorFlow, PyTorch, or OpenCV.
        - Big Data Processing: Apache Spark, Hadoop.
    """)

    
    if st.button("**Image**"):
        st.switch_page("pages/Image.py")

    if st.button("**Basic operations of Image with the help of Open cv**"):
        st.switch_page("pages/Basic operations of Image with the help of Open cv.py")

    if st.button("**How to work on Image**"):
        st.switch_page("pages/How to work on Image.py")

    if st.button("**How to handle Videos**"):
        st.switch_page("pages/How to handle Videos.py")

    if st.button("**Transformation**"):
        st.switch_page("pages/Transformation.py")

    if st.button("**Projects**"):
        st.switch_page("pages/Projects")