Spaces:

shwetashweta05
/

Zero_to_Hero_Machine_Learning

Sleeping

App Files Files Community

shwetashweta05 commited on Dec 17, 2024

Commit

3a279ad

verified ·

1 Parent(s): 3dd3579

Update pages/6.Data Collection.py

Browse files

Files changed (1) hide show

pages/6.Data Collection.py +167 -46

pages/6.Data Collection.py CHANGED Viewed

@@ -20,66 +20,187 @@ if data_type == "Structured":
     # Now, add the format selection in this section
     format_selected = st.radio("Select a data format to learn more:", ["CSV", "Excel", "XML"])
-    # CSV Section
-    if format_selected == "CSV":
-        st.write("### CSV Format")
-        st.subheader("What is CSV?")
-        st.write("CSV (Comma-Separated Values) is a plain-text file format used to store tabular data.")
-        st.subheader("How to Read CSV Files")
-        st.code("""
-        import pandas as pd
-        df = pd.read_csv('file.csv')
-        print(df.head())
-        """)
-        st.subheader("Common Issues with CSV Files")
-        st.write("Issues include incorrect delimiters, encoding problems, and missing data.")
-        st.subheader("How to Overcome These Issues?")
         st.write("""
-        - Use the correct delimiter with `sep=";"` if needed.
-        - Specify encoding to resolve encoding problems, like `encoding="utf-8"`.
         """)
-    # Excel Section
-    elif format_selected == "Excel":
-        st.write("### Excel Format")
-        st.subheader("What is Excel?")
-        st.write("Excel is a widely used file format for structured data. It stores data in tabular form.")
-        st.subheader("How to Read Excel Files")
         st.code("""
         import pandas as pd
-        df = pd.read_excel('file.xlsx')
         print(df.head())
         """)
-        st.subheader("Common Issues with Excel Files")
-        st.write("Issues include missing data, encoding problems, and large files.")
-        st.subheader("How to Overcome These Issues?")
         st.write("""
-        - Handle missing data using imputation techniques.
-        - Specify encoding with `encoding="utf-8"`.
-        - Use chunk processing for large files.
         """)
-    # XML Section
-    elif format_selected == "XML":
-        st.write("### XML Format")
-        st.subheader("What is XML?")
-        st.write("XML is a markup language used to store and transport data in a hierarchical format.")
-        st.subheader("How to Read XML Files")
-        st.code("""
-        import xml.etree.ElementTree as ET
-        tree = ET.parse('file.xml')
-        root = tree.getroot()
-        for child in root:
-            print(child.tag, child.text)
-        """)
-        st.subheader("Common Issues with XML Files")
-        st.write("Issues include large file sizes and missing or incorrect tags.")
         st.subheader("How to Overcome These Issues?")
         st.write("""
-        - Use event-driven parsing for large XML files.
-        - Ensure tags are properly formed or use `BeautifulSoup` for cleaning.
         """)
 # If the user selects Semi-Structured Data
 elif data_type == "Semi-Structured":
     st.write("### Semi-Structured Data")

     # Now, add the format selection in this section
     format_selected = st.radio("Select a data format to learn more:", ["CSV", "Excel", "XML"])
+   # Excel Format Section
+    if format_selected == "Excel":
+        st.write("#### Excel Format")
+        # Part (a) What it is
+        st.subheader("What is Excel?")
         st.write("""
+        Excel is a popular file format used for storing structured data in tabular form.
+        It is commonly used in business and data analysis, and its file extensions include `.xlsx` and `.xls`.
         """)
+        # Part (b) How to read these files
+        st.subheader("How to Read Excel Files?")
         st.code("""
         import pandas as pd
+        # Read an Excel file
+        df = pd.read_excel("file.xlsx")
         print(df.head())
         """)
+        # Part (c) Issues encountered
+        st.subheader("Common Issues Encountered When Handling Excel Files")
         st.write("""
+        - **Missing Data**: Some cells may contain empty or null values.
+        - **Encoding Problems**: Files saved in non-standard formats may have encoding issues.
+        - **File Corruption**: The file may become unreadable if improperly saved or transferred.
+        - **Large Files**: Handling very large Excel files may exceed memory limits.
         """)
+        # Part (d) How to overcome these errors/issues
         st.subheader("How to Overcome These Issues?")
         st.write("""
+        - **Missing Data**: Use data imputation techniques to fill in missing values.
+        - **Encoding Problems**: Specify the encoding format when reading the file, e.g., `encoding='utf-8'`.
+        - **File Corruption**: Use repair tools or convert to a compatible format like CSV.
+        - **Large Files**: Process the file in chunks using `pandas` or optimize it using external tools.
         """)
+        # Downloadable Guide Button
+        st.markdown("### Download Coding Guide:")
+        if st.button("Download Excel Guide"):
+            # Provide a downloadable file
+            file_path = "Excel_guide.ipynb"  # Ensure this file exists in the app directory
+            with open(file_path, "rb") as file:
+                st.download_button(
+                    label="Download Excel Guide",
+                    data=file,
+                    file_name="Excel_guide.ipynb",
+                    mime="application/octet-stream",
+                )
+# CSV Format Content
+if format_selected == "CSV":
+    st.write("#### CSV Format")
+    # Part (a) What it is
+    st.subheader("What is CSV?")
+    st.write("""
+    CSV (Comma-Separated Values) is a plain-text file format used to store tabular data,
+    where each row corresponds to a record, and fields are separated by commas.
+    It is widely used for data exchange due to its simplicity and compatibility across systems.
+    Common file extensions include `.csv`.
+    """)
+    # Part (b) How to Read These Files
+    st.subheader("How to Read CSV Files?")
+    st.code("""
+    import pandas as pd
+    # Reading a CSV file
+    df = pd.read_csv("file.csv")
+    print(df.head())
+    # Reading a CSV file with custom delimiter
+    df = pd.read_csv("file.csv", sep=";")
+    """)
+    # Part (c) Issues Encountered
+    st.subheader("Common Issues Encountered When Handling CSV Files")
+    st.write("""
+    - **Incorrect Delimiters**: Files may use delimiters other than commas, e.g., semicolons or tabs.
+    - **Encoding Problems**: Files with different encodings (e.g., UTF-8, ISO-8859-1) may cause errors.
+    - **Missing or Corrupted Data**: Blank fields or inconsistencies in data.
+    - **Header Issues**: Missing headers or extra/unexpected columns.
+    - **Large File Sizes**: Memory limitations when processing large datasets.
+    """)
+    # Part (d) How to Overcome These Issues
+    st.subheader("How to Overcome These Issues?")
+    st.write("""
+    - **Incorrect Delimiters**: Specify the correct delimiter when reading:
+      ```python
+      df = pd.read_csv("file.csv", sep=";")
+      ```
+    - **Encoding Problems**: Specify the encoding explicitly:
+      ```python
+      df = pd.read_csv("file.csv", encoding="utf-8")
+      ```
+    - **Missing or Corrupted Data**: Handle missing values using pandas:
+      ```python
+      df.fillna("NA", inplace=True)
+      ```
+    - **Header Issues**: Assign custom headers or skip problematic rows:
+      ```python
+      df = pd.read_csv("file.csv", header=None)
+      df.columns = ["Column1", "Column2", "Column3"]
+      ```
+    - **Large Files**: Use chunk processing for large files:
+      ```python
+      chunks = pd.read_csv("file.csv", chunksize=1000)
+      for chunk in chunks:
+          process(chunk)
+      ```
+    """)
+    # Downloadable Guide Button
+    st.markdown("### Download Coding Guide:")
+    if st.button("Download CSV Guide"):
+        # Provide a downloadable Jupyter Notebook file
+        file_path = "CSV_guide.ipynb"  # Replace with the actual file path
+        with open(file_path, "rb") as file:
+            st.download_button(
+                label="Download CSV Guide",
+                data=file,
+                file_name="CSV_guide.ipynb",
+                mime="application/octet-stream",
+            )
+# Main Section
+st.title("XML Data Format Guide")
+# XML Explanation Sections
+st.write("#### a. What is XML?")
+st.write("""
+XML (eXtensible Markup Language) is a markup language designed to store and transport data.
+It uses a hierarchical structure and tags, making it both human-readable and machine-readable.
+""")
+st.write("#### b. How to Read XML Files")
+st.code("""
+import xml.etree.ElementTree as ET
+# Parse an XML file
+tree = ET.parse("file.xml")
+root = tree.getroot()
+# Access elements
+for child in root:
+    print(child.tag, child.text)
+""", language="python")
+st.write("#### c. Issues Encountered When Handling XML Files")
+st.write("""
+1. **Complex Structures:** XML files may have deeply nested hierarchies.
+2. **Large File Sizes:** Memory-intensive parsing for large files.
+3. **Data Inconsistency:** Missing or unexpected tags may cause parsing errors.
+4. **Encoding Issues:** Files with non-standard encodings can fail to parse.
+""")
+st.write("#### d. How to Overcome These Issues")
+st.code("""
+from lxml import etree
+# Handle large XML files using event-driven parsing
+for event, element in etree.iterparse("large_file.xml", events=("end",)):
+    print(element.tag, element.text)
+    element.clear()
+""", language="python")
+# Downloadable Guide Button
+st.markdown("### Download Coding Guide")
+if st.button("Download XML Guide"):
+    file_path = "XML_guide.ipynb"  # Replace with the actual file path
+    with open(file_path, "rb") as file:
+        st.download_button(
+            label="Download XML Guide",
+            data=file,
+            file_name="XML_guide.ipynb",
+            mime="application/octet-stream",
+        )
 # If the user selects Semi-Structured Data
 elif data_type == "Semi-Structured":
     st.write("### Semi-Structured Data")