Update pages/6.Data Collection.py
Browse files- pages/6.Data Collection.py +167 -46
pages/6.Data Collection.py
CHANGED
@@ -20,66 +20,187 @@ if data_type == "Structured":
|
|
20 |
# Now, add the format selection in this section
|
21 |
format_selected = st.radio("Select a data format to learn more:", ["CSV", "Excel", "XML"])
|
22 |
|
23 |
-
|
24 |
-
if format_selected == "
|
25 |
-
st.write("
|
26 |
-
|
27 |
-
|
28 |
-
st.subheader("
|
29 |
-
st.code("""
|
30 |
-
import pandas as pd
|
31 |
-
df = pd.read_csv('file.csv')
|
32 |
-
print(df.head())
|
33 |
-
""")
|
34 |
-
st.subheader("Common Issues with CSV Files")
|
35 |
-
st.write("Issues include incorrect delimiters, encoding problems, and missing data.")
|
36 |
-
st.subheader("How to Overcome These Issues?")
|
37 |
st.write("""
|
38 |
-
|
39 |
-
|
40 |
""")
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
st.write("### Excel Format")
|
45 |
-
st.subheader("What is Excel?")
|
46 |
-
st.write("Excel is a widely used file format for structured data. It stores data in tabular form.")
|
47 |
-
st.subheader("How to Read Excel Files")
|
48 |
st.code("""
|
49 |
import pandas as pd
|
50 |
-
|
|
|
51 |
print(df.head())
|
52 |
""")
|
53 |
-
|
54 |
-
|
55 |
-
st.subheader("
|
56 |
st.write("""
|
57 |
-
-
|
58 |
-
-
|
59 |
-
-
|
|
|
60 |
""")
|
61 |
|
62 |
-
|
63 |
-
elif format_selected == "XML":
|
64 |
-
st.write("### XML Format")
|
65 |
-
st.subheader("What is XML?")
|
66 |
-
st.write("XML is a markup language used to store and transport data in a hierarchical format.")
|
67 |
-
st.subheader("How to Read XML Files")
|
68 |
-
st.code("""
|
69 |
-
import xml.etree.ElementTree as ET
|
70 |
-
tree = ET.parse('file.xml')
|
71 |
-
root = tree.getroot()
|
72 |
-
for child in root:
|
73 |
-
print(child.tag, child.text)
|
74 |
-
""")
|
75 |
-
st.subheader("Common Issues with XML Files")
|
76 |
-
st.write("Issues include large file sizes and missing or incorrect tags.")
|
77 |
st.subheader("How to Overcome These Issues?")
|
78 |
st.write("""
|
79 |
-
- Use
|
80 |
-
-
|
|
|
|
|
81 |
""")
|
82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
# If the user selects Semi-Structured Data
|
84 |
elif data_type == "Semi-Structured":
|
85 |
st.write("### Semi-Structured Data")
|
|
|
20 |
# Now, add the format selection in this section
|
21 |
format_selected = st.radio("Select a data format to learn more:", ["CSV", "Excel", "XML"])
|
22 |
|
23 |
+
# Excel Format Section
|
24 |
+
if format_selected == "Excel":
|
25 |
+
st.write("#### Excel Format")
|
26 |
+
|
27 |
+
# Part (a) What it is
|
28 |
+
st.subheader("What is Excel?")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
st.write("""
|
30 |
+
Excel is a popular file format used for storing structured data in tabular form.
|
31 |
+
It is commonly used in business and data analysis, and its file extensions include `.xlsx` and `.xls`.
|
32 |
""")
|
33 |
|
34 |
+
# Part (b) How to read these files
|
35 |
+
st.subheader("How to Read Excel Files?")
|
|
|
|
|
|
|
|
|
36 |
st.code("""
|
37 |
import pandas as pd
|
38 |
+
# Read an Excel file
|
39 |
+
df = pd.read_excel("file.xlsx")
|
40 |
print(df.head())
|
41 |
""")
|
42 |
+
|
43 |
+
# Part (c) Issues encountered
|
44 |
+
st.subheader("Common Issues Encountered When Handling Excel Files")
|
45 |
st.write("""
|
46 |
+
- **Missing Data**: Some cells may contain empty or null values.
|
47 |
+
- **Encoding Problems**: Files saved in non-standard formats may have encoding issues.
|
48 |
+
- **File Corruption**: The file may become unreadable if improperly saved or transferred.
|
49 |
+
- **Large Files**: Handling very large Excel files may exceed memory limits.
|
50 |
""")
|
51 |
|
52 |
+
# Part (d) How to overcome these errors/issues
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
st.subheader("How to Overcome These Issues?")
|
54 |
st.write("""
|
55 |
+
- **Missing Data**: Use data imputation techniques to fill in missing values.
|
56 |
+
- **Encoding Problems**: Specify the encoding format when reading the file, e.g., `encoding='utf-8'`.
|
57 |
+
- **File Corruption**: Use repair tools or convert to a compatible format like CSV.
|
58 |
+
- **Large Files**: Process the file in chunks using `pandas` or optimize it using external tools.
|
59 |
""")
|
60 |
|
61 |
+
# Downloadable Guide Button
|
62 |
+
st.markdown("### Download Coding Guide:")
|
63 |
+
if st.button("Download Excel Guide"):
|
64 |
+
# Provide a downloadable file
|
65 |
+
file_path = "Excel_guide.ipynb" # Ensure this file exists in the app directory
|
66 |
+
with open(file_path, "rb") as file:
|
67 |
+
st.download_button(
|
68 |
+
label="Download Excel Guide",
|
69 |
+
data=file,
|
70 |
+
file_name="Excel_guide.ipynb",
|
71 |
+
mime="application/octet-stream",
|
72 |
+
)
|
73 |
+
|
74 |
+
# CSV Format Content
|
75 |
+
if format_selected == "CSV":
|
76 |
+
st.write("#### CSV Format")
|
77 |
+
|
78 |
+
# Part (a) What it is
|
79 |
+
st.subheader("What is CSV?")
|
80 |
+
st.write("""
|
81 |
+
CSV (Comma-Separated Values) is a plain-text file format used to store tabular data,
|
82 |
+
where each row corresponds to a record, and fields are separated by commas.
|
83 |
+
It is widely used for data exchange due to its simplicity and compatibility across systems.
|
84 |
+
Common file extensions include `.csv`.
|
85 |
+
""")
|
86 |
+
|
87 |
+
# Part (b) How to Read These Files
|
88 |
+
st.subheader("How to Read CSV Files?")
|
89 |
+
st.code("""
|
90 |
+
import pandas as pd
|
91 |
+
# Reading a CSV file
|
92 |
+
df = pd.read_csv("file.csv")
|
93 |
+
print(df.head())
|
94 |
+
|
95 |
+
# Reading a CSV file with custom delimiter
|
96 |
+
df = pd.read_csv("file.csv", sep=";")
|
97 |
+
""")
|
98 |
+
|
99 |
+
# Part (c) Issues Encountered
|
100 |
+
st.subheader("Common Issues Encountered When Handling CSV Files")
|
101 |
+
st.write("""
|
102 |
+
- **Incorrect Delimiters**: Files may use delimiters other than commas, e.g., semicolons or tabs.
|
103 |
+
- **Encoding Problems**: Files with different encodings (e.g., UTF-8, ISO-8859-1) may cause errors.
|
104 |
+
- **Missing or Corrupted Data**: Blank fields or inconsistencies in data.
|
105 |
+
- **Header Issues**: Missing headers or extra/unexpected columns.
|
106 |
+
- **Large File Sizes**: Memory limitations when processing large datasets.
|
107 |
+
""")
|
108 |
+
|
109 |
+
# Part (d) How to Overcome These Issues
|
110 |
+
st.subheader("How to Overcome These Issues?")
|
111 |
+
st.write("""
|
112 |
+
- **Incorrect Delimiters**: Specify the correct delimiter when reading:
|
113 |
+
```python
|
114 |
+
df = pd.read_csv("file.csv", sep=";")
|
115 |
+
```
|
116 |
+
- **Encoding Problems**: Specify the encoding explicitly:
|
117 |
+
```python
|
118 |
+
df = pd.read_csv("file.csv", encoding="utf-8")
|
119 |
+
```
|
120 |
+
- **Missing or Corrupted Data**: Handle missing values using pandas:
|
121 |
+
```python
|
122 |
+
df.fillna("NA", inplace=True)
|
123 |
+
```
|
124 |
+
- **Header Issues**: Assign custom headers or skip problematic rows:
|
125 |
+
```python
|
126 |
+
df = pd.read_csv("file.csv", header=None)
|
127 |
+
df.columns = ["Column1", "Column2", "Column3"]
|
128 |
+
```
|
129 |
+
- **Large Files**: Use chunk processing for large files:
|
130 |
+
```python
|
131 |
+
chunks = pd.read_csv("file.csv", chunksize=1000)
|
132 |
+
for chunk in chunks:
|
133 |
+
process(chunk)
|
134 |
+
```
|
135 |
+
""")
|
136 |
+
|
137 |
+
# Downloadable Guide Button
|
138 |
+
st.markdown("### Download Coding Guide:")
|
139 |
+
if st.button("Download CSV Guide"):
|
140 |
+
# Provide a downloadable Jupyter Notebook file
|
141 |
+
file_path = "CSV_guide.ipynb" # Replace with the actual file path
|
142 |
+
with open(file_path, "rb") as file:
|
143 |
+
st.download_button(
|
144 |
+
label="Download CSV Guide",
|
145 |
+
data=file,
|
146 |
+
file_name="CSV_guide.ipynb",
|
147 |
+
mime="application/octet-stream",
|
148 |
+
)
|
149 |
+
|
150 |
+
# Main Section
|
151 |
+
st.title("XML Data Format Guide")
|
152 |
+
|
153 |
+
# XML Explanation Sections
|
154 |
+
st.write("#### a. What is XML?")
|
155 |
+
st.write("""
|
156 |
+
XML (eXtensible Markup Language) is a markup language designed to store and transport data.
|
157 |
+
It uses a hierarchical structure and tags, making it both human-readable and machine-readable.
|
158 |
+
""")
|
159 |
+
|
160 |
+
st.write("#### b. How to Read XML Files")
|
161 |
+
st.code("""
|
162 |
+
import xml.etree.ElementTree as ET
|
163 |
+
|
164 |
+
# Parse an XML file
|
165 |
+
tree = ET.parse("file.xml")
|
166 |
+
root = tree.getroot()
|
167 |
+
|
168 |
+
# Access elements
|
169 |
+
for child in root:
|
170 |
+
print(child.tag, child.text)
|
171 |
+
""", language="python")
|
172 |
+
|
173 |
+
st.write("#### c. Issues Encountered When Handling XML Files")
|
174 |
+
st.write("""
|
175 |
+
1. **Complex Structures:** XML files may have deeply nested hierarchies.
|
176 |
+
2. **Large File Sizes:** Memory-intensive parsing for large files.
|
177 |
+
3. **Data Inconsistency:** Missing or unexpected tags may cause parsing errors.
|
178 |
+
4. **Encoding Issues:** Files with non-standard encodings can fail to parse.
|
179 |
+
""")
|
180 |
+
|
181 |
+
st.write("#### d. How to Overcome These Issues")
|
182 |
+
st.code("""
|
183 |
+
from lxml import etree
|
184 |
+
|
185 |
+
# Handle large XML files using event-driven parsing
|
186 |
+
for event, element in etree.iterparse("large_file.xml", events=("end",)):
|
187 |
+
print(element.tag, element.text)
|
188 |
+
element.clear()
|
189 |
+
""", language="python")
|
190 |
+
|
191 |
+
# Downloadable Guide Button
|
192 |
+
st.markdown("### Download Coding Guide")
|
193 |
+
if st.button("Download XML Guide"):
|
194 |
+
file_path = "XML_guide.ipynb" # Replace with the actual file path
|
195 |
+
with open(file_path, "rb") as file:
|
196 |
+
st.download_button(
|
197 |
+
label="Download XML Guide",
|
198 |
+
data=file,
|
199 |
+
file_name="XML_guide.ipynb",
|
200 |
+
mime="application/octet-stream",
|
201 |
+
)
|
202 |
+
|
203 |
+
|
204 |
# If the user selects Semi-Structured Data
|
205 |
elif data_type == "Semi-Structured":
|
206 |
st.write("### Semi-Structured Data")
|