shwetashweta05 commited on
Commit
3a279ad
·
verified ·
1 Parent(s): 3dd3579

Update pages/6.Data Collection.py

Browse files
Files changed (1) hide show
  1. pages/6.Data Collection.py +167 -46
pages/6.Data Collection.py CHANGED
@@ -20,66 +20,187 @@ if data_type == "Structured":
20
  # Now, add the format selection in this section
21
  format_selected = st.radio("Select a data format to learn more:", ["CSV", "Excel", "XML"])
22
 
23
- # CSV Section
24
- if format_selected == "CSV":
25
- st.write("### CSV Format")
26
- st.subheader("What is CSV?")
27
- st.write("CSV (Comma-Separated Values) is a plain-text file format used to store tabular data.")
28
- st.subheader("How to Read CSV Files")
29
- st.code("""
30
- import pandas as pd
31
- df = pd.read_csv('file.csv')
32
- print(df.head())
33
- """)
34
- st.subheader("Common Issues with CSV Files")
35
- st.write("Issues include incorrect delimiters, encoding problems, and missing data.")
36
- st.subheader("How to Overcome These Issues?")
37
  st.write("""
38
- - Use the correct delimiter with `sep=";"` if needed.
39
- - Specify encoding to resolve encoding problems, like `encoding="utf-8"`.
40
  """)
41
 
42
- # Excel Section
43
- elif format_selected == "Excel":
44
- st.write("### Excel Format")
45
- st.subheader("What is Excel?")
46
- st.write("Excel is a widely used file format for structured data. It stores data in tabular form.")
47
- st.subheader("How to Read Excel Files")
48
  st.code("""
49
  import pandas as pd
50
- df = pd.read_excel('file.xlsx')
 
51
  print(df.head())
52
  """)
53
- st.subheader("Common Issues with Excel Files")
54
- st.write("Issues include missing data, encoding problems, and large files.")
55
- st.subheader("How to Overcome These Issues?")
56
  st.write("""
57
- - Handle missing data using imputation techniques.
58
- - Specify encoding with `encoding="utf-8"`.
59
- - Use chunk processing for large files.
 
60
  """)
61
 
62
- # XML Section
63
- elif format_selected == "XML":
64
- st.write("### XML Format")
65
- st.subheader("What is XML?")
66
- st.write("XML is a markup language used to store and transport data in a hierarchical format.")
67
- st.subheader("How to Read XML Files")
68
- st.code("""
69
- import xml.etree.ElementTree as ET
70
- tree = ET.parse('file.xml')
71
- root = tree.getroot()
72
- for child in root:
73
- print(child.tag, child.text)
74
- """)
75
- st.subheader("Common Issues with XML Files")
76
- st.write("Issues include large file sizes and missing or incorrect tags.")
77
  st.subheader("How to Overcome These Issues?")
78
  st.write("""
79
- - Use event-driven parsing for large XML files.
80
- - Ensure tags are properly formed or use `BeautifulSoup` for cleaning.
 
 
81
  """)
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  # If the user selects Semi-Structured Data
84
  elif data_type == "Semi-Structured":
85
  st.write("### Semi-Structured Data")
 
20
  # Now, add the format selection in this section
21
  format_selected = st.radio("Select a data format to learn more:", ["CSV", "Excel", "XML"])
22
 
23
+ # Excel Format Section
24
+ if format_selected == "Excel":
25
+ st.write("#### Excel Format")
26
+
27
+ # Part (a) What it is
28
+ st.subheader("What is Excel?")
 
 
 
 
 
 
 
 
29
  st.write("""
30
+ Excel is a popular file format used for storing structured data in tabular form.
31
+ It is commonly used in business and data analysis, and its file extensions include `.xlsx` and `.xls`.
32
  """)
33
 
34
+ # Part (b) How to read these files
35
+ st.subheader("How to Read Excel Files?")
 
 
 
 
36
  st.code("""
37
  import pandas as pd
38
+ # Read an Excel file
39
+ df = pd.read_excel("file.xlsx")
40
  print(df.head())
41
  """)
42
+
43
+ # Part (c) Issues encountered
44
+ st.subheader("Common Issues Encountered When Handling Excel Files")
45
  st.write("""
46
+ - **Missing Data**: Some cells may contain empty or null values.
47
+ - **Encoding Problems**: Files saved in non-standard formats may have encoding issues.
48
+ - **File Corruption**: The file may become unreadable if improperly saved or transferred.
49
+ - **Large Files**: Handling very large Excel files may exceed memory limits.
50
  """)
51
 
52
+ # Part (d) How to overcome these errors/issues
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  st.subheader("How to Overcome These Issues?")
54
  st.write("""
55
+ - **Missing Data**: Use data imputation techniques to fill in missing values.
56
+ - **Encoding Problems**: Specify the encoding format when reading the file, e.g., `encoding='utf-8'`.
57
+ - **File Corruption**: Use repair tools or convert to a compatible format like CSV.
58
+ - **Large Files**: Process the file in chunks using `pandas` or optimize it using external tools.
59
  """)
60
 
61
+ # Downloadable Guide Button
62
+ st.markdown("### Download Coding Guide:")
63
+ if st.button("Download Excel Guide"):
64
+ # Provide a downloadable file
65
+ file_path = "Excel_guide.ipynb" # Ensure this file exists in the app directory
66
+ with open(file_path, "rb") as file:
67
+ st.download_button(
68
+ label="Download Excel Guide",
69
+ data=file,
70
+ file_name="Excel_guide.ipynb",
71
+ mime="application/octet-stream",
72
+ )
73
+
74
+ # CSV Format Content
75
+ if format_selected == "CSV":
76
+ st.write("#### CSV Format")
77
+
78
+ # Part (a) What it is
79
+ st.subheader("What is CSV?")
80
+ st.write("""
81
+ CSV (Comma-Separated Values) is a plain-text file format used to store tabular data,
82
+ where each row corresponds to a record, and fields are separated by commas.
83
+ It is widely used for data exchange due to its simplicity and compatibility across systems.
84
+ Common file extensions include `.csv`.
85
+ """)
86
+
87
+ # Part (b) How to Read These Files
88
+ st.subheader("How to Read CSV Files?")
89
+ st.code("""
90
+ import pandas as pd
91
+ # Reading a CSV file
92
+ df = pd.read_csv("file.csv")
93
+ print(df.head())
94
+
95
+ # Reading a CSV file with custom delimiter
96
+ df = pd.read_csv("file.csv", sep=";")
97
+ """)
98
+
99
+ # Part (c) Issues Encountered
100
+ st.subheader("Common Issues Encountered When Handling CSV Files")
101
+ st.write("""
102
+ - **Incorrect Delimiters**: Files may use delimiters other than commas, e.g., semicolons or tabs.
103
+ - **Encoding Problems**: Files with different encodings (e.g., UTF-8, ISO-8859-1) may cause errors.
104
+ - **Missing or Corrupted Data**: Blank fields or inconsistencies in data.
105
+ - **Header Issues**: Missing headers or extra/unexpected columns.
106
+ - **Large File Sizes**: Memory limitations when processing large datasets.
107
+ """)
108
+
109
+ # Part (d) How to Overcome These Issues
110
+ st.subheader("How to Overcome These Issues?")
111
+ st.write("""
112
+ - **Incorrect Delimiters**: Specify the correct delimiter when reading:
113
+ ```python
114
+ df = pd.read_csv("file.csv", sep=";")
115
+ ```
116
+ - **Encoding Problems**: Specify the encoding explicitly:
117
+ ```python
118
+ df = pd.read_csv("file.csv", encoding="utf-8")
119
+ ```
120
+ - **Missing or Corrupted Data**: Handle missing values using pandas:
121
+ ```python
122
+ df.fillna("NA", inplace=True)
123
+ ```
124
+ - **Header Issues**: Assign custom headers or skip problematic rows:
125
+ ```python
126
+ df = pd.read_csv("file.csv", header=None)
127
+ df.columns = ["Column1", "Column2", "Column3"]
128
+ ```
129
+ - **Large Files**: Use chunk processing for large files:
130
+ ```python
131
+ chunks = pd.read_csv("file.csv", chunksize=1000)
132
+ for chunk in chunks:
133
+ process(chunk)
134
+ ```
135
+ """)
136
+
137
+ # Downloadable Guide Button
138
+ st.markdown("### Download Coding Guide:")
139
+ if st.button("Download CSV Guide"):
140
+ # Provide a downloadable Jupyter Notebook file
141
+ file_path = "CSV_guide.ipynb" # Replace with the actual file path
142
+ with open(file_path, "rb") as file:
143
+ st.download_button(
144
+ label="Download CSV Guide",
145
+ data=file,
146
+ file_name="CSV_guide.ipynb",
147
+ mime="application/octet-stream",
148
+ )
149
+
150
+ # Main Section
151
+ st.title("XML Data Format Guide")
152
+
153
+ # XML Explanation Sections
154
+ st.write("#### a. What is XML?")
155
+ st.write("""
156
+ XML (eXtensible Markup Language) is a markup language designed to store and transport data.
157
+ It uses a hierarchical structure and tags, making it both human-readable and machine-readable.
158
+ """)
159
+
160
+ st.write("#### b. How to Read XML Files")
161
+ st.code("""
162
+ import xml.etree.ElementTree as ET
163
+
164
+ # Parse an XML file
165
+ tree = ET.parse("file.xml")
166
+ root = tree.getroot()
167
+
168
+ # Access elements
169
+ for child in root:
170
+ print(child.tag, child.text)
171
+ """, language="python")
172
+
173
+ st.write("#### c. Issues Encountered When Handling XML Files")
174
+ st.write("""
175
+ 1. **Complex Structures:** XML files may have deeply nested hierarchies.
176
+ 2. **Large File Sizes:** Memory-intensive parsing for large files.
177
+ 3. **Data Inconsistency:** Missing or unexpected tags may cause parsing errors.
178
+ 4. **Encoding Issues:** Files with non-standard encodings can fail to parse.
179
+ """)
180
+
181
+ st.write("#### d. How to Overcome These Issues")
182
+ st.code("""
183
+ from lxml import etree
184
+
185
+ # Handle large XML files using event-driven parsing
186
+ for event, element in etree.iterparse("large_file.xml", events=("end",)):
187
+ print(element.tag, element.text)
188
+ element.clear()
189
+ """, language="python")
190
+
191
+ # Downloadable Guide Button
192
+ st.markdown("### Download Coding Guide")
193
+ if st.button("Download XML Guide"):
194
+ file_path = "XML_guide.ipynb" # Replace with the actual file path
195
+ with open(file_path, "rb") as file:
196
+ st.download_button(
197
+ label="Download XML Guide",
198
+ data=file,
199
+ file_name="XML_guide.ipynb",
200
+ mime="application/octet-stream",
201
+ )
202
+
203
+
204
  # If the user selects Semi-Structured Data
205
  elif data_type == "Semi-Structured":
206
  st.write("### Semi-Structured Data")