File size: 12,076 Bytes
3b0cd25 d97c686 a7b3a3f 3b0cd25 276f5d0 a7b3a3f 276f5d0 3b0cd25 276f5d0 3a279ad 30aabc7 3a279ad 30aabc7 3a279ad 30aabc7 3a279ad 30aabc7 276f5d0 7b3350f 30aabc7 31496fa 30aabc7 301d94b d1ebb79 31496fa 301d94b 94fc3e6 301d94b 94fc3e6 301d94b 3a279ad 276f5d0 e345f11 276f5d0 e345f11 276f5d0 31496fa 4a5f717 31496fa 4a5f717 ec3cb98 a589b68 ec3cb98 a589b68 ec3cb98 a589b68 d1ebb79 276f5d0 d1ebb79 4a5f717 276f5d0 4a5f717 9b93bcb 4a5f717 276f5d0 8bf2642 276f5d0 8bf2642 06ce905 b8e1a4e fc152bd b8e1a4e 61d26e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 |
import streamlit as st
import pandas as pd
import json
import streamlit as st
import pandas as pd
import json
# Header for Data Section
st.subheader("**What is Data?**")
st.write("Data refers to information, facts, or statistics that are collected, stored, and analyzed to derive meaningful insights. It represents raw, unprocessed values that can be used for decision-making, analysis, and predictions.")
st.subheader("**Types of Data**")
data_type = st.radio("**Select a type of data:**", ["Structured", "Unstructured", "Semi-Structured"], index=0)
# Initialize format_selected to avoid errors
format_selected = None
# Structured Data Section
if data_type == "Structured":
st.write("### Structured Data")
st.write("Structured data is organized in a predefined format, such as rows and columns.")
format_selected = st.radio("Select a data format to learn more:", ["CSV", "Excel", "XML"], index=0)
if format_selected == "Excel":
st.write("#### Excel Format")
st.subheader("What is Excel?")
st.write("""
Excel is a popular file format used for storing structured data in tabular form.
It is commonly used in business and data analysis, and its file extensions include `.xlsx` and `.xls`.
""")
st.subheader("How to Read Excel Files?")
st.code("""
import pandas as pd
# Read an Excel file
df = pd.read_excel("file.xlsx")
print(df.head())
""")
st.subheader("Common Issues Encountered When Handling Excel Files")
st.write("""
- **Missing Data**: Some cells may contain empty or null values.
- **Encoding Problems**: Files saved in non-standard formats may have encoding issues.
- **File Corruption**: The file may become unreadable if improperly saved or transferred.
- **Large Files**: Handling very large Excel files may exceed memory limits.
""")
st.subheader("How to Overcome These Issues?")
st.write("""
- **Missing Data**: Use data imputation techniques to fill in missing values.
- **Encoding Problems**: Specify the encoding format when reading the file, e.g., `encoding='utf-8'`.
- **File Corruption**: Use repair tools or convert to a compatible format like CSV.
- **Large Files**: Process the file in chunks using `pandas` or optimize it using external tools.
""")
# Downloadable Guide Button
st.markdown("### Download Coding Guide:")
if st.button("Download Excel Guide"):
# Provide a downloadable file
file_path = "Excel_guide.ipynb" # Ensure this file exists in the app directory
with open(file_path, "rb") as file:
st.download_button(
label="Download Excel Guide",
data=file,
file_name="Excel_guide.ipynb",
mime="application/octet-stream",
)
elif format_selected == "CSV":
st.write("#### CSV Format")
st.subheader("What is CSV?")
st.write("""
CSV (Comma-Separated Values) is a plain-text file format used to store tabular data,
where each row corresponds to a record, and fields are separated by commas.
It is widely used for data exchange due to its simplicity and compatibility across systems.
Common file extensions include `.csv`.
""")
# Part (b) How to Read These Files
st.subheader("How to Read CSV Files?")
st.code("""
import pandas as pd
# Reading a CSV file
df = pd.read_csv("file.csv")
print(df.head())
# Reading a CSV file with custom delimiter
df = pd.read_csv("file.csv", sep=";")
""")
# Part (c) Issues Encountered
st.subheader("Common Issues Encountered When Handling CSV Files")
st.write("""
- **Incorrect Delimiters**: Files may use delimiters other than commas, e.g., semicolons or tabs.
- **Encoding Problems**: Files with different encodings (e.g., UTF-8, ISO-8859-1) may cause errors.
- **Missing or Corrupted Data**: Blank fields or inconsistencies in data.
- **Header Issues**: Missing headers or extra/unexpected columns.
- **Large File Sizes**: Memory limitations when processing large datasets.
""")
# Part (d) How to Overcome These Issues
st.subheader("How to Overcome These Issues?")
st.write("""
- **Incorrect Delimiters**: Specify the correct delimiter when reading:
```python
df = pd.read_csv("file.csv", sep=";")
```
- **Encoding Problems**: Specify the encoding explicitly:
```python
df = pd.read_csv("file.csv", encoding="utf-8")
```
- **Missing or Corrupted Data**: Handle missing values using pandas:
```python
df.fillna("NA", inplace=True)
```
- **Header Issues**: Assign custom headers or skip problematic rows:
```python
df = pd.read_csv("file.csv", header=None)
df.columns = ["Column1", "Column2", "Column3"]
```
- **Large Files**: Use chunk processing for large files:
```python
chunks = pd.read_csv("file.csv", chunksize=1000)
for chunk in chunks:
process(chunk)
```
""")
# Downloadable Guide Button
st.markdown("### Download Coding Guide:")
if st.button("Download CSV Guide"):
# Provide a downloadable Jupyter Notebook file
file_path = "CSV_guide.ipynb" # Replace with the actual file path
with open(file_path, "rb") as file:
st.download_button(
label="Download CSV Guide",
data=file,
file_name="CSV_guide.ipynb",
mime="application/octet-stream",
)
elif format_selected == "XML":
st.write("#### XML Format")
st.subheader("What is XML?")
st.write("""
XML (eXtensible Markup Language) is a markup language designed to store and transport data.
It uses a hierarchical structure and tags, making it both human-readable and machine-readable.
""")
st.write("#### b. How to Read XML Files")
st.code("""
import xml.etree.ElementTree as ET
# Parse an XML file
tree = ET.parse("file.xml")
root = tree.getroot()
# Access elements
for child in root:
print(child.tag, child.text)
""", language="python")
st.write("#### c. Issues Encountered When Handling XML Files")
st.write("""
1. **Complex Structures:** XML files may have deeply nested hierarchies.
2. **Large File Sizes:** Memory-intensive parsing for large files.
3. **Data Inconsistency:** Missing or unexpected tags may cause parsing errors.
4. **Encoding Issues:** Files with non-standard encodings can fail to parse.
""")
st.write("#### d. How to Overcome These Issues")
st.code("""
from lxml import etree
# Handle large XML files using event-driven parsing
for event, element in etree.iterparse("large_file.xml", events=("end",)):
print(element.tag, element.text)
element.clear()
""", language="python")
# Downloadable Guide Button
st.markdown("### Download Coding Guide:")
if st.button("Download XML Guide"):
# Provide a downloadable Jupyter Notebook file
file_path = "XML_guide.ipynb" # Replace with the actual file path
with open(file_path, "rb") as file:
st.download_button(
label="Download XML Guide",
data=file,
file_name="XML_guide.ipynb",
mime="application/octet-stream",
)
# Semi-Structured Data Section
elif data_type == "Semi-Structured":
st.write("### Semi-Structured Data")
st.write("Semi-structured data does not have a predefined format but contains tags or markers to separate data elements.")
format_selected = st.radio("Choose a format to explore:", ["JSON", "HTML"], index=0)
if format_selected == "JSON":
st.write("#### JSON Format")
st.subheader("What is JSON?")
st.write("""
JSON (JavaScript Object Notation) is a lightweight, text-based data format used for data interchange.
""")
st.subheader("How to Read JSON Files")
st.code("""
import json
with open('file.json') as f:
data = json.load(f)
print(data)
""")
st.subheader("Common Issues with JSON Files")
st.write("Issues include complex nested structures and inconsistent data.")
st.subheader("How to Overcome These Issues?")
st.write("""
- Use `json_normalize` to flatten nested structures.
- Handle missing or inconsistent data with default values or conditionals.
""")
elif format_selected == "HTML":
st.write("#### HTML Format")
st.subheader("What is HTML?")
st.write("""
HTML (HyperText Markup Language) is a standard language used to structure content on web pages.
""")
st.subheader("How to Read HTML Files")
st.code("""
import pandas as pd
df = pd.read_html('file.html')[0] # Reading the first table
print(df.head())
""")
st.subheader("Common Issues with HTML Files")
st.write("Issues include multiple tables and improper tags.")
st.subheader("How to Overcome These Issues?")
st.write("""
- Specify the correct table index when reading multiple tables.
- Use `BeautifulSoup` to clean improperly formatted HTML.
""")
# Unstructured Data Section
elif data_type == "Unstructured":
st.write("### **Unstructured Data**")
st.write("Unstructured data refers to information that does not follow a predefined format or organizational schema. Unlike structured data (organized in rows and columns, such as in databases), unstructured data is more flexible but harder to analyze using traditional methods.")
st.write("""
Examples of Unstructured Data:
1. Text Documents: Emails, PDFs, Word documents.
2. Multimedia: Images, videos, audio recordings.
3. Social Media Content: Posts, comments, likes, and shares.
4. Sensor Data: Logs, IoT device readings without a fixed schema.
""")
st.write("**Characteristics of Unstructured Data:**")
st.write("""
1.Lack of Predefined Format: The data does not fit into traditional tables or relational databases.
2.High Volume: Unstructured data is often generated in large amounts (e.g., social media, video content).
3.Variety: It comes in diverse forms (e.g., images, free-form text, logs).
4.Complex Analysis: Requires advanced tools like Natural Language Processing (NLP) for text or Computer Vision for images.
""")
st.write("**How to Work with Unstructured Data:**")
st.write("""
1.Storage Solutions: Use scalable systems like AWS S3, Hadoop, or NoSQL databases.
2.Preprocessing: Convert unstructured data into analyzable formats using:
- Text data: Tokenization, stemming, and lemmatization.
- Image data: Feature extraction using tools like OpenCV.
3.Tools for Analysis:
- Text Analysis: NLP libraries (e.g., spaCy, NLTK).
- Image/Video Analysis: TensorFlow, PyTorch, or OpenCV.
- Big Data Processing: Apache Spark, Hadoop.
""")
if st.button("**Image**"):
st.switch_page("pages/Image.py")
if st.button("**Basic operations of Image with the help of Open cv**"):
st.switch_page("pages/Basic operations of Image with the help of Open cv.py")
if st.button("**How to work on Image**"):
st.switch_page("pages/How to work on Image.py")
if st.button("**How to handle Videos**"):
st.switch_page("pages/How to handle Videos.py")
if st.button("**Transformation**"):
st.switch_page("pages/Transformation.py")
if st.button("**Projects**"):
st.switch_page("pages/Projects")
|