|
import streamlit as st |
|
import pandas as pd |
|
import json |
|
|
|
import streamlit as st |
|
import pandas as pd |
|
import json |
|
|
|
|
|
st.subheader("**What is Data?**") |
|
st.write("Data refers to information, facts, or statistics that are collected, stored, and analyzed to derive meaningful insights. It represents raw, unprocessed values that can be used for decision-making, analysis, and predictions.") |
|
|
|
st.subheader("**Types of Data**") |
|
data_type = st.radio("**Select a type of data:**", ["Structured", "Unstructured", "Semi-Structured"], index=0) |
|
|
|
|
|
format_selected = None |
|
|
|
|
|
if data_type == "Structured": |
|
st.write("### Structured Data") |
|
st.write("Structured data is organized in a predefined format, such as rows and columns.") |
|
|
|
format_selected = st.radio("Select a data format to learn more:", ["CSV", "Excel", "XML"], index=0) |
|
|
|
if format_selected == "Excel": |
|
st.write("#### Excel Format") |
|
st.subheader("What is Excel?") |
|
st.write(""" |
|
Excel is a popular file format used for storing structured data in tabular form. |
|
It is commonly used in business and data analysis, and its file extensions include `.xlsx` and `.xls`. |
|
""") |
|
st.subheader("How to Read Excel Files?") |
|
st.code(""" |
|
import pandas as pd |
|
# Read an Excel file |
|
df = pd.read_excel("file.xlsx") |
|
print(df.head()) |
|
""") |
|
st.subheader("Common Issues Encountered When Handling Excel Files") |
|
st.write(""" |
|
- **Missing Data**: Some cells may contain empty or null values. |
|
- **Encoding Problems**: Files saved in non-standard formats may have encoding issues. |
|
- **File Corruption**: The file may become unreadable if improperly saved or transferred. |
|
- **Large Files**: Handling very large Excel files may exceed memory limits. |
|
""") |
|
st.subheader("How to Overcome These Issues?") |
|
st.write(""" |
|
- **Missing Data**: Use data imputation techniques to fill in missing values. |
|
- **Encoding Problems**: Specify the encoding format when reading the file, e.g., `encoding='utf-8'`. |
|
- **File Corruption**: Use repair tools or convert to a compatible format like CSV. |
|
- **Large Files**: Process the file in chunks using `pandas` or optimize it using external tools. |
|
""") |
|
|
|
st.markdown("### Download Coding Guide:") |
|
if st.button("Download Excel Guide"): |
|
|
|
file_path = "Excel_guide.ipynb" |
|
with open(file_path, "rb") as file: |
|
st.download_button( |
|
label="Download Excel Guide", |
|
data=file, |
|
file_name="Excel_guide.ipynb", |
|
mime="application/octet-stream", |
|
) |
|
|
|
elif format_selected == "CSV": |
|
st.write("#### CSV Format") |
|
st.subheader("What is CSV?") |
|
st.write(""" |
|
CSV (Comma-Separated Values) is a plain-text file format used to store tabular data, |
|
where each row corresponds to a record, and fields are separated by commas. |
|
It is widely used for data exchange due to its simplicity and compatibility across systems. |
|
Common file extensions include `.csv`. |
|
""") |
|
|
|
st.subheader("How to Read CSV Files?") |
|
st.code(""" |
|
import pandas as pd |
|
# Reading a CSV file |
|
df = pd.read_csv("file.csv") |
|
print(df.head()) |
|
|
|
# Reading a CSV file with custom delimiter |
|
df = pd.read_csv("file.csv", sep=";") |
|
""") |
|
|
|
st.subheader("Common Issues Encountered When Handling CSV Files") |
|
st.write(""" |
|
- **Incorrect Delimiters**: Files may use delimiters other than commas, e.g., semicolons or tabs. |
|
- **Encoding Problems**: Files with different encodings (e.g., UTF-8, ISO-8859-1) may cause errors. |
|
- **Missing or Corrupted Data**: Blank fields or inconsistencies in data. |
|
- **Header Issues**: Missing headers or extra/unexpected columns. |
|
- **Large File Sizes**: Memory limitations when processing large datasets. |
|
""") |
|
|
|
st.subheader("How to Overcome These Issues?") |
|
st.write(""" |
|
- **Incorrect Delimiters**: Specify the correct delimiter when reading: |
|
```python |
|
df = pd.read_csv("file.csv", sep=";") |
|
``` |
|
- **Encoding Problems**: Specify the encoding explicitly: |
|
```python |
|
df = pd.read_csv("file.csv", encoding="utf-8") |
|
``` |
|
- **Missing or Corrupted Data**: Handle missing values using pandas: |
|
```python |
|
df.fillna("NA", inplace=True) |
|
``` |
|
- **Header Issues**: Assign custom headers or skip problematic rows: |
|
```python |
|
df = pd.read_csv("file.csv", header=None) |
|
df.columns = ["Column1", "Column2", "Column3"] |
|
``` |
|
- **Large Files**: Use chunk processing for large files: |
|
```python |
|
chunks = pd.read_csv("file.csv", chunksize=1000) |
|
for chunk in chunks: |
|
process(chunk) |
|
``` |
|
""") |
|
|
|
|
|
st.markdown("### Download Coding Guide:") |
|
if st.button("Download CSV Guide"): |
|
|
|
file_path = "CSV_guide.ipynb" |
|
with open(file_path, "rb") as file: |
|
st.download_button( |
|
label="Download CSV Guide", |
|
data=file, |
|
file_name="CSV_guide.ipynb", |
|
mime="application/octet-stream", |
|
) |
|
|
|
|
|
elif format_selected == "XML": |
|
st.write("#### XML Format") |
|
st.subheader("What is XML?") |
|
st.write(""" |
|
XML (eXtensible Markup Language) is a markup language designed to store and transport data. |
|
It uses a hierarchical structure and tags, making it both human-readable and machine-readable. |
|
""") |
|
st.write("#### b. How to Read XML Files") |
|
st.code(""" |
|
import xml.etree.ElementTree as ET |
|
|
|
# Parse an XML file |
|
tree = ET.parse("file.xml") |
|
root = tree.getroot() |
|
|
|
# Access elements |
|
for child in root: |
|
print(child.tag, child.text) |
|
""", language="python") |
|
|
|
st.write("#### c. Issues Encountered When Handling XML Files") |
|
st.write(""" |
|
1. **Complex Structures:** XML files may have deeply nested hierarchies. |
|
2. **Large File Sizes:** Memory-intensive parsing for large files. |
|
3. **Data Inconsistency:** Missing or unexpected tags may cause parsing errors. |
|
4. **Encoding Issues:** Files with non-standard encodings can fail to parse. |
|
""") |
|
|
|
st.write("#### d. How to Overcome These Issues") |
|
st.code(""" |
|
from lxml import etree |
|
|
|
# Handle large XML files using event-driven parsing |
|
for event, element in etree.iterparse("large_file.xml", events=("end",)): |
|
print(element.tag, element.text) |
|
element.clear() |
|
""", language="python") |
|
|
|
|
|
st.markdown("### Download Coding Guide:") |
|
if st.button("Download XML Guide"): |
|
|
|
file_path = "XML_guide.ipynb" |
|
with open(file_path, "rb") as file: |
|
st.download_button( |
|
label="Download XML Guide", |
|
data=file, |
|
file_name="XML_guide.ipynb", |
|
mime="application/octet-stream", |
|
) |
|
|
|
|
|
|
|
|
|
elif data_type == "Semi-Structured": |
|
st.write("### Semi-Structured Data") |
|
st.write("Semi-structured data does not have a predefined format but contains tags or markers to separate data elements.") |
|
|
|
format_selected = st.radio("Choose a format to explore:", ["JSON", "HTML"], index=0) |
|
|
|
if format_selected == "JSON": |
|
st.write("#### JSON Format") |
|
st.subheader("What is JSON?") |
|
st.write(""" |
|
JSON (JavaScript Object Notation) is a lightweight, text-based data format used for data interchange. |
|
""") |
|
|
|
st.subheader("How to Read JSON Files") |
|
st.code(""" |
|
import json |
|
with open('file.json') as f: |
|
data = json.load(f) |
|
print(data) |
|
""") |
|
st.subheader("Common Issues with JSON Files") |
|
st.write("Issues include complex nested structures and inconsistent data.") |
|
|
|
st.subheader("How to Overcome These Issues?") |
|
st.write(""" |
|
- Use `json_normalize` to flatten nested structures. |
|
- Handle missing or inconsistent data with default values or conditionals. |
|
""") |
|
|
|
|
|
elif format_selected == "HTML": |
|
st.write("#### HTML Format") |
|
st.subheader("What is HTML?") |
|
st.write(""" |
|
HTML (HyperText Markup Language) is a standard language used to structure content on web pages. |
|
""") |
|
st.subheader("How to Read HTML Files") |
|
st.code(""" |
|
import pandas as pd |
|
df = pd.read_html('file.html')[0] # Reading the first table |
|
print(df.head()) |
|
""") |
|
st.subheader("Common Issues with HTML Files") |
|
st.write("Issues include multiple tables and improper tags.") |
|
st.subheader("How to Overcome These Issues?") |
|
st.write(""" |
|
- Specify the correct table index when reading multiple tables. |
|
- Use `BeautifulSoup` to clean improperly formatted HTML. |
|
""") |
|
|
|
|
|
|
|
elif data_type == "Unstructured": |
|
st.write("### **Unstructured Data**") |
|
st.write("Unstructured data refers to information that does not follow a predefined format or organizational schema. Unlike structured data (organized in rows and columns, such as in databases), unstructured data is more flexible but harder to analyze using traditional methods.") |
|
st.write(""" |
|
Examples of Unstructured Data: |
|
1. Text Documents: Emails, PDFs, Word documents. |
|
2. Multimedia: Images, videos, audio recordings. |
|
3. Social Media Content: Posts, comments, likes, and shares. |
|
4. Sensor Data: Logs, IoT device readings without a fixed schema. |
|
""") |
|
|
|
st.write("**Characteristics of Unstructured Data:**") |
|
st.write(""" |
|
1.Lack of Predefined Format: The data does not fit into traditional tables or relational databases. |
|
2.High Volume: Unstructured data is often generated in large amounts (e.g., social media, video content). |
|
3.Variety: It comes in diverse forms (e.g., images, free-form text, logs). |
|
4.Complex Analysis: Requires advanced tools like Natural Language Processing (NLP) for text or Computer Vision for images. |
|
""") |
|
|
|
st.write("**How to Work with Unstructured Data:**") |
|
st.write(""" |
|
1.Storage Solutions: Use scalable systems like AWS S3, Hadoop, or NoSQL databases. |
|
2.Preprocessing: Convert unstructured data into analyzable formats using: |
|
- Text data: Tokenization, stemming, and lemmatization. |
|
- Image data: Feature extraction using tools like OpenCV. |
|
3.Tools for Analysis: |
|
- Text Analysis: NLP libraries (e.g., spaCy, NLTK). |
|
- Image/Video Analysis: TensorFlow, PyTorch, or OpenCV. |
|
- Big Data Processing: Apache Spark, Hadoop. |
|
""") |
|
|
|
|
|
if st.button("**Image**"): |
|
st.switch_page("pages/Image.py") |
|
|
|
if st.button("**Basic operations of Image with the help of Open cv**"): |
|
st.switch_page("pages/Basic operations of Image with the help of Open cv.py") |
|
|
|
if st.button("**How to work on Image**"): |
|
st.switch_page("pages/How to work on Image.py") |
|
|
|
if st.button("**How to handle Videos**"): |
|
st.switch_page("pages/How to handle Videos.py") |
|
|
|
if st.button("**Transformation**"): |
|
st.switch_page("pages/Transformation.py") |
|
|
|
if st.button("**Projects**"): |
|
st.switch_page("pages/Projects") |
|
|
|
|
|
|