Spaces:
Running
Running
victormiller
commited on
Commit
•
7381c06
1
Parent(s):
9fd7ac0
Update curated.py
Browse files- curated.py +21 -5
curated.py
CHANGED
@@ -1041,10 +1041,7 @@ def curated(request):
|
|
1041 |
),
|
1042 |
)
|
1043 |
|
1044 |
-
|
1045 |
-
pipeline. It involves cleaning and transforming raw data into a format that
|
1046 |
-
is suitable for analysis. This process includes handling missing values,
|
1047 |
-
normalizing data, encoding categorical variables, and more.""")
|
1048 |
|
1049 |
preprocessing_steps = pd.DataFrame(
|
1050 |
{
|
@@ -1098,7 +1095,26 @@ def curated(request):
|
|
1098 |
|
1099 |
table_html = preprocessing_steps.to_html(index=False, border=0)
|
1100 |
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
1101 |
-
data_preprocessing_div = Div(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1102 |
|
1103 |
return Div(
|
1104 |
overview,
|
|
|
1041 |
),
|
1042 |
)
|
1043 |
|
1044 |
+
|
|
|
|
|
|
|
1045 |
|
1046 |
preprocessing_steps = pd.DataFrame(
|
1047 |
{
|
|
|
1095 |
|
1096 |
table_html = preprocessing_steps.to_html(index=False, border=0)
|
1097 |
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
1098 |
+
data_preprocessing_div = Div(
|
1099 |
+
H2("Data Preprocessing"),
|
1100 |
+
P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
|
1101 |
+
H3("Language Filter"),
|
1102 |
+
P("The Language Filter removes documents in unwanted languages. This step improves data quality by removing irrelevant documents."),
|
1103 |
+
H3("Minimum Word Count Filter"),
|
1104 |
+
P("The Minimum Word Count Filter sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each datasource."),
|
1105 |
+
H3("Unigram Log Probability"),
|
1106 |
+
P("The Unigram Log Probability Filter calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
|
1107 |
+
H2("Data Processing for S2ORC"),
|
1108 |
+
P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources.")
|
1109 |
+
H3("Title Abstract Filter"),
|
1110 |
+
P("The Title Abstract Filter extracts information from the title and abstract. This step provides additional information for analysis but may introduce bias in the analysis."),
|
1111 |
+
H3("Majority Language Filter"),
|
1112 |
+
P("The Majority Language Filter identifies the majority language in the dataset. This step displays the distribution of languages in the dataset to enable language-specific analysis and insights."),
|
1113 |
+
H3("Paragraph Count Filter"),
|
1114 |
+
P("The Paragraph Count Filter counts the number of paragraphs in each document. This step helps to analyze the structure and length of documents which can be a useful hueristic for document complexity."),
|
1115 |
+
H3("Frequency Filter"),
|
1116 |
+
P("The Frequency Filter calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
|
1117 |
+
)
|
1118 |
|
1119 |
return Div(
|
1120 |
overview,
|