victormiller
commited on
Commit
•
2018e3d
1
Parent(s):
e93fc1a
Update curated.py
Browse files- curated.py +50 -0
curated.py
CHANGED
@@ -856,6 +856,55 @@ fig.update_layout(
|
|
856 |
# Show the plot
|
857 |
diff_stacked_bar = fig
|
858 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
859 |
|
860 |
def curated(request):
|
861 |
|
@@ -992,6 +1041,7 @@ def curated(request):
|
|
992 |
plotly2fasthtml(get_chart_new()),
|
993 |
plotly2fasthtml(stacked_bar),
|
994 |
plotly2fasthtml(diff_stacked_bar),
|
|
|
995 |
H2("Curated Sources Processing"),
|
996 |
filtering_process,
|
997 |
data_preparation_div,
|
|
|
856 |
# Show the plot
|
857 |
diff_stacked_bar = fig
|
858 |
|
859 |
+
# Data for the stacked bar chart
|
860 |
+
data = {
|
861 |
+
'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
|
862 |
+
'Wikipedia': [61614907, 61614907, 60468491, 60468491],
|
863 |
+
'Freelaw': [75971288, 73690766, 68171834, 68123174],
|
864 |
+
'DM Maths': [112559888, 112559888, 112559888, 112559888],
|
865 |
+
'USPTO': [6880276, 6878964, 6749922, 6749389],
|
866 |
+
'PG19': [28752, 28683, 28682, 28632],
|
867 |
+
'Hackernews': [2064931, 2010802, 2010488, 2003636],
|
868 |
+
'Ubuntu IRC': [37966, 23501, 23468, 23205],
|
869 |
+
'Europarl': [69814, 69814, 69814, 69814],
|
870 |
+
'StackExchange': [23246548, 23246548, 23246352, 23246352],
|
871 |
+
'Arxiv': [1911867, 1869441, 1763840, 1762661],
|
872 |
+
'S2ORC': [12963563, 12963563, 12963563, 12963563],
|
873 |
+
'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
|
874 |
+
'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
|
875 |
+
'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
|
876 |
+
'Phil Papers': [49389, 39175, 39175, 39128]
|
877 |
+
}
|
878 |
+
|
879 |
+
# Creating a dataframe
|
880 |
+
df = pd.DataFrame(data)
|
881 |
+
|
882 |
+
# Creating the stacked bar chart
|
883 |
+
fig = go.Figure()
|
884 |
+
|
885 |
+
# Add trace for each dataset
|
886 |
+
for dataset in df.columns[1:]:
|
887 |
+
fig.add_trace(go.Bar(
|
888 |
+
name=dataset,
|
889 |
+
x=df['Filter'],
|
890 |
+
y=df[dataset]
|
891 |
+
))
|
892 |
+
|
893 |
+
# Update the layout
|
894 |
+
fig.update_layout(
|
895 |
+
barmode='stack',
|
896 |
+
title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
|
897 |
+
xaxis_title='Filter',
|
898 |
+
yaxis_title='Number of Lines',
|
899 |
+
legend_title='Dataset',
|
900 |
+
height=600,
|
901 |
+
width=1000
|
902 |
+
)
|
903 |
+
|
904 |
+
# Show the plot
|
905 |
+
diff2_stacked_bar = fig
|
906 |
+
|
907 |
+
|
908 |
|
909 |
def curated(request):
|
910 |
|
|
|
1041 |
plotly2fasthtml(get_chart_new()),
|
1042 |
plotly2fasthtml(stacked_bar),
|
1043 |
plotly2fasthtml(diff_stacked_bar),
|
1044 |
+
plotly2fasthtml(diff2_stacked_bar),
|
1045 |
H2("Curated Sources Processing"),
|
1046 |
filtering_process,
|
1047 |
data_preparation_div,
|