TxT360

Sleeping

App Files Files Community

victormiller commited on Oct 1

Commit

2018e3d

•

1 Parent(s): e93fc1a

Update curated.py

Browse files

Files changed (1) hide show

curated.py +50 -0

curated.py CHANGED Viewed

@@ -856,6 +856,55 @@ fig.update_layout(
 # Show the plot
 diff_stacked_bar = fig
 def curated(request):
@@ -992,6 +1041,7 @@ def curated(request):
             plotly2fasthtml(get_chart_new()),
             plotly2fasthtml(stacked_bar),
             plotly2fasthtml(diff_stacked_bar),
             H2("Curated Sources Processing"),
             filtering_process,
             data_preparation_div,

 # Show the plot
 diff_stacked_bar = fig
+# Data for the stacked bar chart
+data = {
+    'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
+    'Wikipedia': [61614907, 61614907, 60468491, 60468491],
+    'Freelaw': [75971288, 73690766, 68171834, 68123174],
+    'DM Maths': [112559888, 112559888, 112559888, 112559888],
+    'USPTO': [6880276, 6878964, 6749922, 6749389],
+    'PG19': [28752, 28683, 28682, 28632],
+    'Hackernews': [2064931, 2010802, 2010488, 2003636],
+    'Ubuntu IRC': [37966, 23501, 23468, 23205],
+    'Europarl': [69814, 69814, 69814, 69814],
+    'StackExchange': [23246548, 23246548, 23246352, 23246352],
+    'Arxiv': [1911867, 1869441, 1763840, 1762661],
+    'S2ORC': [12963563, 12963563, 12963563, 12963563],
+    'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
+    'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
+    'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
+    'Phil Papers': [49389, 39175, 39175, 39128]
+}
+# Creating a dataframe
+df = pd.DataFrame(data)
+# Creating the stacked bar chart
+fig = go.Figure()
+# Add trace for each dataset
+for dataset in df.columns[1:]:
+    fig.add_trace(go.Bar(
+        name=dataset,
+        x=df['Filter'],
+        y=df[dataset]
+    ))
+# Update the layout
+fig.update_layout(
+    barmode='stack',
+    title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
+    xaxis_title='Filter',
+    yaxis_title='Number of Lines',
+    legend_title='Dataset',
+    height=600,
+    width=1000
+)
+# Show the plot
+diff2_stacked_bar = fig
 def curated(request):
             plotly2fasthtml(get_chart_new()),
             plotly2fasthtml(stacked_bar),
             plotly2fasthtml(diff_stacked_bar),
+            plotly2fasthtml(diff2_stacked_bar),
             H2("Curated Sources Processing"),
             filtering_process,
             data_preparation_div,