meg HF staff commited on
Commit
2fa1451
·
verified ·
1 Parent(s): 4f084e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -9
app.py CHANGED
@@ -1,15 +1,26 @@
1
  from sklearn.feature_extraction.text import CountVectorizer
2
  import numpy as np
3
  from datasets import load_dataset
 
 
4
 
5
  text_dataset = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train", streaming=True, columns=['text'])
6
 
7
- bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer="word")
8
- co_occurrences = bigram_vectorizer.fit_transform(doc['text'] for doc in text_dataset)
9
- print('Printing sparse matrix:')
10
- print(co_occurrences)
11
- print('Printing dense matrix')
12
- print(co_occurrences.todense())
13
- sum_occ = np.sum(co_occurrences.todense(), axis=0)
14
- print('Sum of word-word occurrences:')
15
- print(sum_occ)
 
 
 
 
 
 
 
 
 
 
1
  from sklearn.feature_extraction.text import CountVectorizer
2
  import numpy as np
3
  from datasets import load_dataset
4
+ import gradio as gr
5
+
6
 
7
  text_dataset = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train", streaming=True, columns=['text'])
8
 
9
+ def update(text_dataset):
10
+ bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer="word")
11
+ co_occurrences = bigram_vectorizer.fit_transform(doc['text'] for doc in text_dataset)
12
+ print('Printing sparse matrix:')
13
+ print(co_occurrences)
14
+ print('Printing dense matrix')
15
+ print(co_occurrences.todense())
16
+ sum_occ = np.sum(co_occurrences.todense(), axis=0)
17
+ print('Sum of word-word occurrences:')
18
+ print(sum_occ)
19
+ return sum_occ
20
+
21
+ with gr.Blocks() as app:
22
+ gr.Markdown("Click **Run** to start calculating.")
23
+ btn = gr.Button("Run")
24
+ btn.click(fn=update, inputs=text_dataset, outputs=out)
25
+
26
+ app.launch()