File size: 618 Bytes
4f084e5
0b7b04f
 
 
a186abb
0125fff
4f084e5
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from datasets import load_dataset

text_dataset = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train", streaming=True, columns=['text'])

bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer="word")
co_occurrences = bigram_vectorizer.fit_transform(doc['text'] for doc in text_dataset)
print('Printing sparse matrix:')
print(co_occurrences)
print('Printing dense matrix')
print(co_occurrences.todense())
sum_occ = np.sum(co_occurrences.todense(), axis=0)
print('Sum of word-word occurrences:')
print(sum_occ)