Spaces:
Runtime error
Runtime error
add sample
Browse files
app.py
CHANGED
@@ -1,15 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
import requests
|
3 |
-
import os
|
4 |
|
5 |
-
enable_xorbits =
|
6 |
-
|
7 |
-
if enable_xorbits:
|
8 |
-
import xorbits
|
9 |
-
xorbits.init()
|
10 |
-
import xorbits.pandas as pd
|
11 |
-
else:
|
12 |
-
import pandas as pd
|
13 |
|
14 |
st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide")
|
15 |
st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face')
|
@@ -25,28 +17,39 @@ st.sidebar.header("Please Paste The HF Dataset Name Here:")
|
|
25 |
|
26 |
#@st.cache_data
|
27 |
def load_dataset(j, name, fraction):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
if not os.path.exists('train.gzip'):
|
30 |
with st.spinner('Downloading file from remote server'):
|
31 |
import pandas
|
32 |
train_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'train']
|
33 |
train_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in train_urls], ignore_index=True)
|
34 |
-
train_dataset.to_parquet('train.gzip')
|
35 |
|
36 |
-
if not os.path.exists('test.gzip'):
|
37 |
with st.spinner('Downloading file from remote server'):
|
38 |
import pandas
|
39 |
test_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'validation']
|
40 |
test_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in test_urls], ignore_index=True)
|
41 |
-
test_dataset.to_parquet('test.gzip')
|
42 |
|
43 |
-
train_dataset = pd.read_parquet('train.gzip', engine='pyarrow')
|
|
|
44 |
|
45 |
-
|
|
|
|
|
46 |
|
47 |
dataset = {
|
48 |
-
"train": train_dataset
|
49 |
-
"test": test_dataset
|
50 |
}
|
51 |
|
52 |
return dataset
|
@@ -351,9 +354,9 @@ data was heavily used in their benchmark datasets.
|
|
351 |
|
352 |
def process_data(df):
|
353 |
minhashes = {}
|
354 |
-
for idx,
|
355 |
minhash = MinHash(num_perm=128)
|
356 |
-
for d in ngrams(
|
357 |
s = "".join(d).encode('utf-8')
|
358 |
minhash.update(s)
|
359 |
minhashes[idx] = minhash
|
|
|
1 |
import streamlit as st
|
2 |
import requests
|
|
|
3 |
|
4 |
+
enable_xorbits = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide")
|
7 |
st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face')
|
|
|
17 |
|
18 |
#@st.cache_data
|
19 |
def load_dataset(j, name, fraction):
|
20 |
+
import os
|
21 |
+
|
22 |
+
if enable_xorbits:
|
23 |
+
import xorbits
|
24 |
+
xorbits.init()
|
25 |
+
import xorbits.pandas as pd
|
26 |
+
else:
|
27 |
+
import pandas as pd
|
28 |
|
29 |
+
if not os.path.exists('%s-train.gzip' % name):
|
30 |
with st.spinner('Downloading file from remote server'):
|
31 |
import pandas
|
32 |
train_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'train']
|
33 |
train_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in train_urls], ignore_index=True)
|
34 |
+
train_dataset.to_parquet('%s-train.gzip' % name)
|
35 |
|
36 |
+
if not os.path.exists('%s-test.gzip' % name):
|
37 |
with st.spinner('Downloading file from remote server'):
|
38 |
import pandas
|
39 |
test_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'validation']
|
40 |
test_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in test_urls], ignore_index=True)
|
41 |
+
test_dataset.to_parquet('%s-test.gzip' % name)
|
42 |
|
43 |
+
train_dataset = pd.read_parquet('%s-train.gzip' % name, engine='pyarrow')
|
44 |
+
test_dataset = pd.read_parquet('%s-test.gzip' % name, engine='pyarrow')
|
45 |
|
46 |
+
if enable_xorbits:
|
47 |
+
train_dataset.rebalance()
|
48 |
+
test_dataset.rebalance()
|
49 |
|
50 |
dataset = {
|
51 |
+
"train": train_dataset.sample(frac=fraction),
|
52 |
+
"test": test_dataset.sample(frac=fraction),
|
53 |
}
|
54 |
|
55 |
return dataset
|
|
|
354 |
|
355 |
def process_data(df):
|
356 |
minhashes = {}
|
357 |
+
for idx, text in enumerate(df['text']):
|
358 |
minhash = MinHash(num_perm=128)
|
359 |
+
for d in ngrams(text, 13):
|
360 |
s = "".join(d).encode('utf-8')
|
361 |
minhash.update(s)
|
362 |
minhashes[idx] = minhash
|