Dreamsome commited on
Commit
4ff688d
·
1 Parent(s): 7351996

add sample

Browse files
Files changed (1) hide show
  1. app.py +22 -19
app.py CHANGED
@@ -1,15 +1,7 @@
1
  import streamlit as st
2
  import requests
3
- import os
4
 
5
- enable_xorbits = False
6
-
7
- if enable_xorbits:
8
- import xorbits
9
- xorbits.init()
10
- import xorbits.pandas as pd
11
- else:
12
- import pandas as pd
13
 
14
  st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide")
15
  st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face')
@@ -25,28 +17,39 @@ st.sidebar.header("Please Paste The HF Dataset Name Here:")
25
 
26
  #@st.cache_data
27
  def load_dataset(j, name, fraction):
 
 
 
 
 
 
 
 
28
 
29
- if not os.path.exists('train.gzip'):
30
  with st.spinner('Downloading file from remote server'):
31
  import pandas
32
  train_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'train']
33
  train_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in train_urls], ignore_index=True)
34
- train_dataset.to_parquet('train.gzip')
35
 
36
- if not os.path.exists('test.gzip'):
37
  with st.spinner('Downloading file from remote server'):
38
  import pandas
39
  test_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'validation']
40
  test_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in test_urls], ignore_index=True)
41
- test_dataset.to_parquet('test.gzip')
42
 
43
- train_dataset = pd.read_parquet('train.gzip', engine='pyarrow')
 
44
 
45
- test_dataset = pd.read_parquet('test.gzip', engine='pyarrow')
 
 
46
 
47
  dataset = {
48
- "train": train_dataset[:int(len(train_dataset)*fraction)],
49
- "test": test_dataset[:int(len(test_dataset)*fraction)],
50
  }
51
 
52
  return dataset
@@ -351,9 +354,9 @@ data was heavily used in their benchmark datasets.
351
 
352
  def process_data(df):
353
  minhashes = {}
354
- for idx, r in df.iterrows():
355
  minhash = MinHash(num_perm=128)
356
- for d in ngrams(r['text'], 13):
357
  s = "".join(d).encode('utf-8')
358
  minhash.update(s)
359
  minhashes[idx] = minhash
 
1
  import streamlit as st
2
  import requests
 
3
 
4
+ enable_xorbits = True
 
 
 
 
 
 
 
5
 
6
  st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide")
7
  st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face')
 
17
 
18
  #@st.cache_data
19
  def load_dataset(j, name, fraction):
20
+ import os
21
+
22
+ if enable_xorbits:
23
+ import xorbits
24
+ xorbits.init()
25
+ import xorbits.pandas as pd
26
+ else:
27
+ import pandas as pd
28
 
29
+ if not os.path.exists('%s-train.gzip' % name):
30
  with st.spinner('Downloading file from remote server'):
31
  import pandas
32
  train_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'train']
33
  train_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in train_urls], ignore_index=True)
34
+ train_dataset.to_parquet('%s-train.gzip' % name)
35
 
36
+ if not os.path.exists('%s-test.gzip' % name):
37
  with st.spinner('Downloading file from remote server'):
38
  import pandas
39
  test_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'validation']
40
  test_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in test_urls], ignore_index=True)
41
+ test_dataset.to_parquet('%s-test.gzip' % name)
42
 
43
+ train_dataset = pd.read_parquet('%s-train.gzip' % name, engine='pyarrow')
44
+ test_dataset = pd.read_parquet('%s-test.gzip' % name, engine='pyarrow')
45
 
46
+ if enable_xorbits:
47
+ train_dataset.rebalance()
48
+ test_dataset.rebalance()
49
 
50
  dataset = {
51
+ "train": train_dataset.sample(frac=fraction),
52
+ "test": test_dataset.sample(frac=fraction),
53
  }
54
 
55
  return dataset
 
354
 
355
  def process_data(df):
356
  minhashes = {}
357
+ for idx, text in enumerate(df['text']):
358
  minhash = MinHash(num_perm=128)
359
+ for d in ngrams(text, 13):
360
  s = "".join(d).encode('utf-8')
361
  minhash.update(s)
362
  minhashes[idx] = minhash