Reshinth Adithyan commited on
Commit
4d992f2
2 Parent(s): d1b0126 8e50efb

Merge branch 'main' of https://huggingface.co/spaces/CarperAI/pile-v2-eda

Browse files
Files changed (1) hide show
  1. app.py +12 -11
app.py CHANGED
@@ -14,7 +14,7 @@ else:
14
  contribution_json = "contributors.json"
15
 
16
  contribution_dict = json.load(open(contribution_json,"r"))
17
- IGNORE_LIST = ["Bible","Tanzil",""]
18
 
19
  splits = [split for split in os.listdir(CACHE_DIR) if split not in IGNORE_LIST]
20
 
@@ -44,17 +44,18 @@ def load_page(split):
44
  meta = data["meta"]
45
  with st.expander("Render Content"):
46
  st.write(content)
47
- st.write("### Content:")
48
- st.text(content)
49
- st.write("### Meta:")
50
- st.write(ast.literal_eval(meta))
51
- #Tokenizer related count
52
- tokenized = tokenizer(content, return_length=True)['length'][0]
53
- token_count_metric = st.metric("Token Count(compared to 2048)",value=tokenized,delta=4096-tokenized)
 
54
  #Word related count
55
- split_words = re.findall(r'\w+', content)
56
- word_count_metric = st.metric("Word Count",value=len(split_words))
57
-
58
 
59
 
60
  demo_name = st.sidebar.selectbox("Choose a demo", splits)
 
14
  contribution_json = "contributors.json"
15
 
16
  contribution_dict = json.load(open(contribution_json,"r"))
17
+ IGNORE_LIST = ["Bible","Tanzil","GNOME"]
18
 
19
  splits = [split for split in os.listdir(CACHE_DIR) if split not in IGNORE_LIST]
20
 
 
44
  meta = data["meta"]
45
  with st.expander("Render Content"):
46
  st.write(content)
47
+ with st.expander("Raw Content"):
48
+ st.text(content)
49
+ with st.expander("Metadata and Metrics"):
50
+ st.write("### Meta:")
51
+ st.write(ast.literal_eval(meta))
52
+ # Tokenizer-related count
53
+ tokenized = tokenizer(content, return_length=True)['length'][0]
54
+ token_count_metric = st.metric("Token Count(compared to 2048)",value=tokenized,delta=4096-tokenized)
55
  #Word related count
56
+ split_words = re.findall(r'\w+', content)
57
+ word_count_metric = st.metric("Word Count",value=len(split_words))
58
+
59
 
60
 
61
  demo_name = st.sidebar.selectbox("Choose a demo", splits)