luminoussg commited on
Commit
e7d39a8
1 Parent(s): e30fc8a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -0
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tiktoken
3
+ import json
4
+
5
+ # Function to load JSON/JSONL file and count tokens for each entry
6
+ def count_tokens(json_file, encoding_name):
7
+ encoding = tiktoken.get_encoding(encoding_name)
8
+
9
+ # Load the JSON or JSONL data
10
+ with open(json_file.name, 'r') as f:
11
+ data = [json.loads(line) for line in f.readlines()] if json_file.name.endswith('.jsonl') else json.load(f)
12
+
13
+ # Token counting for each item in the dataset
14
+ token_counts = []
15
+ for item in data:
16
+ if isinstance(item, dict):
17
+ text = ' '.join([str(v) for v in item.values() if isinstance(v, str)])
18
+ else:
19
+ text = str(item)
20
+
21
+ num_tokens = len(encoding.encode(text))
22
+ token_counts.append({
23
+ 'text': text,
24
+ 'token_count': num_tokens
25
+ })
26
+
27
+ return token_counts
28
+
29
+ # Gradio interface
30
+ def token_counter(json_file, encoding_name):
31
+ token_data = count_tokens(json_file, encoding_name)
32
+ return token_data
33
+
34
+ # Gradio UI
35
+ gr.Interface(
36
+ fn=token_counter,
37
+ inputs=[
38
+ gr.File(label="Upload JSON/JSONL File"),
39
+ gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base")
40
+ ],
41
+ outputs=gr.JSON(label="Token Counts")
42
+ ).launch()