luminoussg commited on
Commit
0878173
·
verified ·
1 Parent(s): 1232554

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -15
app.py CHANGED
@@ -2,36 +2,37 @@ import gradio as gr
2
  import tiktoken
3
  import json
4
 
5
- # Function to load JSON/JSONL file and count tokens for each entry
6
  def count_tokens(json_file, encoding_name):
7
  encoding = tiktoken.get_encoding(encoding_name)
8
 
9
  # Load the JSON or JSONL data
10
  with open(json_file.name, 'r') as f:
11
- data = [json.loads(line) for line in f.readlines()] if json_file.name.endswith('.jsonl') else json.load(f)
12
-
13
- # Token counting for each item in the dataset
14
  token_counts = []
15
- for item in data:
16
- if isinstance(item, dict):
17
- text = ' '.join([str(v) for v in item.values() if isinstance(v, str)])
18
- else:
19
- text = str(item)
20
-
21
- num_tokens = len(encoding.encode(text))
 
 
22
  token_counts.append({
23
- 'text': text,
24
- 'token_count': num_tokens
25
  })
26
 
27
  return token_counts
28
 
29
- # Gradio interface
30
  def token_counter(json_file, encoding_name):
31
  token_data = count_tokens(json_file, encoding_name)
32
  return token_data
33
 
34
- # Gradio UI
35
  gr.Interface(
36
  fn=token_counter,
37
  inputs=[
 
2
  import tiktoken
3
  import json
4
 
5
+ # Function to count tokens in the dataset based on the "messages" field
6
  def count_tokens(json_file, encoding_name):
7
  encoding = tiktoken.get_encoding(encoding_name)
8
 
9
  # Load the JSON or JSONL data
10
  with open(json_file.name, 'r') as f:
11
+ data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]
12
+
 
13
  token_counts = []
14
+ for entry in data:
15
+ conversation_token_count = 0
16
+ conversation_texts = []
17
+ if "messages" in entry:
18
+ for message in entry["messages"]:
19
+ content = message.get("content", "")
20
+ conversation_texts.append(content)
21
+ conversation_token_count += len(encoding.encode(content))
22
+
23
  token_counts.append({
24
+ 'conversation': ' '.join(conversation_texts),
25
+ 'token_count': conversation_token_count
26
  })
27
 
28
  return token_counts
29
 
30
+ # Gradio interface function
31
  def token_counter(json_file, encoding_name):
32
  token_data = count_tokens(json_file, encoding_name)
33
  return token_data
34
 
35
+ # Gradio UI setup
36
  gr.Interface(
37
  fn=token_counter,
38
  inputs=[