GraphGen / webui /count_tokens.py
chenzihong-gavin
init
acd7cf4
import os
import sys
import json
import pandas as pd
# pylint: disable=wrong-import-position
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_dir)
from graphgen.models import Tokenizer
def count_tokens(file, tokenizer_name, data_frame):
if not file or not os.path.exists(file):
return data_frame
if file.endswith(".jsonl"):
with open(file, "r", encoding='utf-8') as f:
data = [json.loads(line) for line in f]
elif file.endswith(".json"):
with open(file, "r", encoding='utf-8') as f:
data = json.load(f)
data = [item for sublist in data for item in sublist]
elif file.endswith(".txt"):
with open(file, "r", encoding='utf-8') as f:
data = f.read()
chunks = [
data[i:i + 512] for i in range(0, len(data), 512)
]
data = [{"content": chunk} for chunk in chunks]
else:
raise ValueError(f"Unsupported file type: {file}")
tokenizer = Tokenizer(tokenizer_name)
# Count tokens
token_count = 0
for item in data:
if isinstance(item, dict):
content = item.get("content", "")
else:
content = item
token_count += len(tokenizer.encode_string(content))
_update_data = [[
str(token_count),
str(token_count * 50),
"N/A"
]]
try:
new_df = pd.DataFrame(
_update_data,
columns=data_frame.columns
)
data_frame = new_df
except Exception as e: # pylint: disable=broad-except
print("[ERROR] DataFrame操作异常:", str(e))
return data_frame