Spaces:

chenzihong
/

GraphGen

Running

GraphGen / webui /count_tokens.py

chenzihong-gavin

init

acd7cf4 about 1 month ago

1.69 kB

	import os
	import sys
	import json
	import pandas as pd

	# pylint: disable=wrong-import-position
	root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	sys.path.append(root_dir)
	from graphgen.models import Tokenizer

	def count_tokens(file, tokenizer_name, data_frame):
	if not file or not os.path.exists(file):
	return data_frame

	if file.endswith(".jsonl"):
	with open(file, "r", encoding='utf-8') as f:
	data = [json.loads(line) for line in f]
	elif file.endswith(".json"):
	with open(file, "r", encoding='utf-8') as f:
	data = json.load(f)
	data = [item for sublist in data for item in sublist]
	elif file.endswith(".txt"):
	with open(file, "r", encoding='utf-8') as f:
	data = f.read()
	chunks = [
	data[i:i + 512] for i in range(0, len(data), 512)
	]
	data = [{"content": chunk} for chunk in chunks]
	else:
	raise ValueError(f"Unsupported file type: {file}")

	tokenizer = Tokenizer(tokenizer_name)

	# Count tokens
	token_count = 0

	for item in data:
	if isinstance(item, dict):
	content = item.get("content", "")
	else:
	content = item
	token_count += len(tokenizer.encode_string(content))

	_update_data = [[
	str(token_count),
	str(token_count * 50),
	"N/A"
	]]

	try:
	new_df = pd.DataFrame(
	_update_data,
	columns=data_frame.columns
	)
	data_frame = new_df

	except Exception as e: # pylint: disable=broad-except
	print("[ERROR] DataFrame操作异常:", str(e))

	return data_frame