Spaces:
Running
Running
import os | |
import sys | |
import json | |
import pandas as pd | |
# pylint: disable=wrong-import-position | |
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
sys.path.append(root_dir) | |
from graphgen.models import Tokenizer | |
def count_tokens(file, tokenizer_name, data_frame): | |
if not file or not os.path.exists(file): | |
return data_frame | |
if file.endswith(".jsonl"): | |
with open(file, "r", encoding='utf-8') as f: | |
data = [json.loads(line) for line in f] | |
elif file.endswith(".json"): | |
with open(file, "r", encoding='utf-8') as f: | |
data = json.load(f) | |
data = [item for sublist in data for item in sublist] | |
elif file.endswith(".txt"): | |
with open(file, "r", encoding='utf-8') as f: | |
data = f.read() | |
chunks = [ | |
data[i:i + 512] for i in range(0, len(data), 512) | |
] | |
data = [{"content": chunk} for chunk in chunks] | |
else: | |
raise ValueError(f"Unsupported file type: {file}") | |
tokenizer = Tokenizer(tokenizer_name) | |
# Count tokens | |
token_count = 0 | |
for item in data: | |
if isinstance(item, dict): | |
content = item.get("content", "") | |
else: | |
content = item | |
token_count += len(tokenizer.encode_string(content)) | |
_update_data = [[ | |
str(token_count), | |
str(token_count * 50), | |
"N/A" | |
]] | |
try: | |
new_df = pd.DataFrame( | |
_update_data, | |
columns=data_frame.columns | |
) | |
data_frame = new_df | |
except Exception as e: # pylint: disable=broad-except | |
print("[ERROR] DataFrame操作异常:", str(e)) | |
return data_frame | |