File size: 1,692 Bytes
acd7cf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import sys
import json
import pandas as pd

# pylint: disable=wrong-import-position
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_dir)
from graphgen.models import Tokenizer

def count_tokens(file, tokenizer_name, data_frame):
    if not file or not os.path.exists(file):
        return data_frame

    if file.endswith(".jsonl"):
        with open(file, "r", encoding='utf-8') as f:
            data = [json.loads(line) for line in f]
    elif file.endswith(".json"):
        with open(file, "r", encoding='utf-8') as f:
            data = json.load(f)
            data = [item for sublist in data for item in sublist]
    elif file.endswith(".txt"):
        with open(file, "r", encoding='utf-8') as f:
            data = f.read()
            chunks = [
                data[i:i + 512] for i in range(0, len(data), 512)
            ]
            data = [{"content": chunk} for chunk in chunks]
    else:
        raise ValueError(f"Unsupported file type: {file}")

    tokenizer = Tokenizer(tokenizer_name)

    # Count tokens
    token_count = 0

    for item in data:
        if isinstance(item, dict):
            content = item.get("content", "")
        else:
            content = item
        token_count += len(tokenizer.encode_string(content))

    _update_data = [[
        str(token_count),
        str(token_count * 50),
        "N/A"
    ]]

    try:
        new_df = pd.DataFrame(
            _update_data,
            columns=data_frame.columns
        )
        data_frame = new_df

    except Exception as e: # pylint: disable=broad-except
        print("[ERROR] DataFrame操作异常:", str(e))

    return data_frame