File size: 914 Bytes
1552dd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from dataPipeline import DataPipeline
from my_tokenize import Database 
from yeni_tokenize import TokenizerProcessor
from transformers import BertTokenizer 

# Tokenizer'ı başlat
tokenizer_name = 'bert-base-cased'
pipeline = DataPipeline(tokenizer_name=tokenizer_name, max_length=100)

# MongoDB'den input metinlerini çek
input_texts = [doc["Prompt"] for doc in Database.get_input_texts()]

# Metinleri tokenize et
tokenized_texts = pipeline.tokenize_texts(input_texts)
print("Tokenized Texts:")
for text, tokens in zip(input_texts, tokenized_texts):
    print(f"Original Text: {text}")
    print(f"Tokenized Text: {tokens}")

# Metinleri encode et
encoded_texts = pipeline.encode_texts(input_texts)
print("Encoded Texts:")
for text, encoded in zip(input_texts, encoded_texts):
    print(f"Original Text: {text}")
    print(f"Encoded Text: {encoded['input_ids'].squeeze().tolist()}")