Spaces:
Build error
Build error
from my_tokenize import Database | |
from yeni_tokenize import TokenizerProcessor | |
class DataPipeline: | |
def __init__(self, tokenizer_name='bert-base-uncased', max_length=512): | |
self.tokenizer_processor = TokenizerProcessor(tokenizer_name) | |
self.max_length = max_length | |
def prepare_data(self): | |
input_texts = Database.get_input_texts() | |
output_texts = Database.get_output_texts() | |
encoded_data = self.tokenizer_processor.pad_and_truncate_pairs(input_texts, output_texts, self.max_length) | |
return encoded_data | |
def tokenize_texts(self, texts): | |
return [self.tokenizer_processor.tokenizer(text) for text in texts] | |
def encode_texts(self, texts): | |
return [self.tokenizer_processor.encode(text, self.max_length) for text in texts] | |
# Tokenizer'ı başlat | |
pipeline = DataPipeline(tokenizer_name='bert-base-cased', max_length=512) | |
# MongoDB'den input metinlerini çek | |
input_texts = Database.get_input_texts() | |
# Metinleri tokenize et | |
tokenized_texts = pipeline.tokenize_texts(input_texts) | |
print("Tokenized Texts:") | |
for text, tokens in zip(input_texts, tokenized_texts): | |
print(f"Original Text: {text}") | |
print(f"Tokenized Text: {tokens}") | |
# Metinleri encode et | |
encoded_texts = pipeline.encode_texts(input_texts) | |
print("Encoded Texts:") | |
for text, encoded in zip(input_texts, encoded_texts): | |
print(f"Original Text: {text}") | |
print(f"Encoded Text: {encoded['input_ids'].squeeze().tolist()}") | |