Spaces:
Build error
Build error
File size: 6,849 Bytes
1552dd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
from datasets import load_dataset
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForQuestionAnswering, BertConfig,AutoModelForCausalLM
from pymongo import MongoClient
import torchtext
torchtext.disable_torchtext_deprecation_warning()
from torchtext.data import get_tokenizer
from yeni_tokenize import TokenizerProcessor
class Database:
# MongoDB connection settings
def get_mongodb(database_name='yeniDatabase', collection_name='test', host='localhost', port=27017):
"""
MongoDB connection and collection selection
"""
client = MongoClient(f'mongodb://{host}:{port}/')
db = client[database_name]
collection = db[collection_name]
return collection
@staticmethod
def get_mongodb():
# MongoDB bağlantı bilgilerini döndürecek şekilde tanımlanmalıdır.
return 'mongodb://localhost:27017/', 'yeniDatabase', 'train'
@staticmethod
def get_input_texts():
# MongoDB bağlantı bilgilerini alma
mongo_url, db_name, collection_name = Database.get_mongodb()
# MongoDB'ye bağlanma
client = MongoClient(mongo_url)
db = client[db_name]
collection = db[collection_name]
# Sorguyu tanımlama
query = {"Prompt": {"$exists": True}}
# Sorguyu çalıştırma ve dökümanları çekme
cursor = collection.find(query, {"Prompt": 1, "_id": 0})
# Cursor'ı döküman listesine dönüştürme
input_texts_from_db = [doc['Prompt'] for doc in cursor]
# Input text'leri döndürme
# Düz metin listesine dönüştürme
return input_texts_from_db
@staticmethod
def get_output_texts():
# MongoDB bağlantı bilgilerini alma
mongo_url, db_name, collection_name = Database.get_mongodb()
# MongoDB'ye bağlanma
client = MongoClient(mongo_url)
db = client[db_name]
collection = db[collection_name]
# Sorguyu tanımlama
query = {"Response": {"$exists": True}}
# Sorguyu çalıştırma ve dökümanları çekme
cursor = collection.find(query, {"Response": 1, "_id": 0})
# Cursor'ı döküman listesine dönüştürme
output_texts_from_db = [doc['Response'] for doc in cursor]
#output metin listesine çevirme
return output_texts_from_db
@staticmethod
def get_average_prompt_token_length():
# MongoDB bağlantı bilgilerini alma
mongo_url, db_name, collection_name = Database.get_mongodb()
# MongoDB'ye bağlanma
client = MongoClient(mongo_url)
db = client[db_name]
collection = db[collection_name]
# Tüm dökümanları çekme ve 'prompt_token_length' alanını alma
docs = collection.find({}, {'Prompt_token_length': 1})
# 'prompt_token_length' değerlerini toplama ve sayma
total_length = 0
count = 0
for doc in docs:
if 'Prompt_token_length' in doc:
total_length += doc['Prompt_token_length']
count += 1
# Ortalama hesaplama
average_length = total_length / count if count > 0 else 0
return int(average_length)
# Tokenizer ve Modeli yükleme
"""
class TokenizerProcessor:
def __init__(self, tokenizer_name='bert-base-uncased'):
self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
def tokenize_and_encode(self, input_texts, output_texts, max_length=100):
encoded = self.tokenizer.batch_encode_plus(
text_pair=list(zip(input_texts, output_texts)),
padding='max_length',
truncation=True,
max_length=max_length,
return_attention_mask=True,
return_tensors='pt'
)
return encoded
paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt")
not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt")
paraphrase_classification_logits = model(**paraphrase)[0]
not_paraphrase_classification_logits = model(**not_paraphrase)[0]
def custom_padding(self, input_ids_list, max_length=100, pad_token_id=0):
padded_inputs = []
for ids in input_ids_list:
if len(ids) < max_length:
padded_ids = ids + [pad_token_id] * (max_length - len(ids))
else:
padded_ids = ids[:max_length]
padded_inputs.append(padded_ids)
return padded_inputs
def pad_and_truncate_pairs(self, input_texts, output_texts, max_length=100):
#input ve output verilerinin uzunluğunu eşitleme
inputs = self.tokenizer(input_texts, padding=False, truncation=False, return_tensors=None)
outputs = self.tokenizer(output_texts, padding=False, truncation=False, return_tensors=None)
input_ids = self.custom_padding(inputs['input_ids'], max_length, self.tokenizer.pad_token_id)
output_ids = self.custom_padding(outputs['input_ids'], max_length, self.tokenizer.pad_token_id)
input_ids_tensor = torch.tensor(input_ids)
output_ids_tensor = torch.tensor(output_ids)
input_attention_mask = (input_ids_tensor != self.tokenizer.pad_token_id).long()
output_attention_mask = (output_ids_tensor != self.tokenizer.pad_token_id).long()
return {
'input_ids': input_ids_tensor,
'input_attention_mask': input_attention_mask,
'output_ids': output_ids_tensor,
'output_attention_mask': output_attention_mask
}
"""
#cümleleri teker teker input ve output verilerinden çekmem gerekiyor
#def tokenize_and_pad_sequences(sequence_1,sequence2,)
class DataPipeline:
def __init__(self, tokenizer_name='bert-base-uncased', max_length=100):
self.tokenizer_processor = TokenizerProcessor(tokenizer_name)
self.max_length = max_length
def prepare_data(self):
input_texts = Database.get_input_texts()
output_texts = Database.get_output_texts()
encoded_data = self.tokenizer_processor.pad_and_truncate_pairs(input_texts, output_texts, self.max_length)
return encoded_data
def tokenize_texts(self, texts):
return [self.tokenize(text) for text in texts]
def encode_texts(self, texts):
return [self.encode(text, self.max_length) for text in texts]
# Example Usage
if __name__ == "__main__":
data_pipeline = DataPipeline()
encoded_data = data_pipeline.prepare_data()
print(encoded_data)
|