Spaces:

mericberktas
/

asistan

Runtime error

asistan / semantic_chunker.py

Upload folder using huggingface_hub

ae9c08b verified 4 months ago

1.55 kB

	import os
	import nltk
	from transformers import AutoTokenizer

	# 🔧 nltk'nin doğru dizine erişmesini sağla
	nltk_data_path = os.path.expanduser("C:/Users/meric/AppData/Roaming/nltk_data")
	nltk.data.path.append(nltk_data_path)

	# punkt datasını indir (varsa dokunmaz)
	nltk.download('punkt', download_dir=nltk_data_path)

	# Tokenizer: Embed modeline yakın bir model seç (gerekirse değiştir)
	tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

	def chunk_text_semantic(text, max_tokens=256):
	"""
	Metni cümle bazlı olarak al ve token sayısı max_tokens'ı geçmeyecek şekilde parçalara böl.
	"""
	sentences = nltk.sent_tokenize(text)
	chunks = []
	current_chunk = ""

	for sent in sentences:
	test_chunk = f"{current_chunk} {sent}".strip() if current_chunk else sent
	token_count = len(tokenizer.encode(test_chunk, add_special_tokens=False))

	if token_count > max_tokens:
	if current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = sent
	else:
	# Tek cümle bile çok uzunsa, olduğu gibi ekle
	chunks.append(sent.strip())
	current_chunk = ""
	else:
	current_chunk = test_chunk

	if current_chunk:
	chunks.append(current_chunk.strip())

	return chunks

	def add_header_to_chunk(header, chunk):
	"""
	Chunk'a başlık (kategori, intent vb.) ekler.
	"""
	return f"[{header}]\n{chunk}"