code-completion / archive-misc /ascii_percentage.py

Upload folder using huggingface_hub

a8639ac verified about 2 months ago

1.81 kB

	import os


	def calculate_ascii_percentage(file_path):
	try:
	with open(file_path, "rb") as f:
	data = f.read()

	total_chars = len(data)
	if total_chars == 0:
	return 0

	ascii_chars = sum(1 for c in data if 0 <= c <= 127)
	percentage = (ascii_chars / total_chars) * 100

	return percentage
	except Exception as e:
	print(f"Error: {e}")
	return None


	file_path = os.path.expanduser(
	"~/torch_datasets/github-python/corpus/data/corpus_processed.txt"
	)
	ascii_percentage = calculate_ascii_percentage(file_path)
	if ascii_percentage is not None:
	print(f"Percentage of ASCII characters: {ascii_percentage:.2f}%")


	def find_unicode_passages(file_path, threshold=0.5, min_length=20):
	"""
	Prints passages with a high density of non-ASCII characters.
	Args:
	file_path (str): Path to the input file.
	threshold (float): Proportion of non-ASCII characters to flag a line.
	min_length (int): Minimum length of a line to be considered.
	"""
	try:
	with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
	for line_num, line in enumerate(f, start=1):
	total_chars = len(line.strip())
	if total_chars < min_length:
	continue # Skip short lines

	non_ascii_count = sum(1 for c in line if ord(c) >= 128)
	if non_ascii_count / total_chars > threshold:
	print(f"Line {line_num}: {line.strip()}")
	print(
	f" -> Non-ASCII Density: {non_ascii_count / total_chars:.2%}"
	)
	except Exception as e:
	print(f"Error: {e}")


	# Example usage
	find_unicode_passages(file_path, threshold=0.5, min_length=20)