aiisc-watermarking-modelv3

Sleeping

App Files Files Community

aiisc-watermarking-modelv3 / lcs.py

jgyasu

Upload folder using huggingface_hub

63b3783 verified 4 months ago

raw

history blame

1.76 kB

	import re
	from nltk.corpus import stopwords

	def find_common_subsequences(sentence, str_list):
	stop_words = set(stopwords.words('english'))
	sentence = sentence.lower()

	str_list = [s.lower() for s in str_list]

	def is_present(subseq, str_list):
	subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b')
	return all(subseq_regex.search(s) for s in str_list)

	def remove_stop_words_and_special_chars(sentence):
	sentence = re.sub(r'[^\w\s]', '', sentence)
	words = sentence.split()
	filtered_words = [word for word in words if word.lower() not in stop_words]
	return " ".join(filtered_words)

	sentence = remove_stop_words_and_special_chars(sentence)
	str_list = [remove_stop_words_and_special_chars(s) for s in str_list]

	words = sentence.split()
	common_grams = []
	added_phrases = set()

	for n in range(5, 0, -1):
	for i in range(len(words) - n + 1):
	subseq = " ".join(words[i:i+n])
	if is_present(subseq, str_list) and not any(subseq in phrase for phrase in added_phrases):
	common_grams.append((i, subseq))
	added_phrases.add(subseq)

	# Sort by the first appearance in the original sentence
	common_grams.sort(key=lambda x: x[0])

	# Assign indices based on the sorted order
	indexed_common_grams = [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)]

	return indexed_common_grams

	# Example usage
	# sentence = "Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States."
	# str_list = ['']

	# print(find_common_subsequences(sentence, str_list))