Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

clearlydefined_license_summarizer / src /tfidf.py

Nihal D'Souza

Final app release

e41b03f about 3 years ago

1.77 kB

	# -- coding: utf-8 --
	"""
	Created on Tue Jun 14 23:53:47 2022

	@author: UTKARSH
	"""

	import numpy as np

	import re
	import glob
	import spacy
	from sklearn.feature_extraction.text import TfidfVectorizer

	try:
	from src.clean import clean_license_text
	except:
	from clean import clean_license_text


	nlp = spacy.load("en_core_web_sm")

	TOP_N_WORDS = 100


	def tfidf_preprocess(text):
	"""
	Cleans text by lowercasing it and removing all the special characters.

	Parameters
	----------
	text : str
	Raw license text.

	Returns
	-------
	str
	Cleaned and lowercased license text.

	"""
	text = text.lower()

	# Remove all non-letter words
	text = re.sub("[^a-zA-Z]+", " ", text)

	## Lemmatize the words
	# text = " ".join([token.lemma_.lower().strip() for token in nlp(text)])
	return text.strip()

	corpus = list()

	filepaths = glob.glob("../notebooks/licenses/NOASSERTION/*.txt")
	# filepaths.extend(glob.glob("../notebooks/licenses/OTHER/*.txt"))

	for file_path in filepaths:
	with open(file_path, "r", encoding="utf-8") as f:
	# To eliminate the url and blank line from start of the files
	f.readline()
	f.readline()

	# Reading the remaining content
	content = f.read()

	cleaned_license_text, _ = clean_license_text(content)
	corpus.append(cleaned_license_text)


	vectorizer = TfidfVectorizer(
	lowercase=True,
	preprocessor=tfidf_preprocess,
	stop_words="english"
	)

	tfidf = vectorizer.fit_transform(corpus)

	feature_array = np.array(vectorizer.get_feature_names_out())
	tfidf_sorting = np.argsort(tfidf.toarray()).flatten()

	top_n = feature_array[tfidf_sorting][-TOP_N_WORDS:][::-1]
	bottom_n = feature_array[tfidf_sorting][:TOP_N_WORDS]