# -*- coding: utf-8 -*- """ Created on Tue Jun 14 23:53:47 2022 @author: UTKARSH """ import numpy as np import re import glob import spacy from sklearn.feature_extraction.text import TfidfVectorizer try: from src.clean import clean_license_text except: from clean import clean_license_text nlp = spacy.load("en_core_web_sm") TOP_N_WORDS = 100 def tfidf_preprocess(text): """ Cleans text by lowercasing it and removing all the special characters. Parameters ---------- text : str Raw license text. Returns ------- str Cleaned and lowercased license text. """ text = text.lower() # Remove all non-letter words text = re.sub("[^a-zA-Z]+", " ", text) ## Lemmatize the words # text = " ".join([token.lemma_.lower().strip() for token in nlp(text)]) return text.strip() corpus = list() filepaths = glob.glob("../notebooks/licenses/NOASSERTION/*.txt") # filepaths.extend(glob.glob("../notebooks/licenses/OTHER/*.txt")) for file_path in filepaths: with open(file_path, "r", encoding="utf-8") as f: # To eliminate the url and blank line from start of the files f.readline() f.readline() # Reading the remaining content content = f.read() cleaned_license_text, _ = clean_license_text(content) corpus.append(cleaned_license_text) vectorizer = TfidfVectorizer( lowercase=True, preprocessor=tfidf_preprocess, stop_words="english" ) tfidf = vectorizer.fit_transform(corpus) feature_array = np.array(vectorizer.get_feature_names_out()) tfidf_sorting = np.argsort(tfidf.toarray()).flatten() top_n = feature_array[tfidf_sorting][-TOP_N_WORDS:][::-1] bottom_n = feature_array[tfidf_sorting][:TOP_N_WORDS]