Nihal D'Souza
Final app release
e41b03f
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 14 23:53:47 2022
@author: UTKARSH
"""
import numpy as np
import re
import glob
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
try:
from src.clean import clean_license_text
except:
from clean import clean_license_text
nlp = spacy.load("en_core_web_sm")
TOP_N_WORDS = 100
def tfidf_preprocess(text):
"""
Cleans text by lowercasing it and removing all the special characters.
Parameters
----------
text : str
Raw license text.
Returns
-------
str
Cleaned and lowercased license text.
"""
text = text.lower()
# Remove all non-letter words
text = re.sub("[^a-zA-Z]+", " ", text)
## Lemmatize the words
# text = " ".join([token.lemma_.lower().strip() for token in nlp(text)])
return text.strip()
corpus = list()
filepaths = glob.glob("../notebooks/licenses/NOASSERTION/*.txt")
# filepaths.extend(glob.glob("../notebooks/licenses/OTHER/*.txt"))
for file_path in filepaths:
with open(file_path, "r", encoding="utf-8") as f:
# To eliminate the url and blank line from start of the files
f.readline()
f.readline()
# Reading the remaining content
content = f.read()
cleaned_license_text, _ = clean_license_text(content)
corpus.append(cleaned_license_text)
vectorizer = TfidfVectorizer(
lowercase=True,
preprocessor=tfidf_preprocess,
stop_words="english"
)
tfidf = vectorizer.fit_transform(corpus)
feature_array = np.array(vectorizer.get_feature_names_out())
tfidf_sorting = np.argsort(tfidf.toarray()).flatten()
top_n = feature_array[tfidf_sorting][-TOP_N_WORDS:][::-1]
bottom_n = feature_array[tfidf_sorting][:TOP_N_WORDS]