Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue Jun 14 23:53:47 2022 | |
@author: UTKARSH | |
""" | |
import numpy as np | |
import re | |
import glob | |
import spacy | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
try: | |
from src.clean import clean_license_text | |
except: | |
from clean import clean_license_text | |
nlp = spacy.load("en_core_web_sm") | |
TOP_N_WORDS = 100 | |
def tfidf_preprocess(text): | |
""" | |
Cleans text by lowercasing it and removing all the special characters. | |
Parameters | |
---------- | |
text : str | |
Raw license text. | |
Returns | |
------- | |
str | |
Cleaned and lowercased license text. | |
""" | |
text = text.lower() | |
# Remove all non-letter words | |
text = re.sub("[^a-zA-Z]+", " ", text) | |
## Lemmatize the words | |
# text = " ".join([token.lemma_.lower().strip() for token in nlp(text)]) | |
return text.strip() | |
corpus = list() | |
filepaths = glob.glob("../notebooks/licenses/NOASSERTION/*.txt") | |
# filepaths.extend(glob.glob("../notebooks/licenses/OTHER/*.txt")) | |
for file_path in filepaths: | |
with open(file_path, "r", encoding="utf-8") as f: | |
# To eliminate the url and blank line from start of the files | |
f.readline() | |
f.readline() | |
# Reading the remaining content | |
content = f.read() | |
cleaned_license_text, _ = clean_license_text(content) | |
corpus.append(cleaned_license_text) | |
vectorizer = TfidfVectorizer( | |
lowercase=True, | |
preprocessor=tfidf_preprocess, | |
stop_words="english" | |
) | |
tfidf = vectorizer.fit_transform(corpus) | |
feature_array = np.array(vectorizer.get_feature_names_out()) | |
tfidf_sorting = np.argsort(tfidf.toarray()).flatten() | |
top_n = feature_array[tfidf_sorting][-TOP_N_WORDS:][::-1] | |
bottom_n = feature_array[tfidf_sorting][:TOP_N_WORDS] | |