import pandas as pd # from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer # import nltk # nltk.download('stopwords') # from nltk.corpus import stopwords import pickle from typing import List, Text import logging from summa import keywords try: import streamlit as st except ImportError: logging.info("Streamlit not installed") def sort_coo(coo_matrix): """ It takes Coordinate format scipy sparse matrix and extracts info from same.\ 1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb """ tuples = zip(coo_matrix.col, coo_matrix.data) return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True) def extract_topn_from_vector(feature_names, sorted_items, top_n=10): """get the feature names and tf-idf score of top n items Params --------- feature_names: list of words from vectorizer sorted_items: tuple returned by sort_coo function defined in \ keyword_extraction.py topn: topn words to be extracted using tfidf Return ---------- results: top extracted keywords """ #use only topn items from vector sorted_items = sorted_items[:top_n] score_vals = [] feature_vals = [] # word index and corresponding tf-idf score for idx, score in sorted_items: #keep track of feature name and its corresponding score score_vals.append(round(score, 3)) feature_vals.append(feature_names[idx]) results= {} for idx in range(len(feature_vals)): results[feature_vals[idx]]=score_vals[idx] return results def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n): """ TFIDF based keywords extraction Params --------- vectorizer: trained cont vectorizer model tfidfmodel: TFIDF Tranformer model top_n: Top N keywords to be extracted textdata: text data to which needs keyword extraction Return ---------- keywords: top extracted keywords """ features = vectorizer.get_feature_names_out() tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata)) sorted_items=sort_coo(tf_idf_vector.tocoo()) results=extract_topn_from_vector(features,sorted_items,top_n) keywords = [keyword for keyword in results] return keywords def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10): """ TFIDF based keywords extraction Params --------- sdg: which sdg tfidf model to be used sdgdata: text data to which needs keyword extraction Return ---------- keywords: top extracted keywords """ model_path = "docStore/sdg{}/".format(sdg) vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb')) tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb')) features = vectorizer.get_feature_names_out() tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata)) sorted_items=sort_coo(tf_idf_vector.tocoo()) top_n = top_n results=extract_topn_from_vector(features,sorted_items,top_n) keywords = [keyword for keyword in results] return keywords @st.cache(allow_output_mutation=True) def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]: """ wrappper function to perform textrank, uses either ratio or wordcount to extract top keywords limited by words or ratio. 1. https://github.com/summanlp/textrank/blob/master/summa/keywords.py Params -------- textdata: text data to perform the textrank. ratio: float to limit the number of keywords as proportion of total token \ in textdata words: number of keywords to be extracted. Takes priority over ratio if \ Non zero. Howevr incase the pagerank returns lesser keywords than \ compared to fix value then ratio is used. Return -------- results: extracted keywords """ if words == 0: logging.info("Textrank using defulat ratio value = 0.1, as no words limit given") results = keywords.keywords(textdata, ratio= ratio).split("\n") else: try: results = keywords.keywords(textdata, words= words).split("\n") except: results = keywords.keywords(textdata, ratio = ratio).split("\n") return results