Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

SDSN-demo / utils /keyword_extraction.py

prashant

chnaging logo file

b114d3b almost 2 years ago

raw

history blame

No virus

4.31 kB

	import pandas as pd
	# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
	# import nltk
	# nltk.download('stopwords')
	# from nltk.corpus import stopwords
	import pickle
	from typing import List, Text
	import logging
	from summa import keywords

	try:
	import streamlit as st
	except ImportError:
	logging.info("Streamlit not installed")


	def sort_coo(coo_matrix):
	"""
	It takes Coordinate format scipy sparse matrix and extracts info from same.\
	1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
	"""
	tuples = zip(coo_matrix.col, coo_matrix.data)
	return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

	def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
	"""get the feature names and tf-idf score of top n items

	Params
	---------
	feature_names: list of words from vectorizer
	sorted_items: tuple returned by sort_coo function defined in \
	keyword_extraction.py
	topn: topn words to be extracted using tfidf

	Return
	----------
	results: top extracted keywords

	"""

	#use only topn items from vector
	sorted_items = sorted_items[:top_n]
	score_vals = []
	feature_vals = []

	# word index and corresponding tf-idf score
	for idx, score in sorted_items:

	#keep track of feature name and its corresponding score
	score_vals.append(round(score, 3))
	feature_vals.append(feature_names[idx])

	results= {}
	for idx in range(len(feature_vals)):
	results[feature_vals[idx]]=score_vals[idx]

	return results


	def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
	"""
	TFIDF based keywords extraction

	Params
	---------
	vectorizer: trained cont vectorizer model
	tfidfmodel: TFIDF Tranformer model
	top_n: Top N keywords to be extracted
	textdata: text data to which needs keyword extraction

	Return
	----------
	keywords: top extracted keywords

	"""
	features = vectorizer.get_feature_names_out()
	tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
	sorted_items=sort_coo(tf_idf_vector.tocoo())
	results=extract_topn_from_vector(features,sorted_items,top_n)
	keywords = [keyword for keyword in results]
	return keywords

	def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
	"""
	TFIDF based keywords extraction

	Params
	---------
	sdg: which sdg tfidf model to be used
	sdgdata: text data to which needs keyword extraction


	Return
	----------
	keywords: top extracted keywords

	"""
	model_path = "docStore/sdg{}/".format(sdg)
	vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
	tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
	features = vectorizer.get_feature_names_out()
	tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
	sorted_items=sort_coo(tf_idf_vector.tocoo())
	top_n = top_n
	results=extract_topn_from_vector(features,sorted_items,top_n)
	keywords = [keyword for keyword in results]
	return keywords

	@st.cache(allow_output_mutation=True)
	def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
	"""
	wrappper function to perform textrank, uses either ratio or wordcount to
	extract top keywords limited by words or ratio.
	1. https://github.com/summanlp/textrank/blob/master/summa/keywords.py

	Params
	--------
	textdata: text data to perform the textrank.
	ratio: float to limit the number of keywords as proportion of total token \
	in textdata
	words: number of keywords to be extracted. Takes priority over ratio if \
	Non zero. Howevr incase the pagerank returns lesser keywords than \
	compared to fix value then ratio is used.

	Return
	--------
	results: extracted keywords
	"""
	if words == 0:
	logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
	results = keywords.keywords(textdata, ratio= ratio).split("\n")
	else:
	try:
	results = keywords.keywords(textdata, words= words).split("\n")
	except:
	results = keywords.keywords(textdata, ratio = ratio).split("\n")

	return results