Spaces:

jeffeux
/

spacy-streamlit-haowenchiang

Runtime error

App Files Files Community

spacy-streamlit-haowenchiang / jieba /analyse /tfidf.py

jeffeux

Migrate to HF Space

d825710 about 3 years ago

raw

history blame contribute delete

4.31 kB

	# encoding=utf-8
	from __future__ import absolute_import
	import os
	import jieba
	import jieba.posseg
	from operator import itemgetter

	_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),
	os.path.dirname(__file__), path))
	_get_abs_path = jieba._get_abs_path

	DEFAULT_IDF = _get_module_path("idf.txt")


	class KeywordExtractor(object):

	STOP_WORDS = set((
	"the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
	"by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
	"this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
	))

	def set_stop_words(self, stop_words_path):
	abs_path = _get_abs_path(stop_words_path)
	if not os.path.isfile(abs_path):
	raise Exception("jieba: file does not exist: " + abs_path)
	content = open(abs_path, 'rb').read().decode('utf-8')
	for line in content.splitlines():
	self.stop_words.add(line)

	def extract_tags(self, args, *kwargs):
	raise NotImplementedError


	class IDFLoader(object):

	def __init__(self, idf_path=None):
	self.path = ""
	self.idf_freq = {}
	self.median_idf = 0.0
	if idf_path:
	self.set_new_path(idf_path)

	def set_new_path(self, new_idf_path):
	if self.path != new_idf_path:
	self.path = new_idf_path
	content = open(new_idf_path, 'rb').read().decode('utf-8')
	self.idf_freq = {}
	for line in content.splitlines():
	word, freq = line.strip().split(' ')
	self.idf_freq[word] = float(freq)
	self.median_idf = sorted(
	self.idf_freq.values())[len(self.idf_freq) // 2]

	def get_idf(self):
	return self.idf_freq, self.median_idf


	class TFIDF(KeywordExtractor):

	def __init__(self, idf_path=None):
	self.tokenizer = jieba.dt
	self.postokenizer = jieba.posseg.dt
	self.stop_words = self.STOP_WORDS.copy()
	self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
	self.idf_freq, self.median_idf = self.idf_loader.get_idf()

	def set_idf_path(self, idf_path):
	new_abs_path = _get_abs_path(idf_path)
	if not os.path.isfile(new_abs_path):
	raise Exception("jieba: file does not exist: " + new_abs_path)
	self.idf_loader.set_new_path(new_abs_path)
	self.idf_freq, self.median_idf = self.idf_loader.get_idf()

	def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
	"""
	Extract keywords from sentence using TF-IDF algorithm.
	Parameter:
	- topK: return how many top keywords. `None` for all possible words.
	- withWeight: if True, return a list of (word, weight);
	if False, return a list of words.
	- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
	if the POS of w is not in this list,it will be filtered.
	- withFlag: only work with allowPOS is not empty.
	if True, return a list of pair(word, weight) like posseg.cut
	if False, return a list of words
	"""
	if allowPOS:
	allowPOS = frozenset(allowPOS)
	words = self.postokenizer.cut(sentence)
	else:
	words = self.tokenizer.cut(sentence)
	freq = {}
	for w in words:
	if allowPOS:
	if w.flag not in allowPOS:
	continue
	elif not withFlag:
	w = w.word
	wc = w.word if allowPOS and withFlag else w
	if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
	continue
	freq[w] = freq.get(w, 0.0) + 1.0
	total = sum(freq.values())
	for k in freq:
	kw = k.word if allowPOS and withFlag else k
	freq[k] *= self.idf_freq.get(kw, self.median_idf) / total

	if withWeight:
	tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
	else:
	tags = sorted(freq, key=freq.__getitem__, reverse=True)
	if topK:
	return tags[:topK]
	else:
	return tags