import math from pathlib import Path from typing import Dict, List import spacy from spacy import Language NLP: Language = spacy.load("hu_core_news_trf") def _compute_idf(freq_file: Path) -> Dict[str, float]: freqs: Dict[str, int] = {} with freq_file.open() as f: for line in f: line = line.strip() data: List[str] = line.split() if len(data) == 0: continue word: str = data[0] freq: int = int(data[-1]) if not line.isalpha(): freqs[word] = freq max_freq: int = freqs["a"] idfs: Dict[str, float] = {w: math.log2(max_freq / (float(f) + 1)) + 1 for w, f in freqs.items()} return idfs IDF: Dict[str, float] = _compute_idf(Path(__file__).parent.parent / "resources" / "freq.list")