oroszgy's picture
refactor(keyphrases): moving examples around
9193ed5 unverified
raw
history blame
815 Bytes
import math
from pathlib import Path
from typing import Dict, List
import spacy
from spacy import Language
NLP: Language = spacy.load("hu_core_news_trf")
def _compute_idf(freq_file: Path) -> Dict[str, float]:
freqs: Dict[str, int] = {}
with freq_file.open() as f:
for line in f:
line = line.strip()
data: List[str] = line.split()
if len(data) == 0:
continue
word: str = data[0]
freq: int = int(data[-1])
if not line.isalpha():
freqs[word] = freq
max_freq: int = freqs["a"]
idfs: Dict[str, float] = {w: math.log2(max_freq / (float(f) + 1)) + 1 for w, f in freqs.items()}
return idfs
IDF: Dict[str, float] = _compute_idf(Path(__file__).parent.parent / "resources" / "freq.list")