File size: 815 Bytes
e2e35eb
 
 
 
f214d73
e2e35eb
 
08879de
e2e35eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f214d73
e2e35eb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import math
from pathlib import Path
from typing import Dict, List

import spacy
from spacy import Language

NLP: Language = spacy.load("hu_core_news_trf")


def _compute_idf(freq_file: Path) -> Dict[str, float]:
    freqs: Dict[str, int] = {}
    with freq_file.open() as f:
        for line in f:
            line = line.strip()
            data: List[str] = line.split()
            if len(data) == 0:
                continue
            word: str = data[0]
            freq: int = int(data[-1])
            if not line.isalpha():
                freqs[word] = freq
    max_freq: int = freqs["a"]
    idfs: Dict[str, float] = {w: math.log2(max_freq / (float(f) + 1)) + 1 for w, f in freqs.items()}
    return idfs


IDF: Dict[str, float] = _compute_idf(Path(__file__).parent.parent / "resources" / "freq.list")