File size: 3,596 Bytes
2080fde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# This reads a CMUDict formatted dictionary as a dictionary object
import re
import h2p_parser.format_ph as ph
from . import DATA_PATH


_dict_primary = 'cmudict.dict'


def read_dict(filename: str) -> list:
    # Read the file
    with open(filename, encoding='utf-8', mode='r') as f:
        # Read the file into lines
        lines = f.readlines()
    # Remove any line starting with ";;;"
    lines = [line for line in lines if not line.startswith(';;;')]
    return lines


def parse_dict(lines: list) -> dict:
    # Create a dictionary to store the parsed data
    parsed_dict = {}
    # Detect file format

    # We will read the first 10 lines to determine the format
    # Default to SSD format unless we find otherwise
    dict_form = 'SSD'
    for line in lines[:10]:
        # Strip new lines
        line = line.strip()
        if line == '':
            continue
        """
        Format 1 (Double Space Delimited):
        - Comment allowed to start with ";;;"
        WORD  W ER1 D
        
        Format 2 (Single Space Delimited):
        - Comment allowed at end of any line using "#"
        WORD W ER1 D # Comment
        """
        if '  ' in line:
            dict_form = 'DSD'
            break

    # Iterate over the lines
    for line in lines:
        # Skip empty lines and lines with no space
        line = line.strip()
        if line == '' and ' ' not in line:
            continue

        # Split depending on format
        if dict_form == 'DSD':
            pairs = line.split('  ')
        else:
            space_index = line.find(' ')
            line_split = line[:space_index], line[space_index + 1:]
            pairs = line_split[0], line_split[1].split('#')[0]

        word = str.lower(pairs[0])  # Get word and lowercase it
        phonemes = ph.to_list(pairs[1])   # Convert to list of phonemes
        phonemes = [phonemes]  # Wrap in nested list
        word_num = 0
        word_orig = None

        # Detect if this is a multi-word entry
        if ('(' in word) and (')' in word) and any(char.isdigit() for char in word):
            # Parse the integer from the word using regex
            result = int(re.findall(r"\((\d+)\)", word)[0])
            # If found
            if result is not None:
                # Set the original word
                word_orig = word
                # Remove the integer and bracket from the word
                word = re.sub(r"\(.*\)", "", word)
                # Set the word number to the result
                word_num = result

        # Check existing key
        if word in parsed_dict:
            # If word number is 0, ignore
            if word_num == 0:
                continue
            # If word number is not 0, add phoneme to existing key at index
            parsed_dict[word].extend(phonemes)
            # Also add the original word if it exists
            if word_orig is not None:
                parsed_dict[word_orig] = phonemes
        else:
            # Create a new key
            parsed_dict[word] = phonemes

    # Return the dictionary
    return parsed_dict


class DictReader:
    def __init__(self, filename=None):
        self.filename = filename
        self.dict = {}
        # If filename is None, use the default dictionary
        # default = 'data' uses the dictionary file in the data module
        # default = 'nltk' uses the nltk cmudict
        if filename is not None:
            self.dict = parse_dict(read_dict(filename))
        else:
            with DATA_PATH.joinpath(_dict_primary) as f:
                self.dict = parse_dict(read_dict(f))