File size: 1,290 Bytes
d2ed505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pandas as pd
import torch
def preparing_data(text:str , domain: int):
    """
    
    

    Args:
        text (_str_): input text from the user
        domain (_int_): output domain from domain identification pipeline

    Returns:
        _DataFrame_: dataframe contains texts and domain
    """
    # Let's assume you have the following dictionary
    # the model can't do inference with only one example so this dummy example must be put
    dict_data = {
        'text': ['hello world' ] ,
        'domain': [0] , 
    }
    
    dict_data["text"].append(text)
    dict_data["domain"].append(domain)
    # Convert the dictionary to a DataFrame
    df = pd.DataFrame(dict_data)

    # return the dataframe
    return df


def loading_data(tokenizer , df: pd.DataFrame ):
    ids = []
    masks = []
    domain_list = []

    texts = df["text"]
    domains= df["domain"]

    
    for i in range(len(df)):
        text = texts[i]
        token = tokenizer(text)
        ids.append(token["token_id"])
        masks.append(token["mask"])
        domain_list.append(domains[i])

        input_ids = torch.cat(ids , dim=0)
        input_masks = torch.cat(masks ,dim = 0)
        input_domains = torch.tensor(domain_list)
    
    
    return input_ids , input_masks , input_domains