Spaces:

menikev
/

TestApp

Running

File size: 1,290 Bytes

d2ed505

import pandas as pd
import torch
def preparing_data(text:str , domain: int):
    """
    
    

    Args:
        text (_str_): input text from the user
        domain (_int_): output domain from domain identification pipeline

    Returns:
        _DataFrame_: dataframe contains texts and domain
    """
    # Let's assume you have the following dictionary
    # the model can't do inference with only one example so this dummy example must be put
    dict_data = {
        'text': ['hello world' ] ,
        'domain': [0] , 
    }
    
    dict_data["text"].append(text)
    dict_data["domain"].append(domain)
    # Convert the dictionary to a DataFrame
    df = pd.DataFrame(dict_data)

    # return the dataframe
    return df


def loading_data(tokenizer , df: pd.DataFrame ):
    ids = []
    masks = []
    domain_list = []

    texts = df["text"]
    domains= df["domain"]

    
    for i in range(len(df)):
        text = texts[i]
        token = tokenizer(text)
        ids.append(token["token_id"])
        masks.append(token["mask"])
        domain_list.append(domains[i])

        input_ids = torch.cat(ids , dim=0)
        input_masks = torch.cat(masks ,dim = 0)
        input_domains = torch.tensor(domain_list)
    
    
    return input_ids , input_masks , input_domains