Edit model card

Hebrew Conclusion Extraction Model (based on token classification)

How to use

from transformers import  RobertaTokenizerFast, AutoModelForTokenClassification
from datasets  import load_dataset

def split_into_windows(examples):
    return {'sentences': [examples['sentence']], 'labels': [examples["label"]]}

def concatenate_dict_value(dict_obj):
    concatenated_dict = {}
    for key, value in dict_obj.items():
        flattened_list = []
        for sublist in value:
            if len(flattened_list) + len(sublist) <= 512:
                for item in sublist:
                    flattened_list.append(item)
            else:
                print("Not all sentences were processed due to length")
                break
        concatenated_dict[key] = flattened_list
    return concatenated_dict

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["sentences"], truncation=True, max_length=512)
    tokeized_inp_concat = concatenate_dict_value(tokenized_inputs)
    tokenized_inputs["input_ids"] = tokeized_inp_concat['input_ids']
    tokenized_inputs["attention_mask"] = tokeized_inp_concat['attention_mask']
    word_ids = tokenized_inputs["input_ids"]
    labels = []
    count = 0
    for word_idx in word_ids:
        if word_idx == 2:
            labels.append(examples[f"labels"][count])
            count = count + 1
        else:
            labels.append(-100)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

model = AutoModelForTokenClassification.from_pretrained('HeTree/HeConE') 
tokenizer = RobertaTokenizerFast.from_pretrained('HeTree/HeConE')
raw_dataset = load_dataset('HeTree/MevakerConcSen')
window_size = 5
raw_dataset_window = raw_dataset.map(split_into_windows, batched=True, batch_size=window_size, remove_columns=raw_dataset['train'].column_names)
tokenized_dataset = raw_dataset_window.map(tokenize_and_align_labels, batched=False)

Citing

If you use HeConE in your research, please cite Mevaker: Conclusion Extraction and Allocation Resources for the Hebrew Language.

@article{shalumov2024mevaker,
      title={Mevaker: Conclusion Extraction and Allocation Resources for the Hebrew Language}, 
      author={Vitaly Shalumov and Harel Haskey and Yuval Solaz},
      year={2024},
      eprint={2403.09719},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
Downloads last month
3
Safetensors
Model size
124M params
Tensor type
I64
·
F32
·
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Dataset used to train HeTree/HeConE