Instructions to use Sigurdur/ice-roberta with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Sigurdur/ice-roberta with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("fill-mask", model="Sigurdur/ice-roberta")# Load model directly from transformers import AutoTokenizer, AutoModelForMaskedLM tokenizer = AutoTokenizer.from_pretrained("Sigurdur/ice-roberta") model = AutoModelForMaskedLM.from_pretrained("Sigurdur/ice-roberta") - Notebooks
- Google Colab
- Kaggle
| #! usr/bin/python3 | |
| import xml.etree.ElementTree as ET | |
| from os import listdir, walk, remove | |
| from os.path import isfile, join | |
| import sys | |
| from glob import glob | |
| import urllib.request | |
| def download_data(): | |
| urls = ["http://hdl.handle.net/20.500.12537/192", "https://repository.clarin.is/repository/xmlui/handle/20.500.12537/32/allzip", "https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/96/icesum.json?sequence=1&isAllowed=y"] | |
| for url in urls: | |
| urllib.request.urlretrieve(url) | |
| def get_contents(path='./{http://www.tei-c.org/ns/1.0}text/*/*/*', root=None): | |
| results = '' | |
| paragraphs = root.findall(path) | |
| for p in paragraphs: | |
| results += ''.join(t for t in p.itertext()) + '\n' | |
| return results | |
| def save(path, data): | |
| f = open(path, 'w', encoding='utf8') | |
| f.write(data) | |
| f.close() | |
| def delete_contents_of_dir(path='./data/train/*'): | |
| files = glob(path) | |
| for f in files: | |
| remove(f) | |
| def path_to_files(path='./CC_BY/IGC-News1-21.05.TEI/frettabladid_is'): | |
| the_paths = [] | |
| for dirs in walk(path): | |
| subdirectory = dirs[0] | |
| # extract the path to the text files | |
| for file in listdir(subdirectory): | |
| # if the file does not exist pass | |
| if isfile(join(subdirectory, file)): | |
| the_paths.append(f'{subdirectory}/{file}') | |
| return the_paths | |
| def convert_n_format(paths): | |
| # train test ratio | |
| train = int(len(paths) * .8) | |
| for n in range(len(paths)): | |
| path = paths[n] | |
| # parse the xml file | |
| tree = ET.parse(path) | |
| root = tree.getroot() | |
| # get contents of the file | |
| contents = get_contents(root=root) | |
| # empty directory before saving to it, to ensure no leeking of data | |
| # delete_contents_of_dir('./data/test/*') | |
| # save the file as a .txt in either the train or test dir | |
| id = 'news' | |
| output_path = (f'./data/train/{n}_{id}.txt' if n < (train - 1) | |
| else f'./data/test/{(n - train)}_{id}.txt') | |
| # output_path = (f'./another/{n}-train.txt' if n < (train -1) else f'./another/{(n - len(paths) *.01)}-test.txt') | |
| save(output_path, contents) | |
| print("Beginning data preproccessing") | |
| # get all the paths to the files | |
| print("Collecting paths") | |
| all_paths = path_to_files('./CC_BY/IGC-News1-21.05.TEI/frettabladid_is') | |
| print("Found ", len(all_paths)) | |
| # convert and format all the files | |
| print("converting files...") | |
| convert_n_format(all_paths) | |
| print("\nDone!") |