File size: 1,258 Bytes
7e6bce5
 
d02038d
7e6bce5
b2c2c22
 
7e6bce5
b3c0d6f
 
b1bc515
023db76
b1bc515
023db76
 
 
 
 
 
b2c2c22
4f9f819
1c8f137
4f9f819
 
 
 
 
 
 
b2c2c22
4f9f819
 
 
 
7e6bce5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from datasets import load_dataset
import pandas as pd
import numpy as np
import streamlit as st
from transformers import AutoTokenizer
import matplotlib.pyplot as plt

st.set_page_config(layout="wide")

with st.sidebar:
    subset = st.selectbox('subset', ('dev', 'devtest'))

with st.echo():
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
    flores = load_dataset("facebook/flores", "eng_Latn-ukr_Cyrl")
    dataset = flores[subset]
    eng_num_tokens = dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_eng_Latn'])['input_ids'])})['num_tokens']
    ukr_num_tokens = dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_ukr_Cyrl'])['input_ids'])})['num_tokens']

with st.sidebar:
    fig, (axl, axr) = plt.subplots(2, 1, figsize=(3,6))
    axl.hist(eng_num_tokens)
    axl.set_title(f'eng mistral tokens ({np.sum(eng_num_tokens)} total)')
    axr.hist(ukr_num_tokens)
    axr.set_title(f'ukr mistral tokens ({np.sum(ukr_num_tokens)} total)')
    st.pyplot(fig)

keyword = st.text_input("Filter by text", value="")

if not keyword:
    st.dataframe(pd.DataFrame(dataset))
else:
    st.dataframe(pd.DataFrame(dataset.filter(lambda x: keyword in x['sentence_eng_Latn'] or keyword in x['sentence_ukr_Cyrl'])))