| import streamlit as st |
| from transformers import pipeline |
| import os |
|
|
| |
| st.set_page_config(page_title="Kriolu AI Hub", layout="wide") |
| token = os.environ.get("token") |
|
|
| |
| @st.cache_resource |
| def load_pipeline(task, model_path, **kwargs): |
| return pipeline(task, model=model_path, tokenizer=model_path, token=token, **kwargs) |
|
|
| |
| def instantiate_gpt2(model_name, max_length_, num_return_sequences, text): |
| model_path = f'Iscte-Sintra/{model_name}' |
| pipe = load_pipeline("text-generation", model_path) |
|
|
| return pipe( |
| text, |
| max_new_tokens=max_length_, |
| num_return_sequences=num_return_sequences, |
| do_sample=True, |
| top_p=0.95, |
| top_k=50 |
| ) |
|
|
| |
| def build_chatbot_page(model_name): |
| st.title(f"🤖 {model_name}: Chatbot") |
| |
| if "messages" not in st.session_state: |
| st.session_state.messages = [] |
|
|
| |
| for message in st.session_state.messages: |
| with st.chat_message(message["role"]): |
| st.markdown(message["content"]) |
|
|
| |
| if prompt := st.chat_input("Diz algo..."): |
| st.session_state.messages.append({"role": "user", "content": prompt}) |
| |
| with st.chat_message("user"): |
| st.markdown(prompt) |
|
|
| with st.chat_message("assistant"): |
| with st.spinner("A pensar..."): |
| try: |
| |
| pipe = load_pipeline("text-generation", f"Iscte-Sintra/{model_name}") |
| if pipe is None: |
| st.error("Falha ao inicializar o pipeline.") |
| return |
| |
| tokenizer = pipe.tokenizer |
|
|
| |
| if getattr(tokenizer, "chat_template", None) is None: |
| tokenizer.chat_template = ( |
| "{% for message in messages %}" |
| "{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>\\n'}}" |
| "{% endfor %}" |
| "{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}" |
| ) |
|
|
| |
| conversation = [{"role": "system", "content": "You are a helpful assistant."}] + st.session_state.messages |
| |
| prompt_str = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True) |
| |
| |
| if isinstance(prompt_str, list): |
| prompt_str = prompt_str[0] |
|
|
| |
| |
| pad_id = tokenizer.eos_token_id |
| if isinstance(pad_id, list): |
| pad_id = pad_id[0] |
| if pad_id is None: |
| pad_id = 50256 |
|
|
| |
| response = pipe( |
| str(prompt_str), |
| max_new_tokens=150, |
| do_sample=True, |
| temperature=0.7, |
| pad_token_id=pad_id, |
| return_full_text=False |
| ) |
| |
| |
| if isinstance(response, list) and len(response) > 0: |
| if isinstance(response[0], list): |
| generated_text = response[0][0].get('generated_text', '') |
| else: |
| generated_text = response[0].get('generated_text', '') |
| else: |
| generated_text = str(response) |
| |
| generated_text = generated_text.strip() |
| |
| st.markdown(generated_text) |
| st.session_state.messages.append({"role": "assistant", "content": generated_text}) |
| |
| except Exception as e: |
| st.error(f"Erro na geração: {e}") |
| |
| |
| def instantiate_encoder(model_name, top_k, text): |
| pipe = load_pipeline("fill-mask", f"Iscte-Sintra/{model_name}") |
| return pipe(text, top_k=top_k) |
|
|
| |
| def instantiate_translation_model(model_name, text, src_lg, tgt_lg): |
| model_path = f'Iscte-Sintra/{model_name}' |
|
|
| |
| if "nllb" in model_name: |
| pipe = pipeline( |
| "translation", |
| model=model_path, |
| tokenizer=model_path, |
| token=token, |
| src_lang=src_lg, |
| tgt_lang=tgt_lg |
| ) |
| return pipe(text)[0]["translation_text"] |
|
|
| |
| elif "m2m100" in model_name: |
| pipe = load_pipeline("translation", model_path) |
|
|
| |
| |
| pipe.tokenizer.src_lang = src_lg |
| |
| |
| |
| tgt_lang_id = pipe.tokenizer.convert_tokens_to_ids(tgt_lg) |
| |
| if tgt_lang_id == pipe.tokenizer.unk_token_id: |
| st.error(f"Erro: O token {tgt_lg} não foi encontrado no vocabulário do modelo!") |
| return None |
|
|
| |
| result = pipe( |
| text, |
| forced_bos_token_id=tgt_lang_id |
| ) |
| return result[0]["translation_text"] |
| |
| else: |
| pipe = pipeline( |
| "translation", |
| model=model_path, |
| tokenizer=model_path, |
| token=token, |
| src_lang=src_lg, |
| tgt_lang=tgt_lg |
| ) |
| return pipe(text)[0]["translation_text"] |
|
|
| |
| def build_translation_page(model_name): |
| st.title(f"🌍 {model_name}: Tradução") |
|
|
| if "nllb" in model_name: |
| lang_map = { |
| "Português": "por_Latn", |
| "Kabuverdianu": "kea_Latn" |
| } |
|
|
| elif "m2m100" in model_name: |
| lang_map = { |
| "Português": "__pt__", |
| "Kabuverdianu": "__en__" |
| } |
|
|
| else: |
| lang_map = { |
| "Português": "pt_XX", |
| "Kabuverdianu": "en_XX" |
| } |
|
|
| col1, col2 = st.columns(2) |
| with col1: |
| src_label = st.selectbox("Língua de Origem", list(lang_map.keys())) |
| with col2: |
| tgt_label = st.selectbox("Língua de Destino", list(lang_map.keys())) |
|
|
| text = st.text_area("Texto de entrada", "Katxór sta trás di pórta.", height=100) |
|
|
| if st.button("Traduzir"): |
| if not text.strip(): |
| st.warning("Introduza texto!") |
| return |
|
|
| with st.spinner("A traduzir..."): |
| try: |
| result = instantiate_translation_model( |
| model_name, |
| text, |
| lang_map[src_label], |
| lang_map[tgt_label] |
| ) |
| st.success("Resultado:") |
| st.write(result) |
| except Exception as e: |
| st.error(f"Erro: {e}") |
|
|
| |
| def build_decoder_page(model_name): |
| st.title(f"✍️ {model_name}: Geração de Texto") |
| max_length = st.sidebar.slider("Máximo de Tokens", 10, 200, 50) |
| num_seq = st.sidebar.number_input("Sequências", 1, 5, 1) |
| text = st.text_area("Prompt", "Katxór sta trás di pórta.") |
|
|
| if st.button("Gerar"): |
| with st.spinner("A processar..."): |
| try: |
| results = instantiate_gpt2(model_name, max_length, num_seq, text) |
| for res in results: |
| st.info(res["generated_text"]) |
| except Exception as e: |
| st.error(f"Erro: {e}") |
|
|
| |
| def build_encoder_page(model_name): |
| st.title(f"🔍 {model_name}: Fill-Mask") |
| top_k = st.sidebar.slider("Top K sugestões", 1, 5, 3) |
|
|
| mask_token = "<mask>" if "RoBERTa" in model_name else "[MASK]" |
| st.write(f"Use o token **{mask_token}** para a palavra em falta.") |
|
|
| input_text = st.text_input("Frase", f"Katxór sta trás di {mask_token}.") |
|
|
| if st.button("Prever"): |
| try: |
| results = instantiate_encoder(model_name, top_k, input_text) |
| for res in results: |
| st.write(f"✅ **{res['token_str']}** ({res['score']:.2%})") |
| except Exception: |
| st.error(f"Certifique-se que usou o token {mask_token}") |
|
|
| |
| model_dict = { |
| "RoBERTa-Kriolu": "Encoder", |
| "GPT2_v1.18": "Decoder", |
| "LLM-kea-v1.0": "Decoder", |
| "Modelo-Traducao-kea-ptpt-v1.0": "Encoder-Decoder", |
| "nllb-v1.0": "Encoder-Decoder", |
| "m2m100-v1.0": "Encoder-Decoder", |
| "mbart-v2.0": "Encoder-Decoder", |
| "m2m100-v2.0": "Encoder-Decoder", |
| "mbart-v2.1": "Encoder-Decoder", |
| "nllb-v2.0": "Encoder-Decoder", |
| "m2m100-v2.1": "Encoder-Decoder", |
| "mbart-v2.3": "Encoder-Decoder", |
| "m2m100-v2.1": "Encoder-Decoder", |
| "nllb-kea-v2.1": "Encoder-Decoder", |
| "portuguese-experiment": "Chatbot", |
| } |
|
|
| selected_model = st.sidebar.selectbox("Escolha o Modelo", list(model_dict.keys())) |
| arch = model_dict[selected_model] |
|
|
| if arch == "Chatbot": |
| build_chatbot_page(selected_model) |
| elif arch == "Encoder": |
| build_encoder_page(selected_model) |
| elif arch == "Encoder-Decoder": |
| build_translation_page(selected_model) |
| else: |
| build_decoder_page(selected_model) |
|
|