import re import spacy import json from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel import streamlit as st import whisper he_regex = re.compile(r'\b(he|him|himself)\b', flags=re.IGNORECASE) his_regex = re.compile(r'\b(his)\b', flags=re.IGNORECASE) she_regex = re.compile(r'\b(she|herself)\b', flags=re.IGNORECASE) her_regex = re.compile(r'\b(her)\b', flags=re.IGNORECASE) def hide_footer(): hide_st_style = """ """ st.markdown(hide_st_style, unsafe_allow_html=True) @st.cache_resource def get_whisper_model(model_url:str='tiny'): print("--------------------------------------------") print("Attempting to load Whisper ...") model = whisper.load_model(model_url, device='cpu') print("Succesfully loaded Whisper") return model @st.cache_resource def get_seq2seq_model(model_id): return AutoModelForSeq2SeqLM.from_pretrained(model_id) @st.cache_resource def get_causal_model(model_id): return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True) @st.cache_resource def get_auto_model(model_id): return AutoModel.from_pretrained(model_id) @st.cache_resource def get_tokenizer(model_id): return AutoTokenizer.from_pretrained(model_id) @st.cache_data def get_celeb_data(fpath): with open(fpath) as json_file: return json.load(json_file) @st.cache_resource def preprocess_text(name, gender, text, model_id): lname = name.split(" ")[-1] lname_regex = re.compile(rf'\b({lname})\b') name_regex = re.compile(rf'\b({name})\b') lnames = lname+"’s" if not lname.endswith("s") else lname+"’" lnames_regex = re.compile(rf'\b({lnames})\b') names = name+"’s" if not name.endswith("s") else name+"’" names_regex = re.compile(rf'\b({names})\b') if gender == "M": text = re.sub(he_regex, "I", text) text = re.sub(his_regex, "my", text) elif gender == "F": text = re.sub(she_regex, "I", text) text = re.sub(her_regex, "my", text) text = re.sub(names_regex, "my", text) text = re.sub(lnames_regex, "my", text) text = re.sub(name_regex, "I", text) text = re.sub(lname_regex, "I", text) spacy_model = spacy.load(model_id) texts = [i.text.strip() for i in spacy_model(text).sents] return spacy_model, texts