import streamlit as st from PIL import Image import torch from transformers import AutoImageProcessor import pandas as pd from transformers import ViTForImageClassification from transformers import VitsModel, AutoTokenizer import torch from IPython.display import Audio # Streamlit application title st.title("Phonically describe traffic signs") #Traffic Sign Classification model= ViTForImageClassification.from_pretrained('Rae1230/Traffic_Signs_Classification') processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") uploaded_file = st.file_uploader("Choose a PNG image...", type="png", accept_multiple_files=False) if uploaded_file is not None: img = Image.open(uploaded_file) st.image(img, caption='Uploaded Image.', use_column_width=True) inputs = processor(img.convert('RGB'), return_tensors="pt") outputs = model(**inputs) logits = outputs.logits img_class_idx=logits.argmax(-1).item() with open("labels.csv", "r") as file: df = pd.read_csv(file) num_col = df['ClassId'] text_col = df['Name'] text_value = text_col.loc[num_col == img_class_idx].values[0] st.header("Classified traffic sign:") st.write(text_value) #speech the Traffic Sign model2 = VitsModel.from_pretrained("facebook/mms-tts-eng") tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") text = text_value inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model2(**inputs).waveform st.header("Phonically describe this traffic sign:") st.audio(output.numpy(),sample_rate=model2.config.sampling_rate)