Spaces:
Sleeping
Sleeping
import streamlit as st | |
from PIL import Image | |
import torch | |
from transformers import AutoImageProcessor | |
import pandas as pd | |
from transformers import ViTForImageClassification | |
from transformers import VitsModel, AutoTokenizer | |
import torch | |
from IPython.display import Audio | |
# Streamlit application title | |
st.title("Speech the Traffic Signs") | |
#Traffic Sign Classification | |
model= ViTForImageClassification.from_pretrained('Rae1230/Traffic_Signs_Classification') | |
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") | |
uploaded_file = st.file_uploader("Choose a PNG image...", type="png", accept_multiple_files=False) | |
if uploaded_file is not None: | |
img = Image.open(uploaded_file) | |
st.image(img, caption='Uploaded Image.', use_column_width=True) | |
inputs = processor(img.convert('RGB'), return_tensors="pt") | |
outputs = model(**inputs) | |
logits = outputs.logits | |
img_class_idx=logits.argmax(-1).item() | |
with open("labels.csv", "r") as file: | |
df = pd.read_csv(file) | |
num_col = df['ClassId'] | |
text_col = df['Name'] | |
text_value = text_col.loc[num_col == img_class_idx].values[0] | |
st.write("Predicted class:", text_value) | |
#speech the Traffic Sign | |
model2 = VitsModel.from_pretrained("facebook/mms-tts-eng") | |
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") | |
text = text_value | |
inputs = tokenizer(text, return_tensors="pt") | |
with torch.no_grad(): | |
output = model2(**inputs).waveform | |
st.audio(output.numpy(),sample_rate=model2.config.sampling_rate) |