TrafficSigns_Classification / Traffic_Signs_Classification.py
Rae1230's picture
Update Traffic_Signs_Classification.py
86ffcfa verified
raw
history blame
1.57 kB
import streamlit as st
from PIL import Image
import torch
from transformers import AutoImageProcessor
import pandas as pd
from transformers import ViTForImageClassification
from transformers import VitsModel, AutoTokenizer
import torch
from IPython.display import Audio
# Streamlit application title
st.title("Speech the Traffic Signs")
#Traffic Sign Classification
model= ViTForImageClassification.from_pretrained('Rae1230/Traffic_Signs_Classification')
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
uploaded_file = st.file_uploader("Choose a PNG image...", type="png", accept_multiple_files=False)
if uploaded_file is not None:
img = Image.open(uploaded_file)
st.image(img, caption='Uploaded Image.', use_column_width=True)
inputs = processor(img.convert('RGB'), return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
img_class_idx=logits.argmax(-1).item()
with open("labels.csv", "r") as file:
df = pd.read_csv(file)
num_col = df['ClassId']
text_col = df['Name']
text_value = text_col.loc[num_col == img_class_idx].values[0]
st.write("Predicted class:", text_value)
#speech the Traffic Sign
model2 = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
text = text_value
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
output = model2(**inputs).waveform
st.audio(output.numpy(),sample_rate=model2.config.sampling_rate)