Spaces:
Sleeping
Sleeping
import librosa | |
import numpy as np | |
import torch | |
from collections import Counter | |
import nltk | |
import string | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
nltk.download('punkt') | |
nltk.download('punkt_tab') | |
nltk.download('averaged_perceptron_tagger_eng') | |
nltk.download('averaged_perceptron_tagger') | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
def get_pitch_list(y,sr): | |
hop_length = int(sr / 30) # hop_length determines how far apart the frames are | |
# Extract the pitch (F0) using librosa's piptrack method | |
pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length) | |
# Get the pitch frequencies from the pitch array | |
pitch_frequencies = [] | |
for t in range(pitches.shape[1]): | |
index = magnitudes[:, t].argmax() # Get the index of the maximum magnitude | |
pitch = pitches[index, t] | |
pitch_frequencies.append(pitch) | |
# Convert pitch_frequencies to a NumPy array | |
pitch_frequencies = np.array(pitch_frequencies) | |
print("shape : ",pitch_frequencies.shape) | |
return pitch_frequencies | |
def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path): | |
y, sr = librosa.load(audio_path, sr=16000) | |
inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features | |
inputs = inputs.to(device, dtype=torch_dtype) | |
with torch.no_grad(): | |
generated_ids = asrmodel.generate(inputs) | |
transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
# Sound intensity (RMS) | |
rms = librosa.feature.rms(y=y) | |
sound_intensity = np.mean(rms) | |
# Pitch list | |
pitches=get_pitch_list(y,sr) | |
# Fundamental frequency (F0) | |
f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) | |
fundamental_frequency = np.nanmean(f0) | |
# Spectral energy (based on STFT) | |
S = np.abs(librosa.stft(y)) | |
spectral_energy = np.mean(np.sum(S ** 2, axis=0)) | |
# Spectral centroid | |
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr) | |
avg_spectral_centroid = np.mean(spectral_centroid) | |
# Zero-crossing rate | |
zcr = librosa.feature.zero_crossing_rate(y) | |
zero_crossing_rate = np.mean(zcr) | |
# Pause detection | |
silence_threshold = -40 | |
silent_intervals = librosa.effects.split(y, top_db=silence_threshold) | |
pause_duration = 0 | |
for start, end in silent_intervals: | |
pause_duration += (end - start) / sr | |
total_duration = librosa.get_duration(y=y, sr=sr) | |
pause_rate = (pause_duration / total_duration) * 60 # Convert to pauses per minute | |
# Transcript processing | |
words = nltk.word_tokenize(transcript) | |
words = [word.lower() for word in words if word not in string.punctuation] | |
num_words = len(words) | |
unique_words = len(set(words)) | |
word_frequencies = Counter(words) | |
# Duration in minutes | |
duration_minutes = total_duration / 60 | |
avg_words_per_minute = num_words / duration_minutes | |
avg_unique_words_per_minute = unique_words / duration_minutes | |
# Filler word detection | |
filler_words = [ | |
'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so', | |
'I mean', 'okay', 'right', 'actually', 'basically', 'you see', | |
'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess', | |
'totally', 'honestly', 'seriously', 'alright' | |
] | |
filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words]) | |
filler_words_per_minute = filler_word_count / duration_minutes | |
# POS tagging | |
pos_tags = nltk.pos_tag(words) | |
nouns = [word for word, pos in pos_tags if pos.startswith('NN')] | |
adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')] | |
verbs = [word for word, pos in pos_tags if pos.startswith('VB')] | |
# Sentiment analysis | |
sentiment = sentipipe(transcript) | |
sentiment_mapping = { | |
"LABEL_0": "Negative", | |
"LABEL_1": "Neutral", | |
"LABEL_2": "Positive" | |
} | |
sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']] | |
# Generate Word Cloud and Save it as an Image | |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies) | |
# Save the Word Cloud to the provided path | |
plt.figure(figsize=(10, 5)) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis('off') | |
plt.savefig(wordcloud_path, format='png') | |
plt.close() | |
print("Nouns: ", nouns) | |
print("Adjectives: ", adjectives) | |
print("Verbs: ", verbs) | |
print("Sentiment: ", sentiment) | |
return { | |
"transcript": transcript, | |
"sentiment": sentiment, | |
"sound_intensity": float(sound_intensity), | |
"fundamental_frequency": float(fundamental_frequency), | |
"spectral_energy": float(spectral_energy), | |
"spectral_centroid": float(avg_spectral_centroid), | |
"zero_crossing_rate": float(zero_crossing_rate), | |
"avg_words_per_minute": float(avg_words_per_minute), | |
"avg_unique_words_per_minute": float(avg_unique_words_per_minute), | |
"unique_word_count": int(unique_words), | |
"filler_words_per_minute": float(filler_words_per_minute), | |
"noun_count": len(nouns), | |
"adjective_count": len(adjectives), | |
"verb_count": len(verbs), | |
"pause_rate": float(pause_rate) | |
},pitches | |