import librosa
import numpy as np
import torch
from collections import Counter
import nltk
import string
import matplotlib.pyplot as plt
from wordcloud import WordCloud'punkt')'punkt_tab')'averaged_perceptron_tagger_eng')'averaged_perceptron_tagger')
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
def get_pitch_list(y,sr):
hop_length = int(sr / 30) # hop_length determines how far apart the frames are
# Extract the pitch (F0) using librosa's piptrack method
pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length)
# Get the pitch frequencies from the pitch array
pitch_frequencies = []
for t in range(pitches.shape[1]):
index = magnitudes[:, t].argmax() # Get the index of the maximum magnitude
pitch = pitches[index, t]
# Convert pitch_frequencies to a NumPy array
pitch_frequencies = np.array(pitch_frequencies)
print("shape : ",pitch_frequencies.shape)
return pitch_frequencies
def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path):
y, sr = librosa.load(audio_path, sr=16000)
inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
inputs =, dtype=torch_dtype)
with torch.no_grad():
generated_ids = asrmodel.generate(inputs)
transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Sound intensity (RMS)
rms = librosa.feature.rms(y=y)
sound_intensity = np.mean(rms)
# Pitch list
# Fundamental frequency (F0)
f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
fundamental_frequency = np.nanmean(f0)
# Spectral energy (based on STFT)
S = np.abs(librosa.stft(y))
spectral_energy = np.mean(np.sum(S ** 2, axis=0))
# Spectral centroid
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
avg_spectral_centroid = np.mean(spectral_centroid)
# Zero-crossing rate
zcr = librosa.feature.zero_crossing_rate(y)
zero_crossing_rate = np.mean(zcr)
# Pause detection
silence_threshold = -40
silent_intervals = librosa.effects.split(y, top_db=silence_threshold)
pause_duration = 0
for start, end in silent_intervals:
pause_duration += (end - start) / sr
total_duration = librosa.get_duration(y=y, sr=sr)
pause_rate = (pause_duration / total_duration) * 60 # Convert to pauses per minute
# Transcript processing
words = nltk.word_tokenize(transcript)
words = [word.lower() for word in words if word not in string.punctuation]
num_words = len(words)
unique_words = len(set(words))
word_frequencies = Counter(words)
# Duration in minutes
duration_minutes = total_duration / 60
avg_words_per_minute = num_words / duration_minutes
avg_unique_words_per_minute = unique_words / duration_minutes
# Filler word detection
filler_words = [
'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
'totally', 'honestly', 'seriously', 'alright'
filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
filler_words_per_minute = filler_word_count / duration_minutes
# POS tagging
pos_tags = nltk.pos_tag(words)
nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
# Sentiment analysis
sentiment = sentipipe(transcript)
sentiment_mapping = {
"LABEL_0": "Negative",
"LABEL_1": "Neutral",
"LABEL_2": "Positive"
sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']]
# Generate Word Cloud and Save it as an Image
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)
# Save the Word Cloud to the provided path
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.savefig(wordcloud_path, format='png')
print("Nouns: ", nouns)
print("Adjectives: ", adjectives)
print("Verbs: ", verbs)
print("Sentiment: ", sentiment)
return {
"transcript": transcript,
"sentiment": sentiment,
"sound_intensity": float(sound_intensity),
"fundamental_frequency": float(fundamental_frequency),
"spectral_energy": float(spectral_energy),
"spectral_centroid": float(avg_spectral_centroid),
"zero_crossing_rate": float(zero_crossing_rate),
"avg_words_per_minute": float(avg_words_per_minute),
"avg_unique_words_per_minute": float(avg_unique_words_per_minute),
"unique_word_count": int(unique_words),
"filler_words_per_minute": float(filler_words_per_minute),
"noun_count": len(nouns),
"adjective_count": len(adjectives),
"verb_count": len(verbs),
"pause_rate": float(pause_rate)