In [9]:
pip install librosa soundfile torch transformers scikit-learn numpy pandas




In [10]:
import numpy as np
import librosa
import soundfile as sf
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.ensemble import RandomForestClassifier
import pickle
import warnings
warnings.filterwarnings('ignore')

# ============================================================
# AUDIO FEATURE EXTRACTION
# ============================================================

def extract_audio_features(audio_file):
 """
 Extract audio features that indicate emotional tone:
 - Pitch (fundamental frequency)
 - Energy/Intensity
 - Speaking rate
 - Voice quality indicators
 """
 # Load audio
 y, sr = librosa.load(audio_file, sr=16000)

 features = {}

 # 1. PITCH FEATURES (Low pitch often indicates sadness/depression)
 pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
 pitch_values = []
 for t in range(pitches.shape[1]):
 index = magnitudes[:, t].argmax()
 pitch = pitches[index, t]
 if pitch > 0:
 pitch_values.append(pitch)

 if len(pitch_values) > 0:
 features['pitch_mean'] = np.mean(pitch_values)
 features['pitch_std'] = np.std(pitch_values)
 features['pitch_min'] = np.min(pitch_values)
 features['pitch_max'] = np.max(pitch_values)
 else:
 features['pitch_mean'] = 0
 features['pitch_std'] = 0
 features['pitch_min'] = 0
 features['pitch_max'] = 0

 # 2. ENERGY FEATURES (Low energy indicates low mood)
 rms = librosa.feature.rms(y=y)[0]
 features['energy_mean'] = np.mean(rms)
 features['energy_std'] = np.std(rms)
 features['energy_max'] = np.max(rms)

 # 3. ZERO CROSSING RATE (Voice quality indicator)
 zcr = librosa.feature.zero_crossing_rate(y)[0]
 features['zcr_mean'] = np.mean(zcr)
 features['zcr_std'] = np.std(zcr)

 # 4. SPECTRAL FEATURES
 spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
 features['spectral_centroid_mean'] = np.mean(spectral_centroids)
 features['spectral_centroid_std'] = np.std(spectral_centroids)

 # 5. MFCC (Mel-frequency cepstral coefficients) - Voice timbre
 mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
 for i in range(13):
 features[f'mfcc_{i}_mean'] = np.mean(mfccs[i])
 features[f'mfcc_{i}_std'] = np.std(mfccs[i])

 # 6. TEMPO/SPEAKING RATE
 tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
 features['tempo'] = tempo

 # 7. DURATION (Longer pauses might indicate hesitation/sadness)
 features['duration'] = librosa.get_duration(y=y, sr=sr)

 return features

def classify_vocal_tone(audio_features):
 """
 Classify emotional state based on audio features

 Rules based on psychological research:
 - Depression/Sadness: Low pitch, low energy, slow tempo, monotone
 - Anxiety: Higher pitch variation, moderate-high energy
 - Anger: High energy, high pitch, fast tempo
 - Happiness: Moderate-high pitch, high energy, varied pitch
 """

 pitch_mean = audio_features['pitch_mean']
 pitch_std = audio_features['pitch_std']
 energy_mean = audio_features['energy_mean']
 tempo = audio_features['tempo']

 # Decision logic
 scores = {
 'sad': 0,
 'anxious': 0,
 'angry': 0,
 'happy': 0,
 'neutral': 0
 }

 # LOW TONE DETECTION (Key for your question!)
 if pitch_mean < 150 and energy_mean < 0.05: # Low pitch + low energy
 scores['sad'] += 3
 print(" šŸ”“ ALERT: Low vocal tone detected (possible sadness/depression)")

 # Pitch indicators
 if pitch_mean < 150:
 scores['sad'] += 2
 elif pitch_mean > 200:
 scores['happy'] += 1
 scores['anxious'] += 1

 # Energy indicators
 if energy_mean < 0.03:
 scores['sad'] += 2
 elif energy_mean > 0.1:
 scores['happy'] += 1
 scores['angry'] += 1

 # Pitch variation (monotone vs expressive)
 if pitch_std < 20: # Monotone
 scores['sad'] += 2
 scores['neutral'] += 1
 elif pitch_std > 50: # Very expressive
 scores['happy'] += 2
 scores['anxious'] += 1

 # Tempo indicators
 if tempo < 80: # Slow speaking
 scores['sad'] += 1
 elif tempo > 120: # Fast speaking
 scores['anxious'] += 2
 scores['angry'] += 1

 # Get dominant emotion
 dominant_tone = max(scores, key=scores.get)
 confidence = scores[dominant_tone] / sum(scores.values()) if sum(scores.values()) > 0 else 0

 return dominant_tone, confidence, scores

# ============================================================
# MULTIMODAL DETECTION (Text + Audio)
# ============================================================

def multimodal_tone_detection(text, audio_file, text_model, tokenizer):
 """
 Combine text-based NLP with audio analysis for accurate detection

 This solves your problem: "I'm good" in text vs "I'm good" in low voice
 """

 print("\n" + "="*60)
 print("MULTIMODAL TONE DETECTION")
 print("="*60)

 # 1. TEXT-BASED ANALYSIS
 print("\nšŸ“ TEXT ANALYSIS:")
 print(f" Input text: '{text}'")

 encoding = tokenizer(
 text,
 max_length=128,
 padding='max_length',
 truncation=True,
 return_tensors='pt'
 )

 with torch.no_grad():
 outputs = text_model(**encoding)
 probs = torch.nn.functional.softmax(outputs.logits, dim=1)
 text_prediction = torch.argmax(probs, dim=1).item()
 text_confidence = probs[0][text_prediction].item()

 text_tone_map = {0: 'angry', 1: 'anxious', 2: 'happy', 3: 'neutral', 4: 'sad'}
 text_tone = text_tone_map.get(text_prediction, 'neutral')

 print(f" Text-only prediction: {text_tone} ({text_confidence:.2%})")

 # 2. AUDIO-BASED ANALYSIS
 print("\nšŸŽ¤ AUDIO ANALYSIS:")
 audio_features = extract_audio_features(audio_file)

 print(f" Pitch (mean): {audio_features['pitch_mean']:.1f} Hz")
 print(f" Energy (mean): {audio_features['energy_mean']:.4f}")
 print(f" Pitch variation (std): {audio_features['pitch_std']:.1f}")
 print(f" Tempo: {audio_features['tempo']:.1f} BPM")

 audio_tone, audio_confidence, tone_scores = classify_vocal_tone(audio_features)
 print(f" Audio-only prediction: {audio_tone} ({audio_confidence:.2%})")

 # 3. CONFLICT DETECTION
 print("\nāš ļø CONFLICT ANALYSIS:")
 if text_tone != audio_tone:
 print(f" āš ļø MISMATCH DETECTED!")
 print(f" Text says: {text_tone}")
 print(f" Voice indicates: {audio_tone}")
 print(f" → User may be hiding true feelings")

 # When there's conflict, trust audio more (vocal tone is harder to fake)
 final_tone = audio_tone
 final_confidence = audio_confidence * 0.7 + (1 - text_confidence) * 0.3

 print(f"\n āœ“ Final assessment: {final_tone} ({final_confidence:.2%})")
 print(f" (Prioritizing audio cues over text)")

 else:
 # When aligned, combine confidences
 final_tone = text_tone
 final_confidence = (text_confidence + audio_confidence) / 2
 print(f" āœ“ Text and audio aligned")
 print(f" āœ“ Final assessment: {final_tone} ({final_confidence:.2%})")

 # 4. RISK ASSESSMENT
 print("\nšŸ„ MENTAL HEALTH RISK ASSESSMENT:")
 risk_level = "LOW"

 if audio_tone == 'sad' and text_tone in ['happy', 'neutral']:
 risk_level = "MEDIUM-HIGH"
 print(f" āš ļø Risk Level: {risk_level}")
 print(f" User is masking sadness/depression")
 print(f" Recommendation: Gentle probing, express concern")
 elif final_tone == 'sad' and audio_features['energy_mean'] < 0.03:
 risk_level = "MEDIUM"
 print(f" āš ļø Risk Level: {risk_level}")
 print(f" Low energy and sad tone detected")
 print(f" Recommendation: Check for depression symptoms")
 else:
 print(f" āœ“ Risk Level: {risk_level}")

 return {
 'text_tone': text_tone,
 'audio_tone': audio_tone,
 'final_tone': final_tone,
 'final_confidence': final_confidence,
 'conflict_detected': text_tone != audio_tone,
 'risk_level': risk_level,
 'audio_features': audio_features,
 'tone_scores': tone_scores
 }

# ============================================================
# EXAMPLE USAGE
# ============================================================

def create_example_scenario():
 """
 Demonstrate the scenario: "I'm good" said in different tones
 """

 print("\n" + "="*60)
 print("SCENARIO DEMONSTRATION")
 print("="*60)
 print("\nUser says: 'I'm good'")
 print("\nBut how do they REALLY sound?")
 print("-"*60)

 # Scenario 1: Said in genuinely happy tone
 print("\nšŸ“Š SCENARIO 1: 'I'm good' (genuinely happy voice)")
 print(" - High pitch: 220 Hz")
 print(" - High energy: 0.12")
 print(" - Varied pitch: std 45")
 print(" → TEXT: happy, AUDIO: happy")
 print(" → RESULT: āœ“ User is genuinely doing well")

 # Scenario 2: Said in low, depressed tone (YOUR QUESTION!)
 print("\nšŸ“Š SCENARIO 2: 'I'm good' (low, flat, sad voice)")
 print(" - Low pitch: 130 Hz")
 print(" - Low energy: 0.02")
 print(" - Monotone: std 15")
 print(" → TEXT: happy/neutral, AUDIO: sad")
 print(" → RESULT: āš ļø CONFLICT! User is masking depression")
 print(" → ACTION: Bot should gently probe and show concern")

 # Scenario 3: Said in anxious tone
 print("\nšŸ“Š SCENARIO 3: 'I'm good' (anxious, shaky voice)")
 print(" - Variable pitch: mean 180 Hz, std 60")
 print(" - Moderate energy: 0.07")
 print(" - Fast tempo: 130 BPM")
 print(" → TEXT: happy/neutral, AUDIO: anxious")
 print(" → RESULT: āš ļø CONFLICT! User is anxious but hiding it")
 print(" → ACTION: Bot should acknowledge possible stress")

# ============================================================
# TRAINING DATA GENERATION FOR MULTIMODAL MODEL
# ============================================================

def create_multimodal_training_data():
 """
 Create training data that includes audio feature patterns
 """

 data = {
 'text': [],
 'pitch_mean': [],
 'energy_mean': [],
 'pitch_std': [],
 'tempo': [],
 'true_emotion': []
 }

 # Pattern 1: Masking sadness
 masking_sadness = [
 ("I'm good", 130, 0.02, 15, 80, 'sad'),
 ("I'm fine", 125, 0.025, 12, 75, 'sad'),
 ("Everything's okay", 135, 0.03, 18, 85, 'sad'),
 ("I'm doing well", 128, 0.022, 14, 78, 'sad'),
 ]

 # Pattern 2: Genuinely happy
 genuinely_happy = [
 ("I'm good", 220, 0.12, 45, 110, 'happy'),
 ("I'm great", 215, 0.11, 48, 115, 'happy'),
 ("I'm doing amazing", 225, 0.13, 50, 120, 'happy'),
 ("Everything's wonderful", 218, 0.115, 46, 112, 'happy'),
 ]

 # Pattern 3: Masking anxiety
 masking_anxiety = [
 ("I'm fine", 185, 0.08, 65, 135, 'anxious'),
 ("It's okay", 180, 0.075, 62, 130, 'anxious'),
 ("I'm managing", 190, 0.085, 68, 140, 'anxious'),
 ]

 all_patterns = masking_sadness + genuinely_happy + masking_anxiety

 for text, pitch, energy, pitch_std, tempo, emotion in all_patterns:
 data['text'].append(text)
 data['pitch_mean'].append(pitch)
 data['energy_mean'].append(energy)
 data['pitch_std'].append(pitch_std)
 data['tempo'].append(tempo)
 data['true_emotion'].append(emotion)

 import pandas as pd
 df = pd.DataFrame(data)
 df.to_csv('multimodal_training_data.csv', index=False)
 print("\nāœ“ Multimodal training data saved to 'multimodal_training_data.csv'")
 print(f" Total samples: {len(df)}")
 print(f" Features: text + 4 audio features")

 return df

# ============================================================
# MAIN
# ============================================================

if __name__ == "__main__":
 print("\n" + "="*60)
 print("ANSWERING: 'I'm good' in low tone detection")
 print("="*60)

 create_example_scenario()

 print("\n\n" + "="*60)
 print("KEY INSIGHTS")
 print("="*60)
 print("""
1. TEXT-ONLY MODEL LIMITATION:
 - Sees words: "I'm good" → predicts: happy/neutral
 - MISSES: vocal tone indicating sadness

2. AUDIO FEATURES REVEAL TRUTH:
 - Low pitch (< 150 Hz) → sadness/depression
 - Low energy (< 0.05) → low mood
 - Monotone (low pitch std) → emotional flatness

3. SOLUTION - MULTIMODAL DETECTION:
 - Analyze BOTH text and audio
 - Detect conflicts between words and tone
 - Prioritize audio when conflict exists
 - Audio is harder to fake than words

4. CLINICAL SIGNIFICANCE:
 - People often mask depression with phrases like "I'm fine"
 - Vocal tone reveals true emotional state
 - This is called "alexithymia" or emotional masking
 - Critical for mental health chatbots to detect
 """)

 print("\n" + "="*60)
 print("TO IMPLEMENT THIS:")
 print("="*60)
 print("""
1. Install audio processing libraries:
 pip install librosa soundfile

2. Record user's voice (Web Speech API)

3. Extract audio features (pitch, energy, tempo)

4. Compare text sentiment vs audio indicators

5. Flag conflicts for mental health concern
 """)

 # Generate training data
 create_multimodal_training_data()


ANSWERING: 'I'm good' in low tone detection

SCENARIO DEMONSTRATION

User says: 'I'm good'

But how do they REALLY sound?
------------------------------------------------------------

šŸ“Š SCENARIO 1: 'I'm good' (genuinely happy voice)
 - High pitch: 220 Hz
 - High energy: 0.12
 - Varied pitch: std 45
 → TEXT: happy, AUDIO: happy
 → RESULT: āœ“ User is genuinely doing well

šŸ“Š SCENARIO 2: 'I'm good' (low, flat, sad voice)
 - Low pitch: 130 Hz
 - Low energy: 0.02
 - Monotone: std 15
 → TEXT: happy/neutral, AUDIO: sad
 → RESULT: āš ļø CONFLICT! User is masking depression
 → ACTION: Bot should gently probe and show concern

šŸ“Š SCENARIO 3: 'I'm good' (anxious, shaky voice)
 - Variable pitch: mean 180 Hz, std 60
 - Moderate energy: 0.07
 - Fast tempo: 130 BPM
 → TEXT: happy/neutral, AUDIO: anxious
 → RESULT: āš ļø CONFLICT! User is anxious but hiding it
 → ACTION: Bot should acknowledge possible stress


KEY INSIGHTS

1. TEXT-ONLY MODEL LIMITATION:
 - Sees words: "I'm good" → predict

In [11]:
import numpy as np
import librosa
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# ============================================================
# AUDIO FEATURE EXTRACTION
# ============================================================

def extract_audio_features(audio_file):
 """
 Extract audio features from a .wav file:
 - Pitch (fundamental frequency)
 - Energy/Intensity
 - Speaking rate (tempo)
 - Voice quality (pitch variation)
 """
 y, sr = librosa.load(audio_file, sr=16000)
 features = {}

 # Pitch features
 pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
 pitch_values = [pitches[magnitudes[:, t].argmax(), t]
 for t in range(pitches.shape[1])
 if magnitudes[:, t].max() > 0]

 features['pitch_mean'] = np.mean(pitch_values) if pitch_values else 0
 features['pitch_std'] = np.std(pitch_values) if pitch_values else 0
 features['pitch_min'] = np.min(pitch_values) if pitch_values else 0
 features['pitch_max'] = np.max(pitch_values) if pitch_values else 0

 # Energy
 rms = librosa.feature.rms(y=y)[0]
 features['energy_mean'] = np.mean(rms)
 features['energy_std'] = np.std(rms)
 features['energy_max'] = np.max(rms)

 # Pitch variation indicates monotone vs expressive
 features['pitch_variation'] = features['pitch_std']

 # Tempo / Speaking rate
 tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
 features['tempo'] = tempo

 # Duration
 features['duration'] = librosa.get_duration(y=y, sr=sr)

 return features

# ============================================================
# AUDIO TONE CLASSIFICATION
# ============================================================

def classify_vocal_tone(audio_features):
 """
 Classify emotion from audio features.
 """
 pitch_mean = audio_features['pitch_mean']
 pitch_std = audio_features['pitch_std']
 energy_mean = audio_features['energy_mean']
 tempo = audio_features['tempo']

 scores = {'sad':0, 'happy':0, 'anxious':0, 'neutral':0, 'angry':0}

 # Low tone detection
 if pitch_mean < 150 and energy_mean < 0.05:
 scores['sad'] += 3

 if pitch_mean < 150:
 scores['sad'] += 2
 elif pitch_mean > 200:
 scores['happy'] += 1
 scores['anxious'] += 1

 if energy_mean < 0.03:
 scores['sad'] += 2
 elif energy_mean > 0.1:
 scores['happy'] += 1
 scores['angry'] += 1

 if pitch_std < 20:
 scores['sad'] += 2
 scores['neutral'] += 1
 elif pitch_std > 50:
 scores['happy'] += 2
 scores['anxious'] += 1

 if tempo < 80:
 scores['sad'] += 1
 elif tempo > 120:
 scores['anxious'] += 2
 scores['angry'] += 1

 dominant_tone = max(scores, key=scores.get)
 confidence = scores[dominant_tone] / sum(scores.values()) if sum(scores.values()) > 0 else 0

 return dominant_tone, confidence, scores

# ============================================================
# MULTIMODAL TRAINING DATA GENERATION (AUDIO ONLY)
# ============================================================

def generate_audio_training_data(n_samples=100):
 """
 Generate synthetic audio feature dataset for 5 emotions
 """
 np.random.seed(42)
 emotions = ['sad', 'happy', 'anxious', 'neutral', 'angry']
 data = []

 for emo in emotions:
 for _ in range(n_samples):
 if emo == 'sad':
 pitch_mean = np.random.normal(130, 5)
 pitch_std = np.random.normal(15, 5)
 energy_mean = np.random.uniform(0.015, 0.04)
 tempo = np.random.uniform(70, 90)
 elif emo == 'happy':
 pitch_mean = np.random.normal(220, 10)
 pitch_std = np.random.normal(45, 10)
 energy_mean = np.random.uniform(0.1, 0.15)
 tempo = np.random.uniform(100, 130)
 elif emo == 'anxious':
 pitch_mean = np.random.normal(180, 10)
 pitch_std = np.random.normal(60, 10)
 energy_mean = np.random.uniform(0.06, 0.09)
 tempo = np.random.uniform(120, 150)
 elif emo == 'neutral':
 pitch_mean = np.random.normal(160, 10)
 pitch_std = np.random.normal(25, 5)
 energy_mean = np.random.uniform(0.05, 0.08)
 tempo = np.random.uniform(90, 110)
 elif emo == 'angry':
 pitch_mean = np.random.normal(210, 10)
 pitch_std = np.random.normal(50, 10)
 energy_mean = np.random.uniform(0.12, 0.18)
 tempo = np.random.uniform(120, 160)

 data.append([pitch_mean, pitch_std, energy_mean, tempo, emo])

 df = pd.DataFrame(data, columns=['pitch_mean','pitch_std','energy_mean','tempo','true_emotion'])
 df.to_csv('multimodal_audio_dataset.csv', index=False)
 print("āœ“ Multimodal audio dataset saved as 'multimodal_audio_dataset.csv'")
 return df

# ============================================================
# EXAMPLE USAGE
# ============================================================

if __name__ == "__main__":
 # Generate dataset
 df = generate_audio_training_data(n_samples=100)

 # Example: classify a sample audio file (replace 'example.wav' with your file)
 # audio_features = extract_audio_features('example.wav')
 # tone, confidence, scores = classify_vocal_tone(audio_features)
 # print("Predicted tone:", tone, "Confidence:", confidence)


āœ“ Multimodal audio dataset saved as 'multimodal_audio_dataset.csv'


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle

# Load CSV
df = pd.read_csv('multimodal_audio_dataset.csv')

# Features and labels
X = df.drop(columns=['true_emotion'])
y = df['true_emotion']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
 X, y, test_size=0.2, random_state=42, stratify=y
)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save model
with open('audio_emotion_rf_model.pkl', 'wb') as f:
 pickle.dump(rf_model, f)

print("āœ“ Model saved as 'audio_emotion_rf_model.pkl'")


In [12]:
files.download('audio_emotion_rf_model.pkl')



NameError: name 'files' is not defined