{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "s1RBHgfAsDCN", "outputId": "565d6b86-5df5-4ec4-be5c-5d24079ca536" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: librosa in /usr/local/lib/python3.12/dist-packages (0.11.0)\n", "Requirement already satisfied: soundfile in /usr/local/lib/python3.12/dist-packages (0.13.1)\n", "Requirement already satisfied: torch in /usr/local/lib/python3.12/dist-packages (2.8.0+cu126)\n", "Requirement already satisfied: transformers in /usr/local/lib/python3.12/dist-packages (4.57.1)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (1.6.1)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n", "Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.12/dist-packages (from librosa) (3.1.0)\n", "Requirement already satisfied: numba>=0.51.0 in /usr/local/lib/python3.12/dist-packages (from librosa) (0.60.0)\n", "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from librosa) (1.16.3)\n", "Requirement already satisfied: joblib>=1.0 in /usr/local/lib/python3.12/dist-packages (from librosa) (1.5.2)\n", "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.12/dist-packages (from librosa) (4.4.2)\n", "Requirement already satisfied: pooch>=1.1 in /usr/local/lib/python3.12/dist-packages (from librosa) (1.8.2)\n", "Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.12/dist-packages (from librosa) (1.0.0)\n", "Requirement already satisfied: typing_extensions>=4.1.1 in /usr/local/lib/python3.12/dist-packages (from librosa) (4.15.0)\n", "Requirement already satisfied: lazy_loader>=0.1 in /usr/local/lib/python3.12/dist-packages (from librosa) (0.4)\n", "Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.12/dist-packages (from librosa) (1.1.2)\n", "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.12/dist-packages (from soundfile) (2.0.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from torch) (3.20.0)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from torch) (75.2.0)\n", "Requirement already satisfied: sympy>=1.13.3 in /usr/local/lib/python3.12/dist-packages (from torch) (1.13.3)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.12/dist-packages (from torch) (3.5)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from torch) (3.1.6)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.12/dist-packages (from torch) (2025.3.0)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.77)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.77)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.6.80 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.80)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.10.2.21 in /usr/local/lib/python3.12/dist-packages (from torch) (9.10.2.21)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.6.4.1 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.4.1)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.3.0.4 in /usr/local/lib/python3.12/dist-packages (from torch) (11.3.0.4)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.7.77 in /usr/local/lib/python3.12/dist-packages (from torch) (10.3.7.77)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.7.1.2 in /usr/local/lib/python3.12/dist-packages (from torch) (11.7.1.2)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.5.4.2 in /usr/local/lib/python3.12/dist-packages (from torch) (12.5.4.2)\n", "Requirement already satisfied: nvidia-cusparselt-cu12==0.7.1 in /usr/local/lib/python3.12/dist-packages (from torch) (0.7.1)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.27.3 in /usr/local/lib/python3.12/dist-packages (from torch) (2.27.3)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.77)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12==12.6.85 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.85)\n", "Requirement already satisfied: nvidia-cufile-cu12==1.11.1.6 in /usr/local/lib/python3.12/dist-packages (from torch) (1.11.1.6)\n", "Requirement already satisfied: triton==3.4.0 in /usr/local/lib/python3.12/dist-packages (from torch) (3.4.0)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.34.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.36.0)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (25.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/dist-packages (from transformers) (6.0.3)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.12/dist-packages (from transformers) (2024.11.6)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from transformers) (2.32.4)\n", "Requirement already satisfied: tokenizers<=0.23.0,>=0.22.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.22.1)\n", "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.6.2)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.12/dist-packages (from transformers) (4.67.1)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (3.6.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n", "Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.0->soundfile) (2.23)\n", "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (1.2.0)\n", "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.12/dist-packages (from numba>=0.51.0->librosa) (0.43.0)\n", "Requirement already satisfied: platformdirs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from pooch>=1.1->librosa) (4.5.0)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n", "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (3.4.4)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (3.11)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (2.5.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (2025.10.5)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from sympy>=1.13.3->torch) (1.3.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->torch) (3.0.3)\n" ] } ], "source": [ "pip install librosa soundfile torch transformers scikit-learn numpy pandas\n" ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "import librosa\n", "import soundfile as sf\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", "import torch\n", "from sklearn.ensemble import RandomForestClassifier\n", "import pickle\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "# ============================================================\n", "# AUDIO FEATURE EXTRACTION\n", "# ============================================================\n", "\n", "def extract_audio_features(audio_file):\n", " \"\"\"\n", " Extract audio features that indicate emotional tone:\n", " - Pitch (fundamental frequency)\n", " - Energy/Intensity\n", " - Speaking rate\n", " - Voice quality indicators\n", " \"\"\"\n", " # Load audio\n", " y, sr = librosa.load(audio_file, sr=16000)\n", "\n", " features = {}\n", "\n", " # 1. PITCH FEATURES (Low pitch often indicates sadness/depression)\n", " pitches, magnitudes = librosa.piptrack(y=y, sr=sr)\n", " pitch_values = []\n", " for t in range(pitches.shape[1]):\n", " index = magnitudes[:, t].argmax()\n", " pitch = pitches[index, t]\n", " if pitch > 0:\n", " pitch_values.append(pitch)\n", "\n", " if len(pitch_values) > 0:\n", " features['pitch_mean'] = np.mean(pitch_values)\n", " features['pitch_std'] = np.std(pitch_values)\n", " features['pitch_min'] = np.min(pitch_values)\n", " features['pitch_max'] = np.max(pitch_values)\n", " else:\n", " features['pitch_mean'] = 0\n", " features['pitch_std'] = 0\n", " features['pitch_min'] = 0\n", " features['pitch_max'] = 0\n", "\n", " # 2. ENERGY FEATURES (Low energy indicates low mood)\n", " rms = librosa.feature.rms(y=y)[0]\n", " features['energy_mean'] = np.mean(rms)\n", " features['energy_std'] = np.std(rms)\n", " features['energy_max'] = np.max(rms)\n", "\n", " # 3. ZERO CROSSING RATE (Voice quality indicator)\n", " zcr = librosa.feature.zero_crossing_rate(y)[0]\n", " features['zcr_mean'] = np.mean(zcr)\n", " features['zcr_std'] = np.std(zcr)\n", "\n", " # 4. SPECTRAL FEATURES\n", " spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]\n", " features['spectral_centroid_mean'] = np.mean(spectral_centroids)\n", " features['spectral_centroid_std'] = np.std(spectral_centroids)\n", "\n", " # 5. MFCC (Mel-frequency cepstral coefficients) - Voice timbre\n", " mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)\n", " for i in range(13):\n", " features[f'mfcc_{i}_mean'] = np.mean(mfccs[i])\n", " features[f'mfcc_{i}_std'] = np.std(mfccs[i])\n", "\n", " # 6. TEMPO/SPEAKING RATE\n", " tempo, _ = librosa.beat.beat_track(y=y, sr=sr)\n", " features['tempo'] = tempo\n", "\n", " # 7. DURATION (Longer pauses might indicate hesitation/sadness)\n", " features['duration'] = librosa.get_duration(y=y, sr=sr)\n", "\n", " return features\n", "\n", "def classify_vocal_tone(audio_features):\n", " \"\"\"\n", " Classify emotional state based on audio features\n", "\n", " Rules based on psychological research:\n", " - Depression/Sadness: Low pitch, low energy, slow tempo, monotone\n", " - Anxiety: Higher pitch variation, moderate-high energy\n", " - Anger: High energy, high pitch, fast tempo\n", " - Happiness: Moderate-high pitch, high energy, varied pitch\n", " \"\"\"\n", "\n", " pitch_mean = audio_features['pitch_mean']\n", " pitch_std = audio_features['pitch_std']\n", " energy_mean = audio_features['energy_mean']\n", " tempo = audio_features['tempo']\n", "\n", " # Decision logic\n", " scores = {\n", " 'sad': 0,\n", " 'anxious': 0,\n", " 'angry': 0,\n", " 'happy': 0,\n", " 'neutral': 0\n", " }\n", "\n", " # LOW TONE DETECTION (Key for your question!)\n", " if pitch_mean < 150 and energy_mean < 0.05: # Low pitch + low energy\n", " scores['sad'] += 3\n", " print(\" šŸ”“ ALERT: Low vocal tone detected (possible sadness/depression)\")\n", "\n", " # Pitch indicators\n", " if pitch_mean < 150:\n", " scores['sad'] += 2\n", " elif pitch_mean > 200:\n", " scores['happy'] += 1\n", " scores['anxious'] += 1\n", "\n", " # Energy indicators\n", " if energy_mean < 0.03:\n", " scores['sad'] += 2\n", " elif energy_mean > 0.1:\n", " scores['happy'] += 1\n", " scores['angry'] += 1\n", "\n", " # Pitch variation (monotone vs expressive)\n", " if pitch_std < 20: # Monotone\n", " scores['sad'] += 2\n", " scores['neutral'] += 1\n", " elif pitch_std > 50: # Very expressive\n", " scores['happy'] += 2\n", " scores['anxious'] += 1\n", "\n", " # Tempo indicators\n", " if tempo < 80: # Slow speaking\n", " scores['sad'] += 1\n", " elif tempo > 120: # Fast speaking\n", " scores['anxious'] += 2\n", " scores['angry'] += 1\n", "\n", " # Get dominant emotion\n", " dominant_tone = max(scores, key=scores.get)\n", " confidence = scores[dominant_tone] / sum(scores.values()) if sum(scores.values()) > 0 else 0\n", "\n", " return dominant_tone, confidence, scores\n", "\n", "# ============================================================\n", "# MULTIMODAL DETECTION (Text + Audio)\n", "# ============================================================\n", "\n", "def multimodal_tone_detection(text, audio_file, text_model, tokenizer):\n", " \"\"\"\n", " Combine text-based NLP with audio analysis for accurate detection\n", "\n", " This solves your problem: \"I'm good\" in text vs \"I'm good\" in low voice\n", " \"\"\"\n", "\n", " print(\"\\n\" + \"=\"*60)\n", " print(\"MULTIMODAL TONE DETECTION\")\n", " print(\"=\"*60)\n", "\n", " # 1. TEXT-BASED ANALYSIS\n", " print(\"\\nšŸ“ TEXT ANALYSIS:\")\n", " print(f\" Input text: '{text}'\")\n", "\n", " encoding = tokenizer(\n", " text,\n", " max_length=128,\n", " padding='max_length',\n", " truncation=True,\n", " return_tensors='pt'\n", " )\n", "\n", " with torch.no_grad():\n", " outputs = text_model(**encoding)\n", " probs = torch.nn.functional.softmax(outputs.logits, dim=1)\n", " text_prediction = torch.argmax(probs, dim=1).item()\n", " text_confidence = probs[0][text_prediction].item()\n", "\n", " text_tone_map = {0: 'angry', 1: 'anxious', 2: 'happy', 3: 'neutral', 4: 'sad'}\n", " text_tone = text_tone_map.get(text_prediction, 'neutral')\n", "\n", " print(f\" Text-only prediction: {text_tone} ({text_confidence:.2%})\")\n", "\n", " # 2. AUDIO-BASED ANALYSIS\n", " print(\"\\nšŸŽ¤ AUDIO ANALYSIS:\")\n", " audio_features = extract_audio_features(audio_file)\n", "\n", " print(f\" Pitch (mean): {audio_features['pitch_mean']:.1f} Hz\")\n", " print(f\" Energy (mean): {audio_features['energy_mean']:.4f}\")\n", " print(f\" Pitch variation (std): {audio_features['pitch_std']:.1f}\")\n", " print(f\" Tempo: {audio_features['tempo']:.1f} BPM\")\n", "\n", " audio_tone, audio_confidence, tone_scores = classify_vocal_tone(audio_features)\n", " print(f\" Audio-only prediction: {audio_tone} ({audio_confidence:.2%})\")\n", "\n", " # 3. CONFLICT DETECTION\n", " print(\"\\nāš ļø CONFLICT ANALYSIS:\")\n", " if text_tone != audio_tone:\n", " print(f\" āš ļø MISMATCH DETECTED!\")\n", " print(f\" Text says: {text_tone}\")\n", " print(f\" Voice indicates: {audio_tone}\")\n", " print(f\" → User may be hiding true feelings\")\n", "\n", " # When there's conflict, trust audio more (vocal tone is harder to fake)\n", " final_tone = audio_tone\n", " final_confidence = audio_confidence * 0.7 + (1 - text_confidence) * 0.3\n", "\n", " print(f\"\\n āœ“ Final assessment: {final_tone} ({final_confidence:.2%})\")\n", " print(f\" (Prioritizing audio cues over text)\")\n", "\n", " else:\n", " # When aligned, combine confidences\n", " final_tone = text_tone\n", " final_confidence = (text_confidence + audio_confidence) / 2\n", " print(f\" āœ“ Text and audio aligned\")\n", " print(f\" āœ“ Final assessment: {final_tone} ({final_confidence:.2%})\")\n", "\n", " # 4. RISK ASSESSMENT\n", " print(\"\\nšŸ„ MENTAL HEALTH RISK ASSESSMENT:\")\n", " risk_level = \"LOW\"\n", "\n", " if audio_tone == 'sad' and text_tone in ['happy', 'neutral']:\n", " risk_level = \"MEDIUM-HIGH\"\n", " print(f\" āš ļø Risk Level: {risk_level}\")\n", " print(f\" User is masking sadness/depression\")\n", " print(f\" Recommendation: Gentle probing, express concern\")\n", " elif final_tone == 'sad' and audio_features['energy_mean'] < 0.03:\n", " risk_level = \"MEDIUM\"\n", " print(f\" āš ļø Risk Level: {risk_level}\")\n", " print(f\" Low energy and sad tone detected\")\n", " print(f\" Recommendation: Check for depression symptoms\")\n", " else:\n", " print(f\" āœ“ Risk Level: {risk_level}\")\n", "\n", " return {\n", " 'text_tone': text_tone,\n", " 'audio_tone': audio_tone,\n", " 'final_tone': final_tone,\n", " 'final_confidence': final_confidence,\n", " 'conflict_detected': text_tone != audio_tone,\n", " 'risk_level': risk_level,\n", " 'audio_features': audio_features,\n", " 'tone_scores': tone_scores\n", " }\n", "\n", "# ============================================================\n", "# EXAMPLE USAGE\n", "# ============================================================\n", "\n", "def create_example_scenario():\n", " \"\"\"\n", " Demonstrate the scenario: \"I'm good\" said in different tones\n", " \"\"\"\n", "\n", " print(\"\\n\" + \"=\"*60)\n", " print(\"SCENARIO DEMONSTRATION\")\n", " print(\"=\"*60)\n", " print(\"\\nUser says: 'I'm good'\")\n", " print(\"\\nBut how do they REALLY sound?\")\n", " print(\"-\"*60)\n", "\n", " # Scenario 1: Said in genuinely happy tone\n", " print(\"\\nšŸ“Š SCENARIO 1: 'I'm good' (genuinely happy voice)\")\n", " print(\" - High pitch: 220 Hz\")\n", " print(\" - High energy: 0.12\")\n", " print(\" - Varied pitch: std 45\")\n", " print(\" → TEXT: happy, AUDIO: happy\")\n", " print(\" → RESULT: āœ“ User is genuinely doing well\")\n", "\n", " # Scenario 2: Said in low, depressed tone (YOUR QUESTION!)\n", " print(\"\\nšŸ“Š SCENARIO 2: 'I'm good' (low, flat, sad voice)\")\n", " print(\" - Low pitch: 130 Hz\")\n", " print(\" - Low energy: 0.02\")\n", " print(\" - Monotone: std 15\")\n", " print(\" → TEXT: happy/neutral, AUDIO: sad\")\n", " print(\" → RESULT: āš ļø CONFLICT! User is masking depression\")\n", " print(\" → ACTION: Bot should gently probe and show concern\")\n", "\n", " # Scenario 3: Said in anxious tone\n", " print(\"\\nšŸ“Š SCENARIO 3: 'I'm good' (anxious, shaky voice)\")\n", " print(\" - Variable pitch: mean 180 Hz, std 60\")\n", " print(\" - Moderate energy: 0.07\")\n", " print(\" - Fast tempo: 130 BPM\")\n", " print(\" → TEXT: happy/neutral, AUDIO: anxious\")\n", " print(\" → RESULT: āš ļø CONFLICT! User is anxious but hiding it\")\n", " print(\" → ACTION: Bot should acknowledge possible stress\")\n", "\n", "# ============================================================\n", "# TRAINING DATA GENERATION FOR MULTIMODAL MODEL\n", "# ============================================================\n", "\n", "def create_multimodal_training_data():\n", " \"\"\"\n", " Create training data that includes audio feature patterns\n", " \"\"\"\n", "\n", " data = {\n", " 'text': [],\n", " 'pitch_mean': [],\n", " 'energy_mean': [],\n", " 'pitch_std': [],\n", " 'tempo': [],\n", " 'true_emotion': []\n", " }\n", "\n", " # Pattern 1: Masking sadness\n", " masking_sadness = [\n", " (\"I'm good\", 130, 0.02, 15, 80, 'sad'),\n", " (\"I'm fine\", 125, 0.025, 12, 75, 'sad'),\n", " (\"Everything's okay\", 135, 0.03, 18, 85, 'sad'),\n", " (\"I'm doing well\", 128, 0.022, 14, 78, 'sad'),\n", " ]\n", "\n", " # Pattern 2: Genuinely happy\n", " genuinely_happy = [\n", " (\"I'm good\", 220, 0.12, 45, 110, 'happy'),\n", " (\"I'm great\", 215, 0.11, 48, 115, 'happy'),\n", " (\"I'm doing amazing\", 225, 0.13, 50, 120, 'happy'),\n", " (\"Everything's wonderful\", 218, 0.115, 46, 112, 'happy'),\n", " ]\n", "\n", " # Pattern 3: Masking anxiety\n", " masking_anxiety = [\n", " (\"I'm fine\", 185, 0.08, 65, 135, 'anxious'),\n", " (\"It's okay\", 180, 0.075, 62, 130, 'anxious'),\n", " (\"I'm managing\", 190, 0.085, 68, 140, 'anxious'),\n", " ]\n", "\n", " all_patterns = masking_sadness + genuinely_happy + masking_anxiety\n", "\n", " for text, pitch, energy, pitch_std, tempo, emotion in all_patterns:\n", " data['text'].append(text)\n", " data['pitch_mean'].append(pitch)\n", " data['energy_mean'].append(energy)\n", " data['pitch_std'].append(pitch_std)\n", " data['tempo'].append(tempo)\n", " data['true_emotion'].append(emotion)\n", "\n", " import pandas as pd\n", " df = pd.DataFrame(data)\n", " df.to_csv('multimodal_training_data.csv', index=False)\n", " print(\"\\nāœ“ Multimodal training data saved to 'multimodal_training_data.csv'\")\n", " print(f\" Total samples: {len(df)}\")\n", " print(f\" Features: text + 4 audio features\")\n", "\n", " return df\n", "\n", "# ============================================================\n", "# MAIN\n", "# ============================================================\n", "\n", "if __name__ == \"__main__\":\n", " print(\"\\n\" + \"=\"*60)\n", " print(\"ANSWERING: 'I'm good' in low tone detection\")\n", " print(\"=\"*60)\n", "\n", " create_example_scenario()\n", "\n", " print(\"\\n\\n\" + \"=\"*60)\n", " print(\"KEY INSIGHTS\")\n", " print(\"=\"*60)\n", " print(\"\"\"\n", "1. TEXT-ONLY MODEL LIMITATION:\n", " - Sees words: \"I'm good\" → predicts: happy/neutral\n", " - MISSES: vocal tone indicating sadness\n", "\n", "2. AUDIO FEATURES REVEAL TRUTH:\n", " - Low pitch (< 150 Hz) → sadness/depression\n", " - Low energy (< 0.05) → low mood\n", " - Monotone (low pitch std) → emotional flatness\n", "\n", "3. SOLUTION - MULTIMODAL DETECTION:\n", " - Analyze BOTH text and audio\n", " - Detect conflicts between words and tone\n", " - Prioritize audio when conflict exists\n", " - Audio is harder to fake than words\n", "\n", "4. CLINICAL SIGNIFICANCE:\n", " - People often mask depression with phrases like \"I'm fine\"\n", " - Vocal tone reveals true emotional state\n", " - This is called \"alexithymia\" or emotional masking\n", " - Critical for mental health chatbots to detect\n", " \"\"\")\n", "\n", " print(\"\\n\" + \"=\"*60)\n", " print(\"TO IMPLEMENT THIS:\")\n", " print(\"=\"*60)\n", " print(\"\"\"\n", "1. Install audio processing libraries:\n", " pip install librosa soundfile\n", "\n", "2. Record user's voice (Web Speech API)\n", "\n", "3. Extract audio features (pitch, energy, tempo)\n", "\n", "4. Compare text sentiment vs audio indicators\n", "\n", "5. Flag conflicts for mental health concern\n", " \"\"\")\n", "\n", " # Generate training data\n", " create_multimodal_training_data()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LBO899-WsXfW", "outputId": "62bca492-4abb-422e-e034-869c0e01c643" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "============================================================\n", "ANSWERING: 'I'm good' in low tone detection\n", "============================================================\n", "\n", "============================================================\n", "SCENARIO DEMONSTRATION\n", "============================================================\n", "\n", "User says: 'I'm good'\n", "\n", "But how do they REALLY sound?\n", "------------------------------------------------------------\n", "\n", "šŸ“Š SCENARIO 1: 'I'm good' (genuinely happy voice)\n", " - High pitch: 220 Hz\n", " - High energy: 0.12\n", " - Varied pitch: std 45\n", " → TEXT: happy, AUDIO: happy\n", " → RESULT: āœ“ User is genuinely doing well\n", "\n", "šŸ“Š SCENARIO 2: 'I'm good' (low, flat, sad voice)\n", " - Low pitch: 130 Hz\n", " - Low energy: 0.02\n", " - Monotone: std 15\n", " → TEXT: happy/neutral, AUDIO: sad\n", " → RESULT: āš ļø CONFLICT! User is masking depression\n", " → ACTION: Bot should gently probe and show concern\n", "\n", "šŸ“Š SCENARIO 3: 'I'm good' (anxious, shaky voice)\n", " - Variable pitch: mean 180 Hz, std 60\n", " - Moderate energy: 0.07\n", " - Fast tempo: 130 BPM\n", " → TEXT: happy/neutral, AUDIO: anxious\n", " → RESULT: āš ļø CONFLICT! User is anxious but hiding it\n", " → ACTION: Bot should acknowledge possible stress\n", "\n", "\n", "============================================================\n", "KEY INSIGHTS\n", "============================================================\n", "\n", "1. TEXT-ONLY MODEL LIMITATION:\n", " - Sees words: \"I'm good\" → predicts: happy/neutral\n", " - MISSES: vocal tone indicating sadness\n", " \n", "2. AUDIO FEATURES REVEAL TRUTH:\n", " - Low pitch (< 150 Hz) → sadness/depression\n", " - Low energy (< 0.05) → low mood\n", " - Monotone (low pitch std) → emotional flatness\n", " \n", "3. SOLUTION - MULTIMODAL DETECTION:\n", " - Analyze BOTH text and audio\n", " - Detect conflicts between words and tone\n", " - Prioritize audio when conflict exists\n", " - Audio is harder to fake than words\n", " \n", "4. CLINICAL SIGNIFICANCE:\n", " - People often mask depression with phrases like \"I'm fine\"\n", " - Vocal tone reveals true emotional state\n", " - This is called \"alexithymia\" or emotional masking\n", " - Critical for mental health chatbots to detect\n", " \n", "\n", "============================================================\n", "TO IMPLEMENT THIS:\n", "============================================================\n", "\n", "1. Install audio processing libraries:\n", " pip install librosa soundfile\n", "\n", "2. Record user's voice (Web Speech API)\n", "\n", "3. Extract audio features (pitch, energy, tempo)\n", "\n", "4. Compare text sentiment vs audio indicators\n", "\n", "5. Flag conflicts for mental health concern\n", " \n", "\n", "āœ“ Multimodal training data saved to 'multimodal_training_data.csv'\n", " Total samples: 11\n", " Features: text + 4 audio features\n" ] } ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "import librosa\n", "import pandas as pd\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "# ============================================================\n", "# AUDIO FEATURE EXTRACTION\n", "# ============================================================\n", "\n", "def extract_audio_features(audio_file):\n", " \"\"\"\n", " Extract audio features from a .wav file:\n", " - Pitch (fundamental frequency)\n", " - Energy/Intensity\n", " - Speaking rate (tempo)\n", " - Voice quality (pitch variation)\n", " \"\"\"\n", " y, sr = librosa.load(audio_file, sr=16000)\n", " features = {}\n", "\n", " # Pitch features\n", " pitches, magnitudes = librosa.piptrack(y=y, sr=sr)\n", " pitch_values = [pitches[magnitudes[:, t].argmax(), t]\n", " for t in range(pitches.shape[1])\n", " if magnitudes[:, t].max() > 0]\n", "\n", " features['pitch_mean'] = np.mean(pitch_values) if pitch_values else 0\n", " features['pitch_std'] = np.std(pitch_values) if pitch_values else 0\n", " features['pitch_min'] = np.min(pitch_values) if pitch_values else 0\n", " features['pitch_max'] = np.max(pitch_values) if pitch_values else 0\n", "\n", " # Energy\n", " rms = librosa.feature.rms(y=y)[0]\n", " features['energy_mean'] = np.mean(rms)\n", " features['energy_std'] = np.std(rms)\n", " features['energy_max'] = np.max(rms)\n", "\n", " # Pitch variation indicates monotone vs expressive\n", " features['pitch_variation'] = features['pitch_std']\n", "\n", " # Tempo / Speaking rate\n", " tempo, _ = librosa.beat.beat_track(y=y, sr=sr)\n", " features['tempo'] = tempo\n", "\n", " # Duration\n", " features['duration'] = librosa.get_duration(y=y, sr=sr)\n", "\n", " return features\n", "\n", "# ============================================================\n", "# AUDIO TONE CLASSIFICATION\n", "# ============================================================\n", "\n", "def classify_vocal_tone(audio_features):\n", " \"\"\"\n", " Classify emotion from audio features.\n", " \"\"\"\n", " pitch_mean = audio_features['pitch_mean']\n", " pitch_std = audio_features['pitch_std']\n", " energy_mean = audio_features['energy_mean']\n", " tempo = audio_features['tempo']\n", "\n", " scores = {'sad':0, 'happy':0, 'anxious':0, 'neutral':0, 'angry':0}\n", "\n", " # Low tone detection\n", " if pitch_mean < 150 and energy_mean < 0.05:\n", " scores['sad'] += 3\n", "\n", " if pitch_mean < 150:\n", " scores['sad'] += 2\n", " elif pitch_mean > 200:\n", " scores['happy'] += 1\n", " scores['anxious'] += 1\n", "\n", " if energy_mean < 0.03:\n", " scores['sad'] += 2\n", " elif energy_mean > 0.1:\n", " scores['happy'] += 1\n", " scores['angry'] += 1\n", "\n", " if pitch_std < 20:\n", " scores['sad'] += 2\n", " scores['neutral'] += 1\n", " elif pitch_std > 50:\n", " scores['happy'] += 2\n", " scores['anxious'] += 1\n", "\n", " if tempo < 80:\n", " scores['sad'] += 1\n", " elif tempo > 120:\n", " scores['anxious'] += 2\n", " scores['angry'] += 1\n", "\n", " dominant_tone = max(scores, key=scores.get)\n", " confidence = scores[dominant_tone] / sum(scores.values()) if sum(scores.values()) > 0 else 0\n", "\n", " return dominant_tone, confidence, scores\n", "\n", "# ============================================================\n", "# MULTIMODAL TRAINING DATA GENERATION (AUDIO ONLY)\n", "# ============================================================\n", "\n", "def generate_audio_training_data(n_samples=100):\n", " \"\"\"\n", " Generate synthetic audio feature dataset for 5 emotions\n", " \"\"\"\n", " np.random.seed(42)\n", " emotions = ['sad', 'happy', 'anxious', 'neutral', 'angry']\n", " data = []\n", "\n", " for emo in emotions:\n", " for _ in range(n_samples):\n", " if emo == 'sad':\n", " pitch_mean = np.random.normal(130, 5)\n", " pitch_std = np.random.normal(15, 5)\n", " energy_mean = np.random.uniform(0.015, 0.04)\n", " tempo = np.random.uniform(70, 90)\n", " elif emo == 'happy':\n", " pitch_mean = np.random.normal(220, 10)\n", " pitch_std = np.random.normal(45, 10)\n", " energy_mean = np.random.uniform(0.1, 0.15)\n", " tempo = np.random.uniform(100, 130)\n", " elif emo == 'anxious':\n", " pitch_mean = np.random.normal(180, 10)\n", " pitch_std = np.random.normal(60, 10)\n", " energy_mean = np.random.uniform(0.06, 0.09)\n", " tempo = np.random.uniform(120, 150)\n", " elif emo == 'neutral':\n", " pitch_mean = np.random.normal(160, 10)\n", " pitch_std = np.random.normal(25, 5)\n", " energy_mean = np.random.uniform(0.05, 0.08)\n", " tempo = np.random.uniform(90, 110)\n", " elif emo == 'angry':\n", " pitch_mean = np.random.normal(210, 10)\n", " pitch_std = np.random.normal(50, 10)\n", " energy_mean = np.random.uniform(0.12, 0.18)\n", " tempo = np.random.uniform(120, 160)\n", "\n", " data.append([pitch_mean, pitch_std, energy_mean, tempo, emo])\n", "\n", " df = pd.DataFrame(data, columns=['pitch_mean','pitch_std','energy_mean','tempo','true_emotion'])\n", " df.to_csv('multimodal_audio_dataset.csv', index=False)\n", " print(\"āœ“ Multimodal audio dataset saved as 'multimodal_audio_dataset.csv'\")\n", " return df\n", "\n", "# ============================================================\n", "# EXAMPLE USAGE\n", "# ============================================================\n", "\n", "if __name__ == \"__main__\":\n", " # Generate dataset\n", " df = generate_audio_training_data(n_samples=100)\n", "\n", " # Example: classify a sample audio file (replace 'example.wav' with your file)\n", " # audio_features = extract_audio_features('example.wav')\n", " # tone, confidence, scores = classify_vocal_tone(audio_features)\n", " # print(\"Predicted tone:\", tone, \"Confidence:\", confidence)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "df52EtECuh_s", "outputId": "5992a531-9fc1-4234-d65b-557e59b9a618" }, "execution_count": 11, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "āœ“ Multimodal audio dataset saved as 'multimodal_audio_dataset.csv'\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "IC7SCFueu8NM" }, "execution_count": 11, "outputs": [] }, { "cell_type": "markdown", "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n", "import pickle\n", "\n", "# Load CSV\n", "df = pd.read_csv('multimodal_audio_dataset.csv')\n", "\n", "# Features and labels\n", "X = df.drop(columns=['true_emotion'])\n", "y = df['true_emotion']\n", "\n", "# Split train/test\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42, stratify=y\n", ")\n", "\n", "# Train Random Forest\n", "rf_model = RandomForestClassifier(n_estimators=200, random_state=42)\n", "rf_model.fit(X_train, y_train)\n", "\n", "# Evaluate\n", "y_pred = rf_model.predict(X_test)\n", "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n", "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))\n", "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_test, y_pred))\n", "\n", "# Save model\n", "with open('audio_emotion_rf_model.pkl', 'wb') as f:\n", " pickle.dump(rf_model, f)\n", "\n", "print(\"āœ“ Model saved as 'audio_emotion_rf_model.pkl'\")\n" ], "metadata": { "id": "Kl4svI-KvTUw" } }, { "cell_type": "code", "source": [ "files.download('audio_emotion_rf_model.pkl')\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 158 }, "id": "dTPvn4aSvg56", "outputId": "c3b311b0-9a09-44bc-8bd1-3a2fc6b31471" }, "execution_count": 12, "outputs": [ { "output_type": "error", "ename": "NameError", "evalue": "name 'files' is not defined", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipython-input-4280315695.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfiles\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'audio_emotion_rf_model.pkl'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'files' is not defined" ] } ] } ] }