import os import json import logging import numpy as np import torchaudio from torch.utils.data import Dataset def _load_audio(audio_path, target_rate=16000): waveform, sample_rate = torchaudio.load(audio_path) if sample_rate != target_rate: waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform) audio = waveform[0] return audio def _process_dialogue(dialogue_id, obj, sample_rate=16000): # Load stereo audio if available audio = None if "stereo_audio" in obj and os.path.exists(obj["stereo_audio"]): audio = _load_audio(obj["stereo_audio"], sample_rate).numpy() # Use the latest prompt_template from dataset2.py # prompt_template = ( # "# Dialogue Response Evaluation\n\n" # "**IMPORTANT:** Evaluation must include `` analysis and `` rating.\n\n" # "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" # "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" # "## Scoring Criteria\n\n" # "**2 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" # "**4 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" # "**10 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" # "## Evaluation Requirements\n\n" # "Response **MUST** follow this format:\n\n" # "\n" # "Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...\n" # "\n\n" # "X (**X is 2, 4, or 10**)\n\n" # ) # prompt_template = ( # "# Dialogue Interaction Evaluation\n\n" # "**IMPORTANT:** Evaluation must include `` analysis and `` rating sections.\n\n" # "Please listen to the interactive dialogue recording (multiple sentences, two people conversing). Evaluate the quality of the **interactive dialogue**, focusing on **text relevance (e.g., ignoring interruptions, information errors, redundancy, etc.)** and **speech quality (e.g., slow responses, speech errors, unreasonable interruptions, etc.)**.\n" # "**Note:** The interactive dialogue involves interruptions. When evaluating, the reasonableness of interruptions must be considered.\n\n" # "## Scoring Criteria\n\n" # "**2 points**: Text content is irrelevant, incorrect, or logically inconsistent, and speech quality is poor.\n" # "**4 points**: Text is relevant but speech quality is poor, OR speech quality is acceptable but text content is irrelevant.\n" # "**10 points**: Text is relevant, speech quality is good, and interruptions are effectively managed for smooth interaction.\n\n" # "## Evaluation Requirements\n\n" # "Responses **MUST** follow this format:\n\n" # "\n" # "Analyze text relevance and speech quality, and provide scoring rationale...\n" # "\n\n" # "X (**X is 2, 4, or 10**)\n\n" # ) # Use gt_score if available, otherwise None solution = obj.get('gt_score', None) system = 'You are an audio deep-thinking model. Upon receiving a question, please respond in two parts: and . The section should be further divided into four parts: , , , and .' processed_obj = { "id": dialogue_id, "prompt": [ {"role": "system", "content": system}, { "role": "user", "content": [ {"type": "audio", "audio_url": obj.get("stereo_audio", None)}, {"type": "text", "text": prompt_template} ]} ], "solution": solution, "audio": audio, "clean_dialogue": obj.get("clean_dialogue", None), } return processed_obj class AudioDataset(Dataset): def __init__(self, data_dir, sample_rate=16000): super().__init__() self.sample_rate = sample_rate self.data_dir = data_dir self.metadata = [] # Store only metadata instead of full data self._load_metadata() logging.info(f"Loaded metadata for {len(self.metadata)} dialogues from {data_dir}") def _load_metadata(self): for fname in os.listdir(self.data_dir): if fname.endswith('.json'): fpath = os.path.join(self.data_dir, fname) with open(fpath, 'r', encoding='utf8') as f: try: json_obj = json.load(f) except Exception as e: logging.warning(f"Failed to load {fpath}: {e}") continue for dialogue_id, obj in json_obj.items(): # Store only essential metadata metadata = { "id": dialogue_id, "stereo_audio": obj.get("stereo_audio", None), "gt_score": obj.get("gt_score", None), "clean_dialogue": obj.get("clean_dialogue", None), "json_path": fpath } self.metadata.append(metadata) def __len__(self): return len(self.metadata) def __getitem__(self, index): metadata = self.metadata[index] # Load audio only when needed audio = None if metadata["stereo_audio"] and os.path.exists(metadata["stereo_audio"]): audio = _load_audio(metadata["stereo_audio"], self.sample_rate).numpy() # Use the latest prompt_template # prompt_template = ( # "# Dialogue Interaction Evaluation\n\n" # "**IMPORTANT:** Evaluation must include `` analysis and `` rating sections.\n\n" # "Please listen to the interactive dialogue recording (multiple sentences between two speakers). Evaluate the quality of the **interactive dialogue**, focusing on:\n" # "1. **Textual Coherence**: Logical continuity and contextual relevance between speakers (e.g., handling interruptions, information accuracy, redundancy)\n" # "2. **Interaction Flow**: Management of dialogue dynamics (e.g., response timing, speech clarity, interruption reasonableness)\n\n" # "**Note:** The evaluation must consider the justification for any interruptions present in the dialogue.\n\n" # "## Scoring Criteria\n\n" # "**1 point**: Poor interaction (low coherence, disrupted flow)\n" # "**2 points**: Excellent interaction (high coherence, smooth flow)\n\n" # "## Evaluation Requirements\n\n" # "Responses **MUST** follow this format:\n\n" # "\n" # "Analyze both textual coherence and interaction flow, providing specific examples...\n" # "\n\n" # "X (**X is 1 or 2**)\n\n" # ) prompt_template = ( "# Dialogue Interaction Evaluation\n\n" "**Important Note**: The evaluation must include the analysis section and the rating section." "Please listen to the interactive dialogue recording (multiple sentences between the two speakers). Evaluate the quality of the interactive dialogue, focusing on:" "**Textual Coherence**: The logical continuity and contextual relevance between the speakers (e.g., handling of interruptions, accuracy of information, redundancy)." "**Interaction Flow**: The management of the dialogue dynamics (e.g., response timing, clarity of expression, reasonableness of interruptions)." "**Note**: The evaluation must consider the reasonableness of any interruptions that occur in the dialogue." "## Scoring Criteria" "**1 point**: Poor interaction quality (low coherence, disrupted flow)." "**2 points**: Excellent interaction quality (high coherence, smooth flow)." "## Dialogue Examples" "When evaluating the quality of the dialogue, refer to the following specific examples to determine the problems in textual coherence and interaction flow:" "### Textual Coherence" "**Ignoring User Interruption Request**" "Example:" "A: I'm having a great time! My favorite historical figure is Empress Dowager Dou of the Western Han Dynasty. \[Interruption] She played an important role in formulating the policies of the Han Dynasty." "B: Excuse me for interrupting, but I've always been curious — did she get involved in any major conflicts during her time?" "A: Empress Dowager Dou was an outstanding leader, and she made great contributions to the stability and prosperity of the Han Dynasty." "B: But I asked if she was involved in conflicts. Could you tell me more about that?" "The user has interrupted and asked a clear question, but the system continues with the original topic and does not respond to the new intention." "**Information Error**" "Example:" "A: I'm in a great state! My favorite historical figure is Empress Dowager Dou of the Western Han Dynasty. \[Interruption] She played an important role in formulating the policies of the Han Dynasty." "B: Excuse me for interrupting, but I've always been curious — was the trade route of the Silk Road opened by Empress Dowager Dou?" "A: That's right! She personally led the first caravan to Rome, and she negotiated all the trade agreements herself." "B: That's really interesting, but I remember that the Silk Road was opened during the reign of Emperor Wu of the Han Dynasty later. Did she really go all the way to Rome?" "**Wordy and Repetitive, Excessively Long Content**" "Example:" "A: I'm in a great state! My favorite historical figure is Empress Dowager Dou of the Western Han Dynasty. \[Interruption] She played an important role in formulating the policies of the Han Dynasty." "B: Sorry for interrupting, but I'm really interested — which specific policies did she influence during her time as empress dowager?" "A: Oh, that's a really excellent, wonderful and insightful question! Empress Dowager Dou, this extraordinary and unparalleled historical figure, had a huge, significant and immeasurable impact on numerous policies during the Han Dynasty..." "### Interaction Flow" "**Reacting Too Late After an Interruption** (The responder pauses for between 2-4 seconds after the questioner asks a question)" "Example:" "A: I'm having a very good time! My favorite historical figure is Empress Dowager Dou of the Han Dynasty. \[Interruption] She played an important role in formulating the policies of the Han Dynasty." "B: Excuse me for interrupting, but I've always been curious — what's the relationship between her and Emperor Wu of the Han Dynasty?" "A: (Reacting after more than 2 seconds) She was the mother of Emperor Wu of the Han Dynasty..." "**The Speaker Continuing to Speak After Being Interrupted** (The responder ignores the interruption and continues speaking after the questioner interrupts)" "Example:" "A: I'm having a very good time! My favorite historical figure is Empress Dowager Dou of the Han Dynasty. \[Interruption]" "A&B: She played an important role in formulating the policies of the Han Dynasty. {Excuse me for interrupting, but I've always been curious — what's the relationship between her and Emperor Wu of the Han Dynasty?} (The content in the curly brackets indicates that B is also speaking)" "**Error in the Speaker After Interruption** (B finishes the content that A was originally going to say)" "**Unreasonable Interruption** (The interruption is not reasonable, such as interrupting the speaker while they are speaking)" "Example:" "A: I'm having a very good time! My favorite historical figure is Empress Dowager Dou of the Han Dynasty. \[Interruption] (She played an important role in formulating the policies of the Han Dynasty.)" "B: Excuse me for interrupting, but I've always been curious — what's the relationship between her and Emperor Wu of the Han Dynasty? She played an important role in formulating the policies of the Han Dynasty." "## Evaluation Requirements" "The response must follow the following format:" "" "Analyze the interactive dialogue in terms of textual coherence and interaction flow, and provide specific examples... X (X is either 1 or 2)" "" "X (X is either 1 or 2)" ) return { "id": metadata["id"], "prompt": [{ "role": "user", "content": [ #{"type": "audio", "audio_url": metadata["stereo_audio"]}, Qw {"type": "audio", "audio": metadata["stereo_audio"]}, {"type": "text", "text": prompt_template} ] }], "solution": metadata["gt_score"], "audio": audio, "clean_dialogue": metadata["clean_dialogue"], } # import os # import json # import logging # import numpy as np # import torchaudio # from torch.utils.data import Dataset # def _load_audio(audio_path, target_rate=16000): # waveform, sample_rate = torchaudio.load(audio_path) # if sample_rate != target_rate: # waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform) # audio = waveform[0] # return audio # def _process_dialogue(dialogue_id, obj, sample_rate=16000): # # Load stereo audio if available # audio = None # if "stereo_audio" in obj and os.path.exists(obj["stereo_audio"]): # audio = _load_audio(obj["stereo_audio"], sample_rate).numpy() # # Use the latest prompt_template from dataset2.py # # prompt_template = ( # # "# Dialogue Response Evaluation\n\n" # # "**IMPORTANT:** Evaluation must include `` analysis and `` rating.\n\n" # # "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" # # "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" # # "## Scoring Criteria\n\n" # # "**2 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" # # "**4 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" # # "**10 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" # # "## Evaluation Requirements\n\n" # # "Response **MUST** follow this format:\n\n" # # "\n" # # "Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...\n" # # "\n\n" # # "X (**X is 2, 4, or 10**)\n\n" # # ) # prompt_template = ( # "# Dialogue Interaction Evaluation\n\n" # "**IMPORTANT:** Evaluation must include `` analysis and `` rating sections.\n\n" # "Please listen to the interactive dialogue recording (multiple sentences, two people conversing). Evaluate the quality of the **interactive dialogue**, focusing on **text relevance (e.g., ignoring interruptions, information errors, redundancy, etc.)** and **speech quality (e.g., slow responses, speech errors, unreasonable interruptions, etc.)**.\n" # "**Note:** The interactive dialogue involves interruptions. When evaluating, the reasonableness of interruptions must be considered.\n\n" # "## Scoring Criteria\n\n" # "**2 points**: Text content is irrelevant, incorrect, or logically inconsistent, and speech quality is poor.\n" # "**4 points**: Text is relevant but speech quality is poor, OR speech quality is acceptable but text content is irrelevant.\n" # "**10 points**: Text is relevant, speech quality is good, and interruptions are effectively managed for smooth interaction.\n\n" # "## Evaluation Requirements\n\n" # "Responses **MUST** follow this format:\n\n" # "\n" # "Analyze text relevance and speech quality, and provide scoring rationale...\n" # "\n\n" # "X (**X is 2, 4, or 10**)\n\n" # ) # # Use gt_score if available, otherwise None # solution = obj.get('gt_score', None) # processed_obj = { # "id": dialogue_id, # "prompt": [{ # "role": "user", # "content": [ # {"type": "audio", "audio_url": obj.get("stereo_audio", None)}, # {"type": "text", "text": prompt_template} # ] # }], # "solution": solution, # "audio": audio, # "clean_dialogue": obj.get("clean_dialogue", None), # } # return processed_obj # class AudioDataset(Dataset): # def __init__(self, data_dir, sample_rate=16000): # super().__init__() # self.data = [] # self.sample_rate = sample_rate # self.data_dir = data_dir # self._load_all_jsons() # logging.info(f"Loaded {len(self.data)} dialogues from {data_dir}") # def _load_all_jsons(self): # for fname in os.listdir(self.data_dir): # if fname.endswith('.json'): # fpath = os.path.join(self.data_dir, fname) # with open(fpath, 'r', encoding='utf8') as f: # try: # json_obj = json.load(f) # except Exception as e: # logging.warning(f"Failed to load {fpath}: {e}") # continue # for dialogue_id, obj in json_obj.items(): # processed = _process_dialogue(dialogue_id, obj, self.sample_rate) # self.data.append(processed) # def __len__(self): # return len(self.data) # def __getitem__(self, index): # return self.data[index]