| import json | |
| import logging | |
| import torchaudio | |
| from torch.utils.data import Dataset | |
| def _handle_wav(wav_path, target_rate=16000): | |
| """ | |
| handle one wav file. | |
| Return: | |
| waveform: numpy narray(1d) | |
| """ | |
| waveform, sample_rate = torchaudio.load(wav_path) | |
| if sample_rate != 16000: | |
| waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform) | |
| audio = waveform[0] | |
| return audio | |
| def _handle_qa(obj, is_think=True, think_max_len=50): | |
| if is_think: | |
| prompt_template = ( | |
| "# Dialogue Response Evaluation\n\n" | |
| "**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n" | |
| "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" | |
| "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" | |
| "## Scoring Criteria\n\n" | |
| "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" | |
| "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" | |
| "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" | |
| "## Evaluation Requirements\n\n" | |
| "Response **MUST** follow this format:\n\n" | |
| "<think>\n" | |
| f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n" | |
| "</think>\n\n" | |
| "<score>X</score> (**X is 1, 3, or 5**)\n\n") | |
| else: | |
| prompt_template = ( | |
| "# Dialogue Response Evaluation\n\n" | |
| "**IMPORTANT:** Evaluation must include`<score>` rating.\n\n" | |
| "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" | |
| "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" | |
| "## Scoring Criteria\n\n" | |
| "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" | |
| "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" | |
| "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" | |
| "## Evaluation Requirements\n\n" | |
| "Response **MUST** follow this format:\n\n" | |
| "<score>X</score> (**X is 1, 3, or 5**)\n\n") | |
| obj["prompt"] = [{"role": "user", "content": [ | |
| {"type": "audio", "audio": obj["merge_wav"]}, | |
| {"type": "text", "text": prompt_template} | |
| ]}] | |
| obj["solution"] = obj["gt_score"] | |
| return obj | |
| class AudioDataset(Dataset): | |
| def __init__(self, data_file, sample_rate=16000, is_think=True, think_max_len=50, load_audio=False): | |
| super().__init__() | |
| with open(data_file, 'r', encoding='utf8') as f: | |
| self.data = json.load(f) | |
| self.sample_rate = sample_rate | |
| self.is_think = is_think | |
| self.think_max_len = think_max_len | |
| self.load_audio = load_audio | |
| logging.info(f"Loaded {len(self.data)} items from {data_file}") | |
| def __len__(self): | |
| return len(self.data) | |
| def __getitem__(self, index): | |
| item = self.data[index] | |
| if self.load_audio: | |
| item["audio"] = _handle_wav(item["merge_wav"], self.sample_rate) | |
| return _handle_qa( | |
| item, | |
| is_think=self.is_think, | |
| think_max_len=self.think_max_len | |
| ) | |