interactSpeech / dataset /dataset3.py
Student0809's picture
Add files using upload-large-folder tool
8613355 verified
raw
history blame
6.19 kB
import os
import json
import logging
import numpy as np
import torchaudio
from torch.utils.data import Dataset
def _handle_wav(wav_path, target_rate=16000):
"""
handle one wav file.
Return:
waveform: numpy narray(1d)
"""
waveform, sample_rate = torchaudio.load(wav_path)
if sample_rate != target_rate:
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform)
audio = waveform[0]
return audio
def _handle_qa(obj, is_think=True, think_max_len=50):
if is_think:
prompt_template = (
"# Dialogue Response Evaluation\n\n"
"**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n"
"Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n"
"**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n"
"## Scoring Criteria\n\n"
"**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n"
"**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n"
"**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n"
"## Evaluation Requirements\n\n"
"Response **MUST** follow this format:\n\n"
"<think>\n"
f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n"
"</think>\n\n"
"<score>X</score> (**X is 1, 3, or 5**)\n\n")
else:
prompt_template = (
"# Dialogue Response Evaluation\n\n"
"**IMPORTANT:** Evaluation must include`<score>` rating.\n\n"
"Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n"
"**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n"
"## Scoring Criteria\n\n"
"**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n"
"**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n"
"**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n"
"## Evaluation Requirements\n\n"
"Response **MUST** follow this format:\n\n"
"<score>X</score> (**X is 1, 3, or 5**)\n\n")
# 构建处理后的对象
processed_obj = {
"id": obj["id"],
"prompt": [{"role": "user", "content": [
{"type": "audio", "audio": obj["merge_wav"]},
{"type": "text", "text": prompt_template}
]}],
"solution": obj["gt_score"],
"audio": obj.get("audio", None),
"clean_dialogue": obj.get("clean_dialogue", None)
}
return processed_obj
class AudioDataset(Dataset):
def __init__(self, data_dir, sample_rate=16000, is_think=True, think_max_len=50, load_audio=False):
super().__init__()
self.sample_rate = sample_rate
self.data_dir = data_dir
self.is_think = is_think
self.think_max_len = think_max_len
self.load_audio = load_audio
self.metadata = [] # Store only metadata instead of full data
self._load_metadata()
logging.info(f"Loaded metadata for {len(self.metadata)} dialogues from {data_dir}")
def _load_metadata(self):
for fname in os.listdir(self.data_dir):
if fname.endswith('.json'):
fpath = os.path.join(self.data_dir, fname)
with open(fpath, 'r', encoding='utf8') as f:
try:
json_obj = json.load(f)
except Exception as e:
logging.warning(f"Failed to load {fpath}: {e}")
continue
for dialogue_id, obj in json_obj.items():
# Store only essential metadata
metadata = {
"id": dialogue_id,
"merge_wav": obj.get("merge_wav", None),
"gt_score": obj.get("gt_score", None),
"clean_dialogue": obj.get("clean_dialogue", None),
"json_path": fpath
}
self.metadata.append(metadata)
def __len__(self):
return len(self.metadata)
def __getitem__(self, index):
metadata = self.metadata[index]
# 构建完整的对象
item = {
"id": metadata["id"],
"merge_wav": metadata["merge_wav"],
"gt_score": metadata["gt_score"],
"clean_dialogue": metadata["clean_dialogue"]
}
# 如果需要加载音频
if self.load_audio and metadata["merge_wav"] and os.path.exists(metadata["merge_wav"]):
item["audio"] = _handle_wav(metadata["merge_wav"], self.sample_rate).numpy()
# 使用_handle_qa处理对象
return _handle_qa(
item,
is_think=self.is_think,
think_max_len=self.think_max_len
)