Spaces:
Running
Running
import json | |
import random | |
import pandas as pd | |
import re | |
from datetime import timedelta | |
from pathlib import Path | |
# === Загрузка шаблонов === | |
def load_templates_json(templates_dir, emotion): | |
path = Path(templates_dir) / f"{emotion}.json" | |
if not path.exists(): | |
raise FileNotFoundError(f"Шаблон для эмоции '{emotion}' не найден: {path}") | |
with open(path, "r", encoding="utf-8") as f: | |
return json.load(f) | |
# === Генерация текстов с учётом seed и антидубликатов === | |
def generate_emotion_batch(n, template_data, seed=None): | |
if seed is not None: | |
random.seed(seed) | |
subjects = template_data["subjects"] | |
verbs = template_data["verbs"] | |
contexts = template_data["contexts"] | |
interjections = template_data.get("interjections", [""]) | |
templates = template_data["templates"] | |
# Допустимые звуковые метки DIA‑TTS | |
dia_tags = { | |
"(laughs)", "(clears throat)", "(sighs)", "(gasps)", "(coughs)", | |
"(singing)", "(sings)", "(mumbles)", "(beep)", "(groans)", "(sniffs)", | |
"(claps)", "(screams)", "(inhales)", "(exhales)", "(applause)", | |
"(burps)", "(humming)", "(sneezes)", "(chuckle)", "(whistles)" | |
} | |
def has_tag(text): return any(tag in text for tag in dia_tags) | |
def remove_tags(text): | |
for tag in dia_tags: | |
text = text.replace(tag, "") | |
return text.strip() | |
phrases, attempts = set(), 0 | |
max_attempts = n * 50 | |
while len(phrases) < n and attempts < max_attempts: | |
s, v = random.choice(subjects), random.choice(verbs) | |
c, i = random.choice(contexts), random.choice(interjections) | |
t = random.choice(templates) | |
# ▸ Разрешаем максимум одну звуковую метку на фразу | |
if has_tag(i) and has_tag(c): | |
if random.random() < .5: | |
c = remove_tags(c) | |
else: | |
i = remove_tags(i) | |
phrase = t.format(s=s, v=v, c=c, i=i) | |
# --- Очистка без разрушения многоточий --------------------------- | |
# 1) убрать пробелы перед знаками пунктуации | |
phrase = re.sub(r"\s+([,.!?])", r"\1", phrase) | |
# 2) превратить двойную точку, КОТОРАЯ не часть троеточия, в одну | |
phrase = re.sub(r"(?<!\.)\.\.(?!\.)", ".", phrase) | |
# 3) вставить пробел, если после метки сразу идёт слово | |
phrase = re.sub(r"\)(?=\w)", ") ", phrase) | |
# 4) схлопнуть множественные пробелы и обрезать края | |
phrase = re.sub(r"\s{2,}", " ", phrase).strip() | |
# ------------------------------------------------------------------ | |
if phrase not in phrases: | |
phrases.add(phrase) | |
attempts += 1 | |
if len(phrases) < n: | |
print(f"⚠️ Только {len(phrases)} уникальных фраз из {n} запрошенных — возможно, исчерпан пул шаблонов.") | |
return list(phrases) | |
# === Генерация временных меток === | |
def generate_dummy_timestamps(n): | |
base_time, result = timedelta(), [] | |
for idx in range(n): | |
start = base_time + timedelta(seconds=idx * 6) | |
end = start + timedelta(seconds=5) | |
result.append(( | |
str(start).split(".")[0] + ",000", | |
str(end).split(".")[0] + ",000" | |
)) | |
return result | |
# === Финальная сборка и сохранение CSV === | |
def create_emotion_csv(template_path, emotion_label, out_file, n=1000, seed=None): | |
data = load_templates_json(template_path, emotion_label) | |
phrases = generate_emotion_batch(n, data, seed) | |
timeline = generate_dummy_timestamps(n) | |
emotions = ["neutral", "happy", "sad", "anger", "surprise", "disgust", "fear"] | |
label_mask = {e: float(e == emotion_label) for e in emotions} | |
df = pd.DataFrame({ | |
"video_name": [f"dia_{emotion_label}_utt{i}_synt" for i in range(n)], | |
"start_time": [s for s, _ in timeline], | |
"end_time" : [e for _, e in timeline], | |
"sentiment" : [0] * n, | |
**{e: [label_mask[e]] * n for e in emotions}, | |
"text" : phrases | |
}) | |
df.to_csv(out_file, index=False) | |
print(f"✅ Сохранено {len(df)} строк → {out_file}") | |
# --- Проверка дубликатов --- | |
dupes = df[df.duplicated("text", keep=False)] | |
if not dupes.empty: | |
dupe_file = Path(out_file).with_name(f"duplicates_{emotion_label}.csv") | |
dupes.to_csv(dupe_file, index=False) | |
print(f"⚠️ Найдено {len(dupes)} повторов → {dupe_file}") | |
else: | |
print("✅ Дубликатов нет.") | |
# === Точка входа === | |
if __name__ == "__main__": | |
emotion_config = { | |
"anger": 3600, | |
"disgust": 4438, | |
"fear": 4441, | |
"happy": 2966, | |
"sad": 4026, | |
"surprise": 3504 | |
} | |
seed, template_path, out_dir = 42, "emotion_templates", "synthetic_data" | |
Path(out_dir).mkdir(parents=True, exist_ok=True) | |
for emotion, n in emotion_config.items(): | |
out_csv = Path(out_dir) / f"meld_synthetic_{emotion}_{n}.csv" | |
print(f"\n🔄 Генерация: {emotion} ({n} фраз)") | |
create_emotion_csv(template_path, emotion, str(out_csv), n, seed) | |