commit-message-editing-visualization / generate_synthetic_dataset.py
Petr Tsvetkov
- New version of the end->start synthetics samples generation
a8a595d
raw
history blame
637 Bytes
import config
from api_wrappers import hf_data_loader
from generation_steps import synthetic_end_to_start
def run():
df = hf_data_loader.load_processed_rewriting_dataset_as_pandas()
print(f"End -> start synthesis:")
print(f"GENERATION_MULTIPLIER = {synthetic_end_to_start.GENERATION_MULTIPLIER}")
print(f"REL_INSERTIONS_THRESHOLD = {synthetic_end_to_start.REL_INSERTIONS_THRESHOLD}")
print(f"GENERATION_ATTEMPTS = {synthetic_end_to_start.GENERATION_ATTEMPTS}")
df = synthetic_end_to_start.transform(df)
print("Done")
df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
if __name__ == '__main__':
run()