|
import pandas as pd |
|
import re |
|
import os |
|
import tempfile |
|
|
|
|
|
|
|
def process_text(row, column_name): |
|
script_start_pattern = r"##\s*リスニングスクリプト" |
|
question_start = "Question:" |
|
choice_start = "##選択肢" |
|
correct_choice = "【正解選択肢】" |
|
|
|
|
|
if correct_choice in row[column_name] and not choice_start in row[column_name]: |
|
row[column_name] = row[column_name].replace(correct_choice, choice_start + correct_choice) |
|
|
|
|
|
script_parts = re.split(script_start_pattern, row[column_name], flags=re.IGNORECASE) |
|
if len(script_parts) > 1: |
|
script_part = script_parts[1] |
|
if choice_start in script_part: |
|
choice_part = script_part.split(choice_start, 1)[1] |
|
script_text = script_part.split(choice_start)[0].strip() |
|
choice_text = choice_part.strip() |
|
|
|
if question_start in script_text: |
|
question_text = script_text.split(question_start, 1)[1] |
|
script_text = script_text.split(question_start)[0].strip() |
|
else: |
|
question_text = "" |
|
|
|
return pd.Series({ |
|
f'{column_name}_スクリプト': script_text, |
|
f'{column_name}_Question': question_text, |
|
f'{column_name}_選択肢': choice_text |
|
}) |
|
|
|
return pd.Series({ |
|
f'{column_name}_スクリプト': "", |
|
f'{column_name}_Question': "", |
|
f'{column_name}_選択肢': "" |
|
}) |
|
|
|
|
|
columns_to_process = ['問題1', '問題2', '日本語訳_問題1', '日本語訳_問題2'] |
|
|
|
|
|
def manuscript_conversion(csv_file): |
|
|
|
df = pd.read_csv(csv_file.name) |
|
|
|
|
|
for column in columns_to_process: |
|
df = df.join(df.apply(lambda row: process_text(row, column), axis=1)) |
|
|
|
|
|
df.drop(columns=columns_to_process, inplace=True) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp: |
|
|
|
df.to_csv(tmp.name, index=False, encoding='cp932', errors='ignore') |
|
output_path = tmp.name |
|
|
|
new_path = os.path.join(os.path.dirname(output_path), "output.csv") |
|
os.rename(output_path, new_path) |
|
return new_path |