Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """module1.py | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1AYXXKXRzUU4DWKWbJqvyjSwQ0dVQMS7Y | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sentence_transformers import SentenceTransformer | |
| class MisconceptionModel: | |
| def __init__(self, model_name, misconception_mapping_path, misconception_embs_paths): | |
| # λͺ¨λΈ μ΄κΈ°ν | |
| self.model = SentenceTransformer(model_name) | |
| self.misconception_mapping = pd.read_parquet(misconception_mapping_path) | |
| self.misconception_names = self.misconception_mapping.set_index("MisconceptionId")["MisconceptionName"] | |
| self.misconception_embs = [ | |
| np.load(path) for path in misconception_embs_paths | |
| ] | |
| def preprocess(self, df): | |
| """λ°μ΄ν° ν리νλ‘μΈμ±""" | |
| df_new = df.copy() | |
| for col in df.columns[df.dtypes == "object"]: | |
| df_new[col] = df_new[col].str.strip() | |
| for option in ["A", "B", "C", "D"]: | |
| df_new[f"Answer{option}Text"] = df_new[f"Answer{option}Text"].str.replace("Only\n", "Only ") | |
| return df_new | |
| def wide_to_long(self, df): | |
| """λ°μ΄ν°λ₯Ό wide νμμμ long νμμΌλ‘ λ³ν""" | |
| rows = [] | |
| for _, row in df.iterrows(): | |
| correct_option = row["CorrectAnswer"] | |
| correct_text = row[f"Answer{correct_option}Text"] | |
| for option in ["A", "B", "C", "D"]: | |
| if option == correct_option: | |
| continue | |
| misconception_id = row.get(f"Misconception{option}Id", np.nan) | |
| row_new = row[:"QuestionText"] | |
| row_new["CorrectAnswerText"] = correct_text | |
| row_new["Answer"] = option | |
| row_new["AnswerText"] = row[f"Answer{option}Text"] | |
| if not pd.isna(misconception_id): | |
| row_new["MisconceptionId"] = int(misconception_id) | |
| rows.append(row_new) | |
| df_long = pd.DataFrame(rows).reset_index(drop=True) | |
| df_long.insert(0, "QuestionId_Answer", df_long["QuestionId"].astype(str) + "_" + df_long["Answer"]) | |
| return df_long | |
| def predict(self, test_df): | |
| """ν μ€νΈ λ°μ΄ν°μ λν μμΈ‘ μν""" | |
| test_df_long = self.wide_to_long(test_df) | |
| prompt = ( | |
| "Subject: {SubjectName}\n" | |
| "Construct: {ConstructName}\n" | |
| "Question: {QuestionText}\n" | |
| "Incorrect Answer: {AnswerText}" | |
| ) | |
| test_df_long["anchor"] = [ | |
| prompt.format( | |
| SubjectName=row["SubjectName"], | |
| ConstructName=row["ConstructName"], | |
| QuestionText=row["QuestionText"], | |
| AnswerText=row["AnswerText"] | |
| ) for _, row in test_df_long.iterrows() | |
| ] | |
| # ν μ€νΈ λ°μ΄ν° μλ² λ© | |
| embs_test_query = self.model.encode(test_df_long["anchor"], normalize_embeddings=True) | |
| # μ μ¬λ κ³μ° λ° μμ μ°μΆ | |
| rank_test = np.array([ | |
| np.argsort(np.argsort(-cosine_similarity(embs_test_query, embs_misconception)), axis=1, kind="stable") | |
| for embs_misconception in self.misconception_embs | |
| ]) | |
| rank_ave_test = np.mean(rank_test ** (1 / 4), axis=0) | |
| argsort_test = np.argsort(rank_ave_test, axis=1, kind="stable") | |
| test_df_long["PredictedMisconceptions"] = [argsort_test[i, :25].tolist() for i in range(len(argsort_test))] | |
| return test_df_long | |