Spaces:
Running
Running
Upload 3 files
Browse files- example/ex1_final.py +140 -0
- example/ex1_init.py +61 -0
- example/example_data.json +1 -0
example/ex1_final.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
import torch.optim as optim
|
7 |
+
from torch.utils.data import DataLoader, Dataset
|
8 |
+
from transformers import BertTokenizer, BertModel
|
9 |
+
|
10 |
+
# Define constants
|
11 |
+
DIMENSIONS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
|
12 |
+
|
13 |
+
class EssayDataset(Dataset):
|
14 |
+
def __init__(self, texts, targets, tokenizer, max_len):
|
15 |
+
self.texts = texts
|
16 |
+
self.targets = targets
|
17 |
+
self.tokenizer = tokenizer
|
18 |
+
self.max_len = max_len
|
19 |
+
|
20 |
+
def __len__(self):
|
21 |
+
return len(self.texts)
|
22 |
+
|
23 |
+
def __getitem__(self, item):
|
24 |
+
text = self.texts[item]
|
25 |
+
target = self.targets[item]
|
26 |
+
|
27 |
+
encoding = self.tokenizer.encode_plus(
|
28 |
+
text,
|
29 |
+
add_special_tokens=True,
|
30 |
+
max_length=self.max_len,
|
31 |
+
return_token_type_ids=False,
|
32 |
+
padding='max_length',
|
33 |
+
return_attention_mask=True,
|
34 |
+
return_tensors='pt',
|
35 |
+
truncation=True
|
36 |
+
)
|
37 |
+
|
38 |
+
return {
|
39 |
+
'text': text,
|
40 |
+
'input_ids': encoding['input_ids'].flatten(),
|
41 |
+
'attention_mask': encoding['attention_mask'].flatten(),
|
42 |
+
'targets': torch.tensor(target, dtype=torch.float)
|
43 |
+
}
|
44 |
+
|
45 |
+
class EssayScoreRegressor(nn.Module):
|
46 |
+
def __init__(self, n_outputs):
|
47 |
+
super(EssayScoreRegressor, self).__init__()
|
48 |
+
self.bert = BertModel.from_pretrained('bert-base-uncased')
|
49 |
+
self.drop = nn.Dropout(p=0.3)
|
50 |
+
self.out = nn.Linear(self.bert.config.hidden_size, n_outputs)
|
51 |
+
|
52 |
+
def forward(self, input_ids, attention_mask):
|
53 |
+
pooled_output = self.bert(
|
54 |
+
input_ids=input_ids,
|
55 |
+
attention_mask=attention_mask
|
56 |
+
)['pooler_output']
|
57 |
+
output = self.drop(pooled_output)
|
58 |
+
return self.out(output)
|
59 |
+
|
60 |
+
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
|
61 |
+
model = model.train()
|
62 |
+
losses = []
|
63 |
+
|
64 |
+
for d in data_loader:
|
65 |
+
input_ids = d['input_ids'].to(device)
|
66 |
+
attention_mask = d['attention_mask'].to(device)
|
67 |
+
targets = d['targets'].to(device)
|
68 |
+
|
69 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
70 |
+
loss = loss_fn(outputs, targets)
|
71 |
+
|
72 |
+
losses.append(loss.item())
|
73 |
+
|
74 |
+
loss.backward()
|
75 |
+
optimizer.step()
|
76 |
+
scheduler.step()
|
77 |
+
optimizer.zero_grad()
|
78 |
+
|
79 |
+
return np.mean(losses)
|
80 |
+
|
81 |
+
def train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs, batch_size, max_len):
|
82 |
+
train_dataset = EssayDataset(
|
83 |
+
texts=train_data['full_text'].to_numpy(),
|
84 |
+
targets=train_data[DIMENSIONS].to_numpy(),
|
85 |
+
tokenizer=tokenizer,
|
86 |
+
max_len=max_len
|
87 |
+
)
|
88 |
+
|
89 |
+
val_dataset = EssayDataset(
|
90 |
+
texts=val_data['full_text'].to_numpy(),
|
91 |
+
targets=val_data[DIMENSIONS].to_numpy(),
|
92 |
+
tokenizer=tokenizer,
|
93 |
+
max_len=max_len
|
94 |
+
)
|
95 |
+
|
96 |
+
train_data_loader = DataLoader(
|
97 |
+
train_dataset,
|
98 |
+
batch_size=batch_size,
|
99 |
+
shuffle=True
|
100 |
+
)
|
101 |
+
|
102 |
+
val_data_loader = DataLoader(
|
103 |
+
val_dataset,
|
104 |
+
batch_size=batch_size,
|
105 |
+
shuffle=False
|
106 |
+
)
|
107 |
+
|
108 |
+
loss_fn = nn.MSELoss().to(device)
|
109 |
+
|
110 |
+
for epoch in range(epochs):
|
111 |
+
print(f'Epoch {epoch + 1}/{epochs}')
|
112 |
+
print('-' * 10)
|
113 |
+
|
114 |
+
train_loss = train_epoch(
|
115 |
+
model,
|
116 |
+
train_data_loader,
|
117 |
+
loss_fn,
|
118 |
+
optimizer,
|
119 |
+
device,
|
120 |
+
scheduler,
|
121 |
+
len(train_dataset)
|
122 |
+
)
|
123 |
+
|
124 |
+
print(f'Train loss {train_loss}')
|
125 |
+
|
126 |
+
if __name__ == "__main__":
|
127 |
+
df = pd.read_csv('train.csv')
|
128 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
129 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
130 |
+
model = EssayScoreRegressor(n_outputs=len(DIMENSIONS))
|
131 |
+
model = model.to(device)
|
132 |
+
|
133 |
+
optimizer = optim.Adam(model.parameters(), lr=2e-5)
|
134 |
+
total_steps = len(df) // 16 * 5
|
135 |
+
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=total_steps, gamma=0.1)
|
136 |
+
|
137 |
+
train_data = df.sample(frac=0.8, random_state=42)
|
138 |
+
val_data = df.drop(train_data.index)
|
139 |
+
|
140 |
+
train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs=5, batch_size=16, max_len=160)
|
example/ex1_init.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
|
3 |
+
import numpy as np
|
4 |
+
import random
|
5 |
+
import torch
|
6 |
+
from sklearn.model_selection import train_test_split
|
7 |
+
|
8 |
+
DIMENSIONS = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
|
9 |
+
SEED = 42
|
10 |
+
|
11 |
+
random.seed(SEED)
|
12 |
+
torch.manual_seed(SEED)
|
13 |
+
np.random.seed(SEED)
|
14 |
+
|
15 |
+
def compute_metrics_for_regression(y_test, y_test_pred):
|
16 |
+
metrics = {}
|
17 |
+
for task in DIMENSIONS:
|
18 |
+
targets_task = [t[DIMENSIONS.index(task)] for t in y_test]
|
19 |
+
pred_task = [l[DIMENSIONS.index(task)] for l in y_test_pred]
|
20 |
+
|
21 |
+
rmse = mean_squared_error(targets_task, pred_task, squared=False)
|
22 |
+
|
23 |
+
metrics[f"rmse_{task}"] = rmse
|
24 |
+
|
25 |
+
return metrics
|
26 |
+
|
27 |
+
def train_model(X_train, y_train, X_valid, y_valid):
|
28 |
+
model = None # Placeholder for model training
|
29 |
+
return model
|
30 |
+
|
31 |
+
def predict(model, X):
|
32 |
+
y_pred = np.random.rand(len(X), len(DIMENSIONS))
|
33 |
+
return y_pred
|
34 |
+
|
35 |
+
if __name__ == '__main__':
|
36 |
+
|
37 |
+
ellipse_df = pd.read_csv('train.csv',
|
38 |
+
header=0, names=['text_id', 'full_text', 'Cohesion', 'Syntax',
|
39 |
+
'Vocabulary', 'Phraseology','Grammar', 'Conventions'],
|
40 |
+
index_col='text_id')
|
41 |
+
ellipse_df = ellipse_df.dropna(axis=0)
|
42 |
+
|
43 |
+
data_df = ellipse_df
|
44 |
+
X = list(data_df.full_text.to_numpy())
|
45 |
+
y = np.array([data_df.drop(['full_text'], axis=1).iloc[i] for i in range(len(X))])
|
46 |
+
|
47 |
+
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=SEED)
|
48 |
+
|
49 |
+
model = train_model(X_train, y_train, X_valid, y_valid)
|
50 |
+
|
51 |
+
y_valid_pred = predict(model, X_valid)
|
52 |
+
metrics = compute_metrics_for_regression(y_valid, y_valid_pred)
|
53 |
+
print(metrics)
|
54 |
+
print("final MCRMSE on validation set: ", np.mean(list(metrics.values())))
|
55 |
+
|
56 |
+
submission_df = pd.read_csv('test.csv', header=0, names=['text_id', 'full_text'], index_col='text_id')
|
57 |
+
X_submission = list(submission_df.full_text.to_numpy())
|
58 |
+
y_submission = predict(model, X_submission)
|
59 |
+
submission_df = pd.DataFrame(y_submission, columns=DIMENSIONS)
|
60 |
+
submission_df.index = submission_df.index.rename('text_id')
|
61 |
+
submission_df.to_csv('submission.csv')
|
example/example_data.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"1": {"title": "Dataset and Baseline for Automatic Student Feedback Analysis", "abstract": "This paper presents a student feedback corpus containing 3000 instances of feedback written by university students. The dataset has been annotated for aspect terms, opinion terms, polarities of the opinion terms towards targeted aspects, document-level opinion polarities, and sentence separations. A hierarchical taxonomy for aspect categorization covering all areas of the teaching-learning process was developed. Both implicit and explicit aspects were annotated using this taxonomy. The paper discusses the annotation methodology, difficulties faced during the annotation, and details about aspect term categorization. The annotated corpus can be used for Aspect Extraction, Aspect Level Sentiment Analysis, and Document Level Sentiment Analysis. Baseline results for all three tasks are provided.", "research_tasks": "The primary research tasks include the creation of a comprehensive student feedback corpus, aspect term annotation, opinion polarity annotation, and the development of a hierarchical taxonomy.", "research_gaps": "Gaps include the lack of detailed aspect-level annotations in existing datasets and the focus on document-level sentiment analysis.", "keywords": "Student Feedback Corpus, Aspect Terms, Opinion Terms, Polarity, Hierarchical Taxonomy, Aspect Extraction, Aspect Level Sentiment Analysis, Document Level Sentiment Analysis", "recent_works": ["Students feedback analysis model using deep learning-based method and linguistic knowledge for intelligent educational systems.", "An Automated Approach for Analysing Students Feedback Using Sentiment Analysis Techniques."], "hypothesis": "\n Method: Advanced Aspect-Level Sentiment Analysis of Student Feedback Using a Hybrid Deep Learning Approach\n\n Step 1: Dataset Enhancement \n\n Data Collection and Preprocessing\n * Collect additional student feedback from multiple universities to expand the existing dataset.\n * Preprocess the data to ensure uniformity in annotation and eliminate noise, such as redundant information and grammatical errors.\n Annotation Refinement\n * Use advanced NLP techniques to further refine the aspect terms, opinion terms, and polarities.\n * Incorporate semi-supervised learning methods to improve annotation accuracy, utilizing both manual and automated processes.\n\n Step 2: Model Development\n Hybrid Model Architecture\n * Develop a hybrid model that integrates CNN, BiLSTM, and attention mechanisms, similar to the DTLP approach mentioned in the recent work by DTLP (Deep Learning and Teaching Process).\n * Incorporate a Transformer-based model (like BERT) to capture contextual nuances and improve the understanding of implicit aspects.\n Feature Integration\n * Enhance the feature set by combining statistical, linguistic, and sentiment knowledge features with word embeddings.\n * Include sentiment shifter rules and contextual polarity indicators to address challenges in sentiment analysis.\n\n Step 3: Training and Validation\n Model Training\n * Train the hybrid model using the enhanced dataset.\n * Use cross-validation techniques to ensure robustness and prevent overfitting.\n Baseline Comparisons\n * Compare the model's performance with baseline results provided in the original study and other recent works.\n * Use metrics such as accuracy, precision, recall, and F1-score to evaluate model performance across different tasks, including Aspect Extraction, Aspect Level Sentiment Analysis, and Document Level Sentiment Analysis.\n\n Step 4: Iterative Refinement\n Feedback Loop\n * Implement an iterative feedback loop where the model's predictions are reviewed and corrected, improving the model iteratively.\n * Engage domain experts in the review process to ensure the relevance and accuracy of the feedback. Continuous Learning\n * Utilize active learning techniques to continuously update the model with new data, ensuring it remains up-to-date with current trends in student feedback.\n\n Step 5: Deployment and Application\n Integration with Educational Systems\n * Deploy the model as a part of an intelligent educational system to analyze student feedback in real-time.\n * Provide actionable insights to educators and administrators to improve teaching methods and curriculum design. User Interface Development\n * Develop an intuitive user interface that allows educators to interact with the model, view feedback analysis, and generate reports.\n ", "experiment_plan": "\n Experiment: Validating the Hybrid Deep Learning Approach for Aspect-Level Sentiment Analysis of Student Feedback\n\n Objective:\n To validate the effectiveness of the proposed hybrid deep learning approach (combining CNN, BiLSTM, and Transformer models) for aspect-level sentiment analysis of student feedback by comparing its performance with baseline methods and recent works.\n Research Problem:\n Current sentiment analysis models for student feedback lack detailed aspect-level annotations and fail to address implicit aspects and contextual nuances in feedback data.\n Proposed Method:\n A hybrid deep learning model integrating CNN, BiLSTM, and Transformer-based models (like BERT) to enhance aspect-level sentiment analysis. The method incorporates sentiment shifter rules and contextual polarity indicators to address challenges in sentiment analysis.\n\n Experiment Design:\n 1. Dataset Preparation:\n * Existing Dataset: Use the dataset provided by Herath et al. (2022) with 3000 instances of student feedback, annotated for aspect terms, opinion terms, polarities, and document-level sentiments.\n * Data Augmentation: Expand the dataset by collecting additional feedback from multiple universities, ensuring diversity in feedback data.\n 2. Preprocessing:\n * Clean the data to remove noise and inconsistencies.\n * Tokenize the text and apply part-of-speech tagging.\n * Annotate additional feedback instances using the refined hierarchical taxonomy.\n 3. Model Training:\n * Baseline Models: Implement and train traditional machine learning models (e.g., SVM, Naive Bayes) and existing deep learning models (e.g., LSTM, BiLSTM) for sentiment analysis.\n * Proposed Hybrid Model: Train the proposed hybrid model combining CNN, BiLSTM, and Transformer (BERT) layers. Use pre-trained embeddings and fine-tune on the feedback dataset.\n 4. Feature Extraction:\n * Extract features using word embeddings, sentiment shifter rules, and contextual polarity indicators.\n * Integrate statistical, linguistic, and sentiment knowledge features with word embeddings to form a comprehensive feature set.\n 5. Evaluation Metrics:\n * Measure the performance of models using accuracy, precision, recall, and F1-score.\n * Perform aspect-level evaluation by analyzing the accuracy of aspect term extraction and sentiment classification.\n 6. Experiment Execution:\n * Training Phase: Train the baseline models and the proposed hybrid model on the training dataset.\n * Validation Phase: Validate the models using cross-validation techniques to ensure robustness and prevent overfitting.\n * Testing Phase: Evaluate the models on a held-out test set to compare their performance.\n 7. Comparison and Analysis:\n * Compare the performance of the proposed hybrid model with baseline models and recent works, such as DTLP and other sentiment analysis techniques.\n * Analyze the results to identify strengths and weaknesses of the proposed model in handling aspect-level sentiment analysis and implicit aspects.\n 8. Iterative Refinement:\n * Implement an iterative feedback loop where predictions are reviewed and corrected, improving model performance over iterations.\n * Engage domain experts to review the model's predictions and provide feedback for further refinement.\n 9. Deployment:\n * Integrate the validated model into an intelligent educational system for real-time feedback analysis.\n * Develop a user interface to allow educators to interact with the model, view feedback analysis, and generate reports.\n ", "code_init": "ex1_init.py", "code_final": "ex1_final.py"}, "2": {"title": "An Empirical Study on the Impact of Code Review on Software Quality", "abstract": "This paper presents an empirical study examining the impact of code reviews on the quality of software projects. The study involved analyzing over 500,000 code reviews across 20 open-source projects on GitHub. The analysis was conducted to assess the relationship between code review practices and key software quality metrics, such as defect density, code churn, and the frequency of post-release defects. The findings suggest that code reviews, particularly when conducted by experienced reviewers, significantly reduce the number of defects in the codebase. The paper discusses the methodology used for data collection, the statistical methods employed for analysis, and the implications of these findings for software development practices.", "research_tasks": "The primary research tasks include collecting and analyzing data on code reviews from open-source projects, measuring software quality metrics, and assessing the correlation between code review practices and software quality.", "research_gaps": "Gaps include the lack of large-scale empirical studies that quantify the impact of code reviews on software quality and the limited focus on the role of reviewer expertise in existing literature.", "keywords": "Code Reviews, Software Quality, Defect Density, Code Churn, Post-Release Defects, Empirical Study, Open-Source Projects, GitHub", "recent_works": ["The Effectiveness of Code Reviews in Identifying Defects: A Meta-Analysis of Empirical Studies", "A Study on the Impact of Code Review Tools on Developer Productivity and Software Quality"], "hypothesis": "\n Method: Quantitative Analysis of Code Review Impact on Software Quality Using Statistical Methods\n\n Step 1: Data Collection and Preprocessing\n\n Data Extraction\n * Collect code review data from a variety of open-source projects on GitHub.\n * Ensure that the dataset covers a wide range of projects varying in size, domain, and activity level.\n\n Data Cleaning and Transformation\n * Clean the data by removing duplicates and irrelevant entries.\n * Transform the raw data into a format suitable for statistical analysis, focusing on key metrics such as defect density, code churn, and review frequency.\n\n Step 2: Statistical Analysis\n\n Correlation Analysis\n * Perform a correlation analysis to determine the relationships between code review practices (e.g., number of reviews, reviewer experience) and software quality metrics.\n\n Regression Modeling\n * Develop regression models to predict software quality outcomes based on code review metrics. \n * Consider interaction terms to assess the impact of reviewer experience and code complexity on the effectiveness of code reviews.\n\n Step 3: Model Validation\n\n Model Training\n * Train the regression models using cross-validation techniques to avoid overfitting and ensure the generalizability of results.\n\n Model Evaluation\n * Evaluate the models using standard metrics such as R-squared, mean absolute error, and root mean square error. \n\n Step 4: Hypothesis Testing\n\n Statistical Testing\n * Conduct hypothesis tests to evaluate the significance of findings, particularly whether code reviews have a statistically significant impact on defect density and other software quality metrics.\n\n Sensitivity Analysis\n * Perform sensitivity analyses to determine the robustness of results across different subsets of the data (e.g., by project size or reviewer expertise).\n\n Step 5: Reporting and Application\n\n Research Report\n * Document the findings in a detailed research report, including insights into the most influential factors in code reviews that affect software quality.\n\n Best Practice Recommendations\n * Develop a set of best practices for code reviews based on the empirical findings, aimed at improving software quality in both open-source and proprietary software projects.\n ", "experiment_plan": "\n Experiment: Investigating the Impact of Code Review Practices on Software Quality Metrics\n\n Objective:\n To empirically validate the impact of code review practices on software quality by analyzing a large dataset of open-source projects from GitHub.\n Research Problem:\n There is a lack of large-scale empirical studies that quantify the impact of code reviews on software quality metrics such as defect density, code churn, and post-release defects.\n Proposed Method:\n The study will employ quantitative analysis techniques to measure the correlation between code review practices and software quality outcomes. \n\n Experiment Design:\n 1. Dataset Preparation:\n * Collect a dataset of over 500,000 code reviews from 20 open-source projects on GitHub, ensuring diversity in project size and domain.\n * Preprocess the data to extract relevant features, including review frequency, reviewer experience, and the number of defects.\n 2. Statistical Analysis:\n * Perform correlation analysis to identify key relationships between code review metrics and software quality metrics.\n * Develop regression models to predict software quality outcomes based on code review practices.\n 3. Model Validation:\n * Train regression models on the collected data using cross-validation techniques to avoid overfitting.\n * Evaluate model performance using metrics such as R-squared, MAE, and RMSE.\n 4. Hypothesis Testing:\n * Conduct statistical tests to assess the significance of the relationships identified.\n * Perform sensitivity analyses to check the robustness of the results across different project types and sizes.\n 5. Reporting:\n * Document the results, highlighting key findings and practical recommendations for improving code review practices to enhance software quality.\n * Develop best practices for software development teams based on the empirical findings of the study.\n ", "code_init": "ex2_init.py", "code_final": "ex2_final.py"}}
|