|
|
""" |
|
|
Root pytest configuration and shared fixtures. |
|
|
|
|
|
This module provides fixtures that are available to all test modules. |
|
|
""" |
|
|
import pytest |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import tempfile |
|
|
import sqlite3 |
|
|
from pathlib import Path |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_text_data(): |
|
|
"""Fixture providing sample text data for testing.""" |
|
|
return [ |
|
|
"Fixed bug in authentication system using OAuth2", |
|
|
"Implemented REST API endpoint for user data retrieval", |
|
|
"Added unit tests for data processing pipeline", |
|
|
"Refactored code to improve performance and reduce memory usage", |
|
|
"Updated database schema with new migration scripts", |
|
|
] |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_dirty_text(): |
|
|
"""Fixture providing text with common GitHub noise.""" |
|
|
return [ |
|
|
"Fixed bug https://github.com/repo/issues/123 in auth system", |
|
|
"Added feature with <b>HTML tags</b> and `inline code`", |
|
|
"Removed emoji 😀 and special characters", |
|
|
"""Updated docs with code block: |
|
|
```python |
|
|
def foo(): |
|
|
pass |
|
|
``` |
|
|
""", |
|
|
"Fixed multiple spaces and\n\nnewlines", |
|
|
] |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_labels(): |
|
|
"""Fixture providing sample multi-label data.""" |
|
|
return pd.DataFrame({ |
|
|
'Language': [1, 1, 1, 0, 1], |
|
|
'Data Structure': [1, 0, 0, 1, 1], |
|
|
'Testing': [0, 0, 1, 0, 0], |
|
|
'API': [1, 1, 0, 0, 0], |
|
|
'DevOps': [0, 0, 0, 1, 1], |
|
|
}) |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_dataframe(sample_text_data, sample_labels): |
|
|
"""Fixture providing complete sample dataframe.""" |
|
|
df = pd.DataFrame({ |
|
|
'Repo Name': ['repo1', 'repo2', 'repo1', 'repo3', 'repo2'], |
|
|
'PR #': [1, 2, 3, 4, 5], |
|
|
'issue text': [sample_text_data[0], sample_text_data[1], |
|
|
sample_text_data[2], sample_text_data[3], |
|
|
sample_text_data[4]], |
|
|
'issue description': ['Description for issue 1', 'Description for issue 2', |
|
|
'Description for issue 3', 'Description for issue 4', |
|
|
'Description for issue 5'], |
|
|
}) |
|
|
|
|
|
|
|
|
for col in sample_labels.columns: |
|
|
df[col] = sample_labels[col].values |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def temp_db(sample_dataframe): |
|
|
"""Fixture providing temporary SQLite database.""" |
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.db', delete=False) as f: |
|
|
db_path = f.name |
|
|
|
|
|
|
|
|
conn = sqlite3.connect(db_path) |
|
|
sample_dataframe.to_sql('nlbse_tool_competition_data_by_issue', |
|
|
conn, if_exists='replace', index=False) |
|
|
conn.close() |
|
|
|
|
|
yield Path(db_path) |
|
|
|
|
|
|
|
|
Path(db_path).unlink() |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_tfidf_vectorizer(): |
|
|
"""Fixture providing a simple TF-IDF vectorizer.""" |
|
|
vectorizer = TfidfVectorizer( |
|
|
max_features=100, |
|
|
ngram_range=(1, 2), |
|
|
stop_words='english' |
|
|
) |
|
|
return vectorizer |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_sparse_features(): |
|
|
"""Fixture providing sample sparse feature matrix.""" |
|
|
|
|
|
features = np.zeros((100, 50)) |
|
|
|
|
|
|
|
|
for i in range(100): |
|
|
|
|
|
n_nonzero = np.random.randint(5, 11) |
|
|
indices = np.random.choice(50, n_nonzero, replace=False) |
|
|
features[i, indices] = np.random.rand(n_nonzero) |
|
|
|
|
|
return features |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_multilabel_data(): |
|
|
"""Fixture providing sample multi-label classification data.""" |
|
|
n_samples = 100 |
|
|
n_labels = 10 |
|
|
|
|
|
|
|
|
labels = np.zeros((n_samples, n_labels), dtype=int) |
|
|
|
|
|
for i in range(n_samples): |
|
|
|
|
|
n_labels_per_sample = np.random.randint(1, 6) |
|
|
label_indices = np.random.choice(n_labels, n_labels_per_sample, replace=False) |
|
|
labels[i, label_indices] = 1 |
|
|
|
|
|
return labels |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def empty_text_samples(): |
|
|
"""Fixture providing edge case: empty or null text samples.""" |
|
|
return [ |
|
|
"", |
|
|
None, |
|
|
" ", |
|
|
"\n\n\n", |
|
|
"a", |
|
|
] |
|
|
|