DaCrow13
Deploy to HF Spaces (Clean)
225af6a
"""
Root pytest configuration and shared fixtures.
This module provides fixtures that are available to all test modules.
"""
import pytest
import numpy as np
import pandas as pd
import tempfile
import sqlite3
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
@pytest.fixture
def sample_text_data():
"""Fixture providing sample text data for testing."""
return [
"Fixed bug in authentication system using OAuth2",
"Implemented REST API endpoint for user data retrieval",
"Added unit tests for data processing pipeline",
"Refactored code to improve performance and reduce memory usage",
"Updated database schema with new migration scripts",
]
@pytest.fixture
def sample_dirty_text():
"""Fixture providing text with common GitHub noise."""
return [
"Fixed bug https://github.com/repo/issues/123 in auth system",
"Added feature with <b>HTML tags</b> and `inline code`",
"Removed emoji 😀 and special characters",
"""Updated docs with code block:
```python
def foo():
pass
```
""",
"Fixed multiple spaces and\n\nnewlines",
]
@pytest.fixture
def sample_labels():
"""Fixture providing sample multi-label data."""
return pd.DataFrame({
'Language': [1, 1, 1, 0, 1],
'Data Structure': [1, 0, 0, 1, 1],
'Testing': [0, 0, 1, 0, 0],
'API': [1, 1, 0, 0, 0],
'DevOps': [0, 0, 0, 1, 1],
})
@pytest.fixture
def sample_dataframe(sample_text_data, sample_labels):
"""Fixture providing complete sample dataframe."""
df = pd.DataFrame({
'Repo Name': ['repo1', 'repo2', 'repo1', 'repo3', 'repo2'],
'PR #': [1, 2, 3, 4, 5],
'issue text': [sample_text_data[0], sample_text_data[1],
sample_text_data[2], sample_text_data[3],
sample_text_data[4]],
'issue description': ['Description for issue 1', 'Description for issue 2',
'Description for issue 3', 'Description for issue 4',
'Description for issue 5'],
})
# Add label columns
for col in sample_labels.columns:
df[col] = sample_labels[col].values
return df
@pytest.fixture
def temp_db(sample_dataframe):
"""Fixture providing temporary SQLite database."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.db', delete=False) as f:
db_path = f.name
# Create database and insert data
conn = sqlite3.connect(db_path)
sample_dataframe.to_sql('nlbse_tool_competition_data_by_issue',
conn, if_exists='replace', index=False)
conn.close()
yield Path(db_path)
# Cleanup
Path(db_path).unlink()
@pytest.fixture
def sample_tfidf_vectorizer():
"""Fixture providing a simple TF-IDF vectorizer."""
vectorizer = TfidfVectorizer(
max_features=100,
ngram_range=(1, 2),
stop_words='english'
)
return vectorizer
@pytest.fixture
def sample_sparse_features():
"""Fixture providing sample sparse feature matrix."""
# Create a sparse matrix (mostly zeros)
features = np.zeros((100, 50))
# Add some non-zero values
for i in range(100):
# Each row has 5-10 non-zero features
n_nonzero = np.random.randint(5, 11)
indices = np.random.choice(50, n_nonzero, replace=False)
features[i, indices] = np.random.rand(n_nonzero)
return features
@pytest.fixture
def sample_multilabel_data():
"""Fixture providing sample multi-label classification data."""
n_samples = 100
n_labels = 10
# Generate labels with varying frequencies
labels = np.zeros((n_samples, n_labels), dtype=int)
for i in range(n_samples):
# Each sample has 1-5 labels
n_labels_per_sample = np.random.randint(1, 6)
label_indices = np.random.choice(n_labels, n_labels_per_sample, replace=False)
labels[i, label_indices] = 1
return labels
@pytest.fixture
def empty_text_samples():
"""Fixture providing edge case: empty or null text samples."""
return [
"",
None,
" ",
"\n\n\n",
"a", # Single character
]