Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

Hopcroft-Skill-Classification / tests /conftest.py

DaCrow13

Deploy to HF Spaces (Clean)

225af6a about 2 months ago

4.33 kB

	"""
	Root pytest configuration and shared fixtures.

	This module provides fixtures that are available to all test modules.
	"""
	import pytest
	import numpy as np
	import pandas as pd
	import tempfile
	import sqlite3
	from pathlib import Path
	from sklearn.feature_extraction.text import TfidfVectorizer


	@pytest.fixture
	def sample_text_data():
	"""Fixture providing sample text data for testing."""
	return [
	"Fixed bug in authentication system using OAuth2",
	"Implemented REST API endpoint for user data retrieval",
	"Added unit tests for data processing pipeline",
	"Refactored code to improve performance and reduce memory usage",
	"Updated database schema with new migration scripts",
	]


	@pytest.fixture
	def sample_dirty_text():
	"""Fixture providing text with common GitHub noise."""
	return [
	"Fixed bug https://github.com/repo/issues/123 in auth system",
	"Added feature with <b>HTML tags</b> and `inline code`",
	"Removed emoji 😀 and special characters",
	"""Updated docs with code block:
	```python
	def foo():
	pass
	```
	""",
	"Fixed multiple spaces and\n\nnewlines",
	]


	@pytest.fixture
	def sample_labels():
	"""Fixture providing sample multi-label data."""
	return pd.DataFrame({
	'Language': [1, 1, 1, 0, 1],
	'Data Structure': [1, 0, 0, 1, 1],
	'Testing': [0, 0, 1, 0, 0],
	'API': [1, 1, 0, 0, 0],
	'DevOps': [0, 0, 0, 1, 1],
	})


	@pytest.fixture
	def sample_dataframe(sample_text_data, sample_labels):
	"""Fixture providing complete sample dataframe."""
	df = pd.DataFrame({
	'Repo Name': ['repo1', 'repo2', 'repo1', 'repo3', 'repo2'],
	'PR #': [1, 2, 3, 4, 5],
	'issue text': [sample_text_data[0], sample_text_data[1],
	sample_text_data[2], sample_text_data[3],
	sample_text_data[4]],
	'issue description': ['Description for issue 1', 'Description for issue 2',
	'Description for issue 3', 'Description for issue 4',
	'Description for issue 5'],
	})

	# Add label columns
	for col in sample_labels.columns:
	df[col] = sample_labels[col].values

	return df


	@pytest.fixture
	def temp_db(sample_dataframe):
	"""Fixture providing temporary SQLite database."""
	with tempfile.NamedTemporaryFile(mode='w', suffix='.db', delete=False) as f:
	db_path = f.name

	# Create database and insert data
	conn = sqlite3.connect(db_path)
	sample_dataframe.to_sql('nlbse_tool_competition_data_by_issue',
	conn, if_exists='replace', index=False)
	conn.close()

	yield Path(db_path)

	# Cleanup
	Path(db_path).unlink()


	@pytest.fixture
	def sample_tfidf_vectorizer():
	"""Fixture providing a simple TF-IDF vectorizer."""
	vectorizer = TfidfVectorizer(
	max_features=100,
	ngram_range=(1, 2),
	stop_words='english'
	)
	return vectorizer


	@pytest.fixture
	def sample_sparse_features():
	"""Fixture providing sample sparse feature matrix."""
	# Create a sparse matrix (mostly zeros)
	features = np.zeros((100, 50))

	# Add some non-zero values
	for i in range(100):
	# Each row has 5-10 non-zero features
	n_nonzero = np.random.randint(5, 11)
	indices = np.random.choice(50, n_nonzero, replace=False)
	features[i, indices] = np.random.rand(n_nonzero)

	return features


	@pytest.fixture
	def sample_multilabel_data():
	"""Fixture providing sample multi-label classification data."""
	n_samples = 100
	n_labels = 10

	# Generate labels with varying frequencies
	labels = np.zeros((n_samples, n_labels), dtype=int)

	for i in range(n_samples):
	# Each sample has 1-5 labels
	n_labels_per_sample = np.random.randint(1, 6)
	label_indices = np.random.choice(n_labels, n_labels_per_sample, replace=False)
	labels[i, label_indices] = 1

	return labels


	@pytest.fixture
	def empty_text_samples():
	"""Fixture providing edge case: empty or null text samples."""
	return [
	"",
	None,
	" ",
	"\n\n\n",
	"a", # Single character
	]