Spaces:
Running
on
Zero
Running
on
Zero
| """Test suite for new MIT-licensed HuggingFace datasets integration. | |
| Tests ingestion of: | |
| - arxiv-papers: Scholarly papers (2.55M) | |
| - prompt-report: Prompt engineering docs (83) | |
| - generated-novels: Narrative text (20) | |
| - anac-manuals: Technical manuals (52) | |
| - chatenv: Software development chat (SustcZhangYX/ChatEnv) | |
| - portuguese-edu: Multilingual education (21) | |
| - edustories: Educational stories in English (MU-NLPC/Edustories-en) | |
| """ | |
| import sys | |
| import pytest | |
| from pathlib import Path | |
| from unittest.mock import patch, MagicMock | |
| from warbler_cda.utils.transformers import ( | |
| ArxivTransformer, | |
| PromptReportTransformer, | |
| NovelsTransformer, | |
| ManualsTransformer, | |
| EnterpriseTransformer, | |
| PortugueseEducationTransformer, | |
| EdustoriesTransformer, | |
| WarblerPackBuilder, | |
| ) | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| class TestArxivPapersTransformer: | |
| """Test arXiv papers dataset transformer.""" | |
| def test_arxiv_transformer_exists(self): | |
| """Test that arxiv transformer exists and is callable.""" | |
| transformer = ArxivTransformer() | |
| assert hasattr(transformer, "transform") | |
| assert callable(transformer.transform) | |
| def test_arxiv_output_format(self): | |
| """Test arXiv transformer produces Warbler-compatible format.""" | |
| transformer = ArxivTransformer() | |
| mock_paper = { | |
| "arxiv_id": "2301.00001", | |
| "title": "Test Paper on Machine Learning", | |
| "authors": "Author One, Author Two", | |
| "abstract": "This is a test abstract about ML research.", | |
| "year": 2023, | |
| "categories": "cs.LG;cs.AI", | |
| } | |
| with patch( | |
| "warbler_cda.utils.transformers.arxiv.load_dataset" | |
| ) as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset.__getitem__.return_value = [mock_paper] | |
| mock_dataset.keys.return_value = ["train"] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform(limit=1) | |
| assert len(docs) > 0 | |
| doc = docs[0] | |
| assert "content_id" in doc | |
| assert "content" in doc | |
| assert "metadata" in doc | |
| assert ( | |
| doc["metadata"]["source_dataset"] == "nick007x/arxiv-papers" | |
| ) | |
| assert doc["metadata"]["license"] == "MIT" | |
| def test_arxiv_metadata_fields(self): | |
| """Test that arXiv metadata contains required fields.""" | |
| transformer = ArxivTransformer() | |
| mock_paper = { | |
| "arxiv_id": "2301.00001", | |
| "title": "Test Paper", | |
| "authors": "Author", | |
| "abstract": "Abstract", | |
| "year": 2023, | |
| "categories": "cs.LG", | |
| } | |
| with patch( | |
| "warbler_cda.utils.transformers.arxiv.load_dataset" | |
| ) as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset.__getitem__.return_value = [mock_paper] | |
| mock_dataset.keys.return_value = ["train"] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform(limit=1) | |
| metadata = docs[0]["metadata"] | |
| assert "pack" in metadata | |
| assert "arxiv_id" in metadata | |
| assert "year" in metadata | |
| assert "categories" in metadata | |
| assert metadata["realm_type"] == "scholarly" | |
| assert metadata["realm_label"] == "arxiv" | |
| def test_arxiv_limit_parameter(self): | |
| """Test that arxiv transformer respects limit parameter.""" | |
| transformer = ArxivTransformer() | |
| mock_papers = [ | |
| { | |
| "arxiv_id": f"2301.{i:05d}", | |
| "title": f"Paper {i}", | |
| "authors": f"Author {i}", | |
| "abstract": f"Abstract {i}", | |
| "year": 2023, | |
| "categories": "cs.LG", | |
| } | |
| for i in range(10) | |
| ] | |
| with patch( | |
| "warbler_cda.utils.transformers.arxiv.load_dataset" | |
| ) as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset.__getitem__.return_value = mock_papers | |
| mock_dataset.keys.return_value = ["train"] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform(limit=5) | |
| assert len(docs) <= 5 | |
| class TestPromptReportTransformer: | |
| """Test prompt engineering report dataset transformer.""" | |
| def test_prompt_report_transformer_exists(self): | |
| """Test that prompt report transformer exists.""" | |
| transformer = PromptReportTransformer() | |
| assert hasattr(transformer, "transform") | |
| assert callable(transformer.transform) | |
| def test_prompt_report_output_format(self): | |
| """Test prompt report produces Warbler format.""" | |
| transformer = PromptReportTransformer() | |
| mock_report = { | |
| "id": "report_001", | |
| "title": "The Prompt Report: A Systematic Study", | |
| "text": "This is the full report text about prompting.", | |
| "category": "prompting", | |
| } | |
| with patch( | |
| "warbler_cda.utils.transformers.prompt_report.load_dataset" | |
| ) as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset = [mock_report] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform() | |
| assert len(docs) > 0 | |
| doc = docs[0] | |
| assert "content_id" in doc | |
| assert "content" in doc | |
| assert "metadata" in doc | |
| assert ( | |
| doc["metadata"]["source_dataset"] | |
| == "PromptSystematicReview/ThePromptReport" | |
| ) | |
| assert doc["metadata"]["license"] == "MIT" | |
| class TestGeneratedNovelsTransformer: | |
| """Test generated novels dataset transformer.""" | |
| def test_novels_transformer_exists(self): | |
| """Test that novels transformer exists.""" | |
| transformer = NovelsTransformer() | |
| assert hasattr(transformer, "transform") | |
| assert callable(transformer.transform) | |
| def test_novels_chunking_for_long_text(self): | |
| """Test that long novels are properly chunked.""" | |
| transformer = NovelsTransformer() | |
| long_text = " ".join(["This is a sentence about a novel."] * 500) | |
| mock_novel = {"id": "novel_001", "title": "Test Novel", "text": long_text} | |
| with patch( | |
| "warbler_cda.utils.transformers.novels.load_dataset" | |
| ) as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset = [mock_novel] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform() | |
| for doc in docs: | |
| assert "content_id" in doc | |
| assert "metadata" in doc | |
| assert ( | |
| doc["metadata"]["source_dataset"] | |
| == "GOAT-AI/generated-novels" | |
| ) | |
| assert doc["metadata"]["license"] == "MIT" | |
| class TestManualnsTransformer: | |
| """Test technical manuals dataset transformer.""" | |
| def test_manuals_transformer_exists(self): | |
| """Test that manuals transformer exists.""" | |
| transformer = ManualsTransformer() | |
| assert hasattr(transformer, "transform") | |
| assert callable(transformer.transform) | |
| def test_manuals_output_format(self): | |
| """Test manuals transformer produces Warbler format.""" | |
| transformer = ManualsTransformer() | |
| mock_manual = { | |
| "id": "manual_001", | |
| "title": "Technical Manual", | |
| "text": "This is technical documentation.", | |
| "category": "technology", | |
| } | |
| with patch( | |
| "warbler_cda.utils.transformers.manuals.load_dataset" | |
| ) as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset = [mock_manual] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform() | |
| assert len(docs) > 0 | |
| doc = docs[0] | |
| assert "content_id" in doc | |
| assert "content" in doc | |
| assert "metadata" in doc | |
| assert doc["metadata"]["source_dataset"] == "nlasso/anac-manuals-23" | |
| assert doc["metadata"]["license"] == "MIT" | |
| class TestEnterpriseTransformer: | |
| """Test enterprise/SustainabilityEntered transformer.""" | |
| def test_enterprise_transformer_exists(self): | |
| """Test that enterprise transformer exists.""" | |
| transformer = EnterpriseTransformer() | |
| assert hasattr(transformer, "transform") | |
| assert callable(transformer.transform) | |
| def test_enterprise_output_format(self): | |
| """Test enterprise transformer produces Warbler format.""" | |
| transformer = EnterpriseTransformer() | |
| mock_conversation = { | |
| "id": "conv_001", | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": "Can you help with software development?", | |
| } | |
| ], | |
| } | |
| with patch( | |
| "warbler_cda.utils.transformers.enterprise.load_dataset" | |
| ) as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset = [mock_conversation] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform() | |
| assert len(docs) > 0 | |
| doc = docs[0] | |
| assert "content_id" in doc | |
| assert "content" in doc | |
| assert "metadata" in doc | |
| assert ( | |
| doc["metadata"]["source_dataset"] == "SustcZhangYX/ChatEnv" | |
| ) | |
| assert doc["metadata"]["license"] == "MIT" | |
| assert doc["metadata"]["realm_type"] == "software_development" | |
| class TestPortugueseEducationTransformer: | |
| """Test Portuguese education dataset transformer.""" | |
| def test_portuguese_transformer_exists(self): | |
| """Test that Portuguese education transformer exists.""" | |
| transformer = PortugueseEducationTransformer() | |
| assert hasattr(transformer, "transform") | |
| assert callable(transformer.transform) | |
| def test_portuguese_output_format(self): | |
| """Test Portuguese education produces Warbler format.""" | |
| transformer = PortugueseEducationTransformer() | |
| mock_doc = { | |
| "id": "port_001", | |
| "title": "Portuguese Education Article", | |
| "text": "Conteúdo educacional em português", | |
| } | |
| with patch( | |
| "warbler_cda.utils.transformers" | |
| ".portuguese_education.load_dataset" | |
| ) as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset = [mock_doc] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform() | |
| assert len(docs) > 0 | |
| doc = docs[0] | |
| assert "content_id" in doc | |
| assert "content" in doc | |
| assert "metadata" in doc | |
| assert ( | |
| doc["metadata"]["source_dataset"] | |
| == "Solshine/Portuguese_Language_Education_Texts" | |
| ) | |
| assert doc["metadata"]["license"] == "MIT" | |
| assert doc["metadata"]["language"] == "pt" | |
| class TestEdustoriesTransformer: | |
| """Test educational stories (edustories) transformer.""" | |
| def test_edustories_transformer_exists(self): | |
| """Test that edustories transformer exists.""" | |
| transformer = EdustoriesTransformer() | |
| assert hasattr(transformer, "transform") | |
| assert callable(transformer.transform) | |
| def test_edustories_metadata_completeness(self): | |
| """Test that edustories metadata is complete.""" | |
| transformer = EdustoriesTransformer() | |
| mock_case_study = { | |
| "id": 123, | |
| "description": "Classroom with diverse learners.", | |
| "anamnesis": "Student had learning difficulties.", | |
| "solution": "Implemented personalized learning approach.", | |
| "outcome": "Student improved academically.", | |
| "age, school year": "10 years, 4th grade", | |
| "hobbies": "Reading, art", | |
| "diagnoses": "Dyslexia", | |
| "disorders": "", | |
| "problems_annotated": "reading_difficulty", | |
| "solutions_annotated": "reading_intervention", | |
| "implications_annotated": "literacy_support", | |
| } | |
| with patch( | |
| "warbler_cda.utils.transformers.edustories.load_dataset" | |
| ) as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset = [mock_case_study] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform() | |
| assert len(docs) > 0 | |
| doc = docs[0] | |
| metadata = doc["metadata"] | |
| # Check for case study metadata | |
| assert "pack" in metadata | |
| assert metadata["pack"] == "warbler-pack-edustories" | |
| assert "source_dataset" in metadata | |
| assert metadata["source_dataset"] == "MU-NLPC/Edustories-en" | |
| assert "license" in metadata | |
| assert metadata["license"] == "MIT" | |
| # Check for annotations | |
| assert "problems_annotated" in metadata | |
| assert metadata["problems_annotated"] == "reading_difficulty" | |
| assert "solutions_annotated" in metadata | |
| assert metadata["solutions_annotated"] == "reading_intervention" | |
| assert "implications_annotated" in metadata | |
| assert ( | |
| metadata["implications_annotated"] == "literacy_support" | |
| ) | |
| # Check realm and dialogue type | |
| assert metadata["realm_label"] == "educational_case_studies" | |
| assert metadata["dialogue_type"] == "teaching_case_study" | |
| assert metadata["pack"] == "warbler-pack-edustories" | |
| def test_edustories_content_structure(self): | |
| """Test that edustories content has structured sections.""" | |
| transformer = EdustoriesTransformer() | |
| mock_case_study = { | |
| "id": 789, | |
| "description": ( | |
| "A diverse classroom with students of varying abilities." | |
| ), | |
| "anamnesis": ( | |
| "Student struggled with group work and social interactions." | |
| ), | |
| "solution": ( | |
| "Teacher introduced structured cooperative learning " | |
| "activities." | |
| ), | |
| "outcome": ( | |
| "Student became more comfortable working with peers." | |
| ), | |
| "age, school year": "9 years, 3rd grade", | |
| "hobbies": "Video games", | |
| "diagnoses": "Autism Spectrum Disorder", | |
| "disorders": "", | |
| "problems_annotated": "social_skills_deficit", | |
| "solutions_annotated": "cooperative_learning", | |
| "implications_annotated": "social_improvement", | |
| } | |
| with patch( | |
| "warbler_cda.utils.transformers.edustories.load_dataset" | |
| ) as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset = [mock_case_study] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform() | |
| assert len(docs) > 0 | |
| doc = docs[0] | |
| content = doc["content"] | |
| # Check for structured sections | |
| assert "Background" in content | |
| assert "Situation" in content | |
| assert ( | |
| "Teacher Intervention" in content or "Intervention" in content | |
| ) | |
| assert "Outcome" in content | |
| assert "Student Profile" in content | |
| # Check that actual content is present | |
| assert "diverse classroom" in content | |
| assert "struggled with group work" in content | |
| assert "cooperative learning" in content | |
| assert "more comfortable working with peers" in content | |
| # Check for student profile information | |
| assert "9 years, 3rd grade" in content | |
| assert "Video games" in content | |
| assert "Autism Spectrum Disorder" in content | |
| # Check for annotations section | |
| assert ( | |
| "Annotations" in content or "Identified Problems" in content | |
| ) | |
| assert "social_skills_deficit" in content | |
| assert "cooperative_learning" in content | |
| # Check for case study marker | |
| assert "case study" in content.lower() or "Case Study" in content | |
| class TestNewDatasetsIntegrationWithRetrieval: | |
| """Test that new data integrates with retrieval API.""" | |
| def test_warbler_document_structure(self): | |
| """Test that transformed documents have proper Warbler structure.""" | |
| transformer = ArxivTransformer() | |
| mock_paper = { | |
| "arxiv_id": "2301.00001", | |
| "title": "Test Paper", | |
| "authors": "Author", | |
| "abstract": "Abstract", | |
| "year": 2023, | |
| "categories": "cs.LG", | |
| } | |
| with patch( | |
| "warbler_cda.utils.transformers.arxiv.load_dataset" | |
| ) as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset.__getitem__.return_value = [mock_paper] | |
| mock_dataset.keys.return_value = ["train"] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform(limit=1) | |
| for doc in docs: | |
| assert "content_id" in doc | |
| assert isinstance(doc["content_id"], str) | |
| assert doc["content_id"].strip() != "" | |
| assert "content" in doc | |
| assert isinstance(doc["content"], str) | |
| assert doc["content"].strip() != "" | |
| assert "metadata" in doc | |
| metadata = doc["metadata"] | |
| assert "pack" in metadata | |
| assert "source_dataset" in metadata | |
| assert "license" in metadata | |
| assert metadata["license"] == "MIT" | |
| assert "realm_type" in metadata | |
| assert "realm_label" in metadata | |
| def test_pack_creation_with_new_datasets(self): | |
| """Test that packs can be created from new datasets.""" | |
| builder = WarblerPackBuilder() | |
| test_docs = [ | |
| { | |
| "content_id": f"test_{i}", | |
| "content": f"Test content {i}", | |
| "metadata": { | |
| "pack": "warbler-pack-test", | |
| "source_dataset": "test/dataset", | |
| "license": "MIT", | |
| "realm_type": "test", | |
| "realm_label": "test", | |
| "lifecycle_stage": "emergence", | |
| "activity_level": 0.5, | |
| "dialogue_type": "test", | |
| }, | |
| } | |
| for i in range(3) | |
| ] | |
| assert builder is not None | |
| assert hasattr(builder, "create_pack") | |
| class TestNewDatasetsPerformance: | |
| """Test performance characteristics of new transformers.""" | |
| def test_arxiv_handles_large_dataset(self): | |
| """Test that arxiv transformer can handle large limits efficiently.""" | |
| transformer = ArxivTransformer() | |
| large_dataset = [ | |
| { | |
| "arxiv_id": f"2301.{i:05d}", | |
| "title": f"Paper {i}", | |
| "authors": f"Author {i}", | |
| "abstract": f"Abstract {i}", | |
| "year": 2023, | |
| "categories": "cs.LG", | |
| } | |
| for i in range(100) | |
| ] | |
| with patch( | |
| "warbler_cda.utils.transformers.arxiv.load_dataset" | |
| ) as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset.__getitem__.return_value = large_dataset | |
| mock_dataset.keys.return_value = ["train"] | |
| mock_load.return_value = mock_dataset | |
| import time | |
| start = time.time() | |
| docs = transformer.transform(limit=100) | |
| elapsed = time.time() - start | |
| assert len(docs) <= 100 | |
| assert elapsed < 10.0 | |
| class TestNewDatasetsAllAtOnce: | |
| """Test ingesting all new datasets together.""" | |
| def test_all_transformers_callable(self): | |
| """Test that all new transformers can be called.""" | |
| transformers = [ | |
| ArxivTransformer, | |
| PromptReportTransformer, | |
| NovelsTransformer, | |
| ManualsTransformer, | |
| EnterpriseTransformer, | |
| PortugueseEducationTransformer, | |
| EdustoriesTransformer, | |
| ] | |
| for transformer_class in transformers: | |
| transformer = transformer_class() | |
| assert hasattr( | |
| transformer, "transform" | |
| ), f"Missing transform method in {transformer_class.__name__}" | |
| assert callable(transformer.transform) | |
| if __name__ == "__main__": | |
| pytest.main([__file__, "-v"]) | |