Spaces:

George-API
/

phi4training

Sleeping

App Files Files Community

George-API commited on Mar 9

Commit

356ee13

verified ·

1 Parent(s): b3a8a7a

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

dataset_config.json +4 -4
run_transformers_training.py +116 -8

dataset_config.json CHANGED Viewed

@@ -3,8 +3,7 @@
         "name": "George-API/cognitive-data",
         "split": "train",
         "column_mapping": {
-            "text": "conversations",
-            "id": "id"
         },
         "processing": {
             "sort_by_id": true,
@@ -17,7 +16,8 @@
         "roles": {
             "system": "System: {content}\n\n",
             "human": "Human: {content}\n\n",
-            "assistant": "Assistant: {content}\n\n"
         },
         "metadata_handling": {
             "include_paper_id": true,
@@ -29,7 +29,7 @@
         "batch_size": 24,
         "shuffle": false,
         "drop_last": false,
-        "num_workers": 8,
         "pin_memory": true,
         "prefetch_factor": 4
     },

         "name": "George-API/cognitive-data",
         "split": "train",
         "column_mapping": {
+            "conversations": "text"
         },
         "processing": {
             "sort_by_id": true,
         "roles": {
             "system": "System: {content}\n\n",
             "human": "Human: {content}\n\n",
+            "assistant": "Assistant: {content}\n\n",
+            "user": "Human: {content}\n\n"
         },
         "metadata_handling": {
             "include_paper_id": true,
         "batch_size": 24,
         "shuffle": false,
         "drop_last": false,
+        "num_workers": 4,
         "pin_memory": true,
         "prefetch_factor": 4
     },

run_transformers_training.py CHANGED Viewed

@@ -208,15 +208,51 @@ def load_dataset_with_mapping(dataset_config):
         logger.info(f"Loading dataset {dataset_name}, split {dataset_split}")
         dataset = load_dataset(dataset_name, split=dataset_split)
-        # Map columns if specified
         column_mapping = dataset_config.get("dataset", {}).get("column_mapping", {})
         if column_mapping:
-            logger.info(f"Applying column mapping: {column_mapping}")
-            # Rename columns according to mapping
             for target, source in column_mapping.items():
                 if source in dataset.column_names:
-                    dataset = dataset.rename_column(source, target)
         # Sort dataset if required
         sort_by_id = dataset_config.get("dataset", {}).get("processing", {}).get("sort_by_id", False)
@@ -227,8 +263,14 @@ def load_dataset_with_mapping(dataset_config):
             # Log the first few IDs to verify sorting
             sample_ids = [example['id'] for example in dataset.select(range(min(5, len(dataset))))]
             logger.info(f"First few IDs after sorting: {sample_ids}")
         logger.info(f"Dataset loaded successfully with {len(dataset)} examples")
         return dataset
     except Exception as e:
@@ -243,11 +285,13 @@ def format_phi_chat(messages, dataset_config):
     roles = dataset_config.get("data_formatting", {}).get("roles", {
         "system": "System: {content}\n\n",
         "human": "Human: {content}\n\n",
         "assistant": "Assistant: {content}\n\n"
     })
     # Handle research introduction metadata first
-    metadata = next((msg for msg in messages if "[RESEARCH INTRODUCTION]" in msg.get("content", "")), None)
     if metadata:
         system_template = roles.get("system", "System: {content}\n\n")
         formatted_chat = system_template.format(content=metadata['content'])
@@ -255,20 +299,29 @@ def format_phi_chat(messages, dataset_config):
     # Process remaining messages
     for message in messages:
         role = message.get("role", "").lower()
         content = message.get("content", "")
         # Format based on role
         if role == "human" or role == "user":
-            template = roles.get("human", "Human: {content}\n\n")
             formatted_chat += template.format(content=content)
-        elif role == "assistant":
             template = roles.get("assistant", "Assistant: {content}\n\n")
             formatted_chat += template.format(content=content)
         elif role == "system":
             # For system messages, prepend them
             template = roles.get("system", "System: {content}\n\n")
             formatted_chat = template.format(content=content) + formatted_chat
     return formatted_chat.strip()
@@ -284,8 +337,56 @@ class SimpleDataCollator:
         self.include_metadata = dataset_config.get("data_formatting", {}).get("metadata_handling", {}).get("include_paper_id", True)
         self.include_chunk = dataset_config.get("data_formatting", {}).get("metadata_handling", {}).get("include_chunk_number", True)
         self.metadata_format = dataset_config.get("data_formatting", {}).get("metadata_handling", {}).get("metadata_format", "Paper ID: {paper_id} | Chunk: {chunk_number}")
         logger.info(f"SimpleDataCollator initialized - using phi-4 chat format with max_seq_length={self.max_seq_length}")
     def __call__(self, features):
         batch = {"input_ids": [], "attention_mask": [], "labels": []}
@@ -293,7 +394,12 @@ class SimpleDataCollator:
             try:
                 # Get ID and conversation fields
                 paper_id = example.get("id", "")
-                conversation = example.get("conversations", [])
                 if not conversation:
                     self.stats["skipped"] += 1
@@ -346,10 +452,12 @@ class SimpleDataCollator:
                         logger.info(f"Paper ID: {paper_id} | Chunk: {self.paper_counters[paper_id]}")
                         logger.info(f"Token count: {len(inputs['input_ids'])}")
                         logger.info(f"Content preview:\n{formatted_content[:500]}...")
                 else:
                     self.stats["skipped"] += 1
             except Exception as e:
                 logger.warning(f"Error processing example: {str(e)[:100]}...")
                 self.stats["skipped"] += 1
                 continue

         logger.info(f"Loading dataset {dataset_name}, split {dataset_split}")
         dataset = load_dataset(dataset_name, split=dataset_split)
+        # Map columns if specified - with checks to avoid conflicts
         column_mapping = dataset_config.get("dataset", {}).get("column_mapping", {})
         if column_mapping:
+            logger.info(f"Checking column mapping: {column_mapping}")
+            # Only apply mappings for columns that need renaming and don't already exist
+            safe_mappings = {}
             for target, source in column_mapping.items():
                 if source in dataset.column_names:
+                    # Skip if target already exists and is not the same as source
+                    if target in dataset.column_names and target != source:
+                        logger.warning(f"Cannot rename '{source}' to '{target}' - target column already exists")
+                    else:
+                        safe_mappings[source] = target
+            # Apply safe renames
+            if safe_mappings:
+                logger.info(f"Applying safe column mapping: {safe_mappings}")
+                for source, target in safe_mappings.items():
+                    if source != target:  # Only rename if names are different
+                        dataset = dataset.rename_column(source, target)
+        # Verify expected columns exist
+        expected_columns = {"id", "conversations"}
+        for col in expected_columns:
+            if col not in dataset.column_names:
+                # If "conversations" is missing but "text" exists, it might need conversion
+                if col == "conversations" and "text" in dataset.column_names:
+                    logger.info("Converting 'text' field to 'conversations' format")
+                    def convert_text_to_conversations(example):
+                        # Check if text is already a list of conversation turns
+                        if isinstance(example.get("text"), list):
+                            return {"conversations": example["text"]}
+                        # Otherwise, create a simple conversation with the text as user message
+                        else:
+                            return {
+                                "conversations": [
+                                    {"role": "user", "content": str(example.get("text", ""))}
+                                ]
+                            }
+                    dataset = dataset.map(convert_text_to_conversations)
+                else:
+                    logger.warning(f"Expected column '{col}' not found in dataset")
         # Sort dataset if required
         sort_by_id = dataset_config.get("dataset", {}).get("processing", {}).get("sort_by_id", False)
             # Log the first few IDs to verify sorting
             sample_ids = [example['id'] for example in dataset.select(range(min(5, len(dataset))))]
             logger.info(f"First few IDs after sorting: {sample_ids}")
+            # Log example of conversations structure to verify format
+            if "conversations" in dataset.column_names:
+                sample_conv = dataset["conversations"][0] if len(dataset) > 0 else []
+                logger.info(f"Example conversation structure: {sample_conv}")
         logger.info(f"Dataset loaded successfully with {len(dataset)} examples")
+        logger.info(f"Dataset columns: {dataset.column_names}")
         return dataset
     except Exception as e:
     roles = dataset_config.get("data_formatting", {}).get("roles", {
         "system": "System: {content}\n\n",
         "human": "Human: {content}\n\n",
+        "user": "Human: {content}\n\n",
         "assistant": "Assistant: {content}\n\n"
     })
     # Handle research introduction metadata first
+    metadata = next((msg for msg in messages if isinstance(msg, dict) and
+                    "[RESEARCH INTRODUCTION]" in msg.get("content", "")), None)
     if metadata:
         system_template = roles.get("system", "System: {content}\n\n")
         formatted_chat = system_template.format(content=metadata['content'])
     # Process remaining messages
     for message in messages:
+        if not isinstance(message, dict) or "content" not in message:
+            logger.warning(f"Skipping invalid message format: {message}")
+            continue
         role = message.get("role", "").lower()
         content = message.get("content", "")
         # Format based on role
         if role == "human" or role == "user":
+            template = roles.get("user", roles.get("human", "Human: {content}\n\n"))
             formatted_chat += template.format(content=content)
+        elif role == "assistant" or role == "bot":
             template = roles.get("assistant", "Assistant: {content}\n\n")
             formatted_chat += template.format(content=content)
         elif role == "system":
             # For system messages, prepend them
             template = roles.get("system", "System: {content}\n\n")
             formatted_chat = template.format(content=content) + formatted_chat
+        else:
+            # Default to system for unknown roles
+            logger.warning(f"Unknown role '{role}' - treating as system message")
+            template = roles.get("system", "System: {content}\n\n")
+            formatted_chat += template.format(content=content)
     return formatted_chat.strip()
         self.include_metadata = dataset_config.get("data_formatting", {}).get("metadata_handling", {}).get("include_paper_id", True)
         self.include_chunk = dataset_config.get("data_formatting", {}).get("metadata_handling", {}).get("include_chunk_number", True)
         self.metadata_format = dataset_config.get("data_formatting", {}).get("metadata_handling", {}).get("metadata_format", "Paper ID: {paper_id} | Chunk: {chunk_number}")
+        self.roles = dataset_config.get("data_formatting", {}).get("roles", {})
         logger.info(f"SimpleDataCollator initialized - using phi-4 chat format with max_seq_length={self.max_seq_length}")
+    def normalize_conversation(self, conversation):
+        """Normalize conversation format to ensure consistent structure."""
+        normalized = []
+        # Handle non-list or empty inputs
+        if not isinstance(conversation, list):
+            logger.warning(f"Conversation is not a list: {type(conversation)}")
+            if hasattr(conversation, 'items'):  # It's a dict-like object
+                conversation = [conversation]
+            else:
+                return []
+        for turn in conversation:
+            # Skip empty or None entries
+            if not turn:
+                continue
+            # Handle string entries (convert to user message)
+            if isinstance(turn, str):
+                normalized.append({"role": "user", "content": turn})
+                continue
+            # Handle dict-like entries
+            if not isinstance(turn, dict) and hasattr(turn, 'get'):
+                # Convert to dict
+                turn = {k: turn.get(k) for k in ['role', 'content'] if hasattr(turn, 'get') and turn.get(k) is not None}
+            # Ensure both role and content exist
+            if not isinstance(turn, dict) or 'role' not in turn or 'content' not in turn:
+                logger.warning(f"Skipping malformatted conversation turn: {turn}")
+                continue
+            # Normalize role field
+            role = turn.get('role', '').lower()
+            if role == 'user' or role == 'human':
+                role = 'user'
+            elif role == 'assistant' or role == 'bot':
+                role = 'assistant'
+            # Add normalized turn
+            normalized.append({
+                "role": role,
+                "content": str(turn.get('content', ''))
+            })
+        return normalized
     def __call__(self, features):
         batch = {"input_ids": [], "attention_mask": [], "labels": []}
             try:
                 # Get ID and conversation fields
                 paper_id = example.get("id", "")
+                # Handle conversation field - could be under 'conversations' or 'text'
+                conversation = example.get("conversations", example.get("text", []))
+                # Normalize conversation format
+                conversation = self.normalize_conversation(conversation)
                 if not conversation:
                     self.stats["skipped"] += 1
                         logger.info(f"Paper ID: {paper_id} | Chunk: {self.paper_counters[paper_id]}")
                         logger.info(f"Token count: {len(inputs['input_ids'])}")
                         logger.info(f"Content preview:\n{formatted_content[:500]}...")
+                        logger.info(f"Conversation structure: {conversation[:2]}...")
                 else:
                     self.stats["skipped"] += 1
             except Exception as e:
                 logger.warning(f"Error processing example: {str(e)[:100]}...")
+                logger.warning(f"Problematic example: {str(example)[:200]}...")
                 self.stats["skipped"] += 1
                 continue