Alexander Watson commited on
Commit
0629e69
1 Parent(s): c918aac

analysis improvements

Browse files
Files changed (2) hide show
  1. src/utils/analysis.py +296 -95
  2. src/utils/visualization.py +76 -58
src/utils/analysis.py CHANGED
@@ -1,15 +1,16 @@
1
- from openai import OpenAI
 
 
2
  import json
3
- import yaml
4
  import re
5
- import datetime
 
 
6
  import plotly.express as px
7
  import plotly.graph_objects as go
8
- import pandas as pd
9
- import base64
10
- import io
11
- from collections import Counter
12
  import tiktoken
 
 
13
 
14
 
15
  def extract_json_from_response(text: str) -> str:
@@ -130,8 +131,8 @@ def create_distribution_plot(data, column):
130
 
131
  def create_wordcloud(data, column):
132
  """Create a word cloud visualization."""
133
- from wordcloud import WordCloud
134
  import matplotlib.pyplot as plt
 
135
 
136
  try:
137
  # Handle list columns
@@ -177,16 +178,53 @@ def create_wordcloud(data, column):
177
  raise e
178
 
179
 
180
- def analyze_dataset_with_openai(client: OpenAI, dataset_sample) -> dict:
181
- """Analyze dataset sample using OpenAI API."""
182
- # Get a single record for schema inference
183
- single_record = (
184
- dataset_sample[0] if isinstance(dataset_sample, list) else dataset_sample
185
- )
186
-
187
- # Convert the full sample to JSON for overview analysis
188
- sample_json = json.dumps(dataset_sample, indent=2)
189
- single_record_json = json.dumps(single_record, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  prompt = f"""Analyze this dataset sample and provide the following in a JSON response:
192
 
@@ -195,15 +233,11 @@ def analyze_dataset_with_openai(client: OpenAI, dataset_sample) -> dict:
195
  - A bullet-pointed list of key features and statistics
196
  - A brief statement about potential ML/AI applications
197
 
198
- 2. A schema showing each field's type and description. Use this single record for type inference:
199
- {single_record_json}
200
 
201
- For schema types, use precise types like:
202
- - "string" for text fields
203
- - "number" for numeric fields
204
- - "boolean" for true/false
205
- - "array of X" for arrays where X is the type of elements
206
- - "object" for nested objects, with nested field descriptions
207
 
208
  3. A formatted example record
209
 
@@ -220,15 +254,15 @@ def analyze_dataset_with_openai(client: OpenAI, dataset_sample) -> dict:
220
  }},
221
  "schema": {{
222
  "field_name": {{
223
- "type": "precise type as described above",
224
  "description": "Description of what this field contains"
225
  }}
226
  }},
227
  "example": {{"key": "value"}}
228
  }}
229
 
230
- For context, here are more sample records to help with the overview and features:
231
- {sample_json}
232
  """
233
 
234
  try:
@@ -241,15 +275,12 @@ def analyze_dataset_with_openai(client: OpenAI, dataset_sample) -> dict:
241
 
242
  # Get the response content
243
  response_text = response.choices[0].message.content
244
- print("OpenAI Response:", response_text)
245
 
246
  # Extract JSON from the response
247
  json_str = extract_json_from_response(response_text)
248
- print("Extracted JSON:", json_str)
249
 
250
  # Parse the JSON
251
  result = json.loads(json_str)
252
- print("Parsed Result:", result)
253
  return result
254
 
255
  except Exception as e:
@@ -271,33 +302,33 @@ def analyze_dataset_statistics(df):
271
  "basic_stats": {
272
  "total_records": len(df),
273
  "total_features": len(df.columns),
274
- "memory_usage": f"{df.memory_usage(deep=True).sum() / (1024*1024):.2f} MB"
275
  },
276
- "token_stats": {
277
- "total": 0,
278
- "by_column": {}
279
- }
280
  }
281
-
282
  # Count tokens for each column
283
  for column in df.columns:
284
  try:
285
- if df[column].dtype == 'object' or isinstance(df[column].iloc[0], list):
286
  # For list columns, join items into strings
287
  if isinstance(df[column].iloc[0], list):
288
- token_counts = df[column].apply(lambda x: count_tokens(' '.join(str(item) for item in x)))
 
 
289
  else:
290
  token_counts = df[column].apply(lambda x: count_tokens(str(x)))
291
-
292
  total_tokens = int(token_counts.sum())
293
  stats["token_stats"]["total"] += total_tokens
294
  stats["token_stats"]["by_column"][column] = total_tokens
295
  except Exception as e:
296
  print(f"Error processing column {column}: {str(e)}")
297
  continue
298
-
299
  return stats
300
 
 
301
  def format_dataset_stats(stats):
302
  """Format simplified dataset statistics as markdown."""
303
  md = """## Dataset Overview
@@ -306,7 +337,9 @@ def format_dataset_stats(stats):
306
  * Total Records: {total_records:,}
307
  * Total Features: {total_features}
308
  * Memory Usage: {memory_usage}
309
- """.format(**stats["basic_stats"])
 
 
310
 
311
  # Token Statistics
312
  if stats["token_stats"]["total"] > 0:
@@ -319,6 +352,7 @@ def format_dataset_stats(stats):
319
 
320
  return md
321
 
 
322
  def generate_dataset_card(
323
  dataset_info: dict,
324
  distribution_plots: dict,
@@ -326,56 +360,23 @@ def generate_dataset_card(
326
  openai_analysis: dict,
327
  df: pd.DataFrame,
328
  ) -> str:
329
- """Generate the complete dataset card content."""
 
 
330
  yaml_content = {
331
  "language": ["en"],
332
  "license": "apache-2.0",
333
  "multilinguality": "monolingual",
334
- "size_categories": ["1K<n<10K"],
335
  "task_categories": ["other"],
336
  }
337
-
338
  yaml_string = yaml.dump(yaml_content, sort_keys=False)
339
- description = openai_analysis["description"]
340
-
341
- # Generate schema table
342
- schema_table = generate_schema_table(openai_analysis["schema"])
343
-
344
- # Format example as JSON code block
345
- example_block = f"```json\n{json.dumps(openai_analysis['example'], indent=2)}\n```"
346
 
347
  # Generate dataset statistics
348
  stats = analyze_dataset_statistics(df)
349
- stats_section = format_dataset_stats(stats)
350
-
351
- # Add distribution plots inline
352
- distribution_plots_md = ""
353
- if distribution_plots:
354
- distribution_plots_md = "\n### Distribution Plots\n\n"
355
- distribution_plots_md += '<div style="display: grid; grid-template-columns: repeat(1, 1fr); gap: 20px;">\n'
356
- for col, img_str in distribution_plots.items():
357
- distribution_plots_md += f"<div>\n"
358
- distribution_plots_md += f"<h4>Distribution of {col}</h4>\n"
359
- distribution_plots_md += f'<img src="data:image/png;base64,{img_str}" style="width: 100%; height: auto;">\n'
360
- distribution_plots_md += "</div>\n"
361
- distribution_plots_md += "</div>\n\n"
362
-
363
- # Add word clouds inline in a grid
364
- wordcloud_plots_md = ""
365
- if wordcloud_plots:
366
- wordcloud_plots_md = "\n### Word Clouds\n\n"
367
- wordcloud_plots_md += '<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 20px;">\n'
368
- for col, img_str in wordcloud_plots.items():
369
- wordcloud_plots_md += f"<div>\n"
370
- wordcloud_plots_md += f"<h4>Word Cloud for {col}</h4>\n"
371
- wordcloud_plots_md += f'<img src="data:image/png;base64,{img_str}" style="width: 100%; height: auto;">\n'
372
- wordcloud_plots_md += "</div>\n"
373
- wordcloud_plots_md += "</div>\n\n"
374
-
375
- # Generate clean dataset name for citation
376
- clean_dataset_name = dataset_info["dataset_name"].replace("/", "_")
377
 
378
- # Build the markdown content
379
  readme_content = f"""---
380
  {yaml_string}---
381
 
@@ -383,39 +384,73 @@ def generate_dataset_card(
383
 
384
  {description['overview']}
385
 
386
- The dataset includes:
387
  {chr(10).join(f'* {feature}' for feature in description['key_features'])}
388
 
 
389
  {description['ml_applications']}
390
 
 
 
 
 
 
 
391
  ## Dataset Schema
392
 
393
- {schema_table}
 
 
394
 
395
  ## Example Record
396
 
397
- {example_block}
 
 
398
 
399
  ## Data Distribution Analysis
400
 
401
- The following visualizations show key characteristics of the dataset:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
- {distribution_plots_md}
404
- {wordcloud_plots_md}
405
 
406
- {stats_section}
 
 
407
 
408
- ## Citation and Usage
 
 
 
 
 
409
 
410
- If you use this dataset in your research or applications, please cite it as:
 
 
 
411
 
412
  ```bibtex
413
- @dataset{{{clean_dataset_name},
414
  title = {{{dataset_info['dataset_name']}}},
415
- author = {{Dataset Authors}},
416
  year = {{{datetime.datetime.now().year}}},
417
  publisher = {{Hugging Face}},
418
- howpublished = {{Hugging Face Datasets}},
419
  url = {{https://huggingface.co/datasets/{dataset_info['dataset_name']}}}
420
  }}
421
  ```
@@ -427,11 +462,154 @@ This dataset is released under the Apache 2.0 License. When using this dataset:
427
  * 📚 Cite the dataset using the BibTeX entry above
428
  * 🤝 Consider contributing improvements or reporting issues
429
  * 💡 Share derivative works with the community when possible
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
 
431
  For questions or additional information, please visit the dataset repository on Hugging Face.
432
  """
433
 
434
- return readme_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
 
436
 
437
  def generate_schema_table(schema: dict) -> str:
@@ -449,6 +627,29 @@ def generate_schema_table(schema: dict) -> str:
449
  return table
450
 
451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  def format_schema_item(field_name: str, field_info: dict, prefix: str = "") -> list:
453
  """Recursively format schema items for nested structures."""
454
  rows = []
 
1
+ import base64
2
+ import datetime
3
+ import io
4
  import json
 
5
  import re
6
+ from collections import Counter
7
+
8
+ import pandas as pd
9
  import plotly.express as px
10
  import plotly.graph_objects as go
 
 
 
 
11
  import tiktoken
12
+ import yaml
13
+ from openai import OpenAI
14
 
15
 
16
  def extract_json_from_response(text: str) -> str:
 
131
 
132
  def create_wordcloud(data, column):
133
  """Create a word cloud visualization."""
 
134
  import matplotlib.pyplot as plt
135
+ from wordcloud import WordCloud
136
 
137
  try:
138
  # Handle list columns
 
178
  raise e
179
 
180
 
181
+ def analyze_dataset_with_openai(client: OpenAI, data) -> dict:
182
+ """Analyze dataset using OpenAI API with improved type inference and efficient sampling."""
183
+ # Convert dictionary to DataFrame if needed
184
+ if isinstance(data, dict):
185
+ df = pd.DataFrame(data)
186
+ else:
187
+ df = data
188
+
189
+ # Take a very small sample for efficiency
190
+ sample_size = min(3, len(df))
191
+ if len(df) > 3:
192
+ sample_indices = df.index[
193
+ :sample_size
194
+ ] # Take first 3 rows instead of random sampling
195
+ sample_df = df.loc[sample_indices]
196
+ else:
197
+ sample_df = df
198
+
199
+ dataset_sample = sample_df.to_dict("records")
200
+ single_record = dataset_sample[0]
201
+
202
+ # Create type hints dictionary - only process the sample
203
+ type_hints = {}
204
+ for column in sample_df.columns:
205
+ # Get the pandas dtype
206
+ dtype = sample_df[column].dtype
207
+
208
+ # Efficiently identify types without complex operations
209
+ if pd.api.types.is_integer_dtype(dtype):
210
+ type_hints[column] = "integer"
211
+ elif pd.api.types.is_float_dtype(dtype):
212
+ type_hints[column] = "number"
213
+ elif pd.api.types.is_bool_dtype(dtype):
214
+ type_hints[column] = "boolean"
215
+ elif pd.api.types.is_datetime64_any_dtype(dtype):
216
+ type_hints[column] = "datetime"
217
+ elif pd.api.types.is_categorical_dtype(dtype):
218
+ type_hints[column] = "categorical"
219
+ elif pd.api.types.is_string_dtype(dtype):
220
+ # Simple check for list-like values
221
+ first_val = sample_df[column].iloc[0]
222
+ if isinstance(first_val, list):
223
+ type_hints[column] = "array"
224
+ else:
225
+ type_hints[column] = "string"
226
+ else:
227
+ type_hints[column] = "unknown"
228
 
229
  prompt = f"""Analyze this dataset sample and provide the following in a JSON response:
230
 
 
233
  - A bullet-pointed list of key features and statistics
234
  - A brief statement about potential ML/AI applications
235
 
236
+ 2. A schema showing each field's type and description. Here is the actual DataFrame type information:
237
+ {json.dumps(type_hints, indent=2)}
238
 
239
+ And here's a single record for reference:
240
+ {json.dumps(single_record, indent=2)}
 
 
 
 
241
 
242
  3. A formatted example record
243
 
 
254
  }},
255
  "schema": {{
256
  "field_name": {{
257
+ "type": "use the type from the provided type_hints",
258
  "description": "Description of what this field contains"
259
  }}
260
  }},
261
  "example": {{"key": "value"}}
262
  }}
263
 
264
+ For context, here are more sample records:
265
+ {json.dumps(dataset_sample, indent=2)}
266
  """
267
 
268
  try:
 
275
 
276
  # Get the response content
277
  response_text = response.choices[0].message.content
 
278
 
279
  # Extract JSON from the response
280
  json_str = extract_json_from_response(response_text)
 
281
 
282
  # Parse the JSON
283
  result = json.loads(json_str)
 
284
  return result
285
 
286
  except Exception as e:
 
302
  "basic_stats": {
303
  "total_records": len(df),
304
  "total_features": len(df.columns),
305
+ "memory_usage": f"{df.memory_usage(deep=True).sum() / (1024*1024):.2f} MB",
306
  },
307
+ "token_stats": {"total": 0, "by_column": {}},
 
 
 
308
  }
309
+
310
  # Count tokens for each column
311
  for column in df.columns:
312
  try:
313
+ if df[column].dtype == "object" or isinstance(df[column].iloc[0], list):
314
  # For list columns, join items into strings
315
  if isinstance(df[column].iloc[0], list):
316
+ token_counts = df[column].apply(
317
+ lambda x: count_tokens(" ".join(str(item) for item in x))
318
+ )
319
  else:
320
  token_counts = df[column].apply(lambda x: count_tokens(str(x)))
321
+
322
  total_tokens = int(token_counts.sum())
323
  stats["token_stats"]["total"] += total_tokens
324
  stats["token_stats"]["by_column"][column] = total_tokens
325
  except Exception as e:
326
  print(f"Error processing column {column}: {str(e)}")
327
  continue
328
+
329
  return stats
330
 
331
+
332
  def format_dataset_stats(stats):
333
  """Format simplified dataset statistics as markdown."""
334
  md = """## Dataset Overview
 
337
  * Total Records: {total_records:,}
338
  * Total Features: {total_features}
339
  * Memory Usage: {memory_usage}
340
+ """.format(
341
+ **stats["basic_stats"]
342
+ )
343
 
344
  # Token Statistics
345
  if stats["token_stats"]["total"] > 0:
 
352
 
353
  return md
354
 
355
+
356
  def generate_dataset_card(
357
  dataset_info: dict,
358
  distribution_plots: dict,
 
360
  openai_analysis: dict,
361
  df: pd.DataFrame,
362
  ) -> str:
363
+ """Generate a beautiful and clean dataset card."""
364
+
365
+ # Basic dataset metadata
366
  yaml_content = {
367
  "language": ["en"],
368
  "license": "apache-2.0",
369
  "multilinguality": "monolingual",
370
+ "size_categories": [get_size_category(len(df))],
371
  "task_categories": ["other"],
372
  }
 
373
  yaml_string = yaml.dump(yaml_content, sort_keys=False)
 
 
 
 
 
 
 
374
 
375
  # Generate dataset statistics
376
  stats = analyze_dataset_statistics(df)
377
+ description = openai_analysis["description"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
 
379
+ # Build the markdown content with proper spacing
380
  readme_content = f"""---
381
  {yaml_string}---
382
 
 
384
 
385
  {description['overview']}
386
 
387
+ ### Key Features
388
  {chr(10).join(f'* {feature}' for feature in description['key_features'])}
389
 
390
+ ### Potential Applications
391
  {description['ml_applications']}
392
 
393
+ ## Dataset Statistics
394
+
395
+ * Total Records: {stats['basic_stats']['total_records']:,}
396
+ * Total Features: {stats['basic_stats']['total_features']}
397
+ * Memory Usage: {stats['basic_stats']['memory_usage']}
398
+
399
  ## Dataset Schema
400
 
401
+ | Field | Type | Description |
402
+ | --- | --- | --- |
403
+ {chr(10).join(f"| {field} | {info['type']} | {info['description']} |" for field, info in openai_analysis['schema'].items())}
404
 
405
  ## Example Record
406
 
407
+ ```json
408
+ {json.dumps(openai_analysis['example'], indent=2)}
409
+ ```
410
 
411
  ## Data Distribution Analysis
412
 
413
+ The following visualizations show the distribution patterns and characteristics of key features in the dataset:
414
+
415
+ """
416
+
417
+ # Add individual distribution plots with clean spacing
418
+ for col, img_str in distribution_plots.items():
419
+ readme_content += f"""### Distribution of {col}
420
+ <img src="data:image/png;base64,{img_str}" alt="Distribution of {col}" style="max-width: 800px;">
421
+
422
+ """
423
+
424
+ # Add word clouds with clean spacing
425
+ if wordcloud_plots:
426
+ readme_content += "## Feature Word Clouds\n\n"
427
+ for col, img_str in wordcloud_plots.items():
428
+ readme_content += f"""### Word Cloud for {col}
429
+ <img src="data:image/png;base64,{img_str}" alt="Word Cloud for {col}" style="max-width: 800px;">
430
 
431
+ """
 
432
 
433
+ # Add token statistics if available
434
+ if stats.get("token_stats") and stats["token_stats"]["total"] > 0:
435
+ readme_content += """## Token Statistics
436
 
437
+ """
438
+ readme_content += f"* Total Tokens: {stats['token_stats']['total']:,}\n"
439
+ if stats["token_stats"].get("by_column"):
440
+ readme_content += "\n**Tokens by Column:**\n"
441
+ for col, count in stats["token_stats"]["by_column"].items():
442
+ readme_content += f"* {col}: {count:,}\n"
443
 
444
+ # Add citation section
445
+ clean_name = dataset_info["dataset_name"].replace("/", "_")
446
+ readme_content += f"""
447
+ ## Citation
448
 
449
  ```bibtex
450
+ @dataset{{{clean_name},
451
  title = {{{dataset_info['dataset_name']}}},
 
452
  year = {{{datetime.datetime.now().year}}},
453
  publisher = {{Hugging Face}},
 
454
  url = {{https://huggingface.co/datasets/{dataset_info['dataset_name']}}}
455
  }}
456
  ```
 
462
  * 📚 Cite the dataset using the BibTeX entry above
463
  * 🤝 Consider contributing improvements or reporting issues
464
  * 💡 Share derivative works with the community when possible
465
+ """
466
+
467
+ return readme_content
468
+
469
+
470
+ def get_size_category(record_count: int) -> str:
471
+ """Determine the size category based on record count."""
472
+ if record_count < 1000:
473
+ return "n<1K"
474
+ elif record_count < 10000:
475
+ return "1K<n<10K"
476
+ elif record_count < 100000:
477
+ return "10K<n<100K"
478
+ elif record_count < 1000000:
479
+ return "100K<n<1M"
480
+ else:
481
+ return "n>1M"
482
+
483
+
484
+ def format_overview_section(analysis: dict, stats: dict) -> str:
485
+ """Create a comprehensive overview section."""
486
+ description = analysis["description"]
487
+ overview = f"""
488
+ {description['overview']}
489
+
490
+ ### Key Features and Characteristics
491
+ {chr(10).join(f'* {feature}' for feature in description['key_features'])}
492
+
493
+ ### Potential Applications
494
+ {description['ml_applications']}
495
+
496
+ ### Dataset Size
497
+ * Total Records: {stats['basic_stats']['total_records']:,}
498
+ * Total Features: {stats['basic_stats']['total_features']}
499
+ * Memory Usage: {stats['basic_stats']['memory_usage']}
500
+ """
501
+ return overview.strip()
502
+
503
+
504
+ def format_schema_section(schema: dict, df: pd.DataFrame) -> str:
505
+ """Generate an enhanced schema section with statistics."""
506
+ # Table header
507
+ table = "| Field | Type | Description | Non-Null Count | Unique Values |\n"
508
+ table += "| --- | --- | --- | --- | --- |\n"
509
+
510
+ # Generate rows with additional statistics
511
+ for field, info in schema.items():
512
+ try:
513
+ non_null = df[field].count()
514
+ unique = df[field].nunique()
515
+ row = f"| {field} | {info['type']} | {info['description']} | {non_null:,} | {unique:,} |"
516
+ table += row + "\n"
517
+ except Exception as e:
518
+ print(f"Error processing field {field}: {e}")
519
+ continue
520
+
521
+ return table
522
+
523
+
524
+ def format_visualization_section(
525
+ distribution_plots: dict, wordcloud_plots: dict
526
+ ) -> str:
527
+ """Format the visualization section with improved layout."""
528
+ content = (
529
+ """The following visualizations show key characteristics of the dataset:\n\n"""
530
+ )
531
+
532
+ # Add distribution plots
533
+ if distribution_plots:
534
+ content += "### Distribution Plots\n\n"
535
+ content += '<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(500px, 1fr)); gap: 20px;">\n'
536
+ for col, img_str in distribution_plots.items():
537
+ content += f"""<div>
538
+ <h4>Distribution of {col}</h4>
539
+ <img src="data:image/png;base64,{img_str}" style="width: 100%; height: auto;">
540
+ </div>\n"""
541
+ content += "</div>\n\n"
542
+
543
+ # Add word clouds
544
+ if wordcloud_plots:
545
+ content += "### Word Clouds\n\n"
546
+ content += '<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 20px;">\n'
547
+ for col, img_str in wordcloud_plots.items():
548
+ content += f"""<div>
549
+ <h4>Word Cloud for {col}</h4>
550
+ <img src="data:image/png;base64,{img_str}" style="width: 100%; height: auto;">
551
+ </div>\n"""
552
+ content += "</div>\n"
553
+
554
+ return content
555
+
556
+
557
+ def generate_limitations_section(df: pd.DataFrame, analysis: dict) -> str:
558
+ """Generate a section about dataset limitations and potential biases."""
559
+ limitations = [
560
+ "This dataset may not be representative of all possible scenarios or use cases.",
561
+ f"The dataset contains {len(df):,} records, which may limit its applicability to certain tasks.",
562
+ "There may be inherent biases in the data collection or annotation process.",
563
+ ]
564
+
565
+ # Add warnings about missing values if present
566
+ missing_values = df.isnull().sum()
567
+ if missing_values.any():
568
+ limitations.append(
569
+ f"Some fields contain missing values: {', '.join(missing_values[missing_values > 0].index)}"
570
+ )
571
+
572
+ return f"""The following limitations and potential biases should be considered when using this dataset:
573
+
574
+ {chr(10).join(f'* {limitation}' for limitation in limitations)}
575
+
576
+ Please consider these limitations when using the dataset and validate results accordingly."""
577
+
578
+
579
+ def generate_usage_section(dataset_info: dict, analysis: dict) -> str:
580
+ """Generate comprehensive usage guidelines."""
581
+ return f"""This dataset is released under the Apache 2.0 License. When using this dataset:
582
+
583
+ * 📚 Cite the dataset using the BibTeX entry provided below
584
+ * 🤝 Consider contributing improvements or reporting issues
585
+ * 💡 Share derivative works with the community when possible
586
+ * 🔍 Validate the dataset's suitability for your specific use case
587
+ * ⚠️ Be aware of the limitations and biases discussed above
588
+ * 📊 Consider the dataset size and computational requirements for your application
589
 
590
  For questions or additional information, please visit the dataset repository on Hugging Face.
591
  """
592
 
593
+
594
+ def get_task_categories(df: pd.DataFrame, analysis: dict) -> list:
595
+ """Infer potential task categories based on the data and analysis."""
596
+ categories = ["other"] # Default category
597
+
598
+ # Add more sophisticated task inference logic based on column names and content
599
+ text_columns = df.select_dtypes(include=["object"]).columns
600
+ numeric_columns = df.select_dtypes(include=["int64", "float64"]).columns
601
+
602
+ if len(text_columns) > 0:
603
+ categories.append("text-classification")
604
+ if len(numeric_columns) > 0:
605
+ categories.append("regression")
606
+
607
+ return list(set(categories)) # Remove duplicates
608
+
609
+
610
+ def clean_dataset_name(name: str) -> str:
611
+ """Clean dataset name for citation."""
612
+ return name.replace("/", "_").replace("-", "_").lower()
613
 
614
 
615
  def generate_schema_table(schema: dict) -> str:
 
627
  return table
628
 
629
 
630
+ def format_stats_section(stats: dict) -> str:
631
+ """Format the statistics section of the dataset card."""
632
+ content = """### Basic Statistics
633
+ """
634
+ # Add basic stats
635
+ for key, value in stats["basic_stats"].items():
636
+ # Convert key from snake_case to Title Case
637
+ formatted_key = key.replace("_", " ").title()
638
+ content += f"* {formatted_key}: {value}\n"
639
+
640
+ # Add token statistics if available
641
+ if stats.get("token_stats") and stats["token_stats"]["total"] > 0:
642
+ content += "\n### Token Statistics\n"
643
+ content += f"* Total Tokens: {stats['token_stats']['total']:,}\n"
644
+
645
+ if stats["token_stats"].get("by_column"):
646
+ content += "\n**Tokens by Column:**\n"
647
+ for col, count in stats["token_stats"]["by_column"].items():
648
+ content += f"* {col}: {count:,}\n"
649
+
650
+ return content
651
+
652
+
653
  def format_schema_item(field_name: str, field_info: dict, prefix: str = "") -> list:
654
  """Recursively format schema items for nested structures."""
655
  rows = []
src/utils/visualization.py CHANGED
@@ -1,25 +1,26 @@
1
- import plotly.express as px
2
- import plotly.graph_objects as go
3
- import pandas as pd
4
  import base64
5
  import io
 
6
 
7
-
8
  import plotly.express as px
9
  import plotly.graph_objects as go
10
- import pandas as pd
11
- import base64
12
- import io
13
- from collections import Counter
14
 
15
  def flatten_list_column(data, column):
16
  """Flatten a column containing lists into individual values with counts."""
17
  # Flatten the lists into individual items
18
- flattened = [item for sublist in data[column] if isinstance(sublist, list) for item in sublist]
 
 
 
 
 
19
  # Count occurrences
20
  value_counts = pd.Series(Counter(flattened))
21
  return value_counts
22
 
 
23
  def create_distribution_plot(data, column):
24
  """Create a beautiful distribution plot using Plotly and convert to image."""
25
  try:
@@ -29,110 +30,127 @@ def create_distribution_plot(data, column):
29
  value_counts = flatten_list_column(data, column)
30
  else:
31
  # Handle regular columns
32
- if data[column].dtype in ['int64', 'float64']:
33
  # Continuous data - use histogram
34
  fig = go.Figure()
35
-
36
  # Add histogram
37
- fig.add_trace(go.Histogram(
38
- x=data[column],
39
- name='Count',
40
- nbinsx=30,
41
- marker=dict(
42
- color='rgba(110, 68, 255, 0.7)',
43
- line=dict(color='rgba(184, 146, 255, 1)', width=1)
 
 
44
  )
45
- ))
46
-
47
  else:
48
  # Categorical data
49
  value_counts = data[column].value_counts()
50
 
51
  # For both list columns and categorical data
52
- if 'value_counts' in locals():
53
- fig = go.Figure([go.Bar(
54
- x=value_counts.index,
55
- y=value_counts.values,
56
- marker=dict(
57
- color=value_counts.values,
58
- colorscale=px.colors.sequential.Plotly3,
59
- ),
60
- )])
61
-
 
 
 
 
62
  # Common layout updates
63
  fig.update_layout(
64
- title=f'Distribution of {column}',
65
  xaxis_title=column,
66
- yaxis_title='Count',
67
- template='plotly_white',
68
  margin=dict(t=50, l=50, r=50, b=50),
69
  width=1200,
70
  height=800,
71
- showlegend=False
72
  )
73
-
74
  # Rotate x-axis labels if needed
75
- if isinstance(data[column].iloc[0], list) or data[column].dtype not in ['int64', 'float64']:
 
 
 
76
  fig.update_layout(xaxis_tickangle=-45)
77
-
78
  # Convert to PNG
79
  img_bytes = fig.to_image(format="png", scale=2.0)
80
-
81
  # Encode to base64
82
  img_base64 = base64.b64encode(img_bytes).decode()
83
-
84
  return img_base64
85
-
86
  except Exception as e:
87
  print(f"Error creating distribution plot for {column}: {str(e)}")
88
  raise e
89
 
 
90
  def create_wordcloud(data, column):
91
  """Create a word cloud visualization."""
92
- from wordcloud import WordCloud
93
  import matplotlib.pyplot as plt
94
-
 
95
  try:
96
  # Handle list columns
97
  if isinstance(data[column].iloc[0], list):
98
- text = ' '.join([' '.join(map(str, sublist)) for sublist in data[column] if isinstance(sublist, list)])
 
 
 
 
 
 
99
  else:
100
  # Handle regular columns
101
- text = ' '.join(data[column].astype(str))
102
-
103
  wordcloud = WordCloud(
104
  width=1200,
105
  height=800,
106
- background_color='white',
107
- colormap='plasma',
108
- max_words=100
109
  ).generate(text)
110
-
111
  # Create matplotlib figure
112
  plt.figure(figsize=(10, 5))
113
- plt.imshow(wordcloud, interpolation='bilinear')
114
- plt.axis('off')
115
- plt.title(f'Word Cloud for {column}')
116
-
117
  # Save to bytes
118
  buf = io.BytesIO()
119
- plt.savefig(buf, format='png', bbox_inches='tight', dpi=300)
120
  plt.close()
121
  buf.seek(0)
122
-
123
  # Convert to base64
124
  img_base64 = base64.b64encode(buf.getvalue()).decode()
125
-
126
  return img_base64
127
-
128
  except Exception as e:
129
  print(f"Error creating word cloud for {column}: {str(e)}")
130
  raise e
131
 
 
132
  def create_wordcloud(data, column):
133
  """Create a word cloud visualization."""
134
- from wordcloud import WordCloud
135
  import matplotlib.pyplot as plt
 
136
 
137
  # Generate word cloud
138
  text = " ".join(data[column].astype(str))
 
 
 
 
1
  import base64
2
  import io
3
+ from collections import Counter
4
 
5
+ import pandas as pd
6
  import plotly.express as px
7
  import plotly.graph_objects as go
8
+
 
 
 
9
 
10
  def flatten_list_column(data, column):
11
  """Flatten a column containing lists into individual values with counts."""
12
  # Flatten the lists into individual items
13
+ flattened = [
14
+ item
15
+ for sublist in data[column]
16
+ if isinstance(sublist, list)
17
+ for item in sublist
18
+ ]
19
  # Count occurrences
20
  value_counts = pd.Series(Counter(flattened))
21
  return value_counts
22
 
23
+
24
  def create_distribution_plot(data, column):
25
  """Create a beautiful distribution plot using Plotly and convert to image."""
26
  try:
 
30
  value_counts = flatten_list_column(data, column)
31
  else:
32
  # Handle regular columns
33
+ if data[column].dtype in ["int64", "float64"]:
34
  # Continuous data - use histogram
35
  fig = go.Figure()
36
+
37
  # Add histogram
38
+ fig.add_trace(
39
+ go.Histogram(
40
+ x=data[column],
41
+ name="Count",
42
+ nbinsx=30,
43
+ marker=dict(
44
+ color="rgba(110, 68, 255, 0.7)",
45
+ line=dict(color="rgba(184, 146, 255, 1)", width=1),
46
+ ),
47
  )
48
+ )
49
+
50
  else:
51
  # Categorical data
52
  value_counts = data[column].value_counts()
53
 
54
  # For both list columns and categorical data
55
+ if "value_counts" in locals():
56
+ fig = go.Figure(
57
+ [
58
+ go.Bar(
59
+ x=value_counts.index,
60
+ y=value_counts.values,
61
+ marker=dict(
62
+ color=value_counts.values,
63
+ colorscale=px.colors.sequential.Plotly3,
64
+ ),
65
+ )
66
+ ]
67
+ )
68
+
69
  # Common layout updates
70
  fig.update_layout(
71
+ title=f"Distribution of {column}",
72
  xaxis_title=column,
73
+ yaxis_title="Count",
74
+ template="plotly_white",
75
  margin=dict(t=50, l=50, r=50, b=50),
76
  width=1200,
77
  height=800,
78
+ showlegend=False,
79
  )
80
+
81
  # Rotate x-axis labels if needed
82
+ if isinstance(data[column].iloc[0], list) or data[column].dtype not in [
83
+ "int64",
84
+ "float64",
85
+ ]:
86
  fig.update_layout(xaxis_tickangle=-45)
87
+
88
  # Convert to PNG
89
  img_bytes = fig.to_image(format="png", scale=2.0)
90
+
91
  # Encode to base64
92
  img_base64 = base64.b64encode(img_bytes).decode()
93
+
94
  return img_base64
95
+
96
  except Exception as e:
97
  print(f"Error creating distribution plot for {column}: {str(e)}")
98
  raise e
99
 
100
+
101
  def create_wordcloud(data, column):
102
  """Create a word cloud visualization."""
 
103
  import matplotlib.pyplot as plt
104
+ from wordcloud import WordCloud
105
+
106
  try:
107
  # Handle list columns
108
  if isinstance(data[column].iloc[0], list):
109
+ text = " ".join(
110
+ [
111
+ " ".join(map(str, sublist))
112
+ for sublist in data[column]
113
+ if isinstance(sublist, list)
114
+ ]
115
+ )
116
  else:
117
  # Handle regular columns
118
+ text = " ".join(data[column].astype(str))
119
+
120
  wordcloud = WordCloud(
121
  width=1200,
122
  height=800,
123
+ background_color="white",
124
+ colormap="plasma",
125
+ max_words=100,
126
  ).generate(text)
127
+
128
  # Create matplotlib figure
129
  plt.figure(figsize=(10, 5))
130
+ plt.imshow(wordcloud, interpolation="bilinear")
131
+ plt.axis("off")
132
+ plt.title(f"Word Cloud for {column}")
133
+
134
  # Save to bytes
135
  buf = io.BytesIO()
136
+ plt.savefig(buf, format="png", bbox_inches="tight", dpi=300)
137
  plt.close()
138
  buf.seek(0)
139
+
140
  # Convert to base64
141
  img_base64 = base64.b64encode(buf.getvalue()).decode()
142
+
143
  return img_base64
144
+
145
  except Exception as e:
146
  print(f"Error creating word cloud for {column}: {str(e)}")
147
  raise e
148
 
149
+
150
  def create_wordcloud(data, column):
151
  """Create a word cloud visualization."""
 
152
  import matplotlib.pyplot as plt
153
+ from wordcloud import WordCloud
154
 
155
  # Generate word cloud
156
  text = " ".join(data[column].astype(str))