KB-VQA-E

Sleeping

App Files Files Community

KB-VQA-E / my_model /tabs /dataset_analysis.py

m7mdal7aj

Create dataset_analysis.py

c996cf4 verified 8 months ago

raw

history blame

8.41 kB

	import streamlit as st
	import json
	from collections import Counter
	import contractions
	import csv
	import altair as alt
	from typing import Tuple, List, Optional
	from my_model.dataset.dataset_processor import process_okvqa_dataset
	from my_model.config import dataset_config as config

	class OKVQADatasetAnalyzer:
	"""
	Provides tools for analyzing and visualizing distributions of question types within given question datasets.
	It supports operations such as data loading, categorization of questions based on keywords, visualization of q
	uestion distribution, and exporting data to CSV files.

	Attributes:
	train_file_path (str): Path to the training dataset file.
	test_file_path (str): Path to the testing dataset file.
	data_choice (str): Choice of dataset(s) to analyze; options include 'train', 'test', or 'train_test'.
	questions (List[str]): List of questions aggregated based on the dataset choice.
	question_types (Counter): Counter object tracking the frequency of each question type.
	Qs (Dict[str, List[str]]): Dictionary mapping question types to lists of corresponding questions.
	"""

	def __init__(self, train_file_path: str, test_file_path: str, data_choice: str):
	"""
	Initializes the OKVQADatasetAnalyzer with paths to dataset files and a choice of which datasets to analyze.

	Parameters:
	train_file_path (str): Path to the training dataset JSON file. This file should contain a list of questions.
	test_file_path (str): Path to the testing dataset JSON file. This file should also contain a list of
	questions.
	data_choice (str): Specifies which dataset(s) to load and analyze. Valid options are 'train', 'test', or
	'train_test'indicating whether to load training data, testing data, or both.

	The constructor initializes the paths, selects the dataset based on the choice, and loads the initial data by
	calling the `load_data` method.
	It also prepares structures for categorizing questions and storing the results.
	"""

	self.train_file_path = train_file_path
	self.test_file_path = test_file_path
	self.data_choice = data_choice
	self.questions = []
	self.question_types = Counter()
	self.Qs = {keyword: [] for keyword in config.QUESTION_KEYWORDS}
	self.load_data()

	def load_data(self) -> None:
	"""
	Loads the dataset(s) from the specified JSON file(s) based on the user's choice of 'train', 'test', or
	'train_test'.
	This method updates the internal list of questions depending on the chosen dataset.
	"""

	if self.data_choice in ['train', 'train_test']:
	with open(self.train_file_path, 'r') as file:
	train_data = json.load(file)
	self.questions += [q['question'] for q in train_data['questions']]

	if self.data_choice in ['test', 'train_test']:
	with open(self.test_file_path, 'r') as file:
	test_data = json.load(file)
	self.questions += [q['question'] for q in test_data['questions']]

	def categorize_questions(self) -> None:
	"""
	Categorizes each question in the loaded data into predefined categories based on keywords.
	This method updates the internal dictionary `self.Qs` and the Counter `self.question_types` with categorized
	questions.
	"""

	question_keywords = config.QUESTION_KEYWORDS

	for question in self.questions:
	question = contractions.fix(question)
	words = question.lower().split()
	question_keyword = None
	if words[:2] == ['name', 'the']:
	question_keyword = 'name the'
	else:
	for word in words:
	if word in question_keywords:
	question_keyword = word
	break
	if question_keyword:
	self.question_types[question_keyword] += 1
	self.Qs[question_keyword].append(question)
	else:
	self.question_types["others"] += 1
	self.Qs["others"].append(question)

	def plot_question_distribution(self) -> None:
	"""
	Plots an interactive bar chart of question types using Altair and Streamlit, displaying the count and percentage
	of each type.
	The chart sorts question types by count in descending order and includes detailed tooltips for interaction.
	This method is intended for visualization in a Streamlit application.
	"""

	# Prepare data
	total_questions = sum(self.question_types.values())
	items = [(key, value, (value / total_questions) * 100) for key, value in self.question_types.items()]
	df = pd.DataFrame(items, columns=['Question Keyword', 'Count', 'Percentage'])

	# Sort data and handle 'others' category specifically if present
	df = df[df['Question Keyword'] != 'others'].sort_values('Count', ascending=False)
	if 'others' in self.question_types:
	others_df = pd.DataFrame([('others', self.question_types['others'],
	(self.question_types['others'] / total_questions) * 100)],
	columns=['Question Keyword', 'Count', 'Percentage'])
	df = pd.concat([df, others_df], ignore_index=True)

	# Explicitly set the order of the x-axis based on the sorted DataFrame
	order = df['Question Keyword'].tolist()

	# Create the bar chart
	bars = alt.Chart(df).mark_bar().encode(
	x=alt.X('Question Keyword:N', sort=order, title='Question Keyword', axis=alt.Axis(labelAngle=-45)),
	y=alt.Y('Count:Q', title='Frequency'),
	color=alt.Color('Question Keyword:N', scale=alt.Scale(scheme='category20'), legend=None),
	tooltip=[alt.Tooltip('Question Keyword:N', title='Type'),
	alt.Tooltip('Count:Q', title='Count'),
	alt.Tooltip('Percentage:Q', title='Percentage', format='.1f')]
	)

	# Create text labels for the bars with count and percentage
	text = bars.mark_text(
	align='center',
	baseline='bottom',
	dy=-5 # Nudges text up so it appears above the bar
	).encode(
	text=alt.Text('PercentageText:N')
	).transform_calculate(
	PercentageText="datum.Count + ' (' + format(datum.Percentage, '.1f') + '%)'"
	)

	# Combine the bar and text layers
	chart = (bars + text).properties(
	width=700,
	height=400,
	title='Distribution of Question Keywords'
	).configure_title(fontSize=20).configure_axis(
	labelFontSize=12,
	titleFontSize=14
	)

	# Display the chart in Streamlit
	st.altair_chart(chart, use_container_width=True)

	def export_to_csv(self, qs_filename: str, question_types_filename: str) -> None:
	"""
	Exports the categorized questions and their counts to two separate CSV files.

	Parameters:
	qs_filename (str): The filename or path for exporting the `self.Qs` dictionary data.
	question_types_filename (str): The filename or path for exporting the `self.question_types` Counter data.

	This method writes the contents of `self.Qs` and `self.question_types` to the specified files in CSV format.
	Each CSV file includes headers for better understanding and use of the exported data.
	"""

	# Export self.Qs dictionary
	with open(qs_filename, mode='w', newline='', encoding='utf-8') as file:
	writer = csv.writer(file)
	writer.writerow(['Question Type', 'Questions'])
	for q_type, questions in self.Qs.items():
	for question in questions:
	writer.writerow([q_type, question])

	# Export self.question_types Counter
	with open(question_types_filename, mode='w', newline='', encoding='utf-8') as file:
	writer = csv.writer(file)
	writer.writerow(['Question Type', 'Count'])
	for q_type, count in self.question_types.items():
	writer.writerow([q_type, count])