|
import streamlit as st |
|
import json |
|
from collections import Counter |
|
import contractions |
|
import csv |
|
import altair as alt |
|
from typing import Tuple, List, Optional |
|
from my_model.dataset.dataset_processor import process_okvqa_dataset |
|
from my_model.config import dataset_config as config |
|
|
|
class OKVQADatasetAnalyzer: |
|
""" |
|
Provides tools for analyzing and visualizing distributions of question types within given question datasets. |
|
It supports operations such as data loading, categorization of questions based on keywords, visualization of q |
|
uestion distribution, and exporting data to CSV files. |
|
|
|
Attributes: |
|
train_file_path (str): Path to the training dataset file. |
|
test_file_path (str): Path to the testing dataset file. |
|
data_choice (str): Choice of dataset(s) to analyze; options include 'train', 'test', or 'train_test'. |
|
questions (List[str]): List of questions aggregated based on the dataset choice. |
|
question_types (Counter): Counter object tracking the frequency of each question type. |
|
Qs (Dict[str, List[str]]): Dictionary mapping question types to lists of corresponding questions. |
|
""" |
|
|
|
def __init__(self, train_file_path: str, test_file_path: str, data_choice: str): |
|
""" |
|
Initializes the OKVQADatasetAnalyzer with paths to dataset files and a choice of which datasets to analyze. |
|
|
|
Parameters: |
|
train_file_path (str): Path to the training dataset JSON file. This file should contain a list of questions. |
|
test_file_path (str): Path to the testing dataset JSON file. This file should also contain a list of |
|
questions. |
|
data_choice (str): Specifies which dataset(s) to load and analyze. Valid options are 'train', 'test', or |
|
'train_test'indicating whether to load training data, testing data, or both. |
|
|
|
The constructor initializes the paths, selects the dataset based on the choice, and loads the initial data by |
|
calling the `load_data` method. |
|
It also prepares structures for categorizing questions and storing the results. |
|
""" |
|
|
|
self.train_file_path = train_file_path |
|
self.test_file_path = test_file_path |
|
self.data_choice = data_choice |
|
self.questions = [] |
|
self.question_types = Counter() |
|
self.Qs = {keyword: [] for keyword in config.QUESTION_KEYWORDS} |
|
self.load_data() |
|
|
|
def load_data(self) -> None: |
|
""" |
|
Loads the dataset(s) from the specified JSON file(s) based on the user's choice of 'train', 'test', or |
|
'train_test'. |
|
This method updates the internal list of questions depending on the chosen dataset. |
|
""" |
|
|
|
if self.data_choice in ['train', 'train_test']: |
|
with open(self.train_file_path, 'r') as file: |
|
train_data = json.load(file) |
|
self.questions += [q['question'] for q in train_data['questions']] |
|
|
|
if self.data_choice in ['test', 'train_test']: |
|
with open(self.test_file_path, 'r') as file: |
|
test_data = json.load(file) |
|
self.questions += [q['question'] for q in test_data['questions']] |
|
|
|
def categorize_questions(self) -> None: |
|
""" |
|
Categorizes each question in the loaded data into predefined categories based on keywords. |
|
This method updates the internal dictionary `self.Qs` and the Counter `self.question_types` with categorized |
|
questions. |
|
""" |
|
|
|
question_keywords = config.QUESTION_KEYWORDS |
|
|
|
for question in self.questions: |
|
question = contractions.fix(question) |
|
words = question.lower().split() |
|
question_keyword = None |
|
if words[:2] == ['name', 'the']: |
|
question_keyword = 'name the' |
|
else: |
|
for word in words: |
|
if word in question_keywords: |
|
question_keyword = word |
|
break |
|
if question_keyword: |
|
self.question_types[question_keyword] += 1 |
|
self.Qs[question_keyword].append(question) |
|
else: |
|
self.question_types["others"] += 1 |
|
self.Qs["others"].append(question) |
|
|
|
def plot_question_distribution(self) -> None: |
|
""" |
|
Plots an interactive bar chart of question types using Altair and Streamlit, displaying the count and percentage |
|
of each type. |
|
The chart sorts question types by count in descending order and includes detailed tooltips for interaction. |
|
This method is intended for visualization in a Streamlit application. |
|
""" |
|
|
|
|
|
total_questions = sum(self.question_types.values()) |
|
items = [(key, value, (value / total_questions) * 100) for key, value in self.question_types.items()] |
|
df = pd.DataFrame(items, columns=['Question Keyword', 'Count', 'Percentage']) |
|
|
|
|
|
df = df[df['Question Keyword'] != 'others'].sort_values('Count', ascending=False) |
|
if 'others' in self.question_types: |
|
others_df = pd.DataFrame([('others', self.question_types['others'], |
|
(self.question_types['others'] / total_questions) * 100)], |
|
columns=['Question Keyword', 'Count', 'Percentage']) |
|
df = pd.concat([df, others_df], ignore_index=True) |
|
|
|
|
|
order = df['Question Keyword'].tolist() |
|
|
|
|
|
bars = alt.Chart(df).mark_bar().encode( |
|
x=alt.X('Question Keyword:N', sort=order, title='Question Keyword', axis=alt.Axis(labelAngle=-45)), |
|
y=alt.Y('Count:Q', title='Frequency'), |
|
color=alt.Color('Question Keyword:N', scale=alt.Scale(scheme='category20'), legend=None), |
|
tooltip=[alt.Tooltip('Question Keyword:N', title='Type'), |
|
alt.Tooltip('Count:Q', title='Count'), |
|
alt.Tooltip('Percentage:Q', title='Percentage', format='.1f')] |
|
) |
|
|
|
|
|
text = bars.mark_text( |
|
align='center', |
|
baseline='bottom', |
|
dy=-5 |
|
).encode( |
|
text=alt.Text('PercentageText:N') |
|
).transform_calculate( |
|
PercentageText="datum.Count + ' (' + format(datum.Percentage, '.1f') + '%)'" |
|
) |
|
|
|
|
|
chart = (bars + text).properties( |
|
width=700, |
|
height=400, |
|
title='Distribution of Question Keywords' |
|
).configure_title(fontSize=20).configure_axis( |
|
labelFontSize=12, |
|
titleFontSize=14 |
|
) |
|
|
|
|
|
st.altair_chart(chart, use_container_width=True) |
|
|
|
def export_to_csv(self, qs_filename: str, question_types_filename: str) -> None: |
|
""" |
|
Exports the categorized questions and their counts to two separate CSV files. |
|
|
|
Parameters: |
|
qs_filename (str): The filename or path for exporting the `self.Qs` dictionary data. |
|
question_types_filename (str): The filename or path for exporting the `self.question_types` Counter data. |
|
|
|
This method writes the contents of `self.Qs` and `self.question_types` to the specified files in CSV format. |
|
Each CSV file includes headers for better understanding and use of the exported data. |
|
""" |
|
|
|
|
|
with open(qs_filename, mode='w', newline='', encoding='utf-8') as file: |
|
writer = csv.writer(file) |
|
writer.writerow(['Question Type', 'Questions']) |
|
for q_type, questions in self.Qs.items(): |
|
for question in questions: |
|
writer.writerow([q_type, question]) |
|
|
|
|
|
with open(question_types_filename, mode='w', newline='', encoding='utf-8') as file: |
|
writer = csv.writer(file) |
|
writer.writerow(['Question Type', 'Count']) |
|
for q_type, count in self.question_types.items(): |
|
writer.writerow([q_type, count]) |