m7mdal7aj commited on
Commit
d2441b9
1 Parent(s): e2b7078

Update my_model/utilities/gen_utilities.py

Browse files
Files changed (1) hide show
  1. my_model/utilities/gen_utilities.py +0 -164
my_model/utilities/gen_utilities.py CHANGED
@@ -1,6 +1,4 @@
1
  import pandas as pd
2
- from collections import Counter
3
- import json
4
  import os
5
  from PIL import Image
6
  import numpy as np
@@ -12,160 +10,6 @@ import gc
12
  import streamlit as st
13
 
14
 
15
- class VQADataProcessor:
16
- """
17
- A class to process OKVQA dataset.
18
-
19
- Attributes:
20
- questions_file_path (str): The file path for the questions JSON file.
21
- annotations_file_path (str): The file path for the annotations JSON file.
22
- questions (list): List of questions extracted from the JSON file.
23
- annotations (list): List of annotations extracted from the JSON file.
24
- df_questions (DataFrame): DataFrame created from the questions list.
25
- df_answers (DataFrame): DataFrame created from the annotations list.
26
- merged_df (DataFrame): DataFrame resulting from merging questions and answers.
27
- """
28
-
29
- def __init__(self, questions_file_path, annotations_file_path):
30
- """
31
- Initializes the VQADataProcessor with file paths for questions and annotations.
32
-
33
- Parameters:
34
- questions_file_path (str): The file path for the questions JSON file.
35
- annotations_file_path (str): The file path for the annotations JSON file.
36
- """
37
- self.questions_file_path = questions_file_path
38
- self.annotations_file_path = annotations_file_path
39
- self.questions, self.annotations = self.read_json_files()
40
- self.df_questions = pd.DataFrame(self.questions)
41
- self.df_answers = pd.DataFrame(self.annotations)
42
- self.merged_df = None
43
-
44
- def read_json_files(self):
45
- """
46
- Reads the JSON files for questions and annotations.
47
-
48
- Returns:
49
- tuple: A tuple containing two lists: questions and annotations.
50
- """
51
- with open(self.questions_file_path, 'r') as file:
52
- data = json.load(file)
53
- questions = data['questions']
54
-
55
- with open(self.annotations_file_path, 'r') as file:
56
- data = json.load(file)
57
- annotations = data['annotations']
58
-
59
- return questions, annotations
60
-
61
- @staticmethod
62
- def find_most_frequent(my_list):
63
- """
64
- Finds the most frequent item in a list.
65
-
66
- Parameters:
67
- my_list (list): A list of items.
68
-
69
- Returns:
70
- The most frequent item in the list. Returns None if the list is empty.
71
- """
72
- if not my_list:
73
- return None
74
- counter = Counter(my_list)
75
- most_common = counter.most_common(1)
76
- return most_common[0][0]
77
-
78
- def merge_dataframes(self):
79
- """
80
- Merges the questions and answers DataFrames on 'question_id' and 'image_id'.
81
- """
82
- self.merged_df = pd.merge(self.df_questions, self.df_answers, on=['question_id', 'image_id'])
83
-
84
- def join_words_with_hyphen(self, sentence):
85
-
86
- return '-'.join(sentence.split())
87
-
88
- def process_answers(self):
89
- """
90
- Processes the answers by extracting raw and processed answers and finding the most frequent ones.
91
- """
92
- if self.merged_df is not None:
93
- self.merged_df['raw_answers'] = self.merged_df['answers'].apply(lambda x: [ans['raw_answer'] for ans in x])
94
- self.merged_df['processed_answers'] = self.merged_df['answers'].apply(
95
- lambda x: [ans['answer'] for ans in x])
96
- self.merged_df['most_frequent_raw_answer'] = self.merged_df['raw_answers'].apply(self.find_most_frequent)
97
- self.merged_df['most_frequent_processed_answer'] = self.merged_df['processed_answers'].apply(
98
- self.find_most_frequent)
99
- self.merged_df.drop(columns=['answers'], inplace=True)
100
- else:
101
- print("DataFrames have not been merged yet.")
102
-
103
- # Apply the function to the 'most_frequent_processed_answer' column
104
- self.merged_df['single_word_answers'] = self.merged_df['most_frequent_processed_answer'].apply(
105
- self.join_words_with_hyphen)
106
-
107
- def get_processed_data(self):
108
- """
109
- Retrieves the processed DataFrame.
110
-
111
- Returns:
112
- DataFrame: The processed DataFrame. Returns None if the DataFrame is empty or not processed.
113
- """
114
- if self.merged_df is not None:
115
- return self.merged_df
116
- else:
117
- print("DataFrame is empty or not processed yet.")
118
- return None
119
-
120
- def save_to_csv(self, df, saved_file_name):
121
-
122
- if saved_file_name is not None:
123
- if ".csv" not in saved_file_name:
124
- df.to_csv(os.path.join(saved_file_name, ".csv"), index=None)
125
-
126
- else:
127
- df.to_csv(saved_file_name, index=None)
128
-
129
- else:
130
- df.to_csv("data.csv", index=None)
131
-
132
- def display_dataframe(self):
133
- """
134
- Displays the processed DataFrame.
135
- """
136
- if self.merged_df is not None:
137
- print(self.merged_df)
138
- else:
139
- print("DataFrame is empty.")
140
-
141
-
142
- def process_okvqa_dataset(questions_file_path, annotations_file_path, save_to_csv=False, saved_file_name=None):
143
- """
144
- Processes the OK-VQA dataset given the file paths for questions and annotations.
145
-
146
- Parameters:
147
- questions_file_path (str): The file path for the questions JSON file.
148
- annotations_file_path (str): The file path for the annotations JSON file.
149
-
150
- Returns:
151
- DataFrame: The processed DataFrame containing merged and processed VQA data.
152
- """
153
- # Create an instance of the class
154
- processor = VQADataProcessor(questions_file_path, annotations_file_path)
155
-
156
- # Process the data
157
- processor.merge_dataframes()
158
- processor.process_answers()
159
-
160
- # Retrieve the processed DataFrame
161
- processed_data = processor.get_processed_data()
162
-
163
- if save_to_csv:
164
- processor.save_to_csv(processed_data, saved_file_name)
165
-
166
- return processed_data
167
-
168
-
169
  def show_image(image):
170
  """
171
  Display an image in various environments (Jupyter, PyCharm, Hugging Face Spaces).
@@ -307,11 +151,3 @@ def free_gpu_resources():
307
  gc.collect()
308
  gc.collect()
309
 
310
-
311
-
312
-
313
-
314
- if __name__ == "__main__":
315
- pass
316
- #val_data = process_okvqa_dataset('OpenEnded_mscoco_val2014_questions.json', 'mscoco_val2014_annotations.json', save_to_csv=True, saved_file_name="okvqa_val.csv")
317
- #train_data = process_okvqa_dataset('OpenEnded_mscoco_train2014_questions.json', 'mscoco_train2014_annotations.json', save_to_csv=True, saved_file_name="okvqa_train.csv")
 
1
  import pandas as pd
 
 
2
  import os
3
  from PIL import Image
4
  import numpy as np
 
10
  import streamlit as st
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def show_image(image):
14
  """
15
  Display an image in various environments (Jupyter, PyCharm, Hugging Face Spaces).
 
151
  gc.collect()
152
  gc.collect()
153