import os import pandas as pd import ast import google.generativeai as genai from dotenv import load_dotenv load_dotenv() def extract_column_samples(df, n=5): samples = {} for col in df.columns: samples[col] = df[col].head(n).tolist() return samples def getCodes(query): # query = "covid 19" path="final/"+query+".csv" df = pd.read_csv(path) samples = extract_column_samples(df) prompt = ( "You are a data analyst. I will give you a dictionary containing column names with example values from a dataset.\n\n" "Your task is to:\n" "1. Identify columns where one-hot encoding is *not suitable*.\n" "2. For each of these, determine if it requires:\n" " - feature extraction (e.g., from datetime or strings), or\n" " - use of word embeddings (e.g., for free text or high-cardinality text).\n\n" "For feature extraction columns:\n" "- Create a **Python dictionary** where:\n" " * Each key is a new, meaningful column name.\n" " * Each value is a **valid Pandas expression string** that derives the new column from the original `df` DataFrame.\n" "- Also return a **Python list** of original column names that were used in this dictionary.\n\n" "For columns requiring word embeddings:\n" "- Return a separate **Python list** of these column names.\n" "- If any column appears in both cases, include it *only* in the word embedding list.\n\n" "Your output **must follow this exact format** with no additional explanation or markdown. Only return the following inside a single Python code block:\n" "```python\n" "# Dictionary of transformations\n" "{'new_col1': \"some pandas expression\", 'new_col2': \"some other pandas expression\"}\n\n" "# Array of columns used in the dictionary\n" "['col1', 'col2']\n" "# Array of columns that require the use of word embeddings\n" "['col3', 'col4']\n" "```\n\n" "**DO NOT** include any explanation, reasoning, extra code, or markdown outside of the code block. Only return the exact format shown above. Do not generate or describe functions.\n\n" f"Here is the input :\n{samples}\n" ) genai.configure(api_key=os.getenv("gemini_api")) model = genai.GenerativeModel("gemini-2.0-flash") response = model.generate_content(prompt) merge_map_text = response.text.strip() print(merge_map_text) str1 = merge_map_text.split("```python")[1].split("# Array of columns used in the dictionary")[0].strip() str2 = merge_map_text.split("# Array of columns used in the dictionary")[1].split("# Array of columns that require the use of word embeddings")[0].strip() str3 = merge_map_text.split("# Array of columns used in the dictionary")[1].split("# Array of columns that require the use of word embeddings")[1].replace("```","").strip() preprocessing_code = ast.literal_eval(str1) actual_list = ast.literal_eval(str2) nlp=ast.literal_eval(str3) # print("Parsed dict:\n", preprocessing_code) # print("Columns changed:\n", actual_list) # print("for nlp : ",nlp) return preprocessing_code,actual_list,nlp # getCodes(extract_column_samples)