kedar-bhumkar's picture
Upload 5 files
3d833be verified
import os
import git
from pathlib import Path
from openai import OpenAI
from anthropic import Anthropic
from dotenv import load_dotenv
from pydantic_model import ImpactAnalysis
import tiktoken
import json
from typing import List, Tuple, Dict, Any
# Load environment variables
load_dotenv()
# Initialize API clients
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
anthropic_client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
def clone_repository(repo_url, temp_dir):
"""Clone a git repository to a temporary directory."""
try:
git.Repo.clone_from(repo_url, temp_dir)
return True, None
except Exception as e:
return False, str(e)
def read_code_files(directory):
"""Read all code files from the directory."""
code_files = []
code_extensions = {'.py', '.js', '.jsx', '.ts', '.tsx', '.java', '.cpp', '.c', '.cs', '.go', '.rb', '.php', '.cls', '.object','.page'}
warnings = []
for root, _, files in os.walk(directory):
for file in files:
if Path(file).suffix in code_extensions:
file_path = os.path.join(root, file)
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
relative_path = os.path.relpath(file_path, directory)
code_files.append({
'path': relative_path,
'content': content
})
except Exception as e:
warnings.append(f"Could not read file {file_path}: {str(e)}")
return code_files, warnings
def count_tokens(text: str, model: str = "gpt-4") -> int:
"""Count the number of tokens in a text string."""
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(text))
def chunk_files(code_files: List[Dict[str, str]], model: str = "gpt-4", max_tokens: int = 120000) -> List[List[Dict[str, str]]]:
"""Split files into chunks that fit within the context window."""
chunks = []
current_chunk = []
current_tokens = 0
for file in code_files:
file_content = f"File: {file['path']}\nContent:\n{file['content']}\n"
file_tokens = count_tokens(file_content, model)
# If a single file is larger than max_tokens, skip it
if file_tokens > max_tokens:
print(f"Warning: File {file['path']} is too large ({file_tokens} tokens) and will be skipped")
continue
# If adding this file would exceed max_tokens, start a new chunk
if current_tokens + file_tokens > max_tokens:
if current_chunk: # Only add non-empty chunks
chunks.append(current_chunk)
current_chunk = [file]
current_tokens = file_tokens
else:
current_chunk.append(file)
current_tokens += file_tokens
# Add the last chunk if it's not empty
if current_chunk:
chunks.append(current_chunk)
return chunks
def analyze_code_chunk(chunk: List[Dict[str, str]], prompt: str, model: str) -> Tuple[str, str]:
"""Analyze a chunk of code files."""
try:
# Prepare the context from the chunk
context = "Here are the relevant code files:\n\n"
for file in chunk:
context += f"File: {file['path']}\n```\n{file['content']}\n```\n"
if model == "gpt-4":
json_schema = ImpactAnalysis.model_json_schema()
messages = [
{"role": "system", "content": "You are a code analysis expert. Analyze the provided code based on the user's prompt."},
{"role": "user", "content": f"Please check the impact of performing the below code/configuration changes on the above codebase. Provide only the summary of the impact in a table with aggregate analysis that outputs a JSON object with the following schema : {json_schema} . Pls note : Do not add the characters ``` json anywhere in the response. Do not respond with messages like 'Here is the response in the required JSON format:'.\n\nCode or configuration changes: {prompt}\n\n{context}"}
]
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=messages,
temperature=0.7,
max_tokens=2000
)
return response.choices[0].message.content, ""
else:
# Keep original Claude implementation
system_message = "You are a code analysis expert. Analyze the provided code based on the user's prompt."
user_message = f"Please check the impact of performing the below code/configuration changes on the above codebase. Provide only the summary of the impact in a table with aggregate analysis that includes 1) List of files impacted. 2) No of files impacted 3) Impactd etail on each file impacted . Surface a 'Severity Level' at the top of table with possible values: Low, Medium, High based on the 'Number of impacted files' impacted. E.g. if 'Number of impacted files' > 0 but < 3 then LOW, if 'Number of impacted files' > 3 but < 8 then MEDIUM, if 'Number of impacted files' > 8 then HIGH.\n\nCode or configuration changes: {prompt}\n\n{context}"
response = anthropic_client.messages.create(
model="claude-3-7-sonnet-20250219",
max_tokens=2000,
temperature=0.7,
system=system_message,
messages=[{"role": "user", "content": user_message}]
)
return response.content[0].text, ""
except Exception as e:
return "", str(e)
def analyze_code(code_files: List[Dict[str, str]], prompt: str, model: str) -> Tuple[str, str]:
"""Analyze code files with chunking to handle large codebases."""
try:
# Split files into chunks
chunks = chunk_files(code_files, model)
if not chunks:
return "", "No valid files to analyze"
# Analyze each chunk
all_analyses = []
for i, chunk in enumerate(chunks):
analysis, error = analyze_code_chunk(chunk, prompt, model)
if error:
return "", f"Error analyzing chunk {i+1}: {error}"
if analysis:
all_analyses.append(analysis)
if not all_analyses:
return "", "No analysis results generated"
# Combine results from all chunks
combined_analysis = {
"severity_level": "LOW", # Default to lowest severity
"number_of_files_impacted": 0,
"files_impacted": []
}
# Merge results from all chunks
for analysis in all_analyses:
try:
chunk_data = json.loads(analysis)
combined_analysis["number_of_files_impacted"] += chunk_data.get("number_of_files_impacted", 0)
combined_analysis["files_impacted"].extend(chunk_data.get("files_impacted", []))
# Update severity level based on the highest severity found
severity_map = {"LOW": 1, "MEDIUM": 2, "HIGH": 3}
current_severity = severity_map.get(combined_analysis["severity_level"], 0)
chunk_severity = severity_map.get(chunk_data.get("severity_level", "LOW"), 0)
if chunk_severity > current_severity:
combined_analysis["severity_level"] = chunk_data["severity_level"]
except json.JSONDecodeError:
continue
return json.dumps(combined_analysis), ""
except Exception as e:
return "", str(e)
def check_api_keys():
"""Check if required API keys are set."""
openai_key = os.getenv("OPENAI_API_KEY") is not None
anthropic_key = os.getenv("ANTHROPIC_API_KEY") is not None
return {
"gpt-4": openai_key,
"claude-sonnet": anthropic_key
}