code-chunker / CodeParser.py
CintraAI's picture
added support for ts php and ruby
a404cdb
import os
import subprocess
from typing import List, Dict, Union, Tuple
from tree_sitter import Language, Parser, Node
from typing import Union, List
import logging
def return_simple_line_numbers_with_code(code: str) -> str:
code_lines = code.split('\n')
code_with_line_numbers = [f"Line {i + 1}: {line}" for i, line in enumerate(code_lines)]
joined_lines = "\n".join(code_with_line_numbers)
return joined_lines
class CodeParser:
# Added a CACHE_DIR class attribute for caching
CACHE_DIR = os.path.expanduser("~/.code_parser_cache")
def __init__(self, file_extensions: Union[None, List[str], str] = None):
if isinstance(file_extensions, str):
file_extensions = [file_extensions]
self.language_extension_map = {
"py": "python",
"js": "javascript",
"jsx": "javascript",
"css": "css",
"ts": "typescript",
"tsx": "typescript",
"php": "php",
"rb": "ruby"
}
if file_extensions is None:
self.language_names = []
else:
self.language_names = [self.language_extension_map.get(ext) for ext in file_extensions if
ext in self.language_extension_map]
self.languages = {}
self._install_parsers()
def _install_parsers(self):
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
try:
# Ensure cache directory exists
if not os.path.exists(self.CACHE_DIR):
os.makedirs(self.CACHE_DIR)
for language in self.language_names:
repo_path = os.path.join(self.CACHE_DIR, f"tree-sitter-{language}")
# Check if the repository exists and contains necessary files
if not os.path.exists(repo_path) or not self._is_repo_valid(repo_path, language):
try:
if os.path.exists(repo_path):
logging.info(f"Updating existing repository for {language}")
update_command = f"cd {repo_path} && git pull"
subprocess.run(update_command, shell=True, check=True)
else:
logging.info(f"Cloning repository for {language}")
clone_command = f"git clone https://github.com/tree-sitter/tree-sitter-{language} {repo_path}"
subprocess.run(clone_command, shell=True, check=True)
except subprocess.CalledProcessError as e:
logging.error(f"Failed to clone/update repository for {language}. Error: {e}")
continue
try:
build_path = os.path.join(self.CACHE_DIR, f"build/{language}.so")
# Special handling for TypeScript
if language == 'typescript':
ts_dir = os.path.join(repo_path, 'typescript')
tsx_dir = os.path.join(repo_path, 'tsx')
if os.path.exists(ts_dir) and os.path.exists(tsx_dir):
Language.build_library(build_path, [ts_dir, tsx_dir])
else:
raise FileNotFoundError(f"TypeScript or TSX directory not found in {repo_path}")
if language == 'php':
php_dir = os.path.join(repo_path, 'php')
Language.build_library(build_path, [php_dir])
else:
Language.build_library(build_path, [repo_path])
self.languages[language] = Language(build_path, language)
logging.info(f"Successfully built and loaded {language} parser")
except Exception as e:
logging.error(f"Failed to build or load language {language}. Error: {str(e)}")
except Exception as e:
logging.error(f"An unexpected error occurred during parser installation: {str(e)}")
def _is_repo_valid(self, repo_path: str, language: str) -> bool:
"""Check if the repository contains necessary files."""
if language == 'typescript':
return (os.path.exists(os.path.join(repo_path, 'typescript', 'src', 'parser.c')) and
os.path.exists(os.path.join(repo_path, 'tsx', 'src', 'parser.c')))
elif language == 'php':
return os.path.exists(os.path.join(repo_path, 'php', 'src', 'parser.c'))
else:
return os.path.exists(os.path.join(repo_path, 'src', 'parser.c'))
def parse_code(self, code: str, file_extension: str) -> Union[None, Node]:
language_name = self.language_extension_map.get(file_extension)
if language_name is None:
print(f"Unsupported file type: {file_extension}")
return None
language = self.languages.get(language_name)
if language is None:
print("Language parser not found")
return None
parser = Parser()
parser.set_language(language)
tree = parser.parse(bytes(code, "utf8"))
if tree is None:
print("Failed to parse the code")
return None
return tree.root_node
def extract_points_of_interest(self, node: Node, file_extension: str) -> List[Tuple[Node, str]]:
node_types_of_interest = self._get_node_types_of_interest(file_extension)
points_of_interest = []
if node.type in node_types_of_interest.keys():
points_of_interest.append((node, node_types_of_interest[node.type]))
for child in node.children:
points_of_interest.extend(self.extract_points_of_interest(child, file_extension))
return points_of_interest
def _get_node_types_of_interest(self, file_extension: str) -> Dict[str, str]:
node_types = {
'py': {
'import_statement': 'Import',
'export_statement': 'Export',
'class_definition': 'Class',
'function_definition': 'Function',
},
'css': {
'tag_name': 'Tag',
'@media': 'Media Query',
},
'js': {
'import_statement': 'Import',
'export_statement': 'Export',
'class_declaration': 'Class',
'function_declaration': 'Function',
'arrow_function': 'Arrow Function',
'statement_block': 'Block',
},
'ts': {
'import_statement': 'Import',
'export_statement': 'Export',
'class_declaration': 'Class',
'function_declaration': 'Function',
'arrow_function': 'Arrow Function',
'statement_block': 'Block',
'interface_declaration': 'Interface',
'type_alias_declaration': 'Type Alias',
},
'php': {
'namespace_definition': 'Namespace',
'class_declaration': 'Class',
'method_declaration': 'Method',
'function_definition': 'Function',
'interface_declaration': 'Interface',
'trait_declaration': 'Trait',
},
'rb': {
'class': 'Class',
'method': 'Method',
'module': 'Module',
'singleton_class': 'Singleton Class',
'begin': 'Begin Block',
}
}
if file_extension in node_types.keys():
return node_types[file_extension]
elif file_extension == "jsx":
return node_types["js"]
elif file_extension == "tsx":
return node_types["ts"]
else:
raise ValueError("Unsupported file type")
def _get_nodes_for_comments(self, file_extension: str) -> Dict[str, str]:
node_types = {
'py': {
'comment': 'Comment',
'decorator': 'Decorator', # Broadened category
},
'css': {
'comment': 'Comment'
},
'js': {
'comment': 'Comment',
'decorator': 'Decorator', # Broadened category
},
'ts': {
'comment': 'Comment',
'decorator': 'Decorator',
},
'php': {
'comment': 'Comment',
'attribute': 'Attribute',
},
'rb': {
'comment': 'Comment',
}
}
if file_extension in node_types.keys():
return node_types[file_extension]
elif file_extension == "jsx":
return node_types["js"]
else:
raise ValueError("Unsupported file type")
def extract_comments(self, node: Node, file_extension: str) -> List[Tuple[Node, str]]:
node_types_of_interest = self._get_nodes_for_comments(file_extension)
comments = []
if node.type in node_types_of_interest:
comments.append((node, node_types_of_interest[node.type]))
for child in node.children:
comments.extend(self.extract_comments(child, file_extension))
return comments
def get_lines_for_points_of_interest(self, code: str, file_extension: str) -> List[int]:
language_name = self.language_extension_map.get(file_extension)
if language_name is None:
raise ValueError("Unsupported file type")
language = self.languages.get(language_name)
if language is None:
raise ValueError("Language parser not found")
parser = Parser()
parser.set_language(language)
tree = parser.parse(bytes(code, "utf8"))
root_node = tree.root_node
points_of_interest = self.extract_points_of_interest(root_node, file_extension)
line_numbers_with_type_of_interest = {}
for node, type_of_interest in points_of_interest:
start_line = node.start_point[0]
if type_of_interest not in line_numbers_with_type_of_interest:
line_numbers_with_type_of_interest[type_of_interest] = []
if start_line not in line_numbers_with_type_of_interest[type_of_interest]:
line_numbers_with_type_of_interest[type_of_interest].append(start_line)
lines_of_interest = []
for _, line_numbers in line_numbers_with_type_of_interest.items():
lines_of_interest.extend(line_numbers)
return lines_of_interest
def get_lines_for_comments(self, code: str, file_extension: str) -> List[int]:
language_name = self.language_extension_map.get(file_extension)
if language_name is None:
raise ValueError("Unsupported file type")
language = self.languages.get(language_name)
if language is None:
raise ValueError("Language parser not found")
parser = Parser()
parser.set_language(language)
tree = parser.parse(bytes(code, "utf8"))
root_node = tree.root_node
comments = self.extract_comments(root_node, file_extension)
line_numbers_with_comments = {}
for node, type_of_interest in comments:
start_line = node.start_point[0]
if type_of_interest not in line_numbers_with_comments:
line_numbers_with_comments[type_of_interest] = []
if start_line not in line_numbers_with_comments[type_of_interest]:
line_numbers_with_comments[type_of_interest].append(start_line)
lines_of_interest = []
for _, line_numbers in line_numbers_with_comments.items():
lines_of_interest.extend(line_numbers)
return lines_of_interest
def print_all_line_types(self, code: str, file_extension: str):
language_name = self.language_extension_map.get(file_extension)
if language_name is None:
print(f"Unsupported file type: {file_extension}")
return
language = self.languages.get(language_name)
if language is None:
print("Language parser not found")
return
parser = Parser()
parser.set_language(language)
tree = parser.parse(bytes(code, "utf8"))
root_node = tree.root_node
line_to_node_type = self.map_line_to_node_type(root_node)
code_lines = code.split('\n')
for line_num, node_types in line_to_node_type.items():
line_content = code_lines[line_num - 1] # Adjusting index for zero-based indexing
print(f"line {line_num}: {', '.join(node_types)} | Code: {line_content}")
def map_line_to_node_type(self, node, line_to_node_type=None, depth=0):
if line_to_node_type is None:
line_to_node_type = {}
start_line = node.start_point[0] + 1 # Tree-sitter lines are 0-indexed; converting to 1-indexed
# Only add the node type if it's the start line of the node
if start_line not in line_to_node_type:
line_to_node_type[start_line] = []
line_to_node_type[start_line].append(node.type)
for child in node.children:
self.map_line_to_node_type(child, line_to_node_type, depth + 1)
return line_to_node_type