diff --git a/pyproject.toml b/pyproject.toml index ffc245bf..005269a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "inquirerpy>=0.3.4", "python-dotenv>=1.0.0", "tree-sitter>=0.21.0", - "tree-sitter-language-pack>=0.6.0", + "tree-sitter-language-pack>=0.13.0", "pyyaml", "pytest", "nbformat", @@ -39,7 +39,7 @@ dependencies = [ [project.optional-dependencies] parsing = [ "tree-sitter>=0.21.0", - "tree-sitter-language-pack>=0.6.0", + "tree-sitter-language-pack>=0.13.0", ] dev = [ "pytest>=7.4.0", diff --git a/src/codegraphcontext/tools/graph_builder.py b/src/codegraphcontext/tools/graph_builder.py index c3e45e81..9acf0141 100644 --- a/src/codegraphcontext/tools/graph_builder.py +++ b/src/codegraphcontext/tools/graph_builder.py @@ -101,40 +101,54 @@ def __init__(self, db_manager: DatabaseManager, job_manager: JobManager, loop: a self.job_manager = job_manager self.loop = loop self.driver = self.db_manager.get_driver() - self.parsers = { - '.py': TreeSitterParser('python'), - '.ipynb': TreeSitterParser('python'), - '.js': TreeSitterParser('javascript'), - '.jsx': TreeSitterParser('javascript'), - '.mjs': TreeSitterParser('javascript'), - '.cjs': TreeSitterParser('javascript'), - '.go': TreeSitterParser('go'), - '.ts': TreeSitterParser('typescript'), - '.tsx': TreeSitterParser('typescript'), - '.cpp': TreeSitterParser('cpp'), - '.h': TreeSitterParser('cpp'), - '.hpp': TreeSitterParser('cpp'), - '.hh': TreeSitterParser('cpp'), - '.rs': TreeSitterParser('rust'), - '.c': TreeSitterParser('c'), - # '.h': TreeSitterParser('c'), # Need to write an algo for distinguishing C vs C++ headers - '.java': TreeSitterParser('java'), - '.rb': TreeSitterParser('ruby'), - '.cs': TreeSitterParser('c_sharp'), - '.php': TreeSitterParser('php'), - '.kt': TreeSitterParser('kotlin'), - '.scala': TreeSitterParser('scala'), - '.sc': TreeSitterParser('scala'), - '.swift': TreeSitterParser('swift'), - '.hs': TreeSitterParser('haskell'), - '.dart': TreeSitterParser('dart'), - '.pl': TreeSitterParser('perl'), - '.pm': TreeSitterParser('perl'), - '.ex': TreeSitterParser('elixir'), - '.exs': TreeSitterParser('elixir'), + raw_parsers = { + '.py': 'python', + '.ipynb': 'python', + '.js': 'javascript', + '.jsx': 'javascript', + '.mjs': 'javascript', + '.cjs': 'javascript', + '.go': 'go', + '.ts': 'typescript', + '.tsx': 'typescript', + '.cpp': 'cpp', + '.h': 'cpp', + '.hpp': 'cpp', + '.hh': 'cpp', + '.rs': 'rust', + '.c': 'c', + # '.h': 'c', # Need to write an algo for distinguishing C vs C++ headers + '.java': 'java', + '.rb': 'ruby', + '.cs': 'c_sharp', + '.php': 'php', + '.kt': 'kotlin', + '.scala': 'scala', + '.sc': 'scala', + '.swift': 'swift', + '.hs': 'haskell', + '.dart': 'dart', + '.pl': 'perl', + '.pm': 'perl', + '.ex': 'elixir', + '.exs': 'elixir', } + self.parsers = {} + for ext, lang in raw_parsers.items(): + parser = self._make_parser_safe(lang) + if parser is not None: + self.parsers[ext] = parser self.create_schema() + @staticmethod + def _make_parser_safe(lang: str) -> Optional['TreeSitterParser']: + """Try to construct a TreeSitterParser for *lang*, returning None on failure.""" + try: + return TreeSitterParser(lang) + except Exception as e: + warning_logger(f"Skipping parser for '{lang}': {e}") + return None + # A general schema creation based on common features across languages def create_schema(self): """Create constraints and indexes in Neo4j.""" diff --git a/src/codegraphcontext/utils/tree_sitter_manager.py b/src/codegraphcontext/utils/tree_sitter_manager.py index f3f9d7a7..4c18bdef 100644 --- a/src/codegraphcontext/utils/tree_sitter_manager.py +++ b/src/codegraphcontext/utils/tree_sitter_manager.py @@ -128,22 +128,37 @@ def get_language_safe(self, lang: str) -> Language: return self._language_cache[canonical_name] try: - # Special handling for C# which is available as tree_sitter_c_sharp + # tree-sitter-language-pack changed its C# key across major versions: + # 0.x (>=0.13.0): uses "csharp" (delegates to tree-sitter-c-sharp pkg) + # 1.x: uses "c_sharp" (native binary, no separate pkg) + # Try both names so the loader is version-agnostic. + # All other languages use a stable name matching our canonical form. if canonical_name == "c_sharp": - import tree_sitter_c_sharp - # tree_sitter_c_sharp.language() returns a PyCapsule, wrap it in Language - capsule = tree_sitter_c_sharp.language() - language = Language(capsule) + for pack_name in ("c_sharp", "csharp"): + try: + language = get_language(pack_name) + break + except Exception: + continue + else: + raise ValueError( + "Language 'c_sharp' (C#) is not available. " + "Ensure tree-sitter-language-pack>=0.13.0 is installed " + "(earlier versions do not ship the C# grammar)." + ) else: # Load the language from tree-sitter-language-pack language = get_language(canonical_name) - + self._language_cache[canonical_name] = language return language - except (KeyError, ModuleNotFoundError): + except ValueError: + raise # pass through ValueError unchanged (ours or pack's LanguageNotFoundError) + except (KeyError, LookupError, ModuleNotFoundError) as e: raise ValueError( - f"Language '{canonical_name}' is not available in tree-sitter-language-pack. " - f"This may be due to a missing or experimental grammar." + f"Language '{canonical_name}' is not available. " + f"Ensure tree-sitter-language-pack>=0.13.0 is installed. " + f"Error: {e}" ) except Exception as e: raise Exception(