Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ dependencies = [
"inquirerpy>=0.3.4",
"python-dotenv>=1.0.0",
"tree-sitter>=0.21.0",
"tree-sitter-language-pack>=0.6.0",
"tree-sitter-language-pack>=0.13.0",
"pyyaml",
"pytest",
"nbformat",
Expand All @@ -39,7 +39,7 @@ dependencies = [
[project.optional-dependencies]
parsing = [
"tree-sitter>=0.21.0",
"tree-sitter-language-pack>=0.6.0",
"tree-sitter-language-pack>=0.13.0",
]
dev = [
"pytest>=7.4.0",
Expand Down
76 changes: 45 additions & 31 deletions src/codegraphcontext/tools/graph_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,40 +101,54 @@ def __init__(self, db_manager: DatabaseManager, job_manager: JobManager, loop: a
self.job_manager = job_manager
self.loop = loop
self.driver = self.db_manager.get_driver()
self.parsers = {
'.py': TreeSitterParser('python'),
'.ipynb': TreeSitterParser('python'),
'.js': TreeSitterParser('javascript'),
'.jsx': TreeSitterParser('javascript'),
'.mjs': TreeSitterParser('javascript'),
'.cjs': TreeSitterParser('javascript'),
'.go': TreeSitterParser('go'),
'.ts': TreeSitterParser('typescript'),
'.tsx': TreeSitterParser('typescript'),
'.cpp': TreeSitterParser('cpp'),
'.h': TreeSitterParser('cpp'),
'.hpp': TreeSitterParser('cpp'),
'.hh': TreeSitterParser('cpp'),
'.rs': TreeSitterParser('rust'),
'.c': TreeSitterParser('c'),
# '.h': TreeSitterParser('c'), # Need to write an algo for distinguishing C vs C++ headers
'.java': TreeSitterParser('java'),
'.rb': TreeSitterParser('ruby'),
'.cs': TreeSitterParser('c_sharp'),
'.php': TreeSitterParser('php'),
'.kt': TreeSitterParser('kotlin'),
'.scala': TreeSitterParser('scala'),
'.sc': TreeSitterParser('scala'),
'.swift': TreeSitterParser('swift'),
'.hs': TreeSitterParser('haskell'),
'.dart': TreeSitterParser('dart'),
'.pl': TreeSitterParser('perl'),
'.pm': TreeSitterParser('perl'),
'.ex': TreeSitterParser('elixir'),
'.exs': TreeSitterParser('elixir'),
raw_parsers = {
'.py': 'python',
'.ipynb': 'python',
'.js': 'javascript',
'.jsx': 'javascript',
'.mjs': 'javascript',
'.cjs': 'javascript',
'.go': 'go',
'.ts': 'typescript',
'.tsx': 'typescript',
'.cpp': 'cpp',
'.h': 'cpp',
'.hpp': 'cpp',
'.hh': 'cpp',
'.rs': 'rust',
'.c': 'c',
# '.h': 'c', # Need to write an algo for distinguishing C vs C++ headers
'.java': 'java',
'.rb': 'ruby',
'.cs': 'c_sharp',
'.php': 'php',
'.kt': 'kotlin',
'.scala': 'scala',
'.sc': 'scala',
'.swift': 'swift',
'.hs': 'haskell',
'.dart': 'dart',
'.pl': 'perl',
'.pm': 'perl',
'.ex': 'elixir',
'.exs': 'elixir',
}
self.parsers = {}
for ext, lang in raw_parsers.items():
parser = self._make_parser_safe(lang)
if parser is not None:
self.parsers[ext] = parser
self.create_schema()

@staticmethod
def _make_parser_safe(lang: str) -> Optional['TreeSitterParser']:
"""Try to construct a TreeSitterParser for *lang*, returning None on failure."""
try:
return TreeSitterParser(lang)
except Exception as e:
warning_logger(f"Skipping parser for '{lang}': {e}")
return None

# A general schema creation based on common features across languages
def create_schema(self):
"""Create constraints and indexes in Neo4j."""
Expand Down
33 changes: 24 additions & 9 deletions src/codegraphcontext/utils/tree_sitter_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,22 +128,37 @@ def get_language_safe(self, lang: str) -> Language:
return self._language_cache[canonical_name]

try:
# Special handling for C# which is available as tree_sitter_c_sharp
# tree-sitter-language-pack changed its C# key across major versions:
# 0.x (>=0.13.0): uses "csharp" (delegates to tree-sitter-c-sharp pkg)
# 1.x: uses "c_sharp" (native binary, no separate pkg)
# Try both names so the loader is version-agnostic.
# All other languages use a stable name matching our canonical form.
if canonical_name == "c_sharp":
import tree_sitter_c_sharp
# tree_sitter_c_sharp.language() returns a PyCapsule, wrap it in Language
capsule = tree_sitter_c_sharp.language()
language = Language(capsule)
for pack_name in ("c_sharp", "csharp"):
try:
language = get_language(pack_name)
break
except Exception:
continue
else:
raise ValueError(
"Language 'c_sharp' (C#) is not available. "
"Ensure tree-sitter-language-pack>=0.13.0 is installed "
"(earlier versions do not ship the C# grammar)."
)
else:
# Load the language from tree-sitter-language-pack
language = get_language(canonical_name)

self._language_cache[canonical_name] = language
return language
except (KeyError, ModuleNotFoundError):
except ValueError:
raise # pass through ValueError unchanged (ours or pack's LanguageNotFoundError)
except (KeyError, LookupError, ModuleNotFoundError) as e:
raise ValueError(
f"Language '{canonical_name}' is not available in tree-sitter-language-pack. "
f"This may be due to a missing or experimental grammar."
f"Language '{canonical_name}' is not available. "
f"Ensure tree-sitter-language-pack>=0.13.0 is installed. "
f"Error: {e}"
)
except Exception as e:
raise Exception(
Expand Down
Loading