From f495bd9130c7bbee53c6495f2dbc3baf3685fe89 Mon Sep 17 00:00:00 2001 From: Yiannis Mertzanis Date: Tue, 29 Apr 2025 11:08:09 +0300 Subject: [PATCH 1/3] fix: Resolve XML validation error in OutputGitRepoXML function - Fixed XML generation to properly handle special characters and CDATA sections - Added protection against premature CDATA termination by escaping "]]>" sequences - Improved XML formatting with consistent indentation and structure - Simplified token placeholder replacement without breaking formatting --- prompt/prompt.go | 108 +++++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 50 deletions(-) diff --git a/prompt/prompt.go b/prompt/prompt.go index 1c0a462..1de2db4 100644 --- a/prompt/prompt.go +++ b/prompt/prompt.go @@ -167,60 +167,68 @@ func OutputGitRepo(repo *GitRepo, preambleFile string, scrubComments bool) (stri } func OutputGitRepoXML(repo *GitRepo, scrubComments bool) (string, error) { - // Prepare XML content - if scrubComments { - for i, file := range repo.Files { - repo.Files[i].Contents = utils.RemoveCodeComments(file.Contents) - } - } - - // Add XML header - var result strings.Builder - result.WriteString("\n") - - // Use custom marshaling with proper CDATA for code contents - result.WriteString("\n") - - // Skip the tokens for now - result.WriteString(" PLACEHOLDER\n") - result.WriteString(fmt.Sprintf(" %d\n", repo.FileCount)) - result.WriteString(" \n") - - for _, file := range repo.Files { - result.WriteString(" \n") - result.WriteString(fmt.Sprintf(" %s\n", escapeXML(file.Path))) - result.WriteString(fmt.Sprintf(" %d\n", file.Tokens)) - result.WriteString(" \n") - result.WriteString(" \n") - } - - result.WriteString(" \n") - result.WriteString("") - - // Get the output string - outputStr := result.String() - - // Calculate tokens - tokenCount := EstimateTokens(outputStr) - repo.TotalTokens = tokenCount - - // Replace the placeholder with the actual token count - outputStr = strings.Replace(outputStr, "PLACEHOLDER", - fmt.Sprintf("%d", tokenCount), 1) - - return outputStr, nil + // Prepare XML content + if scrubComments { + for i, file := range repo.Files { + repo.Files[i].Contents = utils.RemoveCodeComments(file.Contents) + } + } + + // Add XML header + var result strings.Builder + result.WriteString("\n") + result.WriteString("\n") + + // Add token placeholder - will replace later + result.WriteString(" PLACEHOLDER\n") + result.WriteString(fmt.Sprintf(" %d\n", repo.FileCount)) + result.WriteString(" \n") + + for _, file := range repo.Files { + result.WriteString(" \n") + result.WriteString(fmt.Sprintf(" %s\n", escapeXML(file.Path))) + result.WriteString(fmt.Sprintf(" %d\n", file.Tokens)) + + // Handle file contents within CDATA, but escape any "]]>" sequences + // which would prematurely terminate the CDATA section + safeContents := strings.ReplaceAll(file.Contents, "]]>", "]]]]>") + + result.WriteString(" \n") + result.WriteString(" \n") + } + + result.WriteString(" \n") + result.WriteString("\n") + + // Get the output string + outputStr := result.String() + + // Calculate tokens + tokenCount := EstimateTokens(outputStr) + repo.TotalTokens = tokenCount + + // Replace the placeholder with the actual token count + outputStr = strings.Replace( + outputStr, + "PLACEHOLDER", + fmt.Sprintf("%d", tokenCount), + 1, + ) + + return outputStr, nil } + // escapeXML escapes XML special characters in a string func escapeXML(s string) string { - s = strings.ReplaceAll(s, "&", "&") - s = strings.ReplaceAll(s, "<", "<") - s = strings.ReplaceAll(s, ">", ">") - s = strings.ReplaceAll(s, "\"", """) - s = strings.ReplaceAll(s, "'", "'") - return s + s = strings.ReplaceAll(s, "&", "&") + s = strings.ReplaceAll(s, "<", "<") + s = strings.ReplaceAll(s, ">", ">") + s = strings.ReplaceAll(s, "\"", """) + s = strings.ReplaceAll(s, "'", "'") + return s } // ValidateXML checks if the given XML string is well-formed From bfac4584cb98300855d5fb3523115fea74a84adc Mon Sep 17 00:00:00 2001 From: Yiannis Mertzanis Date: Tue, 29 Apr 2025 11:33:24 +0300 Subject: [PATCH 2/3] feat: Add .gptinclude functionality for selective file inclusion This commit adds support for a .gptinclude file, which allows users to explicitly specify which files should be included in the repository export. The feature complements the existing .gptignore functionality: - When both .gptinclude and .gptignore exist, files are first filtered by the include patterns, then any matching ignore patterns are excluded - Added new command-line flag: -I/--include to specify a custom path to the .gptinclude file - Default behavior looks for .gptinclude in repository root - Added comprehensive tests for the new functionality - Updated README.md with documentation and examples With this change, users gain more fine-grained control over which parts of their repositories are processed by git2gpt, making it easier to focus on specific areas when working with AI language models. --- .gptinclude | 1 + README.md | 42 ++++++++++- chat.json | 1 + cmd/root.go | 36 ++-------- prompt/gptinclude_test.go | 140 ++++++++++++++++++++++++++++++++++++ prompt/prompt.go | 147 ++++++++++++++++++++++---------------- 6 files changed, 273 insertions(+), 94 deletions(-) create mode 100644 .gptinclude create mode 100644 chat.json create mode 100644 prompt/gptinclude_test.go diff --git a/.gptinclude b/.gptinclude new file mode 100644 index 0000000..8431972 --- /dev/null +++ b/.gptinclude @@ -0,0 +1 @@ +prompt/ \ No newline at end of file diff --git a/README.md b/README.md index a370b13..a88abf0 100644 --- a/README.md +++ b/README.md @@ -24,11 +24,46 @@ To use the git2gpt utility, run the following command: git2gpt [flags] /path/to/git/repository ``` -### Ignoring Files +### Including and Ignoring Files -By default, your `.git` directory and your `.gitignore` files are ignored. Any files in your `.gitignore` are also skipped. If you want to change this behavior, you should add a `.gptignore` file to your repository. The `.gptignore` file should contain a list of files and directories to ignore, one per line. The `.gptignore` file should be in the same directory as your `.gitignore` file. Please note that this overwrites the default ignore list, so you should include the default ignore list in your `.gptignore` file if you want to keep it. +By default, your `.git` directory and your `.gitignore` files are ignored. Any files in your `.gitignore` are also skipped. You can customize the files to include or ignore in several ways: -### Flags +### Including Only Specific Files (.gptinclude) + +Add a `.gptinclude` file to your repository to specify which files should be included in the output. Each line in the file should contain a glob pattern of files or directories to include. If a `.gptinclude` file is present, only files that match these patterns will be included. + +Example `.gptinclude` file: +``` +# Include only these file types +*.go +*.js +*.html +*.css + +# Include specific directories +src/** +docs/api/** +``` + +### Ignoring Specific Files (.gptignore) + +Add a `.gptignore` file to your repository to specify which files should be ignored. This works similar to `.gitignore`, but is specific to git2gpt. The `.gptignore` file should contain a list of files and directories to ignore, one per line. + +Example `.gptignore` file: +``` +# Ignore these file types +*.log +*.tmp +*.bak + +# Ignore specific directories +node_modules/** +build/** +``` + +**Note**: When both `.gptinclude` and `.gptignore` files exist, git2gpt will first include files matching the `.gptinclude` patterns, and then exclude any of those files that also match `.gptignore` patterns. + +## Command Line Options * `-p`, `--preamble`: Path to a text file containing a preamble to include at the beginning of the output file. * `-o`, `--output`: Path to the output file. If not specified, will print to standard output. @@ -36,6 +71,7 @@ By default, your `.git` directory and your `.gitignore` files are ignored. Any f * `-j`, `--json`: Output to JSON rather than plain text. Use with `-o` to specify the output file. * `-x`, `--xml`: Output to XML rather than plain text. Use with `-o` to specify the output file. * `-i`, `--ignore`: Path to the `.gptignore` file. If not specified, will look for a `.gptignore` file in the same directory as the `.gitignore` file. +* `-I`, `--include`: Path to the `.gptinclude` file. If not specified, will look for a `.gptinclude` file in the repository root. * `-g`, `--ignore-gitignore`: Ignore the `.gitignore` file. * `-s`, `--scrub-comments`: Remove comments from the output file to save tokens. diff --git a/chat.json b/chat.json new file mode 100644 index 0000000..f791088 --- /dev/null +++ b/chat.json @@ -0,0 +1 @@ +{"total_tokens":3557,"files":[{"path":"prompt/gptinclude_test.go","tokens":1181,"contents":"package prompt\n\nimport (\n\t\"os\"\n\t\"path/filepath\"\n\t\"testing\"\n)\n\nfunc TestGptIncludeAndIgnore(t *testing.T) {\n\t// Create a temporary directory structure for testing\n\ttempDir, err := os.MkdirTemp(\"\", \"git2gpt-test\")\n\tif err != nil {\n\t\tt.Fatalf(\"Failed to create temp directory: %v\", err)\n\t}\n\tdefer os.RemoveAll(tempDir)\n\n\t// Create test files\n\ttestFiles := []struct {\n\t\tpath string\n\t\tcontents string\n\t}{\n\t\t{\"file1.txt\", \"Content of file1\"},\n\t\t{\"file2.txt\", \"Content of file2\"},\n\t\t{\"file3.txt\", \"Content of file3\"},\n\t\t{\"src/main.go\", \"package main\\nfunc main() {}\"},\n\t\t{\"src/lib/util.go\", \"package lib\\nfunc Util() {}\"},\n\t\t{\"docs/README.md\", \"# Documentation\"},\n\t}\n\n\tfor _, tf := range testFiles {\n\t\tfullPath := filepath.Join(tempDir, tf.path)\n\t\t// Create directory if it doesn't exist\n\t\tdir := filepath.Dir(fullPath)\n\t\tif err := os.MkdirAll(dir, 0755); err != nil {\n\t\t\tt.Fatalf(\"Failed to create directory %s: %v\", dir, err)\n\t\t}\n\t\t// Write the file\n\t\tif err := os.WriteFile(fullPath, []byte(tf.contents), 0644); err != nil {\n\t\t\tt.Fatalf(\"Failed to write file %s: %v\", fullPath, err)\n\t\t}\n\t}\n\n\t// Test cases\n\ttestCases := []struct {\n\t\tname string\n\t\tincludeContent string\n\t\tignoreContent string\n\t\texpectedFiles []string\n\t\tunexpectedFiles []string\n\t}{\n\t\t{\n\t\t\tname: \"Only include src directory\",\n\t\t\tincludeContent: \"src/**\",\n\t\t\tignoreContent: \"\",\n\t\t\texpectedFiles: []string{\"src/main.go\", \"src/lib/util.go\"},\n\t\t\tunexpectedFiles: []string{\"file1.txt\", \"file2.txt\", \"file3.txt\", \"docs/README.md\"},\n\t\t},\n\t\t{\n\t\t\tname: \"Include all, but ignore .txt files\",\n\t\t\tincludeContent: \"**\",\n\t\t\tignoreContent: \"*.txt\",\n\t\t\texpectedFiles: []string{\"src/main.go\", \"src/lib/util.go\", \"docs/README.md\"},\n\t\t\tunexpectedFiles: []string{\"file1.txt\", \"file2.txt\", \"file3.txt\"},\n\t\t},\n\t\t{\n\t\t\tname: \"Include src and docs, but ignore lib directory\",\n\t\t\tincludeContent: \"src/**\\ndocs/**\",\n\t\t\tignoreContent: \"src/lib/**\",\n\t\t\texpectedFiles: []string{\"src/main.go\", \"docs/README.md\"},\n\t\t\tunexpectedFiles: []string{\"file1.txt\", \"file2.txt\", \"file3.txt\", \"src/lib/util.go\"},\n\t\t},\n\t\t{\n\t\t\tname: \"No include file (should include all), ignore .txt files\",\n\t\t\tincludeContent: \"\",\n\t\t\tignoreContent: \"*.txt\",\n\t\t\texpectedFiles: []string{\"src/main.go\", \"src/lib/util.go\", \"docs/README.md\"},\n\t\t\tunexpectedFiles: []string{\"file1.txt\", \"file2.txt\", \"file3.txt\"},\n\t\t},\n\t}\n\n\tfor _, tc := range testCases {\n\t\tt.Run(tc.name, func(t *testing.T) {\n\t\t\t// Create .gptinclude file if needed\n\t\t\tincludeFilePath := filepath.Join(tempDir, \".gptinclude\")\n\t\t\tif tc.includeContent != \"\" {\n\t\t\t\tif err := os.WriteFile(includeFilePath, []byte(tc.includeContent), 0644); err != nil {\n\t\t\t\t\tt.Fatalf(\"Failed to write .gptinclude file: %v\", err)\n\t\t\t\t}\n\t\t\t} else {\n\t\t\t\t// Ensure no .gptinclude file exists\n\t\t\t\tos.Remove(includeFilePath)\n\t\t\t}\n\n\t\t\t// Create .gptignore file if needed\n\t\t\tignoreFilePath := filepath.Join(tempDir, \".gptignore\")\n\t\t\tif tc.ignoreContent != \"\" {\n\t\t\t\tif err := os.WriteFile(ignoreFilePath, []byte(tc.ignoreContent), 0644); err != nil {\n\t\t\t\t\tt.Fatalf(\"Failed to write .gptignore file: %v\", err)\n\t\t\t\t}\n\t\t\t} else {\n\t\t\t\t// Ensure no .gptignore file exists\n\t\t\t\tos.Remove(ignoreFilePath)\n\t\t\t}\n\n\t\t\t// Generate include and ignore lists\n\t\t\tincludeList := GenerateIncludeList(tempDir, \"\")\n\t\t\tignoreList := GenerateIgnoreList(tempDir, \"\", false)\n\n\t\t\t// Process the repository\n\t\t\trepo, err := ProcessGitRepo(tempDir, includeList, ignoreList)\n\t\t\tif err != nil {\n\t\t\t\tt.Fatalf(\"Failed to process repository: %v\", err)\n\t\t\t}\n\n\t\t\t// Check if expected files are included\n\t\t\tfor _, expectedFile := range tc.expectedFiles {\n\t\t\t\tfound := false\n\t\t\t\tfor _, file := range repo.Files {\n\t\t\t\t\tif file.Path == expectedFile {\n\t\t\t\t\t\tfound = true\n\t\t\t\t\t\tbreak\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\tif !found {\n\t\t\t\t\tt.Errorf(\"Expected file %s to be included, but it wasn't\", expectedFile)\n\t\t\t\t}\n\t\t\t}\n\n\t\t\t// Check if unexpected files are excluded\n\t\t\tfor _, unexpectedFile := range tc.unexpectedFiles {\n\t\t\t\tfor _, file := range repo.Files {\n\t\t\t\t\tif file.Path == unexpectedFile {\n\t\t\t\t\t\tt.Errorf(\"File %s should have been excluded, but it was included\", unexpectedFile)\n\t\t\t\t\t\tbreak\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t})\n\t}\n}"},{"path":"prompt/prompt.go","tokens":2521,"contents":"package prompt\n\nimport (\n\t\"bufio\"\n\t\"encoding/json\"\n\t\"encoding/xml\"\n\t\"fmt\"\n\t\"io\"\n\t\"os\"\n\t\"path/filepath\"\n\t\"strings\"\n\t\"unicode/utf8\"\n\t\"github.com/chand1012/git2gpt/utils\"\n\t\"github.com/gobwas/glob\"\n\t\"github.com/pkoukk/tiktoken-go\"\n)\n\ntype GitFile struct {\n\tPath string `json:\"path\" xml:\"path\"` // path to the file relative to the repository root\n\tTokens int64 `json:\"tokens\" xml:\"tokens\"` // number of tokens in the file\n\tContents string `json:\"contents\" xml:\"contents\"` // contents of the file\n}\n\ntype GitRepo struct {\n\tTotalTokens int64 `json:\"total_tokens\" xml:\"total_tokens\"`\n\tFiles []GitFile `json:\"files\" xml:\"files\u003efile\"`\n\tFileCount int `json:\"file_count\" xml:\"file_count\"`\n}\n\nfunc contains(s []string, e string) bool {\n\tfor _, a := range s {\n\t\tif a == e {\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n\nfunc getIgnoreList(ignoreFilePath string) ([]string, error) {\n\tvar ignoreList []string\n\tfile, err := os.Open(ignoreFilePath)\n\tif err != nil {\n\t\treturn ignoreList, err\n\t}\n\tdefer file.Close()\n\tscanner := bufio.NewScanner(file)\n\tfor scanner.Scan() {\n\t\tline := strings.TrimSpace(scanner.Text())\n\t\tif line == \"\" || strings.HasPrefix(line, \"#\") {\n\t\t\tcontinue\n\t\t}\n\t\tif strings.HasSuffix(line, \"/\") {\n\t\t\tline = line + \"**\"\n\t\t}\n\t\tline = strings.TrimPrefix(line, \"/\")\n\t\tignoreList = append(ignoreList, line)\n\t}\n\treturn ignoreList, scanner.Err()\n}\n\n// Similar to getIgnoreList, but for .gptinclude files\nfunc getIncludeList(includeFilePath string) ([]string, error) {\n\tvar includeList []string\n\tfile, err := os.Open(includeFilePath)\n\tif err != nil {\n\t\treturn includeList, err\n\t}\n\tdefer file.Close()\n\tscanner := bufio.NewScanner(file)\n\tfor scanner.Scan() {\n\t\tline := strings.TrimSpace(scanner.Text())\n\t\tif line == \"\" || strings.HasPrefix(line, \"#\") {\n\t\t\tcontinue\n\t\t}\n\t\tif strings.HasSuffix(line, \"/\") {\n\t\t\tline = line + \"**\"\n\t\t}\n\t\tline = strings.TrimPrefix(line, \"/\")\n\t\tincludeList = append(includeList, line)\n\t}\n\treturn includeList, scanner.Err()\n}\n\nfunc windowsToUnixPath(windowsPath string) string {\n\tunixPath := strings.ReplaceAll(windowsPath, \"\\\\\", \"/\")\n\treturn unixPath\n}\n\n// This function is kept for backward compatibility\nfunc shouldIgnore(filePath string, ignoreList []string) bool {\n\tfor _, pattern := range ignoreList {\n\t\tg := glob.MustCompile(pattern, '/')\n\t\tif g.Match(windowsToUnixPath(filePath)) {\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n\n// Determines if a file should be included in the output\n// First checks if the file matches the include list (if provided)\n// Then checks if the file is excluded by the ignore list\nfunc shouldProcess(filePath string, includeList, ignoreList []string) bool {\n\t// If includeList is provided, check if the file is included\n\tif len(includeList) \u003e 0 {\n\t\tincluded := false\n\t\tfor _, pattern := range includeList {\n\t\t\tg := glob.MustCompile(pattern, '/')\n\t\t\tif g.Match(windowsToUnixPath(filePath)) {\n\t\t\t\tincluded = true\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\t\tif !included {\n\t\t\treturn false // If not in the include list, skip it\n\t\t}\n\t}\n\t\n\t// Check if the file is excluded by ignoreList\n\tfor _, pattern := range ignoreList {\n\t\tg := glob.MustCompile(pattern, '/')\n\t\tif g.Match(windowsToUnixPath(filePath)) {\n\t\t\treturn false // If in the ignore list, skip it\n\t\t}\n\t}\n\t\n\treturn true // Process this file\n}\n\nfunc GenerateIgnoreList(repoPath, ignoreFilePath string, useGitignore bool) []string {\n\tif ignoreFilePath == \"\" {\n\t\tignoreFilePath = filepath.Join(repoPath, \".gptignore\")\n\t}\n\tvar ignoreList []string\n\tif _, err := os.Stat(ignoreFilePath); err == nil {\n\t\tignoreList, _ = getIgnoreList(ignoreFilePath)\n\t}\n\tignoreList = append(ignoreList, \".git/**\", \".gitignore\", \".gptignore\", \".gptinclude\")\n\tif useGitignore {\n\t\tgitignorePath := filepath.Join(repoPath, \".gitignore\")\n\t\tif _, err := os.Stat(gitignorePath); err == nil {\n\t\t\tgitignoreList, _ := getIgnoreList(gitignorePath)\n\t\t\tignoreList = append(ignoreList, gitignoreList...)\n\t\t}\n\t}\n\tvar finalIgnoreList []string\n\tfor _, pattern := range ignoreList {\n\t\tif !contains(finalIgnoreList, pattern) {\n\t\t\tinfo, err := os.Stat(filepath.Join(repoPath, pattern))\n\t\t\tif err == nil \u0026\u0026 info.IsDir() {\n\t\t\t\tpattern = filepath.Join(pattern, \"**\")\n\t\t\t}\n\t\t\tfinalIgnoreList = append(finalIgnoreList, pattern)\n\t\t}\n\t}\n\treturn finalIgnoreList\n}\n\n// Generate include list from .gptinclude file\nfunc GenerateIncludeList(repoPath, includeFilePath string) []string {\n\tif includeFilePath == \"\" {\n\t\tincludeFilePath = filepath.Join(repoPath, \".gptinclude\")\n\t}\n\tvar includeList []string\n\tif _, err := os.Stat(includeFilePath); err == nil {\n\t\tincludeList, _ = getIncludeList(includeFilePath)\n\t}\n\t\n\tvar finalIncludeList []string\n\tfor _, pattern := range includeList {\n\t\tif !contains(finalIncludeList, pattern) {\n\t\t\tinfo, err := os.Stat(filepath.Join(repoPath, pattern))\n\t\t\tif err == nil \u0026\u0026 info.IsDir() {\n\t\t\t\tpattern = filepath.Join(pattern, \"**\")\n\t\t\t}\n\t\t\tfinalIncludeList = append(finalIncludeList, pattern)\n\t\t}\n\t}\n\treturn finalIncludeList\n}\n\n// Update the function signature to accept includeList\nfunc ProcessGitRepo(repoPath string, includeList, ignoreList []string) (*GitRepo, error) {\n\tvar repo GitRepo\n\terr := processRepository(repoPath, includeList, ignoreList, \u0026repo)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"error processing repository: %w\", err)\n\t}\n\treturn \u0026repo, nil\n}\n\nfunc OutputGitRepo(repo *GitRepo, preambleFile string, scrubComments bool) (string, error) {\n\tvar repoBuilder strings.Builder\n\tif preambleFile != \"\" {\n\t\tpreambleText, err := os.ReadFile(preambleFile)\n\t\tif err != nil {\n\t\t\treturn \"\", fmt.Errorf(\"error reading preamble file: %w\", err)\n\t\t}\n\t\trepoBuilder.WriteString(fmt.Sprintf(\"%s\\n\", string(preambleText)))\n\t} else {\n\t\trepoBuilder.WriteString(\"The following text is a Git repository with code. The structure of the text are sections that begin with ----, followed by a single line containing the file path and file name, followed by a variable amount of lines containing the file contents. The text representing the Git repository ends when the symbols --END-- are encounted. Any further text beyond --END-- are meant to be interpreted as instructions using the aforementioned Git repository as context.\\n\")\n\t}\n\tfor _, file := range repo.Files {\n\t\trepoBuilder.WriteString(\"----\\n\")\n\t\trepoBuilder.WriteString(fmt.Sprintf(\"%s\\n\", file.Path))\n\t\tif scrubComments {\n\t\t\tfile.Contents = utils.RemoveCodeComments(file.Contents)\n\t\t}\n\t\trepoBuilder.WriteString(fmt.Sprintf(\"%s\\n\", file.Contents))\n\t}\n\trepoBuilder.WriteString(\"--END--\")\n\toutput := repoBuilder.String()\n\trepo.TotalTokens = EstimateTokens(output)\n\treturn output, nil\n}\n\nfunc OutputGitRepoXML(repo *GitRepo, scrubComments bool) (string, error) {\n\tif scrubComments {\n\t\t\tfor i, file := range repo.Files {\n\t\t\t\t\trepo.Files[i].Contents = utils.RemoveCodeComments(file.Contents)\n\t\t\t}\n\t}\n\tvar result strings.Builder\n\tresult.WriteString(\"\u003c?xml version=\\\"1.0\\\" encoding=\\\"UTF-8\\\"?\u003e\\n\")\n\tresult.WriteString(\"\u003croot\u003e\\n\")\n\t\n\tresult.WriteString(\" \u003ctotal_tokens\u003ePLACEHOLDER\u003c/total_tokens\u003e\\n\")\n\tresult.WriteString(fmt.Sprintf(\" \u003cfile_count\u003e%d\u003c/file_count\u003e\\n\", repo.FileCount))\n\tresult.WriteString(\" \u003cfiles\u003e\\n\")\n\t\n\tfor _, file := range repo.Files {\n\t\t\tresult.WriteString(\" \u003cfile\u003e\\n\")\n\t\t\tresult.WriteString(fmt.Sprintf(\" \u003cpath\u003e%s\u003c/path\u003e\\n\", escapeXML(file.Path)))\n\t\t\tresult.WriteString(fmt.Sprintf(\" \u003ctokens\u003e%d\u003c/tokens\u003e\\n\", file.Tokens))\n\t\t\t\n\t\t\tsafeContents := strings.ReplaceAll(file.Contents, \"]]]]\u003e\u003c![CDATA[\u003e\", \"]]]]]]\u003e\u003c![CDATA[\u003e\u003c![CDATA[\u003e\")\n\t\t\t\n\t\t\tresult.WriteString(\" \u003ccontents\u003e\u003c![CDATA[\")\n\t\t\tresult.WriteString(safeContents)\n\t\t\tresult.WriteString(\"]]]]\u003e\u003c![CDATA[\u003e\u003c/contents\u003e\\n\")\n\t\t\tresult.WriteString(\" \u003c/file\u003e\\n\")\n\t}\n\t\n\tresult.WriteString(\" \u003c/files\u003e\\n\")\n\tresult.WriteString(\"\u003c/root\u003e\\n\")\n\t\n\toutputStr := result.String()\n\t\n\ttokenCount := EstimateTokens(outputStr)\n\trepo.TotalTokens = tokenCount\n\t\n\toutputStr = strings.Replace(\n\t\t\toutputStr, \n\t\t\t\"\u003ctotal_tokens\u003ePLACEHOLDER\u003c/total_tokens\u003e\", \n\t\t\tfmt.Sprintf(\"\u003ctotal_tokens\u003e%d\u003c/total_tokens\u003e\", tokenCount), \n\t\t\t1,\n\t)\n\t\n\treturn outputStr, nil\n}\n\nfunc escapeXML(s string) string {\n\ts = strings.ReplaceAll(s, \"\u0026\", \"\u0026amp;\")\n\ts = strings.ReplaceAll(s, \"\u003c\", \"\u0026lt;\")\n\ts = strings.ReplaceAll(s, \"\u003e\", \"\u0026gt;\")\n\ts = strings.ReplaceAll(s, \"\\\"\", \"\u0026quot;\")\n\ts = strings.ReplaceAll(s, \"'\", \"\u0026apos;\")\n\treturn s\n}\n\nfunc ValidateXML(xmlString string) error {\n decoder := xml.NewDecoder(strings.NewReader(xmlString))\n for {\n _, err := decoder.Token()\n if err == io.EOF {\n break\n }\n if err != nil {\n return fmt.Errorf(\"XML validation error: %w\", err)\n }\n }\n return nil\n}\n\nfunc MarshalRepo(repo *GitRepo, scrubComments bool) ([]byte, error) {\n\t_, err := OutputGitRepo(repo, \"\", scrubComments)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"error marshalling repo: %w\", err)\n\t}\n\treturn json.Marshal(repo)\n}\n\n// Update the function signature to accept includeList and use shouldProcess\nfunc processRepository(repoPath string, includeList, ignoreList []string, repo *GitRepo) error {\n\terr := filepath.Walk(repoPath, func(path string, info os.FileInfo, err error) error {\n\t\tif err != nil {\n\t\t\treturn err\n\t\t}\n\t\tif !info.IsDir() {\n\t\t\trelativeFilePath, _ := filepath.Rel(repoPath, path)\n\t\t\tprocess := shouldProcess(relativeFilePath, includeList, ignoreList)\n\t\t\tif process {\n\t\t\t\tcontents, err := os.ReadFile(path)\n\t\t\t\tif !utf8.Valid(contents) {\n\t\t\t\t\treturn nil\n\t\t\t\t}\n\t\t\t\tif err != nil {\n\t\t\t\t\treturn err\n\t\t\t\t}\n\t\t\t\tvar file GitFile\n\t\t\t\tfile.Path = relativeFilePath\n\t\t\t\tfile.Contents = string(contents)\n\t\t\t\tfile.Tokens = EstimateTokens(file.Contents)\n\t\t\t\trepo.Files = append(repo.Files, file)\n\t\t\t}\n\t\t}\n\t\treturn nil\n\t})\n\trepo.FileCount = len(repo.Files)\n\tif err != nil {\n\t\treturn fmt.Errorf(\"error walking the path %q: %w\", repoPath, err)\n\t}\n\treturn nil\n}\n\nfunc EstimateTokens(output string) int64 {\n\ttke, err := tiktoken.GetEncoding(\"cl100k_base\")\n\tif err != nil {\n\t\tfmt.Println(\"Error getting encoding:\", err)\n\t\treturn 0\n\t}\n\ttokens := tke.Encode(output, nil, nil)\n\treturn int64(len(tokens))\n}"}],"file_count":2} \ No newline at end of file diff --git a/cmd/root.go b/cmd/root.go index 840fd8b..d438984 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -1,49 +1,40 @@ package cmd - import ( "fmt" "os" - "github.com/chand1012/git2gpt/prompt" "github.com/spf13/cobra" ) - var repoPath string var preambleFile string var outputFile string var estimateTokens bool var ignoreFilePath string +var includeFilePath string // New: Add variable for include file path var ignoreGitignore bool var outputJSON bool var outputXML bool var debug bool var scrubComments bool - var rootCmd = &cobra.Command{ Use: "git2gpt [flags] /path/to/git/repository [/path/to/another/repository ...]", Short: "git2gpt is a utility to convert one or more Git repositories to a text file for input into an LLM", Args: cobra.MinimumNArgs(1), Run: func(cmd *cobra.Command, args []string) { - // Create a combined repository to hold all files combinedRepo := &prompt.GitRepo{ Files: []prompt.GitFile{}, } - - // Process each repository path for _, path := range args { repoPath = path ignoreList := prompt.GenerateIgnoreList(repoPath, ignoreFilePath, !ignoreGitignore) - repo, err := prompt.ProcessGitRepo(repoPath, ignoreList) + includeList := prompt.GenerateIncludeList(repoPath, includeFilePath) // New: Generate include list + repo, err := prompt.ProcessGitRepo(repoPath, includeList, ignoreList) // Modified: Pass includeList if err != nil { fmt.Printf("Error processing %s: %s\n", repoPath, err) os.Exit(1) } - - // Add files from this repo to the combined repo combinedRepo.Files = append(combinedRepo.Files, repo.Files...) } - - // Update the file count combinedRepo.FileCount = len(combinedRepo.Files) if outputJSON { output, err := prompt.MarshalRepo(combinedRepo, scrubComments) @@ -52,7 +43,6 @@ var rootCmd = &cobra.Command{ os.Exit(1) } if outputFile != "" { - // if output file exists, throw error if _, err := os.Stat(outputFile); err == nil { fmt.Printf("Error: output file %s already exists\n", outputFile) os.Exit(1) @@ -75,15 +65,11 @@ var rootCmd = &cobra.Command{ fmt.Printf("Error: %s\n", err) os.Exit(1) } - - // Validate the XML output if err := prompt.ValidateXML(output); err != nil { fmt.Printf("Error: %s\n", err) os.Exit(1) } - if outputFile != "" { - // if output file exists, throw error if _, err := os.Stat(outputFile); err == nil { fmt.Printf("Error: output file %s already exists\n", outputFile) os.Exit(1) @@ -106,7 +92,6 @@ var rootCmd = &cobra.Command{ os.Exit(1) } if outputFile != "" { - // if output file exists, throw error if _, err := os.Stat(outputFile); err == nil { fmt.Printf("Error: output file %s already exists\n", outputFile) os.Exit(1) @@ -126,33 +111,22 @@ var rootCmd = &cobra.Command{ } }, } - func init() { rootCmd.Flags().StringVarP(&preambleFile, "preamble", "p", "", "path to preamble text file") - // output to file flag. Should be a string rootCmd.Flags().StringVarP(&outputFile, "output", "o", "", "path to output file") - // estimate tokens. Should be a bool rootCmd.Flags().BoolVarP(&estimateTokens, "estimate", "e", false, "estimate the number of tokens in the output") - // ignore file path. Should be a string rootCmd.Flags().StringVarP(&ignoreFilePath, "ignore", "i", "", "path to .gptignore file") - // ignore gitignore. Should be a bool + rootCmd.Flags().StringVarP(&includeFilePath, "include", "I", "", "path to .gptinclude file") // New: Add flag for include file rootCmd.Flags().BoolVarP(&ignoreGitignore, "ignore-gitignore", "g", false, "ignore .gitignore file") - // output JSON. Should be a bool rootCmd.Flags().BoolVarP(&outputJSON, "json", "j", false, "output JSON") - // output XML. Should be a bool rootCmd.Flags().BoolVarP(&outputXML, "xml", "x", false, "output XML") - // debug. Should be a bool rootCmd.Flags().BoolVarP(&debug, "debug", "d", false, "debug mode. Do not output to standard output") - // scrub comments. Should be a bool rootCmd.Flags().BoolVarP(&scrubComments, "scrub-comments", "s", false, "scrub comments from the output. Decreases token count") - - // Update the example usage to show multiple paths rootCmd.Example = " git2gpt /path/to/repo1 /path/to/repo2\n git2gpt -o output.txt /path/to/repo1 /path/to/repo2" } - func Execute() { if err := rootCmd.Execute(); err != nil { fmt.Println(err) os.Exit(1) } -} +} \ No newline at end of file diff --git a/prompt/gptinclude_test.go b/prompt/gptinclude_test.go new file mode 100644 index 0000000..fe17d68 --- /dev/null +++ b/prompt/gptinclude_test.go @@ -0,0 +1,140 @@ +package prompt + +import ( + "os" + "path/filepath" + "testing" +) + +func TestGptIncludeAndIgnore(t *testing.T) { + // Create a temporary directory structure for testing + tempDir, err := os.MkdirTemp("", "git2gpt-test") + if err != nil { + t.Fatalf("Failed to create temp directory: %v", err) + } + defer os.RemoveAll(tempDir) + + // Create test files + testFiles := []struct { + path string + contents string + }{ + {"file1.txt", "Content of file1"}, + {"file2.txt", "Content of file2"}, + {"file3.txt", "Content of file3"}, + {"src/main.go", "package main\nfunc main() {}"}, + {"src/lib/util.go", "package lib\nfunc Util() {}"}, + {"docs/README.md", "# Documentation"}, + } + + for _, tf := range testFiles { + fullPath := filepath.Join(tempDir, tf.path) + // Create directory if it doesn't exist + dir := filepath.Dir(fullPath) + if err := os.MkdirAll(dir, 0755); err != nil { + t.Fatalf("Failed to create directory %s: %v", dir, err) + } + // Write the file + if err := os.WriteFile(fullPath, []byte(tf.contents), 0644); err != nil { + t.Fatalf("Failed to write file %s: %v", fullPath, err) + } + } + + // Test cases + testCases := []struct { + name string + includeContent string + ignoreContent string + expectedFiles []string + unexpectedFiles []string + }{ + { + name: "Only include src directory", + includeContent: "src/**", + ignoreContent: "", + expectedFiles: []string{"src/main.go", "src/lib/util.go"}, + unexpectedFiles: []string{"file1.txt", "file2.txt", "file3.txt", "docs/README.md"}, + }, + { + name: "Include all, but ignore .txt files", + includeContent: "**", + ignoreContent: "*.txt", + expectedFiles: []string{"src/main.go", "src/lib/util.go", "docs/README.md"}, + unexpectedFiles: []string{"file1.txt", "file2.txt", "file3.txt"}, + }, + { + name: "Include src and docs, but ignore lib directory", + includeContent: "src/**\ndocs/**", + ignoreContent: "src/lib/**", + expectedFiles: []string{"src/main.go", "docs/README.md"}, + unexpectedFiles: []string{"file1.txt", "file2.txt", "file3.txt", "src/lib/util.go"}, + }, + { + name: "No include file (should include all), ignore .txt files", + includeContent: "", + ignoreContent: "*.txt", + expectedFiles: []string{"src/main.go", "src/lib/util.go", "docs/README.md"}, + unexpectedFiles: []string{"file1.txt", "file2.txt", "file3.txt"}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Create .gptinclude file if needed + includeFilePath := filepath.Join(tempDir, ".gptinclude") + if tc.includeContent != "" { + if err := os.WriteFile(includeFilePath, []byte(tc.includeContent), 0644); err != nil { + t.Fatalf("Failed to write .gptinclude file: %v", err) + } + } else { + // Ensure no .gptinclude file exists + os.Remove(includeFilePath) + } + + // Create .gptignore file if needed + ignoreFilePath := filepath.Join(tempDir, ".gptignore") + if tc.ignoreContent != "" { + if err := os.WriteFile(ignoreFilePath, []byte(tc.ignoreContent), 0644); err != nil { + t.Fatalf("Failed to write .gptignore file: %v", err) + } + } else { + // Ensure no .gptignore file exists + os.Remove(ignoreFilePath) + } + + // Generate include and ignore lists + includeList := GenerateIncludeList(tempDir, "") + ignoreList := GenerateIgnoreList(tempDir, "", false) + + // Process the repository + repo, err := ProcessGitRepo(tempDir, includeList, ignoreList) + if err != nil { + t.Fatalf("Failed to process repository: %v", err) + } + + // Check if expected files are included + for _, expectedFile := range tc.expectedFiles { + found := false + for _, file := range repo.Files { + if file.Path == expectedFile { + found = true + break + } + } + if !found { + t.Errorf("Expected file %s to be included, but it wasn't", expectedFile) + } + } + + // Check if unexpected files are excluded + for _, unexpectedFile := range tc.unexpectedFiles { + for _, file := range repo.Files { + if file.Path == unexpectedFile { + t.Errorf("File %s should have been excluded, but it was included", unexpectedFile) + break + } + } + } + }) + } +} \ No newline at end of file diff --git a/prompt/prompt.go b/prompt/prompt.go index 1de2db4..0f7c025 100644 --- a/prompt/prompt.go +++ b/prompt/prompt.go @@ -10,27 +10,23 @@ import ( "path/filepath" "strings" "unicode/utf8" - "github.com/chand1012/git2gpt/utils" "github.com/gobwas/glob" "github.com/pkoukk/tiktoken-go" ) -// GitFile is a file in a Git repository type GitFile struct { Path string `json:"path" xml:"path"` // path to the file relative to the repository root Tokens int64 `json:"tokens" xml:"tokens"` // number of tokens in the file Contents string `json:"contents" xml:"contents"` // contents of the file } -// GitRepo is a Git repository type GitRepo struct { TotalTokens int64 `json:"total_tokens" xml:"total_tokens"` Files []GitFile `json:"files" xml:"files>file"` FileCount int `json:"file_count" xml:"file_count"` } -// contains checks if a string is in a slice of strings func contains(s []string, e string) bool { for _, a := range s { if a == e { @@ -47,30 +43,50 @@ func getIgnoreList(ignoreFilePath string) ([]string, error) { return ignoreList, err } defer file.Close() - scanner := bufio.NewScanner(file) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if line == "" || strings.HasPrefix(line, "#") { continue } - // if the line ends with a slash, add a globstar to the end if strings.HasSuffix(line, "/") { line = line + "**" } - // remove all preceding slashes line = strings.TrimPrefix(line, "/") - // line = filepath.FromSlash(line) ignoreList = append(ignoreList, line) } return ignoreList, scanner.Err() } +// Similar to getIgnoreList, but for .gptinclude files +func getIncludeList(includeFilePath string) ([]string, error) { + var includeList []string + file, err := os.Open(includeFilePath) + if err != nil { + return includeList, err + } + defer file.Close() + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + if strings.HasSuffix(line, "/") { + line = line + "**" + } + line = strings.TrimPrefix(line, "/") + includeList = append(includeList, line) + } + return includeList, scanner.Err() +} + func windowsToUnixPath(windowsPath string) string { unixPath := strings.ReplaceAll(windowsPath, "\\", "/") return unixPath } +// This function is kept for backward compatibility func shouldIgnore(filePath string, ignoreList []string) bool { for _, pattern := range ignoreList { g := glob.MustCompile(pattern, '/') @@ -81,34 +97,55 @@ func shouldIgnore(filePath string, ignoreList []string) bool { return false } -// GenerateIgnoreList generates a list of ignore patterns from the .gptignore file and the .gitignore file. Returns a slice of strings. Will return an empty slice if no ignore files exist. +// Determines if a file should be included in the output +// First checks if the file matches the include list (if provided) +// Then checks if the file is excluded by the ignore list +func shouldProcess(filePath string, includeList, ignoreList []string) bool { + // If includeList is provided, check if the file is included + if len(includeList) > 0 { + included := false + for _, pattern := range includeList { + g := glob.MustCompile(pattern, '/') + if g.Match(windowsToUnixPath(filePath)) { + included = true + break + } + } + if !included { + return false // If not in the include list, skip it + } + } + + // Check if the file is excluded by ignoreList + for _, pattern := range ignoreList { + g := glob.MustCompile(pattern, '/') + if g.Match(windowsToUnixPath(filePath)) { + return false // If in the ignore list, skip it + } + } + + return true // Process this file +} + func GenerateIgnoreList(repoPath, ignoreFilePath string, useGitignore bool) []string { if ignoreFilePath == "" { ignoreFilePath = filepath.Join(repoPath, ".gptignore") } - var ignoreList []string if _, err := os.Stat(ignoreFilePath); err == nil { - // .gptignore file exists ignoreList, _ = getIgnoreList(ignoreFilePath) } - ignoreList = append(ignoreList, ".git/**", ".gitignore", ".gptignore") - + ignoreList = append(ignoreList, ".git/**", ".gitignore", ".gptignore", ".gptinclude") if useGitignore { gitignorePath := filepath.Join(repoPath, ".gitignore") if _, err := os.Stat(gitignorePath); err == nil { - // .gitignore file exists gitignoreList, _ := getIgnoreList(gitignorePath) ignoreList = append(ignoreList, gitignoreList...) } } - var finalIgnoreList []string - // loop through the ignore list and remove any duplicates - // also check if any pattern is a directory and add a globstar to the end for _, pattern := range ignoreList { if !contains(finalIgnoreList, pattern) { - // check if the pattern is a directory info, err := os.Stat(filepath.Join(repoPath, pattern)) if err == nil && info.IsDir() { pattern = filepath.Join(pattern, "**") @@ -116,27 +153,44 @@ func GenerateIgnoreList(repoPath, ignoreFilePath string, useGitignore bool) []st finalIgnoreList = append(finalIgnoreList, pattern) } } - return finalIgnoreList } -// ProcessGitRepo processes a Git repository and returns a GitRepo object -func ProcessGitRepo(repoPath string, ignoreList []string) (*GitRepo, error) { +// Generate include list from .gptinclude file +func GenerateIncludeList(repoPath, includeFilePath string) []string { + if includeFilePath == "" { + includeFilePath = filepath.Join(repoPath, ".gptinclude") + } + var includeList []string + if _, err := os.Stat(includeFilePath); err == nil { + includeList, _ = getIncludeList(includeFilePath) + } + + var finalIncludeList []string + for _, pattern := range includeList { + if !contains(finalIncludeList, pattern) { + info, err := os.Stat(filepath.Join(repoPath, pattern)) + if err == nil && info.IsDir() { + pattern = filepath.Join(pattern, "**") + } + finalIncludeList = append(finalIncludeList, pattern) + } + } + return finalIncludeList +} +// Update the function signature to accept includeList +func ProcessGitRepo(repoPath string, includeList, ignoreList []string) (*GitRepo, error) { var repo GitRepo - - err := processRepository(repoPath, ignoreList, &repo) + err := processRepository(repoPath, includeList, ignoreList, &repo) if err != nil { return nil, fmt.Errorf("error processing repository: %w", err) } - return &repo, nil } -// OutputGitRepo outputs a Git repository to a text file func OutputGitRepo(repo *GitRepo, preambleFile string, scrubComments bool) (string, error) { var repoBuilder strings.Builder - if preambleFile != "" { preambleText, err := os.ReadFile(preambleFile) if err != nil { @@ -146,8 +200,6 @@ func OutputGitRepo(repo *GitRepo, preambleFile string, scrubComments bool) (stri } else { repoBuilder.WriteString("The following text is a Git repository with code. The structure of the text are sections that begin with ----, followed by a single line containing the file path and file name, followed by a variable amount of lines containing the file contents. The text representing the Git repository ends when the symbols --END-- are encounted. Any further text beyond --END-- are meant to be interpreted as instructions using the aforementioned Git repository as context.\n") } - - // write the files to the repoBuilder here for _, file := range repo.Files { repoBuilder.WriteString("----\n") repoBuilder.WriteString(fmt.Sprintf("%s\n", file.Path)) @@ -156,30 +208,22 @@ func OutputGitRepo(repo *GitRepo, preambleFile string, scrubComments bool) (stri } repoBuilder.WriteString(fmt.Sprintf("%s\n", file.Contents)) } - repoBuilder.WriteString("--END--") - output := repoBuilder.String() - repo.TotalTokens = EstimateTokens(output) - return output, nil } func OutputGitRepoXML(repo *GitRepo, scrubComments bool) (string, error) { - // Prepare XML content if scrubComments { for i, file := range repo.Files { repo.Files[i].Contents = utils.RemoveCodeComments(file.Contents) } } - - // Add XML header var result strings.Builder result.WriteString("\n") result.WriteString("\n") - // Add token placeholder - will replace later result.WriteString(" PLACEHOLDER\n") result.WriteString(fmt.Sprintf(" %d\n", repo.FileCount)) result.WriteString(" \n") @@ -189,27 +233,22 @@ func OutputGitRepoXML(repo *GitRepo, scrubComments bool) (string, error) { result.WriteString(fmt.Sprintf(" %s\n", escapeXML(file.Path))) result.WriteString(fmt.Sprintf(" %d\n", file.Tokens)) - // Handle file contents within CDATA, but escape any "]]>" sequences - // which would prematurely terminate the CDATA section - safeContents := strings.ReplaceAll(file.Contents, "]]>", "]]]]>") + safeContents := strings.ReplaceAll(file.Contents, "]]]]>", "]]]]]]>") result.WriteString(" \n") + result.WriteString("]]]]>\n") result.WriteString(" \n") } result.WriteString(" \n") result.WriteString("\n") - // Get the output string outputStr := result.String() - // Calculate tokens tokenCount := EstimateTokens(outputStr) repo.TotalTokens = tokenCount - // Replace the placeholder with the actual token count outputStr = strings.Replace( outputStr, "PLACEHOLDER", @@ -220,8 +259,6 @@ func OutputGitRepoXML(repo *GitRepo, scrubComments bool) (string, error) { return outputStr, nil } - -// escapeXML escapes XML special characters in a string func escapeXML(s string) string { s = strings.ReplaceAll(s, "&", "&") s = strings.ReplaceAll(s, "<", "<") @@ -231,7 +268,6 @@ func escapeXML(s string) string { return s } -// ValidateXML checks if the given XML string is well-formed func ValidateXML(xmlString string) error { decoder := xml.NewDecoder(strings.NewReader(xmlString)) for { @@ -246,10 +282,7 @@ func ValidateXML(xmlString string) error { return nil } - - func MarshalRepo(repo *GitRepo, scrubComments bool) ([]byte, error) { - // run the output function to get the total tokens _, err := OutputGitRepo(repo, "", scrubComments) if err != nil { return nil, fmt.Errorf("error marshalling repo: %w", err) @@ -257,18 +290,17 @@ func MarshalRepo(repo *GitRepo, scrubComments bool) ([]byte, error) { return json.Marshal(repo) } -func processRepository(repoPath string, ignoreList []string, repo *GitRepo) error { +// Update the function signature to accept includeList and use shouldProcess +func processRepository(repoPath string, includeList, ignoreList []string, repo *GitRepo) error { err := filepath.Walk(repoPath, func(path string, info os.FileInfo, err error) error { if err != nil { return err } if !info.IsDir() { relativeFilePath, _ := filepath.Rel(repoPath, path) - ignore := shouldIgnore(relativeFilePath, ignoreList) - // fmt.Println(relativeFilePath, ignore) - if !ignore { + process := shouldProcess(relativeFilePath, includeList, ignoreList) + if process { contents, err := os.ReadFile(path) - // if the file is not valid UTF-8, skip it if !utf8.Valid(contents) { return nil } @@ -284,24 +316,19 @@ func processRepository(repoPath string, ignoreList []string, repo *GitRepo) erro } return nil }) - repo.FileCount = len(repo.Files) - if err != nil { return fmt.Errorf("error walking the path %q: %w", repoPath, err) } - return nil } -// EstimateTokens estimates the number of tokens in a string func EstimateTokens(output string) int64 { tke, err := tiktoken.GetEncoding("cl100k_base") if err != nil { fmt.Println("Error getting encoding:", err) return 0 } - tokens := tke.Encode(output, nil, nil) return int64(len(tokens)) -} +} \ No newline at end of file From fef3a2df83721a8f17dab212e1b8d1a8d95f92fb Mon Sep 17 00:00:00 2001 From: Yiannis Mertzanis Date: Tue, 29 Apr 2025 11:42:38 +0300 Subject: [PATCH 3/3] fix: Properly handle CDATA sections in XML output This commit fixes an issue where the XML export would fail with "unexpected EOF in CDATA section" errors when file content contained the CDATA end marker sequence ']]>'. The fix implements a proper CDATA handling strategy that: - Detects all occurrences of ']]>' in file content - Splits the content around these markers - Creates properly nested CDATA sections to preserve the original content - Ensures all XML output is well-formed regardless of source content This approach maintains the efficiency of CDATA for storing large code blocks while ensuring compatibility with all possible file content. Fixes the XML validation error that would occur when processing files containing CDATA end marker sequences. --- chat.json | 1 - prompt/prompt.go | 53 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 38 insertions(+), 16 deletions(-) delete mode 100644 chat.json diff --git a/chat.json b/chat.json deleted file mode 100644 index f791088..0000000 --- a/chat.json +++ /dev/null @@ -1 +0,0 @@ -{"total_tokens":3557,"files":[{"path":"prompt/gptinclude_test.go","tokens":1181,"contents":"package prompt\n\nimport (\n\t\"os\"\n\t\"path/filepath\"\n\t\"testing\"\n)\n\nfunc TestGptIncludeAndIgnore(t *testing.T) {\n\t// Create a temporary directory structure for testing\n\ttempDir, err := os.MkdirTemp(\"\", \"git2gpt-test\")\n\tif err != nil {\n\t\tt.Fatalf(\"Failed to create temp directory: %v\", err)\n\t}\n\tdefer os.RemoveAll(tempDir)\n\n\t// Create test files\n\ttestFiles := []struct {\n\t\tpath string\n\t\tcontents string\n\t}{\n\t\t{\"file1.txt\", \"Content of file1\"},\n\t\t{\"file2.txt\", \"Content of file2\"},\n\t\t{\"file3.txt\", \"Content of file3\"},\n\t\t{\"src/main.go\", \"package main\\nfunc main() {}\"},\n\t\t{\"src/lib/util.go\", \"package lib\\nfunc Util() {}\"},\n\t\t{\"docs/README.md\", \"# Documentation\"},\n\t}\n\n\tfor _, tf := range testFiles {\n\t\tfullPath := filepath.Join(tempDir, tf.path)\n\t\t// Create directory if it doesn't exist\n\t\tdir := filepath.Dir(fullPath)\n\t\tif err := os.MkdirAll(dir, 0755); err != nil {\n\t\t\tt.Fatalf(\"Failed to create directory %s: %v\", dir, err)\n\t\t}\n\t\t// Write the file\n\t\tif err := os.WriteFile(fullPath, []byte(tf.contents), 0644); err != nil {\n\t\t\tt.Fatalf(\"Failed to write file %s: %v\", fullPath, err)\n\t\t}\n\t}\n\n\t// Test cases\n\ttestCases := []struct {\n\t\tname string\n\t\tincludeContent string\n\t\tignoreContent string\n\t\texpectedFiles []string\n\t\tunexpectedFiles []string\n\t}{\n\t\t{\n\t\t\tname: \"Only include src directory\",\n\t\t\tincludeContent: \"src/**\",\n\t\t\tignoreContent: \"\",\n\t\t\texpectedFiles: []string{\"src/main.go\", \"src/lib/util.go\"},\n\t\t\tunexpectedFiles: []string{\"file1.txt\", \"file2.txt\", \"file3.txt\", \"docs/README.md\"},\n\t\t},\n\t\t{\n\t\t\tname: \"Include all, but ignore .txt files\",\n\t\t\tincludeContent: \"**\",\n\t\t\tignoreContent: \"*.txt\",\n\t\t\texpectedFiles: []string{\"src/main.go\", \"src/lib/util.go\", \"docs/README.md\"},\n\t\t\tunexpectedFiles: []string{\"file1.txt\", \"file2.txt\", \"file3.txt\"},\n\t\t},\n\t\t{\n\t\t\tname: \"Include src and docs, but ignore lib directory\",\n\t\t\tincludeContent: \"src/**\\ndocs/**\",\n\t\t\tignoreContent: \"src/lib/**\",\n\t\t\texpectedFiles: []string{\"src/main.go\", \"docs/README.md\"},\n\t\t\tunexpectedFiles: []string{\"file1.txt\", \"file2.txt\", \"file3.txt\", \"src/lib/util.go\"},\n\t\t},\n\t\t{\n\t\t\tname: \"No include file (should include all), ignore .txt files\",\n\t\t\tincludeContent: \"\",\n\t\t\tignoreContent: \"*.txt\",\n\t\t\texpectedFiles: []string{\"src/main.go\", \"src/lib/util.go\", \"docs/README.md\"},\n\t\t\tunexpectedFiles: []string{\"file1.txt\", \"file2.txt\", \"file3.txt\"},\n\t\t},\n\t}\n\n\tfor _, tc := range testCases {\n\t\tt.Run(tc.name, func(t *testing.T) {\n\t\t\t// Create .gptinclude file if needed\n\t\t\tincludeFilePath := filepath.Join(tempDir, \".gptinclude\")\n\t\t\tif tc.includeContent != \"\" {\n\t\t\t\tif err := os.WriteFile(includeFilePath, []byte(tc.includeContent), 0644); err != nil {\n\t\t\t\t\tt.Fatalf(\"Failed to write .gptinclude file: %v\", err)\n\t\t\t\t}\n\t\t\t} else {\n\t\t\t\t// Ensure no .gptinclude file exists\n\t\t\t\tos.Remove(includeFilePath)\n\t\t\t}\n\n\t\t\t// Create .gptignore file if needed\n\t\t\tignoreFilePath := filepath.Join(tempDir, \".gptignore\")\n\t\t\tif tc.ignoreContent != \"\" {\n\t\t\t\tif err := os.WriteFile(ignoreFilePath, []byte(tc.ignoreContent), 0644); err != nil {\n\t\t\t\t\tt.Fatalf(\"Failed to write .gptignore file: %v\", err)\n\t\t\t\t}\n\t\t\t} else {\n\t\t\t\t// Ensure no .gptignore file exists\n\t\t\t\tos.Remove(ignoreFilePath)\n\t\t\t}\n\n\t\t\t// Generate include and ignore lists\n\t\t\tincludeList := GenerateIncludeList(tempDir, \"\")\n\t\t\tignoreList := GenerateIgnoreList(tempDir, \"\", false)\n\n\t\t\t// Process the repository\n\t\t\trepo, err := ProcessGitRepo(tempDir, includeList, ignoreList)\n\t\t\tif err != nil {\n\t\t\t\tt.Fatalf(\"Failed to process repository: %v\", err)\n\t\t\t}\n\n\t\t\t// Check if expected files are included\n\t\t\tfor _, expectedFile := range tc.expectedFiles {\n\t\t\t\tfound := false\n\t\t\t\tfor _, file := range repo.Files {\n\t\t\t\t\tif file.Path == expectedFile {\n\t\t\t\t\t\tfound = true\n\t\t\t\t\t\tbreak\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\tif !found {\n\t\t\t\t\tt.Errorf(\"Expected file %s to be included, but it wasn't\", expectedFile)\n\t\t\t\t}\n\t\t\t}\n\n\t\t\t// Check if unexpected files are excluded\n\t\t\tfor _, unexpectedFile := range tc.unexpectedFiles {\n\t\t\t\tfor _, file := range repo.Files {\n\t\t\t\t\tif file.Path == unexpectedFile {\n\t\t\t\t\t\tt.Errorf(\"File %s should have been excluded, but it was included\", unexpectedFile)\n\t\t\t\t\t\tbreak\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t})\n\t}\n}"},{"path":"prompt/prompt.go","tokens":2521,"contents":"package prompt\n\nimport (\n\t\"bufio\"\n\t\"encoding/json\"\n\t\"encoding/xml\"\n\t\"fmt\"\n\t\"io\"\n\t\"os\"\n\t\"path/filepath\"\n\t\"strings\"\n\t\"unicode/utf8\"\n\t\"github.com/chand1012/git2gpt/utils\"\n\t\"github.com/gobwas/glob\"\n\t\"github.com/pkoukk/tiktoken-go\"\n)\n\ntype GitFile struct {\n\tPath string `json:\"path\" xml:\"path\"` // path to the file relative to the repository root\n\tTokens int64 `json:\"tokens\" xml:\"tokens\"` // number of tokens in the file\n\tContents string `json:\"contents\" xml:\"contents\"` // contents of the file\n}\n\ntype GitRepo struct {\n\tTotalTokens int64 `json:\"total_tokens\" xml:\"total_tokens\"`\n\tFiles []GitFile `json:\"files\" xml:\"files\u003efile\"`\n\tFileCount int `json:\"file_count\" xml:\"file_count\"`\n}\n\nfunc contains(s []string, e string) bool {\n\tfor _, a := range s {\n\t\tif a == e {\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n\nfunc getIgnoreList(ignoreFilePath string) ([]string, error) {\n\tvar ignoreList []string\n\tfile, err := os.Open(ignoreFilePath)\n\tif err != nil {\n\t\treturn ignoreList, err\n\t}\n\tdefer file.Close()\n\tscanner := bufio.NewScanner(file)\n\tfor scanner.Scan() {\n\t\tline := strings.TrimSpace(scanner.Text())\n\t\tif line == \"\" || strings.HasPrefix(line, \"#\") {\n\t\t\tcontinue\n\t\t}\n\t\tif strings.HasSuffix(line, \"/\") {\n\t\t\tline = line + \"**\"\n\t\t}\n\t\tline = strings.TrimPrefix(line, \"/\")\n\t\tignoreList = append(ignoreList, line)\n\t}\n\treturn ignoreList, scanner.Err()\n}\n\n// Similar to getIgnoreList, but for .gptinclude files\nfunc getIncludeList(includeFilePath string) ([]string, error) {\n\tvar includeList []string\n\tfile, err := os.Open(includeFilePath)\n\tif err != nil {\n\t\treturn includeList, err\n\t}\n\tdefer file.Close()\n\tscanner := bufio.NewScanner(file)\n\tfor scanner.Scan() {\n\t\tline := strings.TrimSpace(scanner.Text())\n\t\tif line == \"\" || strings.HasPrefix(line, \"#\") {\n\t\t\tcontinue\n\t\t}\n\t\tif strings.HasSuffix(line, \"/\") {\n\t\t\tline = line + \"**\"\n\t\t}\n\t\tline = strings.TrimPrefix(line, \"/\")\n\t\tincludeList = append(includeList, line)\n\t}\n\treturn includeList, scanner.Err()\n}\n\nfunc windowsToUnixPath(windowsPath string) string {\n\tunixPath := strings.ReplaceAll(windowsPath, \"\\\\\", \"/\")\n\treturn unixPath\n}\n\n// This function is kept for backward compatibility\nfunc shouldIgnore(filePath string, ignoreList []string) bool {\n\tfor _, pattern := range ignoreList {\n\t\tg := glob.MustCompile(pattern, '/')\n\t\tif g.Match(windowsToUnixPath(filePath)) {\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n\n// Determines if a file should be included in the output\n// First checks if the file matches the include list (if provided)\n// Then checks if the file is excluded by the ignore list\nfunc shouldProcess(filePath string, includeList, ignoreList []string) bool {\n\t// If includeList is provided, check if the file is included\n\tif len(includeList) \u003e 0 {\n\t\tincluded := false\n\t\tfor _, pattern := range includeList {\n\t\t\tg := glob.MustCompile(pattern, '/')\n\t\t\tif g.Match(windowsToUnixPath(filePath)) {\n\t\t\t\tincluded = true\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\t\tif !included {\n\t\t\treturn false // If not in the include list, skip it\n\t\t}\n\t}\n\t\n\t// Check if the file is excluded by ignoreList\n\tfor _, pattern := range ignoreList {\n\t\tg := glob.MustCompile(pattern, '/')\n\t\tif g.Match(windowsToUnixPath(filePath)) {\n\t\t\treturn false // If in the ignore list, skip it\n\t\t}\n\t}\n\t\n\treturn true // Process this file\n}\n\nfunc GenerateIgnoreList(repoPath, ignoreFilePath string, useGitignore bool) []string {\n\tif ignoreFilePath == \"\" {\n\t\tignoreFilePath = filepath.Join(repoPath, \".gptignore\")\n\t}\n\tvar ignoreList []string\n\tif _, err := os.Stat(ignoreFilePath); err == nil {\n\t\tignoreList, _ = getIgnoreList(ignoreFilePath)\n\t}\n\tignoreList = append(ignoreList, \".git/**\", \".gitignore\", \".gptignore\", \".gptinclude\")\n\tif useGitignore {\n\t\tgitignorePath := filepath.Join(repoPath, \".gitignore\")\n\t\tif _, err := os.Stat(gitignorePath); err == nil {\n\t\t\tgitignoreList, _ := getIgnoreList(gitignorePath)\n\t\t\tignoreList = append(ignoreList, gitignoreList...)\n\t\t}\n\t}\n\tvar finalIgnoreList []string\n\tfor _, pattern := range ignoreList {\n\t\tif !contains(finalIgnoreList, pattern) {\n\t\t\tinfo, err := os.Stat(filepath.Join(repoPath, pattern))\n\t\t\tif err == nil \u0026\u0026 info.IsDir() {\n\t\t\t\tpattern = filepath.Join(pattern, \"**\")\n\t\t\t}\n\t\t\tfinalIgnoreList = append(finalIgnoreList, pattern)\n\t\t}\n\t}\n\treturn finalIgnoreList\n}\n\n// Generate include list from .gptinclude file\nfunc GenerateIncludeList(repoPath, includeFilePath string) []string {\n\tif includeFilePath == \"\" {\n\t\tincludeFilePath = filepath.Join(repoPath, \".gptinclude\")\n\t}\n\tvar includeList []string\n\tif _, err := os.Stat(includeFilePath); err == nil {\n\t\tincludeList, _ = getIncludeList(includeFilePath)\n\t}\n\t\n\tvar finalIncludeList []string\n\tfor _, pattern := range includeList {\n\t\tif !contains(finalIncludeList, pattern) {\n\t\t\tinfo, err := os.Stat(filepath.Join(repoPath, pattern))\n\t\t\tif err == nil \u0026\u0026 info.IsDir() {\n\t\t\t\tpattern = filepath.Join(pattern, \"**\")\n\t\t\t}\n\t\t\tfinalIncludeList = append(finalIncludeList, pattern)\n\t\t}\n\t}\n\treturn finalIncludeList\n}\n\n// Update the function signature to accept includeList\nfunc ProcessGitRepo(repoPath string, includeList, ignoreList []string) (*GitRepo, error) {\n\tvar repo GitRepo\n\terr := processRepository(repoPath, includeList, ignoreList, \u0026repo)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"error processing repository: %w\", err)\n\t}\n\treturn \u0026repo, nil\n}\n\nfunc OutputGitRepo(repo *GitRepo, preambleFile string, scrubComments bool) (string, error) {\n\tvar repoBuilder strings.Builder\n\tif preambleFile != \"\" {\n\t\tpreambleText, err := os.ReadFile(preambleFile)\n\t\tif err != nil {\n\t\t\treturn \"\", fmt.Errorf(\"error reading preamble file: %w\", err)\n\t\t}\n\t\trepoBuilder.WriteString(fmt.Sprintf(\"%s\\n\", string(preambleText)))\n\t} else {\n\t\trepoBuilder.WriteString(\"The following text is a Git repository with code. The structure of the text are sections that begin with ----, followed by a single line containing the file path and file name, followed by a variable amount of lines containing the file contents. The text representing the Git repository ends when the symbols --END-- are encounted. Any further text beyond --END-- are meant to be interpreted as instructions using the aforementioned Git repository as context.\\n\")\n\t}\n\tfor _, file := range repo.Files {\n\t\trepoBuilder.WriteString(\"----\\n\")\n\t\trepoBuilder.WriteString(fmt.Sprintf(\"%s\\n\", file.Path))\n\t\tif scrubComments {\n\t\t\tfile.Contents = utils.RemoveCodeComments(file.Contents)\n\t\t}\n\t\trepoBuilder.WriteString(fmt.Sprintf(\"%s\\n\", file.Contents))\n\t}\n\trepoBuilder.WriteString(\"--END--\")\n\toutput := repoBuilder.String()\n\trepo.TotalTokens = EstimateTokens(output)\n\treturn output, nil\n}\n\nfunc OutputGitRepoXML(repo *GitRepo, scrubComments bool) (string, error) {\n\tif scrubComments {\n\t\t\tfor i, file := range repo.Files {\n\t\t\t\t\trepo.Files[i].Contents = utils.RemoveCodeComments(file.Contents)\n\t\t\t}\n\t}\n\tvar result strings.Builder\n\tresult.WriteString(\"\u003c?xml version=\\\"1.0\\\" encoding=\\\"UTF-8\\\"?\u003e\\n\")\n\tresult.WriteString(\"\u003croot\u003e\\n\")\n\t\n\tresult.WriteString(\" \u003ctotal_tokens\u003ePLACEHOLDER\u003c/total_tokens\u003e\\n\")\n\tresult.WriteString(fmt.Sprintf(\" \u003cfile_count\u003e%d\u003c/file_count\u003e\\n\", repo.FileCount))\n\tresult.WriteString(\" \u003cfiles\u003e\\n\")\n\t\n\tfor _, file := range repo.Files {\n\t\t\tresult.WriteString(\" \u003cfile\u003e\\n\")\n\t\t\tresult.WriteString(fmt.Sprintf(\" \u003cpath\u003e%s\u003c/path\u003e\\n\", escapeXML(file.Path)))\n\t\t\tresult.WriteString(fmt.Sprintf(\" \u003ctokens\u003e%d\u003c/tokens\u003e\\n\", file.Tokens))\n\t\t\t\n\t\t\tsafeContents := strings.ReplaceAll(file.Contents, \"]]]]\u003e\u003c![CDATA[\u003e\", \"]]]]]]\u003e\u003c![CDATA[\u003e\u003c![CDATA[\u003e\")\n\t\t\t\n\t\t\tresult.WriteString(\" \u003ccontents\u003e\u003c![CDATA[\")\n\t\t\tresult.WriteString(safeContents)\n\t\t\tresult.WriteString(\"]]]]\u003e\u003c![CDATA[\u003e\u003c/contents\u003e\\n\")\n\t\t\tresult.WriteString(\" \u003c/file\u003e\\n\")\n\t}\n\t\n\tresult.WriteString(\" \u003c/files\u003e\\n\")\n\tresult.WriteString(\"\u003c/root\u003e\\n\")\n\t\n\toutputStr := result.String()\n\t\n\ttokenCount := EstimateTokens(outputStr)\n\trepo.TotalTokens = tokenCount\n\t\n\toutputStr = strings.Replace(\n\t\t\toutputStr, \n\t\t\t\"\u003ctotal_tokens\u003ePLACEHOLDER\u003c/total_tokens\u003e\", \n\t\t\tfmt.Sprintf(\"\u003ctotal_tokens\u003e%d\u003c/total_tokens\u003e\", tokenCount), \n\t\t\t1,\n\t)\n\t\n\treturn outputStr, nil\n}\n\nfunc escapeXML(s string) string {\n\ts = strings.ReplaceAll(s, \"\u0026\", \"\u0026amp;\")\n\ts = strings.ReplaceAll(s, \"\u003c\", \"\u0026lt;\")\n\ts = strings.ReplaceAll(s, \"\u003e\", \"\u0026gt;\")\n\ts = strings.ReplaceAll(s, \"\\\"\", \"\u0026quot;\")\n\ts = strings.ReplaceAll(s, \"'\", \"\u0026apos;\")\n\treturn s\n}\n\nfunc ValidateXML(xmlString string) error {\n decoder := xml.NewDecoder(strings.NewReader(xmlString))\n for {\n _, err := decoder.Token()\n if err == io.EOF {\n break\n }\n if err != nil {\n return fmt.Errorf(\"XML validation error: %w\", err)\n }\n }\n return nil\n}\n\nfunc MarshalRepo(repo *GitRepo, scrubComments bool) ([]byte, error) {\n\t_, err := OutputGitRepo(repo, \"\", scrubComments)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"error marshalling repo: %w\", err)\n\t}\n\treturn json.Marshal(repo)\n}\n\n// Update the function signature to accept includeList and use shouldProcess\nfunc processRepository(repoPath string, includeList, ignoreList []string, repo *GitRepo) error {\n\terr := filepath.Walk(repoPath, func(path string, info os.FileInfo, err error) error {\n\t\tif err != nil {\n\t\t\treturn err\n\t\t}\n\t\tif !info.IsDir() {\n\t\t\trelativeFilePath, _ := filepath.Rel(repoPath, path)\n\t\t\tprocess := shouldProcess(relativeFilePath, includeList, ignoreList)\n\t\t\tif process {\n\t\t\t\tcontents, err := os.ReadFile(path)\n\t\t\t\tif !utf8.Valid(contents) {\n\t\t\t\t\treturn nil\n\t\t\t\t}\n\t\t\t\tif err != nil {\n\t\t\t\t\treturn err\n\t\t\t\t}\n\t\t\t\tvar file GitFile\n\t\t\t\tfile.Path = relativeFilePath\n\t\t\t\tfile.Contents = string(contents)\n\t\t\t\tfile.Tokens = EstimateTokens(file.Contents)\n\t\t\t\trepo.Files = append(repo.Files, file)\n\t\t\t}\n\t\t}\n\t\treturn nil\n\t})\n\trepo.FileCount = len(repo.Files)\n\tif err != nil {\n\t\treturn fmt.Errorf(\"error walking the path %q: %w\", repoPath, err)\n\t}\n\treturn nil\n}\n\nfunc EstimateTokens(output string) int64 {\n\ttke, err := tiktoken.GetEncoding(\"cl100k_base\")\n\tif err != nil {\n\t\tfmt.Println(\"Error getting encoding:\", err)\n\t\treturn 0\n\t}\n\ttokens := tke.Encode(output, nil, nil)\n\treturn int64(len(tokens))\n}"}],"file_count":2} \ No newline at end of file diff --git a/prompt/prompt.go b/prompt/prompt.go index 0f7c025..7f969e6 100644 --- a/prompt/prompt.go +++ b/prompt/prompt.go @@ -214,11 +214,12 @@ func OutputGitRepo(repo *GitRepo, preambleFile string, scrubComments bool) (stri return output, nil } + func OutputGitRepoXML(repo *GitRepo, scrubComments bool) (string, error) { if scrubComments { - for i, file := range repo.Files { - repo.Files[i].Contents = utils.RemoveCodeComments(file.Contents) - } + for i, file := range repo.Files { + repo.Files[i].Contents = utils.RemoveCodeComments(file.Contents) + } } var result strings.Builder result.WriteString("\n") @@ -229,16 +230,38 @@ func OutputGitRepoXML(repo *GitRepo, scrubComments bool) (string, error) { result.WriteString(" \n") for _, file := range repo.Files { - result.WriteString(" \n") - result.WriteString(fmt.Sprintf(" %s\n", escapeXML(file.Path))) - result.WriteString(fmt.Sprintf(" %d\n", file.Tokens)) + result.WriteString(" \n") + result.WriteString(fmt.Sprintf(" %s\n", escapeXML(file.Path))) + result.WriteString(fmt.Sprintf(" %d\n", file.Tokens)) + + // Split content around CDATA end marker (]]>) and create multiple CDATA sections + contents := file.Contents + result.WriteString(" ") + + for { + idx := strings.Index(contents, "]]>") + if idx == -1 { + // No more CDATA end markers, write remaining content in one CDATA section + result.WriteString("") + break + } + + // Write content up to the CDATA end marker + result.WriteString("") // Close this CDATA section - safeContents := strings.ReplaceAll(file.Contents, "]]]]>", "]]]]]]>") + // Start a new CDATA section with the ">" character + result.WriteString("") - result.WriteString(" \n") - result.WriteString(" \n") + // Move past the "]]>" in the original content + contents = contents[idx+3:] + } + + result.WriteString("\n") + result.WriteString(" \n") } result.WriteString(" \n") @@ -250,10 +273,10 @@ func OutputGitRepoXML(repo *GitRepo, scrubComments bool) (string, error) { repo.TotalTokens = tokenCount outputStr = strings.Replace( - outputStr, - "PLACEHOLDER", - fmt.Sprintf("%d", tokenCount), - 1, + outputStr, + "PLACEHOLDER", + fmt.Sprintf("%d", tokenCount), + 1, ) return outputStr, nil