diff --git a/pkg/utils/file_filter.go b/pkg/utils/file_filter.go index 96930b650..c1a935811 100644 --- a/pkg/utils/file_filter.go +++ b/pkg/utils/file_filter.go @@ -10,6 +10,7 @@ import ( "strings" "time" + "github.com/go-git/go-git/v5" "github.com/rs/zerolog" gitignore "github.com/sabhiram/go-gitignore" "golang.org/x/sync/semaphore" @@ -112,6 +113,10 @@ func (fw *FileFilter) GetFilteredFiles(filesCh chan string, globs []string) chan // create pattern matcher used to match filesToFilter to glob patterns globPatternMatcher := gitignore.CompileIgnoreLines(globs...) + + // get git-tracked files (relative paths) and repo root to avoid filtering them out + gitTrackedFiles, repoRoot := fw.getGitTrackedFiles() + go func() { ctx := context.Background() availableThreads := semaphore.NewWeighted(fw.max_threads) @@ -126,6 +131,11 @@ func (fw *FileFilter) GetFilteredFiles(filesCh chan string, globs []string) chan } go func(f string) { defer availableThreads.Release(1) + // files tracked in git should not be filtered, even if they match gitignore patterns + if fw.isGitTracked(f, gitTrackedFiles, repoRoot) { + filteredFilesCh <- f + return + } // filesToFilter that do not match the glob pattern are filtered if !globPatternMatcher.MatchesPath(f) { filteredFilesCh <- f @@ -143,6 +153,70 @@ func (fw *FileFilter) GetFilteredFiles(filesCh chan string, globs []string) chan return filteredFilesCh } +// isGitTracked checks if a file path is tracked in git by computing its path relative to the repo root +func (fw *FileFilter) isGitTracked(filePath string, trackedFiles map[string]bool, repoRoot string) bool { + if len(trackedFiles) == 0 || repoRoot == "" { + return false + } + + // convert file path to absolute to ensure consistent comparison + absPath, err := filepath.Abs(filePath) + if err != nil { + return false + } + + // compute path relative to repo root + relPath, err := filepath.Rel(repoRoot, absPath) + if err != nil { + return false + } + + // normalize to forward slashes (git index uses forward slashes) + relPath = filepath.ToSlash(relPath) + + // check if this relative path is in the tracked files + return trackedFiles[relPath] +} + +// getGitTrackedFiles returns a map of relative file paths (using forward slashes) that are tracked in git, +// along with the repository root path. Using relative paths reduces memory usage in large repos. +func (fw *FileFilter) getGitTrackedFiles() (map[string]bool, string) { + trackedFiles := make(map[string]bool) + + // open the git repository + repo, err := git.PlainOpenWithOptions(fw.path, &git.PlainOpenOptions{ + DetectDotGit: true, + }) + if err != nil { + fw.logger.Debug().Msgf("failed to open git repository: %v", err) + return trackedFiles, "" + } + + // get the worktree to find the root path and access the index + worktree, err := repo.Worktree() + if err != nil { + fw.logger.Debug().Msgf("failed to get worktree: %v", err) + return trackedFiles, "" + } + repoRoot := worktree.Filesystem.Root() + + // get the index (staging area) - this contains all tracked files + // A file is tracked in git once it's added to the index, even before commit + idx, err := repo.Storer.Index() + if err != nil { + fw.logger.Debug().Msgf("failed to get git index: %v", err) + return trackedFiles, "" + } + + // store relative paths (as they appear in git index) - uses forward slashes + // this reduces memory usage compared to storing full absolute paths + for _, entry := range idx.Entries { + trackedFiles[entry.Name] = true + } + + return trackedFiles, repoRoot +} + // buildGlobs iterates a list of ignore filesToFilter and returns a list of glob patterns that can be used to test for ignored filesToFilter func (fw *FileFilter) buildGlobs(ignoreFiles []string) ([]string, error) { var globs = make([]string, 0) diff --git a/pkg/utils/file_filter_test.go b/pkg/utils/file_filter_test.go index 3764f8642..57a3899ef 100644 --- a/pkg/utils/file_filter_test.go +++ b/pkg/utils/file_filter_test.go @@ -7,7 +7,10 @@ import ( "runtime" "strings" "testing" + "time" + "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing/object" "github.com/rs/zerolog/log" "github.com/stretchr/testify/assert" ) @@ -707,3 +710,124 @@ func TestDotSnykExclude_isExpired(t *testing.T) { }) } } + +func TestFileFilter_GitTrackedFilesNotFiltered(t *testing.T) { + t.Run("committed files are not filtered even if they match gitignore", func(t *testing.T) { + tempDir := t.TempDir() + + // Initialize a git repository + repo, err := git.PlainInit(tempDir, false) + assert.NoError(t, err) + + // Create a file that will be tracked and matches gitignore pattern + trackedIgnoredFile := filepath.Join(tempDir, "ignored.log") + createFileInPath(t, trackedIgnoredFile, []byte("tracked but ignored")) + + // Create another file that will NOT be tracked and matches gitignore pattern + untrackedIgnoredFile := filepath.Join(tempDir, "untracked.log") + createFileInPath(t, untrackedIgnoredFile, []byte("untracked and ignored")) + + // Create a regular file that doesn't match gitignore + regularFile := filepath.Join(tempDir, "regular.txt") + createFileInPath(t, regularFile, []byte("regular file")) + + // Add and commit the tracked file BEFORE creating gitignore + worktree, err := repo.Worktree() + assert.NoError(t, err) + + _, err = worktree.Add("ignored.log") + assert.NoError(t, err) + _, err = worktree.Add("regular.txt") + assert.NoError(t, err) + + _, err = worktree.Commit("initial commit", &git.CommitOptions{ + Author: &object.Signature{ + Name: "Test", + Email: "test@test.com", + When: time.Now(), + }, + }) + assert.NoError(t, err) + + // Now create .gitignore that ignores *.log files + gitignorePath := filepath.Join(tempDir, ".gitignore") + createFileInPath(t, gitignorePath, []byte("*.log\n")) + + // Create file filter and get filtered files + fileFilter := NewFileFilter(tempDir, &log.Logger) + rules, err := fileFilter.GetRules([]string{".gitignore"}) + assert.NoError(t, err) + + allFiles := fileFilter.GetAllFiles() + filteredFiles := fileFilter.GetFilteredFiles(allFiles, rules) + + // Collect filtered files + var filteredFilesList []string + for file := range filteredFiles { + filteredFilesList = append(filteredFilesList, file) + } + + // The tracked file (ignored.log) should NOT be filtered out + assert.Contains(t, filteredFilesList, trackedIgnoredFile, "git tracked file should not be filtered even if it matches gitignore") + + // The untracked file (untracked.log) SHOULD be filtered out + assert.NotContains(t, filteredFilesList, untrackedIgnoredFile, "untracked file matching gitignore should be filtered") + + // The regular file should be present + assert.Contains(t, filteredFilesList, regularFile, "regular file should not be filtered") + + // The gitignore file should be present + assert.Contains(t, filteredFilesList, gitignorePath, ".gitignore should not be filtered") + }) + + t.Run("staged but uncommitted files are not filtered even if they match gitignore", func(t *testing.T) { + tempDir := t.TempDir() + + // Initialize a git repository + repo, err := git.PlainInit(tempDir, false) + assert.NoError(t, err) + + // Create .gitignore first + gitignorePath := filepath.Join(tempDir, ".gitignore") + createFileInPath(t, gitignorePath, []byte("*.log\n")) + + // Create a file that matches gitignore pattern but will be staged (git add) + stagedIgnoredFile := filepath.Join(tempDir, "staged.log") + createFileInPath(t, stagedIgnoredFile, []byte("staged but ignored")) + + // Create another file that will NOT be staged and matches gitignore pattern + unstagedIgnoredFile := filepath.Join(tempDir, "unstaged.log") + createFileInPath(t, unstagedIgnoredFile, []byte("unstaged and ignored")) + + // Stage the file (git add) but do NOT commit + worktree, err := repo.Worktree() + assert.NoError(t, err) + + _, err = worktree.Add("staged.log") + assert.NoError(t, err) + // Note: we do NOT commit here - the file is only staged + + // Create file filter and get filtered files + fileFilter := NewFileFilter(tempDir, &log.Logger) + rules, err := fileFilter.GetRules([]string{".gitignore"}) + assert.NoError(t, err) + + allFiles := fileFilter.GetAllFiles() + filteredFiles := fileFilter.GetFilteredFiles(allFiles, rules) + + // Collect filtered files + var filteredFilesList []string + for file := range filteredFiles { + filteredFilesList = append(filteredFilesList, file) + } + + // The staged file (staged.log) should NOT be filtered out - it's in the index + assert.Contains(t, filteredFilesList, stagedIgnoredFile, "staged file should not be filtered even if it matches gitignore") + + // The unstaged file (unstaged.log) SHOULD be filtered out + assert.NotContains(t, filteredFilesList, unstagedIgnoredFile, "unstaged file matching gitignore should be filtered") + + // The gitignore file should be present + assert.Contains(t, filteredFilesList, gitignorePath, ".gitignore should not be filtered") + }) +}