Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions pkg/utils/file_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"strings"
"time"

"github.com/go-git/go-git/v5"
"github.com/rs/zerolog"
gitignore "github.com/sabhiram/go-gitignore"
"golang.org/x/sync/semaphore"
Expand Down Expand Up @@ -112,6 +113,10 @@ func (fw *FileFilter) GetFilteredFiles(filesCh chan string, globs []string) chan

// create pattern matcher used to match filesToFilter to glob patterns
globPatternMatcher := gitignore.CompileIgnoreLines(globs...)

// get git-tracked files (relative paths) and repo root to avoid filtering them out
gitTrackedFiles, repoRoot := fw.getGitTrackedFiles()

go func() {
ctx := context.Background()
availableThreads := semaphore.NewWeighted(fw.max_threads)
Expand All @@ -126,6 +131,11 @@ func (fw *FileFilter) GetFilteredFiles(filesCh chan string, globs []string) chan
}
go func(f string) {
defer availableThreads.Release(1)
// files tracked in git should not be filtered, even if they match gitignore patterns
if fw.isGitTracked(f, gitTrackedFiles, repoRoot) {
filteredFilesCh <- f
return
}
// filesToFilter that do not match the glob pattern are filtered
if !globPatternMatcher.MatchesPath(f) {
filteredFilesCh <- f
Expand All @@ -143,6 +153,70 @@ func (fw *FileFilter) GetFilteredFiles(filesCh chan string, globs []string) chan
return filteredFilesCh
}

// isGitTracked checks if a file path is tracked in git by computing its path relative to the repo root
func (fw *FileFilter) isGitTracked(filePath string, trackedFiles map[string]bool, repoRoot string) bool {
if len(trackedFiles) == 0 || repoRoot == "" {
return false
}

// convert file path to absolute to ensure consistent comparison
absPath, err := filepath.Abs(filePath)
if err != nil {
return false
}

// compute path relative to repo root
relPath, err := filepath.Rel(repoRoot, absPath)
if err != nil {
return false
}

// normalize to forward slashes (git index uses forward slashes)
relPath = filepath.ToSlash(relPath)

// check if this relative path is in the tracked files
return trackedFiles[relPath]
}

// getGitTrackedFiles returns a map of relative file paths (using forward slashes) that are tracked in git,
// along with the repository root path. Using relative paths reduces memory usage in large repos.
func (fw *FileFilter) getGitTrackedFiles() (map[string]bool, string) {
trackedFiles := make(map[string]bool)

// open the git repository
repo, err := git.PlainOpenWithOptions(fw.path, &git.PlainOpenOptions{
DetectDotGit: true,
})
if err != nil {
fw.logger.Debug().Msgf("failed to open git repository: %v", err)
return trackedFiles, ""
}

// get the worktree to find the root path and access the index
worktree, err := repo.Worktree()
if err != nil {
fw.logger.Debug().Msgf("failed to get worktree: %v", err)
return trackedFiles, ""
}
repoRoot := worktree.Filesystem.Root()

// get the index (staging area) - this contains all tracked files
// A file is tracked in git once it's added to the index, even before commit
idx, err := repo.Storer.Index()
if err != nil {
fw.logger.Debug().Msgf("failed to get git index: %v", err)
return trackedFiles, ""
}

// store relative paths (as they appear in git index) - uses forward slashes
// this reduces memory usage compared to storing full absolute paths
for _, entry := range idx.Entries {
trackedFiles[entry.Name] = true
}

return trackedFiles, repoRoot
}

// buildGlobs iterates a list of ignore filesToFilter and returns a list of glob patterns that can be used to test for ignored filesToFilter
func (fw *FileFilter) buildGlobs(ignoreFiles []string) ([]string, error) {
var globs = make([]string, 0)
Expand Down
124 changes: 124 additions & 0 deletions pkg/utils/file_filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ import (
"runtime"
"strings"
"testing"
"time"

"github.com/go-git/go-git/v5"
"github.com/go-git/go-git/v5/plumbing/object"
"github.com/rs/zerolog/log"
"github.com/stretchr/testify/assert"
)
Expand Down Expand Up @@ -707,3 +710,124 @@ func TestDotSnykExclude_isExpired(t *testing.T) {
})
}
}

func TestFileFilter_GitTrackedFilesNotFiltered(t *testing.T) {
t.Run("committed files are not filtered even if they match gitignore", func(t *testing.T) {
tempDir := t.TempDir()

// Initialize a git repository
repo, err := git.PlainInit(tempDir, false)
assert.NoError(t, err)

// Create a file that will be tracked and matches gitignore pattern
trackedIgnoredFile := filepath.Join(tempDir, "ignored.log")
createFileInPath(t, trackedIgnoredFile, []byte("tracked but ignored"))

// Create another file that will NOT be tracked and matches gitignore pattern
untrackedIgnoredFile := filepath.Join(tempDir, "untracked.log")
createFileInPath(t, untrackedIgnoredFile, []byte("untracked and ignored"))

// Create a regular file that doesn't match gitignore
regularFile := filepath.Join(tempDir, "regular.txt")
createFileInPath(t, regularFile, []byte("regular file"))

// Add and commit the tracked file BEFORE creating gitignore
worktree, err := repo.Worktree()
assert.NoError(t, err)

_, err = worktree.Add("ignored.log")
assert.NoError(t, err)
_, err = worktree.Add("regular.txt")
assert.NoError(t, err)

_, err = worktree.Commit("initial commit", &git.CommitOptions{
Author: &object.Signature{
Name: "Test",
Email: "test@test.com",
When: time.Now(),
},
})
assert.NoError(t, err)

// Now create .gitignore that ignores *.log files
gitignorePath := filepath.Join(tempDir, ".gitignore")
createFileInPath(t, gitignorePath, []byte("*.log\n"))

// Create file filter and get filtered files
fileFilter := NewFileFilter(tempDir, &log.Logger)
rules, err := fileFilter.GetRules([]string{".gitignore"})
assert.NoError(t, err)

allFiles := fileFilter.GetAllFiles()
filteredFiles := fileFilter.GetFilteredFiles(allFiles, rules)

// Collect filtered files
var filteredFilesList []string
for file := range filteredFiles {
filteredFilesList = append(filteredFilesList, file)
}

// The tracked file (ignored.log) should NOT be filtered out
assert.Contains(t, filteredFilesList, trackedIgnoredFile, "git tracked file should not be filtered even if it matches gitignore")

// The untracked file (untracked.log) SHOULD be filtered out
assert.NotContains(t, filteredFilesList, untrackedIgnoredFile, "untracked file matching gitignore should be filtered")

// The regular file should be present
assert.Contains(t, filteredFilesList, regularFile, "regular file should not be filtered")

// The gitignore file should be present
assert.Contains(t, filteredFilesList, gitignorePath, ".gitignore should not be filtered")
})

t.Run("staged but uncommitted files are not filtered even if they match gitignore", func(t *testing.T) {
tempDir := t.TempDir()

// Initialize a git repository
repo, err := git.PlainInit(tempDir, false)
assert.NoError(t, err)

// Create .gitignore first
gitignorePath := filepath.Join(tempDir, ".gitignore")
createFileInPath(t, gitignorePath, []byte("*.log\n"))

// Create a file that matches gitignore pattern but will be staged (git add)
stagedIgnoredFile := filepath.Join(tempDir, "staged.log")
createFileInPath(t, stagedIgnoredFile, []byte("staged but ignored"))

// Create another file that will NOT be staged and matches gitignore pattern
unstagedIgnoredFile := filepath.Join(tempDir, "unstaged.log")
createFileInPath(t, unstagedIgnoredFile, []byte("unstaged and ignored"))

// Stage the file (git add) but do NOT commit
worktree, err := repo.Worktree()
assert.NoError(t, err)

_, err = worktree.Add("staged.log")
assert.NoError(t, err)
// Note: we do NOT commit here - the file is only staged

// Create file filter and get filtered files
fileFilter := NewFileFilter(tempDir, &log.Logger)
rules, err := fileFilter.GetRules([]string{".gitignore"})
assert.NoError(t, err)

allFiles := fileFilter.GetAllFiles()
filteredFiles := fileFilter.GetFilteredFiles(allFiles, rules)

// Collect filtered files
var filteredFilesList []string
for file := range filteredFiles {
filteredFilesList = append(filteredFilesList, file)
}

// The staged file (staged.log) should NOT be filtered out - it's in the index
assert.Contains(t, filteredFilesList, stagedIgnoredFile, "staged file should not be filtered even if it matches gitignore")

// The unstaged file (unstaged.log) SHOULD be filtered out
assert.NotContains(t, filteredFilesList, unstagedIgnoredFile, "unstaged file matching gitignore should be filtered")

// The gitignore file should be present
assert.Contains(t, filteredFilesList, gitignorePath, ".gitignore should not be filtered")
})
}