From 4381f9f1b860eedc8baacedc70ab445ca65e03ae Mon Sep 17 00:00:00 2001 From: maxniu1 Date: Fri, 26 Jun 2026 12:13:10 -0700 Subject: [PATCH] fix: use _wfopen on Windows to support non-ASCII (CJK/U...) file paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Windows, fopen() uses the active ANSI codepage (e.g. GBK on zh-CN, Shift-JIS on ja-JP, CP1251 on ru-RU) rather than UTF-8. When a repository path contains non-ASCII characters — Chinese, Japanese, Korean, Cyrillic, Arabic, accented Latin (é, ü, ñ), etc. — every file's tree-sitter definitions pass fails (defs=0, errors=N for all files), producing a knowledge graph with only File/Folder nodes and zero code intelligence. The codebase already has the infrastructure for UTF-8→wide conversion (win_utf8.h: cbm_utf8_to_wide/cbm_wide_to_utf8) and uses it correctly for directory enumeration (FindFirstFileW), mkdir (_wmkdir), and unlink (_wunlink). But fopen() for reading file content was missed. Fix: - Add cbm_fopen() to compat_fs.h / compat_fs.c - Windows: uses _wfopen() with wide-char path - POSIX: delegates to fopen() - Replace fopen(path, "rb") with cbm_fopen(path, "rb") in the 3 pipeline files that read source files for tree-sitter parsing: pass_parallel.c, pass_calls.c, pass_definitions.c Fixes #636 --- src/foundation/compat_fs.c | 20 ++++++++++++++++++++ src/foundation/compat_fs.h | 4 ++++ src/pipeline/pass_calls.c | 3 ++- src/pipeline/pass_definitions.c | 3 ++- src/pipeline/pass_parallel.c | 3 ++- 5 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/foundation/compat_fs.c b/src/foundation/compat_fs.c index f77ad9a89..d91e84931 100644 --- a/src/foundation/compat_fs.c +++ b/src/foundation/compat_fs.c @@ -130,6 +130,22 @@ int cbm_pclose(FILE *f) { return _pclose(f); } +/* Open a file with wide-char _wfopen for UTF-8 path support on Windows. */ +FILE *cbm_fopen(const char *path, const char *mode) { + wchar_t *wpath = cbm_utf8_to_wide(path); + if (!wpath) { + return NULL; + } + wchar_t wmode[8] = {0}; + if (mbstowcs(wmode, mode, 7) == (size_t)-1) { + free(wpath); + return NULL; + } + FILE *f = _wfopen(wpath, wmode); + free(wpath); + return f; +} + bool cbm_mkdir_p(const char *path, int mode) { (void)mode; wchar_t *wpath = cbm_utf8_to_wide(path); @@ -262,6 +278,10 @@ int cbm_pclose(FILE *f) { return pclose(f); } +FILE *cbm_fopen(const char *path, const char *mode) { + return fopen(path, mode); +} + bool cbm_mkdir_p(const char *path, int mode) { /* Try direct mkdir first */ if (mkdir(path, (mode_t)mode) == 0) { diff --git a/src/foundation/compat_fs.h b/src/foundation/compat_fs.h index 285ad555b..fefd1805b 100644 --- a/src/foundation/compat_fs.h +++ b/src/foundation/compat_fs.h @@ -56,4 +56,8 @@ int cbm_rmdir(const char *path); * POSIX: fork() + execvp(). Windows: _spawnvp(). */ int cbm_exec_no_shell(const char *const *argv); +/* Open a file for reading. On Windows, uses wide-char _wfopen to support + * non-ASCII (UTF-8) paths. On POSIX, delegates to fopen(). */ +FILE *cbm_fopen(const char *path, const char *mode); + #endif /* CBM_COMPAT_FS_H */ diff --git a/src/pipeline/pass_calls.c b/src/pipeline/pass_calls.c index 4f4d7b54b..8bd208469 100644 --- a/src/pipeline/pass_calls.c +++ b/src/pipeline/pass_calls.c @@ -19,6 +19,7 @@ enum { PC_RING = 4, PC_RING_MASK = 3, PC_SIG_SCAN = 15, PC_REGEX_GRP = 2 }; #include "graph_buffer/graph_buffer.h" #include "foundation/log.h" #include "foundation/compat.h" +#include "foundation/compat_fs.h" #include "foundation/str_util.h" #include "cbm.h" #include "service_patterns.h" @@ -32,7 +33,7 @@ enum { PC_RING = 4, PC_RING_MASK = 3, PC_SIG_SCAN = 15, PC_REGEX_GRP = 2 }; /* Read entire file into heap-allocated buffer. Caller must free(). */ static char *read_file(const char *path, int *out_len) { - FILE *f = fopen(path, "rb"); + FILE *f = cbm_fopen(path, "rb"); if (!f) { return NULL; } diff --git a/src/pipeline/pass_definitions.c b/src/pipeline/pass_definitions.c index 676f1b169..169570de8 100644 --- a/src/pipeline/pass_definitions.c +++ b/src/pipeline/pass_definitions.c @@ -22,6 +22,7 @@ enum { PD_JSON_FIELD_OVERHEAD = 6 }; #include "graph_buffer/graph_buffer.h" #include "foundation/log.h" #include "foundation/compat.h" +#include "foundation/compat_fs.h" #include "cbm.h" #include "simhash/minhash.h" #include "semantic/ast_profile.h" @@ -33,7 +34,7 @@ enum { PD_JSON_FIELD_OVERHEAD = 6 }; /* Read entire file into heap-allocated buffer. Returns NULL on error. * Caller must free(). Sets *out_len to byte count. */ static char *read_file(const char *path, int *out_len) { - FILE *f = fopen(path, "rb"); + FILE *f = cbm_fopen(path, "rb"); if (!f) { return NULL; } diff --git a/src/pipeline/pass_parallel.c b/src/pipeline/pass_parallel.c index 0471cbe04..f0b47d866 100644 --- a/src/pipeline/pass_parallel.c +++ b/src/pipeline/pass_parallel.c @@ -57,6 +57,7 @@ enum { PP_CSHARP_M_PREFIX_LEN = 2 }; #include "pipeline/worker_pool.h" #include "foundation/compat.h" #include "foundation/compat_thread.h" +#include "foundation/compat_fs.h" #include "graph_buffer/graph_buffer.h" #include "service_patterns.h" #include "foundation/platform.h" @@ -88,7 +89,7 @@ static uint64_t extract_now_ns(void) { /* Read file into a malloc'd buffer (= mimalloc in production). */ static char *read_file(const char *path, int *out_len) { - FILE *f = fopen(path, "rb"); + FILE *f = cbm_fopen(path, "rb"); if (!f) { return NULL; }