From 96d649759eb2598ee0bae2acd0f4d676953ee885 Mon Sep 17 00:00:00 2001 From: Yuji Ueki Date: Mon, 9 Feb 2026 03:33:59 +0900 Subject: [PATCH] fix: use REST API for user search to fix incorrect sort order The original implementation uses GraphQL search with sort:followers-desc in the query string, but GitHub does not support the sort qualifier for user search. This causes non-deterministic results and missing users. This commit replaces the user search with a two-step approach: 1. REST API GET /search/users?sort=followers (officially supported) 2. GraphQL user(login:) batch queries for detailed data Other changes: - Add MinFollowers field to UserSearchQuery and QueryPreset - Set minFollowers=1000 for the worldwide preset to avoid incomplete results - Exclude the "claude" account (Anthropic co-author bot) which causes GitHub GraphQL to return 504 on contributionsCollection - Add exponential backoff retry for GraphQL batch errors - Add rate limit delays between API requests --- github/github.go | 457 +++++++++++++++++++++++++++++++---------------- main.go | 4 +- presets.go | 12 +- top/top.go | 9 +- 4 files changed, 321 insertions(+), 161 deletions(-) diff --git a/github/github.go b/github/github.go index 8ea1c36d69..f924283f73 100644 --- a/github/github.go +++ b/github/github.go @@ -4,7 +4,9 @@ import ( "encoding/json" "fmt" "log" + "math" "net/http" + "net/url" "regexp" "strings" "time" @@ -13,6 +15,14 @@ import ( ) const root string = "https://api.github.com/" +const usersPerGraphQLBatch = 10 + +// excludeLogins lists accounts that are not real human users. +// "claude" is the Anthropic Claude Code co-author account whose +// contributionsCollection causes GitHub GraphQL to return 504. +var excludeLogins = []string{ + "claude", +} type HTTPGithubClient struct { wrappers []net.Wrapper @@ -61,178 +71,308 @@ func (client HTTPGithubClient) User(login string) (User, error) { return user, nil } +// SearchUsers fetches top users by followers using a two-step approach: +// 1. REST API GET /search/users with sort=followers (officially supported) +// 2. GraphQL user(login:) batch queries for detailed data (contributions, orgs) func (client HTTPGithubClient) SearchUsers(query UserSearchQuery) (GithubSearchResults, error) { - users := []User{} - userLogins := map[string]bool{} + logins, totalUsersCount, err := client.searchUserLogins(query) + if err != nil { + return GithubSearchResults{}, err + } + + users, err := client.fetchUserDetails(logins) + if err != nil { + return GithubSearchResults{}, err + } + + return GithubSearchResults{ + Users: users, + MinimumFollowerCount: minFollowers(users), + TotalUserCount: totalUsersCount, + }, nil +} - totalCount := 0 - minFollowerCount := -1 - maxPerQuery := 1000 - perPage := 5 +// searchUserLogins uses REST API to get user logins sorted by followers. +// The REST API officially supports sort=followers for user search. +func (client HTTPGithubClient) searchUserLogins(query UserSearchQuery) ([]string, int, error) { + var logins []string + seen := map[string]bool{} totalUsersCount := 0 + perPage := 100 + maxPages := (query.MaxUsers + perPage - 1) / perPage retryCount := 0 maxRetryCount := 10 -Pages: - for totalCount < query.MaxUsers { - previousCursor := "" - followerCountQueryStr := "" - if minFollowerCount >= 0 { - followerCountQueryStr = fmt.Sprintf(" followers:<%d", minFollowerCount) + for page := 1; page <= maxPages; page++ { + q := query.Q + if query.MinFollowers > 0 { + q = fmt.Sprintf("%s followers:>=%d", q, query.MinFollowers) } - for currentPage := 1; currentPage <= (maxPerQuery / perPage); currentPage++ { - cursorQueryStr := "" - if previousCursor != "" { - cursorQueryStr = fmt.Sprintf(", after: \\\"%s\\\"", previousCursor) + requestURL := fmt.Sprintf( + "%ssearch/users?q=%s&sort=%s&order=%s&per_page=%d&page=%d", + root, url.QueryEscape(q), query.Sort, query.Order, perPage, page, + ) + + body, err := client.Request(requestURL, "") + if err != nil { + retryCount++ + if retryCount < maxRetryCount { + log.Println("error making REST search request... retrying") + time.Sleep(10 * time.Second) + page-- + continue } - graphQlString := fmt.Sprintf(`{ "query": "query { - search(type: USER, query:\"%s%s sort:%s-%s\", first: %d%s) { - userCount - edges { - node { - __typename - ... on User { - login, - avatarUrl, - name, - company, - organizations(first: 100) { - nodes { - login - } - } - followers { - totalCount - } - contributionsCollection { - contributionCalendar { - totalContributions - }, - totalCommitContributions, - totalPullRequestContributions, - restrictedContributionsCount - } - } - }, - cursor - } - } - }" }`, query.Q, followerCountQueryStr, query.Sort, query.Order, perPage, cursorQueryStr) - - re := regexp.MustCompile(`\r?\n`) - graphQlString = re.ReplaceAllString(graphQlString, " ") - - body, err := client.Request("https://api.github.com/graphql", graphQlString) + return nil, 0, fmt.Errorf("too many REST search errors: %w", err) + } + + var result restSearchResponse + if err := json.Unmarshal(body, &result); err != nil { + retryCount++ + if retryCount < maxRetryCount { + log.Println("error unmarshalling REST search JSON... retrying") + time.Sleep(10 * time.Second) + page-- + continue + } + return nil, 0, fmt.Errorf("too many REST search JSON errors: %w", err) + } + + if result.Message != "" { + retryCount++ + if retryCount < maxRetryCount { + log.Printf("REST search API error (retrying): %s", result.Message) + time.Sleep(10 * time.Second) + page-- + continue + } + return nil, 0, fmt.Errorf("too many REST search API errors: %s", result.Message) + } + + retryCount = 0 + totalUsersCount = result.TotalCount + + for _, item := range result.Items { + if isExcluded(item.Login) { + continue + } + if !seen[item.Login] { + seen[item.Login] = true + logins = append(logins, item.Login) + } + } + + if len(result.Items) < perPage { + break + } + + // Avoid secondary rate limit + time.Sleep(2 * time.Second) + + if len(logins) >= query.MaxUsers { + logins = logins[:query.MaxUsers] + break + } + } + + return logins, totalUsersCount, nil +} + +// fetchUserDetails uses GraphQL user(login:) batch queries to get +// contributions, organizations, and follower counts. +func (client HTTPGithubClient) fetchUserDetails(logins []string) ([]User, error) { + var users []User + maxRetryCount := 5 + + for i := 0; i < len(logins); i += usersPerGraphQLBatch { + end := i + usersPerGraphQLBatch + if end > len(logins) { + end = len(logins) + } + batch := logins[i:end] + batchNum := i/usersPerGraphQLBatch + 1 + + dataNode, err := client.fetchGraphQLBatch(batch, batchNum, maxRetryCount) + if err != nil { + return nil, err + } + + for _, login := range batch { + key := "u_" + sanitizeLogin(login) + userNode, ok := dataNode[key] + if !ok || userNode == nil { + log.Printf("user %s not found in GraphQL response, skipping", login) + continue + } + + user, err := parseUserNode(userNode.(map[string]interface{})) if err != nil { - retryCount++ - if retryCount < maxRetryCount { - log.Println("error making graphql request... retrying") - time.Sleep(10 * time.Second) - continue Pages - } else { - log.Fatalln("Too many errors received. Quitting.") - } + log.Printf("error parsing user %s, skipping: %v", login, err) + continue } + users = append(users, user) + } - var response interface{} - if err := json.Unmarshal(body, &response); err != nil { - retryCount++ - if retryCount < maxRetryCount { - log.Println("error unmarshalling JSON response... retrying") - time.Sleep(10 * time.Second) - continue Pages - } else { - log.Fatalln("Too many errors received. Quitting.") - } + // Avoid secondary rate limit between batches + if i+usersPerGraphQLBatch < len(logins) { + time.Sleep(2 * time.Second) + } + } + + return users, nil +} + +// fetchGraphQLBatch fetches a single batch of users via GraphQL with retries and exponential backoff. +func (client HTTPGithubClient) fetchGraphQLBatch(batch []string, batchNum int, maxRetryCount int) (map[string]interface{}, error) { + graphQlString := buildBatchUserQuery(batch) + + for retryCount := 0; ; retryCount++ { + body, err := client.Request("https://api.github.com/graphql", graphQlString) + if err != nil { + if retryCount >= maxRetryCount { + return nil, fmt.Errorf("batch %d: too many request errors: %w", batchNum, err) } - rootNode := response.(map[string]interface{}) - if val, ok := rootNode["errors"]; ok { - retryCount++ - if retryCount < maxRetryCount { - log.Printf("Received error response (retrying): %+v", val) - time.Sleep(10 * time.Second) - continue Pages - } else { - log.Fatalln("Too many errors received. Quitting.") - } + log.Printf("error making graphql request (batch %d)... retrying", batchNum) + time.Sleep(time.Duration(retryBackoff(retryCount)) * time.Second) + continue + } + + var response map[string]interface{} + if err := json.Unmarshal(body, &response); err != nil { + if retryCount >= maxRetryCount { + return nil, fmt.Errorf("batch %d: too many JSON parse errors: %w", batchNum, err) } - dataNode, ok := rootNode["data"].(map[string]interface{}) - if !ok { - retryCount++ - if retryCount < maxRetryCount { - log.Println("Error accessing data element") - time.Sleep(10 * time.Second) - continue Pages - } else { - log.Fatalln("Too many errors received. Quitting.") - } + log.Printf("error unmarshalling JSON response (batch %d)... retrying", batchNum) + time.Sleep(time.Duration(retryBackoff(retryCount)) * time.Second) + continue + } + + if val, ok := response["errors"]; ok { + if retryCount >= maxRetryCount { + return nil, fmt.Errorf("batch %d: too many GraphQL errors: %+v", batchNum, val) + } + log.Printf("Received error response (batch %d, retrying): %+v", batchNum, val) + time.Sleep(time.Duration(retryBackoff(retryCount)) * time.Second) + continue + } + + dataNode, ok := response["data"].(map[string]interface{}) + if !ok { + if retryCount >= maxRetryCount { + return nil, fmt.Errorf("batch %d: too many missing data node errors", batchNum) } + log.Printf("error accessing data element (batch %d)... retrying", batchNum) + time.Sleep(time.Duration(retryBackoff(retryCount)) * time.Second) + continue + } + + return dataNode, nil + } +} - searchNode := dataNode["search"].(map[string]interface{}) - totalUsersCount = int(searchNode["userCount"].(float64)) - edgeNodes := searchNode["edges"].([]interface{}) +func retryBackoff(retryCount int) int { + wait := 10 * (1 << retryCount) // 10, 20, 40, 80, 120... + if wait > 120 { + wait = 120 + } + return wait +} + +func isExcluded(login string) bool { + for _, excluded := range excludeLogins { + if excluded == login { + return true + } + } + return false +} - if len(edgeNodes) == 0 { - break Pages +func buildBatchUserQuery(logins []string) string { + var parts []string + for _, login := range logins { + key := sanitizeLogin(login) + parts = append(parts, fmt.Sprintf(`u_%s: user(login: \"%s\") { + login + avatarUrl + name + company + organizations(first: 100) { nodes { login } } + followers { totalCount } + contributionsCollection { + contributionCalendar { totalContributions } + totalCommitContributions + totalPullRequestContributions + restrictedContributionsCount } - totalCount += len(edgeNodes) - - Edges: - for _, edge := range edgeNodes { - edgeNode := edge.(map[string]interface{}) - userNode := edgeNode["node"].(map[string]interface{}) - typename := userNode["__typename"].(string) - if typename != "User" { - continue Edges - } - login := userNode["login"].(string) - avatarURL := userNode["avatarUrl"].(string) - name := strPropOrEmpty(userNode, "name") - company := strPropOrEmpty(userNode, "company") - organizations := []string{} + }`, key, login)) + } - orgNodes := userNode["organizations"].(map[string]interface{})["nodes"].([]interface{}) - for _, orgNode := range orgNodes { + queryBody := strings.Join(parts, "\n") + graphQlString := fmt.Sprintf(`{ "query": "{ %s }" }`, queryBody) - organizations = append(organizations, orgNode.(map[string]interface{})["login"].(string)) - } + re := regexp.MustCompile(`\r?\n`) + graphQlString = re.ReplaceAllString(graphQlString, " ") + re2 := regexp.MustCompile(`\t+`) + graphQlString = re2.ReplaceAllString(graphQlString, " ") - followerCount := int(userNode["followers"].(map[string]interface{})["totalCount"].(float64)) - contributionsCollection := userNode["contributionsCollection"].(map[string]interface{}) - contributionCount := int(contributionsCollection["contributionCalendar"].(map[string]interface{})["totalContributions"].(float64)) - privateContributionCount := int(contributionsCollection["restrictedContributionsCount"].(float64)) - commitsCount := int(contributionsCollection["totalCommitContributions"].(float64)) - pullRequestsCount := int(contributionsCollection["totalPullRequestContributions"].(float64)) - - user := User{ - Login: login, - AvatarURL: avatarURL, - Name: name, - Company: company, - Organizations: organizations, - FollowerCount: followerCount, - ContributionCount: contributionCount, - PublicContributionCount: (contributionCount - privateContributionCount), - PrivateContributionCount: privateContributionCount, - CommitsCount: commitsCount, - PullRequestsCount: pullRequestsCount} - - if !userLogins[login] { - userLogins[login] = true - users = append(users, user) - } + return graphQlString +} - previousCursor = edgeNode["cursor"].(string) - minFollowerCount = int(followerCount) +func sanitizeLogin(login string) string { + re := regexp.MustCompile(`[^a-zA-Z0-9]`) + return re.ReplaceAllString(login, "_") +} + +func parseUserNode(userNode map[string]interface{}) (User, error) { + login := userNode["login"].(string) + avatarURL := strPropOrEmpty(userNode, "avatarUrl") + name := strPropOrEmpty(userNode, "name") + company := strPropOrEmpty(userNode, "company") + + var organizations []string + if orgData, ok := userNode["organizations"].(map[string]interface{}); ok { + if nodes, ok := orgData["nodes"].([]interface{}); ok { + for _, orgNode := range nodes { + if orgMap, ok := orgNode.(map[string]interface{}); ok { + organizations = append(organizations, orgMap["login"].(string)) + } } } } - return GithubSearchResults{ - Users: users, - MinimumFollowerCount: minFollowerCount, - TotalUserCount: totalUsersCount}, nil + followerCount := int(userNode["followers"].(map[string]interface{})["totalCount"].(float64)) + contributionsCollection := userNode["contributionsCollection"].(map[string]interface{}) + contributionCount := int(contributionsCollection["contributionCalendar"].(map[string]interface{})["totalContributions"].(float64)) + privateContributionCount := int(contributionsCollection["restrictedContributionsCount"].(float64)) + commitsCount := int(contributionsCollection["totalCommitContributions"].(float64)) + pullRequestsCount := int(contributionsCollection["totalPullRequestContributions"].(float64)) + + return User{ + Login: login, + AvatarURL: avatarURL, + Name: name, + Company: company, + Organizations: organizations, + FollowerCount: followerCount, + ContributionCount: contributionCount, + PublicContributionCount: contributionCount - privateContributionCount, + PrivateContributionCount: privateContributionCount, + CommitsCount: commitsCount, + PullRequestsCount: pullRequestsCount, + }, nil +} + +func minFollowers(users []User) int { + if len(users) == 0 { + return 0 + } + min := math.MaxInt32 + for _, user := range users { + if user.FollowerCount < min { + min = user.FollowerCount + } + } + return min } func strPropOrEmpty(obj map[string]interface{}, prop string) string { @@ -242,7 +382,6 @@ func strPropOrEmpty(obj map[string]interface{}, prop string) string { default: return "" } - } func (client HTTPGithubClient) Organizations(login string) ([]string, error) { @@ -258,12 +397,10 @@ func (client HTTPGithubClient) Organizations(login string) ([]string, error) { log.Fatalf("error parsing organizations JSON for user %+v", login) return []string{}, err } - orgs := []string{} - + var orgs []string for _, org := range orgResp { orgs = append(orgs, org.Organization) } - return orgs, err } @@ -290,10 +427,11 @@ type User struct { } type UserSearchQuery struct { - Q string - Sort string - Order string - MaxUsers int + Q string + Sort string + Order string + MaxUsers int + MinFollowers int } type GithubSearchResults struct { @@ -301,3 +439,14 @@ type GithubSearchResults struct { MinimumFollowerCount int TotalUserCount int } + +type restSearchResponse struct { + TotalCount int `json:"total_count"` + IncompleteResults bool `json:"incomplete_results"` + Items []restSearchItem `json:"items"` + Message string `json:"message"` +} + +type restSearchItem struct { + Login string `json:"login"` +} diff --git a/main.go b/main.go index c156355668..e65da9e2c0 100644 --- a/main.go +++ b/main.go @@ -26,6 +26,7 @@ var locations arrayFlags var excludeLocations arrayFlags var presetTitle string var presetChecksum string +var minFollowers int func main() { token := flag.String("token", LookupEnvOrString("GITHUB_TOKEN", ""), "Github auth token") @@ -51,6 +52,7 @@ func main() { preset := Preset(*presetName) locations = preset.include excludeLocations = preset.exclude + minFollowers = preset.minFollowers presetTitle = PresetTitle(*presetName) presetChecksum = PresetChecksum(*presetName) } @@ -67,7 +69,7 @@ func main() { log.Fatal("Unrecognized output format: ", *outputOpt) } - opts := top.Options{Token: *token, Locations: locations, ExcludeLocations: excludeLocations, Amount: *amount, ConsiderNum: *considerNum, PresetTitle: presetTitle, PresetChecksum: presetChecksum} + opts := top.Options{Token: *token, Locations: locations, ExcludeLocations: excludeLocations, Amount: *amount, ConsiderNum: *considerNum, MinFollowers: minFollowers, PresetTitle: presetTitle, PresetChecksum: presetChecksum} data, err := top.GithubTop(opts) if err != nil { diff --git a/presets.go b/presets.go index 4c8c7823fb..d971e37941 100644 --- a/presets.go +++ b/presets.go @@ -8,9 +8,10 @@ import ( ) type QueryPreset struct { - title string - include []string - exclude []string + title string + include []string + exclude []string + minFollowers int } var PRESETS = map[string]QueryPreset{ @@ -107,8 +108,9 @@ var PRESETS = map[string]QueryPreset{ include: []string{"croatia", "hrvatska", "zagreb", "split", "rijeka", "osijek", "zadar", "pula"}, }, "worldwide": QueryPreset{ - title: "Worldwide", - include: []string{}, + title: "Worldwide", + include: []string{}, + minFollowers: 1000, }, "china": QueryPreset{ title: "China", diff --git a/top/top.go b/top/top.go index 153a0a9e89..f9be8aaa60 100644 --- a/top/top.go +++ b/top/top.go @@ -24,7 +24,13 @@ func GithubTop(options Options) (github.GithubSearchResults, error) { } var client = github.NewGithubClient(net.TokenAuth(token)) - users, err := client.SearchUsers(github.UserSearchQuery{Q: query, Sort: "followers", Order: "desc", MaxUsers: options.ConsiderNum}) + users, err := client.SearchUsers(github.UserSearchQuery{ + Q: query, + Sort: "followers", + Order: "desc", + MaxUsers: options.ConsiderNum, + MinFollowers: options.MinFollowers, + }) if err != nil { return github.GithubSearchResults{}, err } @@ -37,6 +43,7 @@ type Options struct { ExcludeLocations []string Amount int ConsiderNum int + MinFollowers int PresetTitle string PresetChecksum string }