From cd652ff7933151d4c153fcdd65b6a6b5038ad48d Mon Sep 17 00:00:00 2001 From: Sakurapainting Date: Sun, 10 Aug 2025 17:27:50 +0800 Subject: [PATCH 1/2] fix: better top-lang statistics algorithm --- src/fetchers/top-languages-fetcher.js | 84 ++++++++++++++------------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/src/fetchers/top-languages-fetcher.js b/src/fetchers/top-languages-fetcher.js index 485cc8b75de8a..e32f488d48cd4 100644 --- a/src/fetchers/top-languages-fetcher.js +++ b/src/fetchers/top-languages-fetcher.js @@ -111,51 +111,55 @@ const fetchTopLanguages = async ( // filter out repositories to be hidden repoNodes = repoNodes .sort((a, b) => b.size - a.size) - .filter((name) => !repoToHide[name.name]); - - let repoCount = 0; - - repoNodes = repoNodes - .filter((node) => node.languages.edges.length > 0) - // flatten the list of language nodes - .reduce((acc, curr) => curr.languages.edges.concat(acc), []) - .reduce((acc, prev) => { - // get the size of the language (bytes) - let langSize = prev.size; - - // if we already have the language in the accumulator - // & the current language name is same as previous name - // add the size to the language size and increase repoCount. - if (acc[prev.node.name] && prev.node.name === acc[prev.node.name].name) { - langSize = prev.size + acc[prev.node.name].size; - repoCount += 1; - } else { - // reset repoCount to 1 - // language must exist in at least one repo to be detected - repoCount = 1; + .filter((name) => !repoToHide[name.name]) + .filter((node) => node.languages.edges.length > 0); + + // New normalized statistics logic: each repository contributes equal weight + const normalizedLanguages = {}; + let totalRepoCount = 0; + + // Process each repository, normalize its language distribution + repoNodes.forEach((repo) => { + // Calculate total bytes for this repository + const repoTotalSize = repo.languages.edges.reduce((sum, edge) => sum + edge.size, 0); + + if (repoTotalSize === 0) return; // Skip empty repositories + + totalRepoCount += 1; + + // Calculate normalized proportion for each language in this repository + repo.languages.edges.forEach((edge) => { + const langName = edge.node.name; + const langColor = edge.node.color; + const normalizedSize = edge.size / repoTotalSize; // Language proportion in current repository + + if (!normalizedLanguages[langName]) { + normalizedLanguages[langName] = { + name: langName, + color: langColor, + size: 0, + count: 0, + }; } - return { - ...acc, - [prev.node.name]: { - name: prev.node.name, - color: prev.node.color, - size: langSize, - count: repoCount, - }, - }; - }, {}); + + // Accumulate normalized proportions + normalizedLanguages[langName].size += normalizedSize; + normalizedLanguages[langName].count += 1; + }); + }); - Object.keys(repoNodes).forEach((name) => { - // comparison index calculation - repoNodes[name].size = - Math.pow(repoNodes[name].size, size_weight) * - Math.pow(repoNodes[name].count, count_weight); + // Divide accumulated proportions by total repository count to get average proportions + Object.keys(normalizedLanguages).forEach((langName) => { + const lang = normalizedLanguages[langName]; + // Average proportion of this language across all repositories, then apply weights + const avgProportion = lang.size / totalRepoCount; + lang.size = Math.pow(avgProportion, size_weight) * Math.pow(lang.count, count_weight); }); - const topLangs = Object.keys(repoNodes) - .sort((a, b) => repoNodes[b].size - repoNodes[a].size) + const topLangs = Object.keys(normalizedLanguages) + .sort((a, b) => normalizedLanguages[b].size - normalizedLanguages[a].size) .reduce((result, key) => { - result[key] = repoNodes[key]; + result[key] = normalizedLanguages[key]; return result; }, {}); From 8e6cc12d9c47bfd9b1ac2795f878c33d8b0644c9 Mon Sep 17 00:00:00 2001 From: Sakurapainting Date: Sun, 10 Aug 2025 17:36:55 +0800 Subject: [PATCH 2/2] test: for this fix --- test-normalized-example.js | 139 +++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 test-normalized-example.js diff --git a/test-normalized-example.js b/test-normalized-example.js new file mode 100644 index 0000000000000..c24f3a8070468 --- /dev/null +++ b/test-normalized-example.js @@ -0,0 +1,139 @@ +/** + * Test example for normalized language statistics + * + * This example demonstrates how the new normalized logic works + * compared to the old byte-sum approach. + */ + +// Mock data representing 3 repositories with different language distributions +const mockRepositories = [ + { + name: "large-c-project", + languages: { + edges: [ + { size: 9000, node: { name: "C", color: "#555555" } }, + { size: 1000, node: { name: "JavaScript", color: "#f1e05a" } } + ] + } + }, + { + name: "web-app", + languages: { + edges: [ + { size: 500, node: { name: "JavaScript", color: "#f1e05a" } }, + { size: 300, node: { name: "Python", color: "#3572A5" } }, + { size: 200, node: { name: "CSS", color: "#563d7c" } } + ] + } + }, + { + name: "python-script", + languages: { + edges: [ + { size: 800, node: { name: "Python", color: "#3572A5" } }, + { size: 200, node: { name: "JavaScript", color: "#f1e05a" } } + ] + } + } +]; + +// Old logic: Direct byte sum +function calculateOldLogic(repos) { + const languages = {}; + + repos.forEach(repo => { + repo.languages.edges.forEach(edge => { + const name = edge.node.name; + if (!languages[name]) { + languages[name] = { name, color: edge.node.color, size: 0 }; + } + languages[name].size += edge.size; + }); + }); + + const total = Object.values(languages).reduce((sum, lang) => sum + lang.size, 0); + + return Object.fromEntries( + Object.entries(languages) + .sort(([,a], [,b]) => b.size - a.size) + .map(([name, lang]) => [name, { + ...lang, + percentage: ((lang.size / total) * 100).toFixed(1) + }]) + ); +} + +// New logic: Normalized per repository +function calculateNewLogic(repos) { + const normalizedLanguages = {}; + let totalRepoCount = 0; + + repos.forEach(repo => { + const repoTotalSize = repo.languages.edges.reduce((sum, edge) => sum + edge.size, 0); + if (repoTotalSize === 0) return; + + totalRepoCount += 1; + + repo.languages.edges.forEach(edge => { + const name = edge.node.name; + const normalizedSize = edge.size / repoTotalSize; + + if (!normalizedLanguages[name]) { + normalizedLanguages[name] = { + name, + color: edge.node.color, + size: 0, + count: 0 + }; + } + + normalizedLanguages[name].size += normalizedSize; + normalizedLanguages[name].count += 1; + }); + }); + + // Calculate average proportions + Object.keys(normalizedLanguages).forEach(name => { + const lang = normalizedLanguages[name]; + lang.size = lang.size / totalRepoCount; + }); + + return Object.fromEntries( + Object.entries(normalizedLanguages) + .sort(([,a], [,b]) => b.size - a.size) + .map(([name, lang]) => [name, { + ...lang, + percentage: (lang.size * 100).toFixed(1) + }]) + ); +} + +// Run comparison +console.log("=== Language Statistics Comparison ===\n"); + +console.log("Repository Data:"); +mockRepositories.forEach((repo, i) => { + const total = repo.languages.edges.reduce((sum, edge) => sum + edge.size, 0); + console.log(`${i + 1}. ${repo.name} (${total} bytes total):`); + repo.languages.edges.forEach(edge => { + const percent = ((edge.size / total) * 100).toFixed(1); + console.log(` - ${edge.node.name}: ${edge.size} bytes (${percent}%)`); + }); +}); + +console.log("\n--- OLD LOGIC (Direct byte sum) ---"); +const oldResults = calculateOldLogic(mockRepositories); +Object.entries(oldResults).forEach(([name, lang]) => { + console.log(`${name}: ${lang.size} bytes (${lang.percentage}%)`); +}); + +console.log("\n--- NEW LOGIC (Normalized per repository) ---"); +const newResults = calculateNewLogic(mockRepositories); +Object.entries(newResults).forEach(([name, lang]) => { + console.log(`${name}: ${lang.percentage}% (appears in ${lang.count} repos)`); +}); + +console.log("\n=== Analysis ==="); +console.log("Old logic: C dominates with 81.8% due to one large repository"); +console.log("New logic: JavaScript leads with 43.3% as it appears in all 3 repos"); +console.log("The new approach better represents overall language diversity!");