1+ #include < iostream>
2+ #include < algorithm>
3+ #include < vector>
4+ #include < numeric>
5+ #include < iterator>
6+ #include < random>
7+ #include < limits>
8+ #include < cmath>
9+ #include < array>
10+ #pragma GCC optimize("Ofast")
11+ #pragma GCC target("avx512bw,avx512vl")
12+ #pragma GCC optimization("unroll-loops")
13+ using namespace std ;
14+ /*
15+ vec:待聚类的向量组
16+ n:向量个数
17+ dimension:向量维度
18+ belong:向量属于哪一个簇
19+ center:算法计算出的每个簇的中心向量
20+ k:簇的个数
21+ iterations:迭代次数
22+ */
23+ template <typename T, int MAXN, int MAXM, int MAXD>
24+ inline void kmeans (const T (&vec)[MAXN][MAXD], int n, int dimension, int(&belong)[MAXN], T(¢er)[MAXM][MAXD], int k, int iterations) {
25+ mt19937 engine;
26+ vector<int > indices (n), choices;
27+ iota (indices.begin (), indices.end (), 0 );
28+ sample (indices.begin (), indices.end (), back_inserter (choices), k, engine);
29+ for (int i = 0 ; i < k; ++i) {
30+ copy (vec[choices[i]], vec[choices[i]] + dimension, center[i]);
31+ }
32+ for (int t = 0 ; t < iterations; ++t) { // 迭代iterations轮
33+ for (int i = 0 ; i < n; ++i) {
34+ T best = numeric_limits<T>::max ();
35+ for (int j = 0 ; j < k; ++j) {
36+ T distance = 0 ;
37+ for (int d = 0 ; d < dimension; ++d) {
38+ distance += (vec[i][d] - center[j][d]) * (vec[i][d] - center[j][d]);
39+ }
40+ if (distance < best) {
41+ best = distance;
42+ belong[i] = j;
43+ }
44+ }
45+ }
46+ if (t + 1 != iterations) {
47+ int sz[MAXM] = {};
48+ for (int j = 0 ; j < k; ++j) {
49+ for (int d = 0 ; d < dimension; ++d) {
50+ center[j][d] = 0 ;
51+ }
52+ }
53+ for (int i = 0 ; i < n; ++i) {
54+ sz[belong[i]] += 1 ;
55+ for (int d = 0 ; d < dimension; ++d) {
56+ center[belong[i]][d] += vec[i][d];
57+ }
58+ }
59+ for (int j = 0 ; j < k; ++j) {
60+ for (int d = 0 ; d < dimension; ++d) {
61+ center[j][d] /= sz[j];
62+ }
63+ }
64+ }
65+ }
66+ }
67+ template <typename T, int MAXN, int MAXM, int MAXD>
68+ inline void kmeans_plusplus (const T (&vec)[MAXN][MAXD], int n, int dimension, int(&belong)[MAXN], T(¢er)[MAXM][MAXD], int k, int iterations) {
69+ mt19937 engine;
70+ vector<T> distance (n, numeric_limits<T>::max ());
71+ vector<bool > mask (n);
72+ mask[0 ] = true ;
73+ copy (vec[0 ], vec[0 ] + dimension, center[0 ]);
74+ for (int j = 1 ; j < k; ++j) {
75+ for (int i = 0 ; i < n; ++i) {
76+ T dist = 0 ;
77+ for (int d = 0 ; d < dimension; ++d) {
78+ dist += (vec[i][d] - center[j - 1 ][d]) * (vec[i][d] - center[j - 1 ][d]);
79+ }
80+ distance[i] = min (distance[i], dist);
81+ }
82+ vector<T> sum (n);
83+ for (int i = 1 ; i < n; ++i) {
84+ sum[i] = sum[i - 1 ] + (mask[i] ? 0.0 : distance[i]);
85+ }
86+ uniform_real_distribution<T> gen (0 , sum[n - 1 ]);
87+ auto index = lower_bound (sum.begin (), sum.end (), gen (engine)) - sum.begin ();
88+ mask[index] = true ;
89+ copy (vec[index], vec[index] + dimension, center[j]);
90+ }
91+ for (int t = 0 ; t < iterations; ++t) { // 迭代iterations轮
92+ for (int i = 0 ; i < n; ++i) {
93+ T best = numeric_limits<T>::max ();
94+ for (int j = 0 ; j < k; ++j) {
95+ T distance = 0 ;
96+ for (int d = 0 ; d < dimension; ++d) {
97+ distance += (vec[i][d] - center[j][d]) * (vec[i][d] - center[j][d]);
98+ }
99+ if (distance < best) {
100+ best = distance;
101+ belong[i] = j;
102+ }
103+ }
104+ }
105+ if (t + 1 != iterations) {
106+ int sz[MAXM] = {};
107+ for (int j = 0 ; j < k; ++j) {
108+ for (int d = 0 ; d < dimension; ++d) {
109+ center[j][d] = 0 ;
110+ }
111+ }
112+ for (int i = 0 ; i < n; ++i) {
113+ sz[belong[i]] += 1 ;
114+ for (int d = 0 ; d < dimension; ++d) {
115+ center[belong[i]][d] += vec[i][d];
116+ }
117+ }
118+ for (int j = 0 ; j < k; ++j) {
119+ for (int d = 0 ; d < dimension; ++d) {
120+ center[j][d] /= sz[j];
121+ }
122+ }
123+ }
124+ }
125+ }
126+
127+ constexpr int MAXN = 50000 , MAXD = 1024 , MAXM = 100 ;
128+ double vec[MAXN][MAXD], center[MAXM][MAXD];
129+ int belong[MAXN];
130+ int main () {
131+ mt19937 engine;
132+ normal_distribution<double > generate (0 , 0.5 ), gen (0 , 0.1 );
133+ vector<array<double , MAXD>> cluster (MAXM);
134+ for (int i = 0 ; i < MAXM; ++i) {
135+ for (int j = 0 ; j < MAXD; ++j) {
136+ cluster[i][j] = generate (engine);
137+ }
138+ }
139+ for (int i = 0 ; i < MAXN; ++i) {
140+ int c = i % MAXM;
141+ for (int j = 0 ; j < MAXD; ++j) {
142+ vec[i][j] = cluster[c][j] + gen (engine);
143+ }
144+ }
145+ kmeans (vec, MAXN, MAXD, belong, center, MAXM, 10 ); // Error: 103956
146+ // kmeans_plusplus(vec, MAXN, MAXD, belong, center, MAXM, 10); // Error: 59772.6
147+ double error = 0 ;
148+ for (int i = 0 ; i < MAXN; ++i) {
149+ double dist = 0 ;
150+ for (int j = 0 ; j < MAXM; ++j) {
151+ dist += (vec[i][j] - center[belong[i]][j]) * (vec[i][j] - center[belong[i]][j]);
152+ }
153+ error += sqrt (dist);
154+ }
155+ int sz[MAXM] = {};
156+ for (int i = 0 ; i < MAXN; ++i) {
157+ sz[belong[i]] += 1 ;
158+ }
159+ for (int i = 0 ; i < MAXM; ++i) {
160+ cout << sz[i] << " " ;
161+ }
162+ cout << endl << " Error: " << error << endl;
163+ return 0 ;
164+ }
0 commit comments