-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark_spectral_labeling.py
More file actions
107 lines (86 loc) · 3.58 KB
/
benchmark_spectral_labeling.py
File metadata and controls
107 lines (86 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""Benchmarks for labeling in spectral clustering methods."""
import timeit
import csv
import gc
import numpy as np
from scipy import sparse
from memory_profiler import memory_usage
from sklearn.cluster import SpectralClustering
from sklearn.metrics import adjusted_rand_score
def to_numpy(x_file, y_file):
i_list, j_list = [], []
for row in x_file:
# Convert from one-based to zero-based indexing.
i = int(row[0]) - 1
j = int(row[1]) - 1
i_list.append(i)
j_list.append(j)
# symmetrize
j_list.append(i)
i_list.append(j)
vals = [1 for _ in i_list]
n = max(i_list) + 1
s = sparse.coo_matrix((vals, (i_list, j_list)), shape=(n, n), dtype=np.float32)
y = []
for row in y_file:
y.append(int(row[1]) - 1)
return s, np.array(y)
def get_data():
cases = [
# (1000, "static_lowOverlap_lowBlockSizeVar"),
# (5000, "static_lowOverlap_lowBlockSizeVar"),
# (20000, "static_lowOverlap_lowBlockSizeVar"),
# (50000, "static_lowOverlap_lowBlockSizeVar"),
# (200000, "static_lowOverlap_lowBlockSizeVar"),
# (1000, "static_lowOverlap_highBlockSizeVar"),
# (5000, "static_lowOverlap_highBlockSizeVar"),
# (20000, "static_lowOverlap_highBlockSizeVar"),
# (50000, "static_lowOverlap_highBlockSizeVar"),
# (200000, "static_lowOverlap_highBlockSizeVar"),
#(1000, "static_highOverlap_lowBlockSizeVar"),
#(5000, "static_highOverlap_lowBlockSizeVar"),
(20000, "static_highOverlap_lowBlockSizeVar"),
#(50000, "static_highOverlap_lowBlockSizeVar"),
#(200000, "static_highOverlap_lowBlockSizeVar"),
# (1000, "static_highOverlap_highBlockSizeVar"),
# (5000, "static_highOverlap_highBlockSizeVar"),
# (20000, "static_highOverlap_highBlockSizeVar"),
# (50000, "static_highOverlap_highBlockSizeVar"),
# (200000, "static_highOverlap_highBlockSizeVar"),
# (1000000, "static_lowOverlap_lowBlockSizeVar"),
# (1000000, "static_lowOverlap_highBlockSizeVar"),
# (1000000, "static_highOverlap_lowBlockSizeVar"),
# (1000000, "static_highOverlap_highBlockSizeVar"),
]
for size, name in cases:
x_file = csv.reader(open(f"{name}_{size}_nodes.tsv"), delimiter="\t")
y_file = csv.reader(
open(f"{name}_{size}_nodes_truePartition.tsv"), delimiter="\t"
)
yield to_numpy(x_file, y_file), name
def profile_and_score(s, y, assign_labels, n_clusters):
def cluster():
return SpectralClustering(
random_state=0,
n_clusters=n_clusters,
affinity="precomputed",
eigen_solver="lobpcg",
assign_labels=assign_labels,
).fit(s)
gc.collect()
time = np.mean(timeit.repeat(cluster, repeat=3, number=1))
memory = np.max(memory_usage(cluster))
score = adjusted_rand_score(y, cluster().labels_)
return time, memory, score
def run_benchmark():
for (s, y), name in get_data():
n_clusters = np.max(y) + 1
print(f"Test {name} of size {s.shape} with {n_clusters} clusters")
for assign_labels in ("kmeans", "discretize", "cluster_qr"):
time, memory, score = profile_and_score(
s, y, assign_labels=assign_labels, n_clusters=n_clusters
)
print(f"{assign_labels:10}: {score:.3f} ({time:.3f} sec., {memory:.3f} MB)")
print("\n")
if __name__ == "__main__":
run_benchmark()