Skip to content

Commit f294fe9

Browse files
committed
update generate_plots.py (and resulting plots) to show uniform and exponential distributions simultaneously, add Makefile
1 parent 01bdccc commit f294fe9

39 files changed

+189
-25
lines changed

docs/asymmetric/Makefile

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
PYTHON?=python3
2+
VENV?=venv
3+
REQUIREMENTS?=requirements.txt
4+
5+
.PHONY: install reinstall
6+
7+
install:
8+
$(PYTHON) -m virtualenv $(VENV)
9+
. $(VENV)/bin/activate; $(PYTHON) -m pip install -r $(REQUIREMENTS); $(PYTHON) generate_plots.py
10+
11+
reinstall:
12+
rm -rf $(VENV)
13+
$(MAKE) install

docs/asymmetric/README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,19 @@ The data and summarizing plots can be produced in two steps.
1818
### Generate data
1919

2020
In [TDigestTests](../../core/src/test/java/com/tdunning/math/stats/TDigestTest.java), run `writeUniformResultsWithCompression` with the `ALVTreeDigest` implementation, i.e., run
21-
`AVLTreeDigestTest.writeUniformResultsWithCompression`.
22-
23-
In [MergingDigestTest](../../core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java), run `writeAsymmetricScaleFunctionResults`.
21+
`AVLTreeDigestTest.writeUniformResultsWithCompression`. Similarly for `writeExponentialResultsWithCompression`.
2422

23+
In [MergingDigestTest](../../core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java), run
24+
`writeUniformAsymmetricScaleFunctionResults` and `writeExponentialAsymmetricScaleFunctionResults`.
25+
2526
These will write data files.
2627

27-
2828
### Generate plots
2929

30-
Now run the script [generate_plots.py](./generate_plots.py).
30+
Now run the script [generate_plots.py](./generate_plots.py). For convenience, one can run `make install` (from this directory), which handles the requirements and runs the script.
3131

3232
This script expects to be present the results of running the tests as above.
3333
It will write plots (as PNG files).
3434
The figures so generated are already present in this repository, see [here](../asymmetric/plots/merging/t_digest_figs_K_0q.png)
3535
and [here](../asymmetric/plots/tree/t_digest_figs_K_0q.png) for example.
36+

docs/asymmetric/generate_plots.py

Lines changed: 168 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,8 @@ def generate_figures(prefixes=scale_function_prefixes, save=False, outfilename="
7777
ax[prefixes.index(prefix), 1].set_yscale('log')
7878
ax[prefixes.index(prefix), 2].set_title(
7979
clean_string(prefix) + implementation + " " + cc_suffix.replace(".csv", "").lstrip("_"))
80-
ax[prefixes.index(prefix), 2].hist(centroid_count_data[prefix]["centroid_count"], range=[5, 95],
80+
ax[prefixes.index(prefix), 2].hist(centroid_count_data[prefix]["centroid_count"],
81+
range=[5, 95],
8182
bins=30)
8283

8384
fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,
@@ -90,22 +91,24 @@ def generate_figures(prefixes=scale_function_prefixes, save=False, outfilename="
9091

9192

9293
def generate_size_figures(prefix="K_0_USUAL", save=False, outfilename="", value='0.01',
93-
location="", centroid_index=0):
94+
location="", centroid_index=0):
9495
data = {}
9596
centroid_sizes_data = {}
9697

9798
for impl in implementations:
9899
data[impl] = {}
99100
centroid_sizes_data[impl] = {}
100101
for dist in distributions:
101-
data[impl][dist]= {}
102+
data[impl][dist] = {}
102103
centroid_sizes_data[impl][dist] = {}
103104
filename = "{0}_{1}.csv".format(prefix, value)
104105
with open("{0}/{1}/{2}".format(location, impl, dist) + "/" + filename, 'r') as f:
105106
data[impl][dist][value] = pd.read_csv(f)
106-
with open("{0}/{1}/{2}".format(location, impl, dist) + "/" + prefix + cs_suffix, 'r') as f:
107+
with open("{0}/{1}/{2}".format(location, impl, dist) + "/" + prefix + cs_suffix,
108+
'r') as f:
107109
_d = f.readlines()
108-
centroid_sizes_data[impl][dist][prefix] = [[int(x) for x in y.rstrip(',\n').split(',')] for y in _d]
110+
centroid_sizes_data[impl][dist][prefix] = [
111+
[int(x) for x in y.rstrip(',\n').split(',')] for y in _d]
109112

110113
fig, ax = plt.subplots(len(implementations), len(distributions), squeeze=False)
111114
fig.set_figheight(15)
@@ -120,41 +123,186 @@ def generate_size_figures(prefix="K_0_USUAL", save=False, outfilename="", value=
120123
df = data[impl][dist][v]
121124
error_q_list.append(df['error_q'])
122125
norm_error_q_list.append(df['norm_error_q'])
123-
title = "{0}, {1}, {2}, q={3}, index {4}".format(clean_string(prefix), impl, dist.lower(), value, str(centroid_index))
126+
title = "{0}, {1}, {2}, q={3}, index {4}".format(clean_string(prefix), impl,
127+
dist.lower(), value,
128+
str(centroid_index))
124129
ax[implementations.index(impl), distributions.index(dist)].set_title(title)
125130
_a, b = centroid_sizes_data[impl][dist][prefix], df['norm_error_q']
126131
a = [i[centroid_index] for i in _a]
127132
ax[implementations.index(impl), distributions.index(dist)].scatter(a, b)
128133

129134
fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,
130-
hspace=0.4, wspace=0.3)
135+
hspace=0.4, wspace=0.3)
131136

132137
if save is True:
133138
plt.savefig(outfilename)
134139
elif save is False:
135140
plt.show()
136141

137142

138-
params = [ ("{0}/{1}/{2}/".format(out_prefix, impl, dist), "{0}/{1}/{2}/".format(in_prefix, impl, dist),
139-
" ({0}, {1})".format(impl, dist.lower())) for impl in implementations for dist in distributions]
143+
def generate_figures_both_distr(prefixes=scale_function_prefixes, save=False, outfilename="",
144+
locations=[""], implementation=""):
145+
data = {}
146+
147+
for prefix in prefixes:
148+
data[prefix] = {}
149+
for location in locations:
150+
data[prefix][location] = {}
151+
filenames = filter(
152+
lambda x: x.startswith(prefix) and not x.endswith(cc_suffix) and not x.endswith(
153+
cs_suffix),
154+
os.listdir(location))
155+
for filename in filenames:
156+
value = filename.replace(prefix + "_", "").replace(".csv", "")
157+
with open(location + filename, 'r') as f:
158+
data[prefix][location][value] = pd.read_csv(f)
159+
160+
centroid_count_data = {}
161+
centroid_counts = map(lambda x: x + cc_suffix, prefixes)
162+
for cc_name in centroid_counts:
163+
centroid_count_data[cc_name.replace(cc_suffix, "")] = {}
164+
for location in locations:
165+
# centroid_count_data[cc_name.replace(cc_suffix, "")][location] = {}
166+
with open(location + cc_name, 'r') as f:
167+
centroid_count_data[cc_name.replace(cc_suffix, "")][location] = pd.read_csv(f)
168+
169+
fig, ax = plt.subplots(len(prefixes), 3, squeeze=False)
170+
fig.set_figheight(4 * len(prefixes))
171+
fig.set_figwidth(15)
172+
173+
for prefix in prefixes:
174+
error_q_list, norm_error_q_list = {}, {}
175+
pos = {}
176+
for location in locations:
177+
error_q_list[location] = []
178+
norm_error_q_list[location] = []
179+
pos[location] = []
180+
for v in data[prefix][location]:
181+
pos[location].append(axis_labels[v])
182+
df = data[prefix][location][v]
183+
error_q_list[location].append(df['error_q'])
184+
norm_error_q_list[location].append(df['norm_error_q'])
185+
186+
location_0, location_1 = locations
187+
188+
ax[prefixes.index(prefix), 0].set_title(clean_string(prefix) + implementation + " error")
189+
ax[prefixes.index(prefix), 0].boxplot(error_q_list[location_0],
190+
positions=[x - .15 for x in pos[location_0]],
191+
whis=[5, 95],
192+
showfliers=False, widths=0.2,
193+
medianprops=dict(linestyle='-', linewidth=1.5,
194+
label=location_0.split('/')[
195+
-2].lower(), color='orange'))
196+
ax[prefixes.index(prefix), 0].boxplot(error_q_list[location_1],
197+
positions=[x + .15 for x in pos[location_1]],
198+
whis=[5, 95],
199+
showfliers=False, widths=0.2,
200+
medianprops=dict(linestyle='-', linewidth=4.5,
201+
label=location_1.split('/')[
202+
-2].lower(), color='blue'))
203+
204+
handles, labels = ax[prefixes.index(prefix), 0].get_legend_handles_labels()
205+
ax[prefixes.index(prefix), 0].legend([handles[0], handles[-1]], [labels[0], labels[-1]])
206+
ax[prefixes.index(prefix), 0].set_xticks(range(-5, 6))
207+
ax[prefixes.index(prefix), 0].set_xticklabels(range(-5, 6))
208+
ax[prefixes.index(prefix), 0].set_yscale('log')
209+
210+
ax[prefixes.index(prefix), 1].set_title(
211+
clean_string(prefix) + implementation + " norm_error")
212+
ax[prefixes.index(prefix), 1].boxplot(norm_error_q_list[location_0],
213+
positions=[x - .15 for x in pos[location_0]],
214+
whis=[5, 95],
215+
showfliers=False, widths=0.2,
216+
medianprops=dict(linewidth=1.5,
217+
label=location_0.split('/')[
218+
-2].lower(), color='orange'))
219+
ax[prefixes.index(prefix), 1].boxplot(norm_error_q_list[location_1],
220+
positions=[x + .15 for x in pos[location_1]],
221+
whis=[5, 95],
222+
showfliers=False, widths=0.2,
223+
medianprops=dict(linewidth=4.5,
224+
label=location_1.split('/')[
225+
-2].lower(), color='blue'))
226+
227+
handles, labels = ax[prefixes.index(prefix), 1].get_legend_handles_labels()
228+
ax[prefixes.index(prefix), 1].legend([handles[0], handles[-1]], [labels[0], labels[-1]])
229+
ax[prefixes.index(prefix), 1].set_xticks(range(-5, 6))
230+
ax[prefixes.index(prefix), 1].set_xticklabels(range(-5, 6))
231+
ax[prefixes.index(prefix), 1].set_yscale('log')
232+
233+
ax[prefixes.index(prefix), 2].set_title(
234+
clean_string(prefix) + implementation + " " + cc_suffix.replace(".csv", "").lstrip("_"))
235+
ax[prefixes.index(prefix), 2].hist(
236+
centroid_count_data[prefix][location_0]["centroid_count"], range=[20, 100],
237+
bins=40, color='orange', alpha=0.5, label=location_0.split('/')[-2].lower())
238+
ax[prefixes.index(prefix), 2].hist(
239+
centroid_count_data[prefix][location_1]["centroid_count"], range=[20, 100],
240+
bins=40, color='blue', alpha=0.5, label=location_1.split('/')[-2].lower())
241+
ax[prefixes.index(prefix), 2].legend()
242+
243+
fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,
244+
hspace=0.4, wspace=0.3)
245+
if save is True:
246+
plt.savefig(outfilename)
247+
elif save is False:
248+
plt.show()
249+
plt.show()
250+
251+
252+
# for separate plots for the two distributions
253+
_params = [
254+
("{0}/{1}/{2}/".format(out_prefix, impl, dist), "{0}/{1}/{2}/".format(in_prefix, impl, dist),
255+
" ({0}, {1})".format(impl, dist.lower())) for impl in implementations for dist in
256+
distributions]
257+
258+
params = [("{0}/{1}/BOTH/".format(out_prefix, impl),
259+
["{0}/{1}/{2}/".format(in_prefix, impl, dist) for dist in distributions],
260+
" ({0})".format(impl)) for impl in ['tree', 'merging']]
261+
140262

141263
def main():
142264
for a, b, c in params:
265+
generate_figures_both_distr(prefixes=["K_0_USUAL", "K_QUADRATIC"], save=True,
266+
outfilename="{}t_digest_figs_K_0q".format(a), locations=b,
267+
implementation=c)
268+
generate_figures_both_distr(prefixes=["K_1_{}".format(y) for y in ["USUAL", "GLUED"]],
269+
save=True,
270+
outfilename="{}t_digest_figs_K_1".format(a), locations=b,
271+
implementation=c)
272+
generate_figures_both_distr(prefixes=["K_2_{}".format(y) for y in ["USUAL", "GLUED"]],
273+
save=True,
274+
outfilename="{}t_digest_figs_K_2".format(a), locations=b,
275+
implementation=c)
276+
generate_figures_both_distr(prefixes=["K_3_{}".format(y) for y in ["USUAL", "GLUED"]],
277+
save=True,
278+
outfilename="{}t_digest_figs_K_3".format(a), locations=b,
279+
implementation=c)
280+
for v in ['0.99', '0.999']:
281+
fcn = 'K_0_USUAL'
282+
centroid_index = -1
283+
outfile = out_prefix + '/' + 'size/' + fcn + '_' + v + '_' + str(centroid_index) + '.png'
284+
generate_size_figures(location=in_prefix + '/', prefix=fcn, value=v,
285+
centroid_index=centroid_index,
286+
outfilename=outfile, save=True)
287+
generate_size_figures(location=in_prefix + '/', prefix=fcn, value=v,
288+
centroid_index=centroid_index,
289+
outfilename=outfile, save=True)
290+
291+
# these plots are no longer used in the paper
292+
for a, b, c in _params:
143293
generate_figures(prefixes=["K_0_USUAL", "K_QUADRATIC"], save=True,
144-
outfilename="{}t_digest_figs_K_0q".format(a), location=b, implementation=c)
294+
outfilename="{}t_digest_figs_K_0q".format(a), location=b,
295+
implementation=c)
145296
generate_figures(prefixes=["K_1_{}".format(y) for y in ["USUAL", "GLUED"]], save=True,
146-
outfilename="{}t_digest_figs_K_1".format(a), location=b, implementation=c)
297+
outfilename="{}t_digest_figs_K_1".format(a), location=b,
298+
implementation=c)
147299
generate_figures(prefixes=["K_2_{}".format(y) for y in ["USUAL", "GLUED"]], save=True,
148-
outfilename="{}t_digest_figs_K_2".format(a), location=b, implementation=c)
300+
outfilename="{}t_digest_figs_K_2".format(a), location=b,
301+
implementation=c)
149302
generate_figures(prefixes=["K_3_{}".format(y) for y in ["USUAL", "GLUED"]], save=True,
150-
outfilename="{}t_digest_figs_K_3".format(a), location=b, implementation=c)
151-
for centroid_index, v in [(-1, '0.99'), (-1, '0.999'), (0, '0.01')]:
152-
fcn = 'K_0_USUAL'
153-
outfile = "{0}/size/{1}_{2}_{3}.png".format(out_prefix, fcn, v, str(centroid_index))
154-
generate_size_figures(location=in_prefix + '/', prefix=fcn, value=v, centroid_index=centroid_index,
155-
outfilename=outfile, save=True)
156-
generate_size_figures(location=in_prefix + '/', prefix=fcn, value=v, centroid_index=centroid_index,
157-
outfilename=outfile, save=True)
303+
outfilename="{}t_digest_figs_K_3".format(a), location=b,
304+
implementation=c)
305+
158306

159307
if __name__ == "__main__":
160308
main()
85.2 KB
Loading
83.1 KB
Loading
82.9 KB
Loading
79 KB
Loading
68.8 KB
Loading
62.7 KB
Loading
62 KB
Loading

0 commit comments

Comments
 (0)