tdunning
diff --git a/‎docs/asymmetric/Makefile‎
Lines changed: 13 additions & 0 deletions b/‎docs/asymmetric/Makefile‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎docs/asymmetric/README.md‎
Lines changed: 6 additions & 5 deletions b/‎docs/asymmetric/README.md‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎docs/asymmetric/generate_plots.py‎
Lines changed: 168 additions & 20 deletions b/‎docs/asymmetric/generate_plots.py‎
Lines changed: 168 additions & 20 deletions
diff --git a/‎docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_0q.png‎
85.2 KB b/‎docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_0q.png‎
85.2 KB
diff --git a/‎docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_1.png‎
83.1 KB b/‎docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_1.png‎
83.1 KB
diff --git a/‎docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_2.png‎
82.9 KB b/‎docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_2.png‎
82.9 KB
diff --git a/‎docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_3.png‎
79 KB b/‎docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_3.png‎
79 KB
diff --git a/‎docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_0q.png‎
68.8 KB b/‎docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_0q.png‎
68.8 KB
diff --git a/‎docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_1.png‎
62.7 KB b/‎docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_1.png‎
62.7 KB
diff --git a/‎docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_2.png‎
62 KB b/‎docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_2.png‎
62 KB
@@ -0,0 +1,13 @@
+PYTHON?=python3
+VENV?=venv
+REQUIREMENTS?=requirements.txt
+
+.PHONY: install reinstall
+
+install:
+	$(PYTHON) -m virtualenv $(VENV)
+	. $(VENV)/bin/activate; $(PYTHON) -m pip install -r $(REQUIREMENTS); $(PYTHON) generate_plots.py
+
+reinstall:
+	rm -rf $(VENV)
+	$(MAKE) install
@@ -18,18 +18,19 @@ The data and summarizing plots can be produced in two steps.
 ### Generate data
 
 In [TDigestTests](../../core/src/test/java/com/tdunning/math/stats/TDigestTest.java), run `writeUniformResultsWithCompression` with the `ALVTreeDigest` implementation, i.e., run
-`AVLTreeDigestTest.writeUniformResultsWithCompression`.
-
-In [MergingDigestTest](../../core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java), run `writeAsymmetricScaleFunctionResults`.
+`AVLTreeDigestTest.writeUniformResultsWithCompression`. Similarly for `writeExponentialResultsWithCompression`.
 
+In [MergingDigestTest](../../core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java), run
+ `writeUniformAsymmetricScaleFunctionResults` and `writeExponentialAsymmetricScaleFunctionResults`.
+ 
 These will write data files.
 
-
 ### Generate plots
 
-Now run the script [generate_plots.py](./generate_plots.py).
+Now run the script [generate_plots.py](./generate_plots.py). For convenience, one can run `make install` (from this directory), which handles the requirements and runs the script.
 
 This script expects to be present the results of running the tests as above.
 It will write plots (as PNG files).
 The figures so generated are already present in this repository, see [here](../asymmetric/plots/merging/t_digest_figs_K_0q.png)
 and [here](../asymmetric/plots/tree/t_digest_figs_K_0q.png) for example.
+
@@ -77,7 +77,8 @@ def generate_figures(prefixes=scale_function_prefixes, save=False, outfilename="
         ax[prefixes.index(prefix), 1].set_yscale('log')
         ax[prefixes.index(prefix), 2].set_title(
             clean_string(prefix) + implementation + " " + cc_suffix.replace(".csv", "").lstrip("_"))
-        ax[prefixes.index(prefix), 2].hist(centroid_count_data[prefix]["centroid_count"], range=[5, 95],
+        ax[prefixes.index(prefix), 2].hist(centroid_count_data[prefix]["centroid_count"],
+                                           range=[5, 95],
                                            bins=30)
 
     fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,
@@ -90,22 +91,24 @@ def generate_figures(prefixes=scale_function_prefixes, save=False, outfilename="
 
 
 def generate_size_figures(prefix="K_0_USUAL", save=False, outfilename="", value='0.01',
-                     location="", centroid_index=0):
+                          location="", centroid_index=0):
     data = {}
     centroid_sizes_data = {}
 
     for impl in implementations:
         data[impl] = {}
         centroid_sizes_data[impl] = {}
         for dist in distributions:
-            data[impl][dist]= {}
+            data[impl][dist] = {}
             centroid_sizes_data[impl][dist] = {}
             filename = "{0}_{1}.csv".format(prefix, value)
             with open("{0}/{1}/{2}".format(location, impl, dist) + "/" + filename, 'r') as f:
                 data[impl][dist][value] = pd.read_csv(f)
-            with open("{0}/{1}/{2}".format(location, impl, dist) + "/" + prefix + cs_suffix, 'r') as f:
+            with open("{0}/{1}/{2}".format(location, impl, dist) + "/" + prefix + cs_suffix,
+                      'r') as f:
                 _d = f.readlines()
-                centroid_sizes_data[impl][dist][prefix] = [[int(x) for x in y.rstrip(',\n').split(',')] for y in _d]
+                centroid_sizes_data[impl][dist][prefix] = [
+                    [int(x) for x in y.rstrip(',\n').split(',')] for y in _d]
 
     fig, ax = plt.subplots(len(implementations), len(distributions), squeeze=False)
     fig.set_figheight(15)
@@ -120,41 +123,186 @@ def generate_size_figures(prefix="K_0_USUAL", save=False, outfilename="", value=
                 df = data[impl][dist][v]
                 error_q_list.append(df['error_q'])
                 norm_error_q_list.append(df['norm_error_q'])
-                title = "{0}, {1}, {2}, q={3}, index {4}".format(clean_string(prefix), impl, dist.lower(), value, str(centroid_index))
+                title = "{0}, {1}, {2}, q={3}, index {4}".format(clean_string(prefix), impl,
+                                                                 dist.lower(), value,
+                                                                 str(centroid_index))
                 ax[implementations.index(impl), distributions.index(dist)].set_title(title)
                 _a, b = centroid_sizes_data[impl][dist][prefix], df['norm_error_q']
                 a = [i[centroid_index] for i in _a]
                 ax[implementations.index(impl), distributions.index(dist)].scatter(a, b)
 
     fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,
-                            hspace=0.4, wspace=0.3)
+                        hspace=0.4, wspace=0.3)
 
     if save is True:
         plt.savefig(outfilename)
     elif save is False:
         plt.show()
 
 
-params = [ ("{0}/{1}/{2}/".format(out_prefix, impl, dist), "{0}/{1}/{2}/".format(in_prefix, impl, dist),
-            " ({0}, {1})".format(impl, dist.lower())) for impl in implementations for dist in distributions]
+def generate_figures_both_distr(prefixes=scale_function_prefixes, save=False, outfilename="",
+                                locations=[""], implementation=""):
+    data = {}
+
+    for prefix in prefixes:
+        data[prefix] = {}
+        for location in locations:
+            data[prefix][location] = {}
+            filenames = filter(
+                lambda x: x.startswith(prefix) and not x.endswith(cc_suffix) and not x.endswith(
+                    cs_suffix),
+                os.listdir(location))
+            for filename in filenames:
+                value = filename.replace(prefix + "_", "").replace(".csv", "")
+                with open(location + filename, 'r') as f:
+                    data[prefix][location][value] = pd.read_csv(f)
+
+    centroid_count_data = {}
+    centroid_counts = map(lambda x: x + cc_suffix, prefixes)
+    for cc_name in centroid_counts:
+        centroid_count_data[cc_name.replace(cc_suffix, "")] = {}
+        for location in locations:
+            # centroid_count_data[cc_name.replace(cc_suffix, "")][location] = {}
+            with open(location + cc_name, 'r') as f:
+                centroid_count_data[cc_name.replace(cc_suffix, "")][location] = pd.read_csv(f)
+
+    fig, ax = plt.subplots(len(prefixes), 3, squeeze=False)
+    fig.set_figheight(4 * len(prefixes))
+    fig.set_figwidth(15)
+
+    for prefix in prefixes:
+        error_q_list, norm_error_q_list = {}, {}
+        pos = {}
+        for location in locations:
+            error_q_list[location] = []
+            norm_error_q_list[location] = []
+            pos[location] = []
+            for v in data[prefix][location]:
+                pos[location].append(axis_labels[v])
+                df = data[prefix][location][v]
+                error_q_list[location].append(df['error_q'])
+                norm_error_q_list[location].append(df['norm_error_q'])
+
+        location_0, location_1 = locations
+
+        ax[prefixes.index(prefix), 0].set_title(clean_string(prefix) + implementation + " error")
+        ax[prefixes.index(prefix), 0].boxplot(error_q_list[location_0],
+                                              positions=[x - .15 for x in pos[location_0]],
+                                              whis=[5, 95],
+                                              showfliers=False, widths=0.2,
+                                              medianprops=dict(linestyle='-', linewidth=1.5,
+                                                               label=location_0.split('/')[
+                                                                   -2].lower(), color='orange'))
+        ax[prefixes.index(prefix), 0].boxplot(error_q_list[location_1],
+                                              positions=[x + .15 for x in pos[location_1]],
+                                              whis=[5, 95],
+                                              showfliers=False, widths=0.2,
+                                              medianprops=dict(linestyle='-', linewidth=4.5,
+                                                               label=location_1.split('/')[
+                                                                   -2].lower(), color='blue'))
+
+        handles, labels = ax[prefixes.index(prefix), 0].get_legend_handles_labels()
+        ax[prefixes.index(prefix), 0].legend([handles[0], handles[-1]], [labels[0], labels[-1]])
+        ax[prefixes.index(prefix), 0].set_xticks(range(-5, 6))
+        ax[prefixes.index(prefix), 0].set_xticklabels(range(-5, 6))
+        ax[prefixes.index(prefix), 0].set_yscale('log')
+
+        ax[prefixes.index(prefix), 1].set_title(
+            clean_string(prefix) + implementation + " norm_error")
+        ax[prefixes.index(prefix), 1].boxplot(norm_error_q_list[location_0],
+                                              positions=[x - .15 for x in pos[location_0]],
+                                              whis=[5, 95],
+                                              showfliers=False, widths=0.2,
+                                              medianprops=dict(linewidth=1.5,
+                                                               label=location_0.split('/')[
+                                                                   -2].lower(), color='orange'))
+        ax[prefixes.index(prefix), 1].boxplot(norm_error_q_list[location_1],
+                                              positions=[x + .15 for x in pos[location_1]],
+                                              whis=[5, 95],
+                                              showfliers=False, widths=0.2,
+                                              medianprops=dict(linewidth=4.5,
+                                                               label=location_1.split('/')[
+                                                                   -2].lower(), color='blue'))
+
+        handles, labels = ax[prefixes.index(prefix), 1].get_legend_handles_labels()
+        ax[prefixes.index(prefix), 1].legend([handles[0], handles[-1]], [labels[0], labels[-1]])
+        ax[prefixes.index(prefix), 1].set_xticks(range(-5, 6))
+        ax[prefixes.index(prefix), 1].set_xticklabels(range(-5, 6))
+        ax[prefixes.index(prefix), 1].set_yscale('log')
+
+        ax[prefixes.index(prefix), 2].set_title(
+            clean_string(prefix) + implementation + " " + cc_suffix.replace(".csv", "").lstrip("_"))
+        ax[prefixes.index(prefix), 2].hist(
+            centroid_count_data[prefix][location_0]["centroid_count"], range=[20, 100],
+            bins=40, color='orange', alpha=0.5, label=location_0.split('/')[-2].lower())
+        ax[prefixes.index(prefix), 2].hist(
+            centroid_count_data[prefix][location_1]["centroid_count"], range=[20, 100],
+            bins=40, color='blue', alpha=0.5, label=location_1.split('/')[-2].lower())
+        ax[prefixes.index(prefix), 2].legend()
+
+    fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,
+                        hspace=0.4, wspace=0.3)
+    if save is True:
+        plt.savefig(outfilename)
+    elif save is False:
+        plt.show()
+    plt.show()
+
+
+# for separate plots for the two distributions
+_params = [
+    ("{0}/{1}/{2}/".format(out_prefix, impl, dist), "{0}/{1}/{2}/".format(in_prefix, impl, dist),
+     " ({0}, {1})".format(impl, dist.lower())) for impl in implementations for dist in
+    distributions]
+
+params = [("{0}/{1}/BOTH/".format(out_prefix, impl),
+           ["{0}/{1}/{2}/".format(in_prefix, impl, dist) for dist in distributions],
+           " ({0})".format(impl)) for impl in ['tree', 'merging']]
+
 
 def main():
     for a, b, c in params:
+        generate_figures_both_distr(prefixes=["K_0_USUAL", "K_QUADRATIC"], save=True,
+                                    outfilename="{}t_digest_figs_K_0q".format(a), locations=b,
+                                    implementation=c)
+        generate_figures_both_distr(prefixes=["K_1_{}".format(y) for y in ["USUAL", "GLUED"]],
+                                    save=True,
+                                    outfilename="{}t_digest_figs_K_1".format(a), locations=b,
+                                    implementation=c)
+        generate_figures_both_distr(prefixes=["K_2_{}".format(y) for y in ["USUAL", "GLUED"]],
+                                    save=True,
+                                    outfilename="{}t_digest_figs_K_2".format(a), locations=b,
+                                    implementation=c)
+        generate_figures_both_distr(prefixes=["K_3_{}".format(y) for y in ["USUAL", "GLUED"]],
+                                    save=True,
+                                    outfilename="{}t_digest_figs_K_3".format(a), locations=b,
+                                    implementation=c)
+    for v in ['0.99', '0.999']:
+        fcn = 'K_0_USUAL'
+        centroid_index = -1
+        outfile = out_prefix + '/' + 'size/' + fcn + '_' + v + '_' + str(centroid_index) + '.png'
+        generate_size_figures(location=in_prefix + '/', prefix=fcn, value=v,
+                              centroid_index=centroid_index,
+                              outfilename=outfile, save=True)
+        generate_size_figures(location=in_prefix + '/', prefix=fcn, value=v,
+                              centroid_index=centroid_index,
+                              outfilename=outfile, save=True)
+
+    # these plots are no longer used in the paper
+    for a, b, c in _params:
         generate_figures(prefixes=["K_0_USUAL", "K_QUADRATIC"], save=True,
-                         outfilename="{}t_digest_figs_K_0q".format(a), location=b, implementation=c)
+                         outfilename="{}t_digest_figs_K_0q".format(a), location=b,
+                         implementation=c)
         generate_figures(prefixes=["K_1_{}".format(y) for y in ["USUAL", "GLUED"]], save=True,
-                         outfilename="{}t_digest_figs_K_1".format(a), location=b, implementation=c)
+                         outfilename="{}t_digest_figs_K_1".format(a), location=b,
+                         implementation=c)
         generate_figures(prefixes=["K_2_{}".format(y) for y in ["USUAL", "GLUED"]], save=True,
-                         outfilename="{}t_digest_figs_K_2".format(a), location=b, implementation=c)
+                         outfilename="{}t_digest_figs_K_2".format(a), location=b,
+                         implementation=c)
         generate_figures(prefixes=["K_3_{}".format(y) for y in ["USUAL", "GLUED"]], save=True,
-                         outfilename="{}t_digest_figs_K_3".format(a), location=b, implementation=c)
-    for centroid_index, v in [(-1, '0.99'), (-1, '0.999'), (0, '0.01')]:
-        fcn = 'K_0_USUAL'
-        outfile = "{0}/size/{1}_{2}_{3}.png".format(out_prefix, fcn, v, str(centroid_index))
-        generate_size_figures(location=in_prefix + '/', prefix=fcn, value=v, centroid_index=centroid_index,
-                             outfilename=outfile, save=True)
-        generate_size_figures(location=in_prefix + '/', prefix=fcn, value=v, centroid_index=centroid_index,
-                             outfilename=outfile, save=True)
+                         outfilename="{}t_digest_figs_K_3".format(a), location=b,
+                         implementation=c)
+
 
 if __name__ == "__main__":
     main()