diff --git a/core/src/main/java/com/tdunning/math/stats/MergingDigest.java b/core/src/main/java/com/tdunning/math/stats/MergingDigest.java index 6e1047ba..20e1bb83 100644 --- a/core/src/main/java/com/tdunning/math/stats/MergingDigest.java +++ b/core/src/main/java/com/tdunning/math/stats/MergingDigest.java @@ -214,6 +214,9 @@ public MergingDigest(double compression, int bufferSize, int size) { size = (int) Math.ceil(this.compression + sizeFudge); } + // yet more fudge + size += 20; + // ensure enough space in buffer (possibly again) if (bufferSize <= 2 * size) { bufferSize = 2 * size; @@ -240,6 +243,10 @@ public TDigest recordAllData() { return this; } + public void setUseAlternatingSort(boolean bool) { + useAlternatingSort = bool; + } + @Override void add(double x, int w, Centroid base) { add(x, w, base.data()); diff --git a/core/src/main/java/com/tdunning/math/stats/ScaleFunction.java b/core/src/main/java/com/tdunning/math/stats/ScaleFunction.java index 606eb825..4661b41f 100644 --- a/core/src/main/java/com/tdunning/math/stats/ScaleFunction.java +++ b/core/src/main/java/com/tdunning/math/stats/ScaleFunction.java @@ -122,6 +122,87 @@ public double normalizer(double compression, double n) { } }, + /** + * Generates cluster sizes proportional to sqrt(1-q) for q geq 1/2, and uniform cluster sizes for q lt 1/2 by gluing + * the graph of the K_1 function to its tangent line at q=1/2. Changing the split point is possible. + */ + K_1_GLUED { + + double splitPoint = 0.5; + + @Override + public double k(double q, double compression, double n) { + if (q <= splitPoint) { + return (compression / (2 * Math.PI)) * (Math.asin(2 * splitPoint - 1) + + (q - splitPoint) / (Math.sqrt(splitPoint * (1 - splitPoint)))); + } else { + return compression * Math.asin(2 * q - 1) / (2 * Math.PI); + } + } + + @Override + public double k(double q, double normalizer) { + if (q <= splitPoint) { + return normalizer * (Math.asin(2 * splitPoint - 1) + + (q - splitPoint) / (Math.sqrt(splitPoint * (1 - splitPoint)))); + } else { + return normalizer * Math.asin(2 * q - 1); + } + } + + + @Override + public double q(double k, double compression, double n) { + if (k <= compression * Math.asin(2 * splitPoint - 1) / Math.PI / 2) { + double ww = k * Math.PI * 2 / compression - Math.asin(2 * splitPoint - 1); + return ww * Math.sqrt(splitPoint * (1 - splitPoint)) + splitPoint; + } else { + return (Math.sin(k * 2 * Math.PI / compression) + 1) / 2; + } + } + + @Override + public double q(double k, double normalizer) { + if (k <= normalizer * Math.asin(2 * splitPoint - 1) ) { + double ww = k / normalizer - Math.asin(2 * splitPoint - 1); + return ww * Math.sqrt(splitPoint * (1 - splitPoint)) + splitPoint; + } else { + return (Math.sin(k / normalizer) + 1) / 2; + } + } + + @Override + public double max(double q, double compression, double n) { + if (q <= 0) { + return 0; + } else if (q >= 1) { + return 0; + } else if (q <= splitPoint) { + return Math.sqrt(splitPoint * (1 - splitPoint)) * 2 * Math.PI / compression; + } else { + return 2 * Math.sin(Math.PI / compression) * Math.sqrt(q * (1 - q)); + } + } + + @Override + public double max(double q, double normalizer) { + if (q <= 0) { + return 0; + } else if (q >= 1) { + return 0; + } else if (q <= splitPoint) { + return Math.sqrt(splitPoint * (1 - splitPoint)) / normalizer; + } else { + return 2 * Math.sin(0.5 / normalizer) * Math.sqrt(q * (1 - q)); + } + } + + @Override + public double normalizer(double compression, double n) { + return compression / (2 * Math.PI); + } + }, + /** * Generates cluster sizes proportional to sqrt(q*(1-q)) but avoids computation of asin in the critical path by * using an approximate version. @@ -245,6 +326,100 @@ private double Z(double compression, double n) { } }, + /** + * Generates cluster sizes proportional to 1-q for q geq 1/2, and uniform cluster sizes for q lt 1/2 by gluing + * the graph of the K_2 function to its tangent line at q=1/2. Changing the split point is possible. + */ + K_2_GLUED { + + double splitPoint = 0.5; // should be between 1e-15 and 1 - 1e-15 + + @Override + public double k(double q, double compression, double n) { + if (n <= 1) { + if (q <= 0) { + return -10; + } else if (q >= 1) { + return 10; + } else { + return 0; + } + } + if (q <= splitPoint) { + return (((q - splitPoint) / splitPoint / (1 - splitPoint)) + + Math.log(splitPoint / (1 - splitPoint))) * compression / Z(compression, n); + } else if (q == 1) { + return 2 * k((n - 1) / n, compression, n); + } else { + return compression * Math.log(q / (1 - q)) / Z(compression, n); + } + } + + @Override + public double k(double q, double normalizer) { + if (q <= splitPoint) { + return (((q - splitPoint) / splitPoint / (1 - splitPoint)) + + Math.log(splitPoint / (1 - splitPoint))) * normalizer; // fixed parens + } else if (q > 1 - 1e-15) { + // this will return something more extreme than q = (n-1)/n + return 2 * k(1 - 1e-15, normalizer); + } else { + return Math.log(q / (1 - q)) * normalizer; + } + } + + @Override + public double q(double k, double compression, double n) { //fixedher too + if (k <= compression * Math.log(splitPoint / (1 - splitPoint)) / Z(compression, n)) { + return splitPoint * (1 - splitPoint) * (k * Z(compression, n) / compression - + Math.log(splitPoint / (1 - splitPoint))) + splitPoint; + } else { + double w = Math.exp(k * Z(compression, n) / compression); + return w / (1 + w); + } + } + + @Override + public double q(double k, double normalizer) { + if (k <= Math.log(splitPoint / (1 - splitPoint)) * normalizer) { ///fixed whole thing + return splitPoint * (1 - splitPoint) * (k / normalizer - + Math.log(splitPoint / (1 - splitPoint))) + splitPoint; + } + else { + double w = Math.exp(k / normalizer); + return w / (1 + w); + } + } + + @Override + public double max(double q, double compression, double n) { + if (q <= splitPoint) { + return Z(compression, n) * splitPoint * (1 - splitPoint) / compression; + } + else { + return Z(compression, n) * q * (1 - q) / compression; + } + } + + @Override + public double max(double q, double normalizer) { + if (q <= splitPoint) { + return splitPoint * (1 - splitPoint) / normalizer; // changed to division. + } else { + return q * (1 - q) / normalizer; + } + } + + @Override + public double normalizer(double compression, double n) { + return compression / Z(compression, n); + } + + private double Z(double compression, double n) { + return 4 * Math.log(n / compression) + 24; + } + }, + /** * Generates cluster sizes proportional to min(q, 1-q). This makes tail error bounds tighter than for K_1 or K_2. * The use of a normalizing function results in a strictly bounded number of clusters no matter how many samples. @@ -318,6 +493,121 @@ private double Z(double compression, double n) { } }, + /** + * Generates cluster sizes proportional to 1-q for q geq 1/2, and uniform cluster sizes for q lt 1/2 by gluing + * the graph of the K_3 function to its tangent line at q=1/2. + */ + K_3_GLUED { + @Override + public double k(double q, double compression, double n) { + if (q <= 0.5) { + return compression * (2 * q - 1) / Z(compression, n); + } else if (q > 1 - 0.9 / n) { + return 10 * k((n - 1) / n, compression, n); + } else { + return - compression * Math.log(2 * (1 - q)) / Z(compression, n); + } + } + + @Override + public double k(double q, double normalizer) { + if (q <= 0.5) { + return normalizer * (2 * q - 1); + } else if (q > 1 - 1e-15) { + return 10 * k(1 - 1e-15, normalizer); + } else { + return - normalizer * Math.log(2 * (1 - q)); + } + } + + @Override + public double q(double k, double compression, double n) { + if (k <= 0) { + return ((k * Z(compression, n) / compression) + 1) / 2; + } else { + return 1 - (Math.exp(-k * Z(compression, n) / compression) / 2); + } + } + + @Override + public double q(double k, double normalizer) { + if (k <= 0) { + return ((k / normalizer) + 1 ) / 2; + } else { + return 1 - (Math.exp(-k / normalizer) / 2); + } + } + + @Override + public double max(double q, double compression, double n) { + if (q <= 0.5) { + return Z(compression, n) / 2d / compression; + } else { + return Z(compression, n) * (1-q) / compression; + } + } + + @Override + public double max(double q, double normalizer) { + if (q <= 0.5) { + return 1d / 2 / normalizer; + } else { + return (1-q) / normalizer; + } + } + + @Override + public double normalizer(double compression, double n) { + return compression / Z(compression, n); + } + + private double Z(double compression, double n) { + return 4 * Math.log(n / compression) + 21; + } + }, + + + /** + * Generates cluster sizes proportional to 1/(1+q). + */ + K_QUADRATIC { + @Override + public double k(double q, double compression, double n) { + return compression * (q * q + 2 * q) / 6; + } + + @Override + public double k(double q, double normalizer) { + return normalizer * (q * q + 2 * q) / 3; + } + + @Override + public double q(double k, double compression, double n) { + return Math.sqrt(compression * (compression + 6 * k)) / compression - 1; + } + + @Override + public double q(double k, double normalizer) { + return Math.sqrt(normalizer * (normalizer + 3 * k)) / normalizer - 1; + } + + @Override + public double max(double q, double compression, double n) { + return 3 / compression / (1 + q); + } + + @Override + public double max(double q, double normalizer) { + return 3 / 2 / normalizer / (1 + q); + } + + @Override + public double normalizer(double compression, double n) { + return compression / 2; + } + + }, + /** * Generates cluster sizes proportional to q*(1-q). This makes the tail error bounds tighter. This version does not * use a normalizer function and thus the number of clusters increases roughly proportional to log(n). That is good diff --git a/core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java b/core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java index e373cf39..1bf10651 100644 --- a/core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java +++ b/core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java @@ -17,18 +17,26 @@ package com.tdunning.math.stats; -import com.carrotsearch.randomizedtesting.annotations.Seed; -import com.google.common.collect.Lists; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import org.apache.commons.math3.util.Pair; import org.apache.mahout.common.RandomUtils; +import org.apache.mahout.math.jet.random.AbstractContinousDistribution; +import org.apache.mahout.math.jet.random.Exponential; +import org.apache.mahout.math.jet.random.Uniform; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.Iterator; -import java.util.List; -import java.util.Random; +import com.carrotsearch.randomizedtesting.annotations.Seed; //to freeze the tests with a particular seed, put the seed on the next line //@Seed("84527677CF03B566:A6FF596BDDB2D59D") @@ -57,4 +65,133 @@ public void testSetUp() { protected TDigest fromBytes(ByteBuffer bytes) { return MergingDigest.fromBytes(bytes); } -} \ No newline at end of file + + @Test + public void writeUniformAsymmetricScaleFunctionResults() { + try { + writeAsymmetricScaleFunctionResults(Distribution.UNIFORM); + } catch (Exception e) { + e.printStackTrace(); + } + } + + @Test + public void writeExponentialAsymmetricScaleFunctionResults() { + try { + writeAsymmetricScaleFunctionResults(Distribution.EXPONENTIAL); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private void writeAsymmetricScaleFunctionResults(Distribution distribution) throws Exception { + + List scaleFcns = Arrays.asList(ScaleFunction.K_0, ScaleFunction.K_1, + ScaleFunction.K_2, ScaleFunction.K_3, ScaleFunction.K_1_GLUED, + ScaleFunction.K_2_GLUED, ScaleFunction.K_3_GLUED, ScaleFunction.K_QUADRATIC); + + int numTrials = 100; + + Map> digestParams = new HashMap<>(); + + for (ScaleFunction fcn : scaleFcns) { + if (fcn.toString().endsWith("GLUED") || fcn.toString().endsWith("QUADRATIC")) { + digestParams.put(fcn.toString(), new Pair<>(fcn, false)); + } else { + digestParams.put(fcn.toString() + "_USUAL", new Pair<>(fcn, false)); + } + } + writeSeveralDigestUniformResults(digestParams, numTrials, distribution, + "../docs/asymmetric/data/merging/" + distribution.name() + "/"); + } + + private void writeSeveralDigestUniformResults(Map> digestParams, + int numTrials, Distribution distribution, String writeLocation) throws Exception { + + int trialSize = 1_000_000; + double compression = 100; + double[] quants = new double[]{0.00001, 0.0001, 0.001, 0.01, 0.1, + 0.5, 0.9, 0.99, 0.999, 0.9999, 0.99999}; + + Map> centroidCounts= new HashMap<>(); + + Map>> centroidSequences= new HashMap<>(); + + + for (Map.Entry> entry : digestParams.entrySet()) { + centroidCounts.put(entry.getKey(), new ArrayList()); + centroidSequences.put(entry.getKey(), new ArrayList>()); + try { + Map> records = new HashMap<>(); + for (double q : quants) { + records.put(q, new ArrayList()); + } + for (int j = 0; j < numTrials; j++) { + MergingDigest digest = (MergingDigest) factory(compression).create(); + digest.setScaleFunction(entry.getValue().getFirst()); + digest.setUseAlternatingSort(entry.getValue().getSecond()); + Random rand = new Random(); + AbstractContinousDistribution gen; + if (distribution.equals(Distribution.UNIFORM)) { + gen = new Uniform(50, 51, rand); + } else if (distribution.equals(Distribution.EXPONENTIAL)) { + gen = new Exponential(5, rand); + } else throw new Exception("distribution not specified"); + double[] data = new double[trialSize]; + for (int i = 0; i < trialSize; i++) { + data[i] = gen.nextDouble(); + digest.add(data[i]); + } + Arrays.sort(data); + digest.compress(); + for (double q : quants) { + double x1 = Dist.quantile(q, data); + double q1 = Dist.cdf(x1, data); + double q2 = digest.cdf(x1); + records.get(q).add(String.valueOf(Math.abs(q1 - q2)) + "," + + String.valueOf(Math.abs(q1 - q2) / Math.min(q, 1 - q)) + "\n"); + } + centroidCounts.get(entry.getKey()).add(digest.centroids().size()); + + List seq = new ArrayList<>(); + for (Centroid c : digest.centroids()) { + seq.add(c.count()); + } + centroidSequences.get(entry.getKey()).add(seq); + } + for (double q : quants) { + FileWriter csvWriter = new FileWriter(writeLocation + entry.getKey() + "_" + String.valueOf(q) + ".csv"); + csvWriter.append("error_q,norm_error_q\n"); + for (String obs : records.get(q)) { + csvWriter.append(obs); + } + csvWriter.flush(); + csvWriter.close(); + } + + FileWriter csvWriter = new FileWriter(writeLocation + entry.getKey() + "_centroid_counts.csv"); + csvWriter.append("centroid_count\n"); + for (Integer ct : centroidCounts.get(entry.getKey())) { + csvWriter.append(ct.toString()).append("\n"); + } + csvWriter.flush(); + csvWriter.close(); + + + FileWriter csvWriter2 = new FileWriter(writeLocation + entry.getKey() + "_centroid_sizes.csv"); + for (List ct : centroidSequences.get(entry.getKey())) { + for (Integer c : ct) { + csvWriter2.append(c.toString()).append(","); + } + csvWriter2.append("\n"); + } + csvWriter2.flush(); + csvWriter2.close(); + + } catch (IOException e) { + System.out.println(e.toString()); + return; + } + } + } +} diff --git a/core/src/test/java/com/tdunning/math/stats/TDigestTest.java b/core/src/test/java/com/tdunning/math/stats/TDigestTest.java index 41dfd4d7..a16978c5 100644 --- a/core/src/test/java/com/tdunning/math/stats/TDigestTest.java +++ b/core/src/test/java/com/tdunning/math/stats/TDigestTest.java @@ -18,8 +18,10 @@ package com.tdunning.math.stats; import com.google.common.collect.Lists; + import org.apache.mahout.common.RandomUtils; import org.apache.mahout.math.jet.random.AbstractContinousDistribution; +import org.apache.mahout.math.jet.random.Exponential; import org.apache.mahout.math.jet.random.Gamma; import org.apache.mahout.math.jet.random.Normal; import org.apache.mahout.math.jet.random.Uniform; @@ -31,7 +33,6 @@ import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; - /** * Base test case for TDigests, just extend this class and implement the abstract methods. */ @@ -44,6 +45,8 @@ public abstract class TDigestTest extends AbstractTest { private static String digestName; + protected enum Distribution {UNIFORM, EXPONENTIAL}; + @BeforeClass public static void freezeSeed() { RandomUtils.useTestSeed(); @@ -127,6 +130,125 @@ public void offsetUniform() { } } + @Test + public void writeUniformResultsWithCompression() { + try { + writeResultsWithCompression(Distribution.UNIFORM); + } catch (Exception e) { + e.printStackTrace(); + } + } + + @Test + public void writeExponentialResultsWithCompression() { + try { + writeResultsWithCompression(Distribution.EXPONENTIAL); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private void writeResultsWithCompression(Distribution distribution) throws Exception { + + List scaleFcns = Arrays.asList(ScaleFunction.K_0, ScaleFunction.K_1, + ScaleFunction.K_2, ScaleFunction.K_3, ScaleFunction.K_1_GLUED, + ScaleFunction.K_2_GLUED, ScaleFunction.K_3_GLUED, ScaleFunction.K_QUADRATIC); + + int trialSize = 1_000_000; + + Map> centroidCounts= new HashMap<>(); + + Map>> centroidSequences= new HashMap<>(); + + for (ScaleFunction scaleFcn : scaleFcns) { + centroidCounts.put(scaleFcn, new ArrayList()); + centroidSequences.put(scaleFcn, new ArrayList>()); + + try { + Map> records = new HashMap<>(); + double[] quants = new double[]{0.00001, 0.0001, 0.001, 0.01, 0.1, + 0.5, 0.9, 0.99, 0.999, 0.9999, 0.99999}; + for (double q : quants) { + records.put(q, new ArrayList()); + } + for (int j = 0; j < 100; j++) { + for (double compression : new double[]{100}) { + TDigest digest = factory(compression).create(); + digest.setScaleFunction(scaleFcn); + Random rand = new Random(); + AbstractContinousDistribution gen; + if (distribution.equals(Distribution.UNIFORM)) { + gen = new Uniform(50, 51, rand); + } else if (distribution.equals(Distribution.EXPONENTIAL)) { + gen = new Exponential(5, rand); + } else throw new Exception("distribution not specified"); + double[] data = new double[trialSize]; + for (int i = 0; i < trialSize; i++) { + data[i] = gen.nextDouble(); + digest.add(data[i]); + } + Arrays.sort(data); + digest.compress(); + for (double q : quants) { + double x1 = Dist.quantile(q, data); + double q1 = Dist.cdf(x1, data); + double q2 = digest.cdf(x1); + records.get(q).add(String.valueOf(Math.abs(q1 - q2)) + "," + + String.valueOf(Math.abs(q1 - q2) / Math.min(q, 1 - q)) + "\n"); + } + centroidCounts.get(scaleFcn).add(digest.centroids().size()); + + List seq = new ArrayList<>(); + for (Centroid c : digest.centroids()) { + seq.add(c.count()); + } + centroidSequences.get(scaleFcn).add(seq); + } + } + + String fcnName; + + if (scaleFcn.toString().endsWith("GLUED") || scaleFcn.toString().endsWith("QUADRATIC")) { + fcnName = scaleFcn.toString() ; + } else { + fcnName = scaleFcn.toString() + "_USUAL"; + } + + for (double q : quants) { + FileWriter csvWriter = new FileWriter("../docs/asymmetric/data/tree/" + distribution.name() + "/" + fcnName + "_" + String.valueOf(q) + ".csv"); + csvWriter.append("error_q,norm_error_q\n"); + for (String obs : records.get(q)) { + csvWriter.append(obs); + } + csvWriter.flush(); + csvWriter.close(); + } + + FileWriter csvWriter = new FileWriter("../docs/asymmetric/data/tree/" + distribution.name() + "/" + fcnName + "_centroid_counts.csv"); + csvWriter.append("centroid_count\n"); + for (Integer ct : centroidCounts.get(scaleFcn)) { + csvWriter.append(ct.toString()).append("\n"); + } + csvWriter.flush(); + csvWriter.close(); + + + FileWriter csvWriter2 = new FileWriter("../docs/asymmetric/data/tree/" + distribution.name() + "/" + fcnName + "_centroid_sizes.csv"); + for (List ct : centroidSequences.get(scaleFcn)) { + for (Integer c : ct) { + csvWriter2.append(c.toString()).append(","); + } + csvWriter2.append("\n"); + } + csvWriter2.flush(); + csvWriter2.close(); + + } catch (IOException e) { + return; + } + } + } + @Test public void bigJump() { TDigest digest = factory(100).create(); diff --git a/docs/asymmetric/Makefile b/docs/asymmetric/Makefile new file mode 100644 index 00000000..7ac3e408 --- /dev/null +++ b/docs/asymmetric/Makefile @@ -0,0 +1,13 @@ +PYTHON?=python3 +VENV?=venv +REQUIREMENTS?=requirements.txt + +.PHONY: install reinstall + +install: + $(PYTHON) -m virtualenv $(VENV) + . $(VENV)/bin/activate; $(PYTHON) -m pip install -r $(REQUIREMENTS); $(PYTHON) generate_plots.py + +reinstall: + rm -rf $(VENV) + $(MAKE) install diff --git a/docs/asymmetric/README.md b/docs/asymmetric/README.md new file mode 100644 index 00000000..14a389cd --- /dev/null +++ b/docs/asymmetric/README.md @@ -0,0 +1,36 @@ +This directory contains some empirical results of constructing t-digests with asymmetric scale functions, +which may be of particular interest for distributions with substantial skew. Suitable asymmetric scale functions +can be obtained by gluing the familiar k1, k2, k3 to their tangent lines at some point p in (0, 1). +A theoretical justification for this construction (namely, the proofs that the modified scale functions produce +mergeable data structures that can operate online), together with these results, is contained in a preprint +"Asymmetric scale functions for t-digests." The preprint is available in this repository [here](../proofs/gluing-construction.pdf) +and at https://arxiv.org/abs/2005.09599, and has been submitted for publication. + +The main goal here is to compare the usual scale functions with their asymmetric counterparts (using the point p=1/2 as a gluing location), +both in terms of accuracy and memory consumption (via number of centroids). A quadratic scale function is considered as well. + +The gist of the results is that for each ki mentioned above, the asymmetric (glued) variant has the +error profile of ki for quantiles greater than 1/2 (i.e., above the median) and that of k0 +(a linear function producing equal-sized bins) for quantiles on the other side. + +The data and summarizing plots can be produced in two steps. + +### Generate data + +In [TDigestTests](../../core/src/test/java/com/tdunning/math/stats/TDigestTest.java), run `writeUniformResultsWithCompression` with the `ALVTreeDigest` implementation, i.e., run +`AVLTreeDigestTest.writeUniformResultsWithCompression`. Similarly for `writeExponentialResultsWithCompression`. + +In [MergingDigestTest](../../core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java), run + `writeUniformAsymmetricScaleFunctionResults` and `writeExponentialAsymmetricScaleFunctionResults`. + +These will write data files. + +### Generate plots + +Now run the script [generate_plots.py](./generate_plots.py). For convenience, one can run `make install` (from this directory), which handles the requirements and runs the script. + +This script expects to be present the results of running the tests as above. +It will write plots (as PNG files). +The figures so generated are already present in this repository, see [here](../asymmetric/plots/merging/BOTH/t_digest_figs_K_0q.png) +and [here](../asymmetric/plots/tree/BOTH/t_digest_figs_K_0q.png) for example. + diff --git a/docs/asymmetric/generate_plots.py b/docs/asymmetric/generate_plots.py new file mode 100644 index 00000000..b2058fc2 --- /dev/null +++ b/docs/asymmetric/generate_plots.py @@ -0,0 +1,318 @@ +import os +import pandas as pd +import matplotlib.pyplot as plt + +in_prefix = "data" +out_prefix = "plots" + +implementations = ["tree", "merging"] +distributions = ["UNIFORM", "EXPONENTIAL"] + +scale_function_prefixes = ["K_{0}_{1}".format(x, y) for x in ["1", "2", "3"] for y in + ["USUAL", "GLUED"]] + ["K_0_USUAL"] + ["K_QUADRATIC"] + + +def clean_string(s): + return s.replace("K", "k").replace("_USUAL", "").replace("GLUED", "glued"). \ + replace("QUADRATIC", "quadratic") + + +cc_suffix = "_centroid_counts.csv" +cs_suffix = "_centroid_sizes.csv" + +axis_labels = {'.99': 2, '0.99': 2, + '1.0E-5': -5, '0.00001': -5, + '1.0E-4': -4, '0.0001': -4, + '.99999': 5, '0.99999': 5, + '.001': -3, '0.001': -3, + '.9': 1, '0.9': 1, + '.999': 3, '0.999': 3, + '.9999': 4, '0.9999': 4, + '.1': -1, '0.1': -1, + '.01': -2, '0.01': -2, + '.5': 0, '0.5': 0} + + +def generate_figures(prefixes=scale_function_prefixes, save=False, outfilename="", + location="", implementation="", dpi=None): + data = {} + + for prefix in prefixes: + data[prefix] = {} + filenames = filter( + lambda x: x.startswith(prefix) and not x.endswith(cc_suffix) and not x.endswith( + cs_suffix), + os.listdir(location)) + for filename in filenames: + value = filename.replace(prefix + "_", "").replace(".csv", "") + with open(location + filename, 'r') as f: + data[prefix][value] = pd.read_csv(f) + + centroid_count_data = {} + centroid_counts = map(lambda x: x + cc_suffix, prefixes) + for cc_name in centroid_counts: + with open(location + cc_name, 'r') as f: + centroid_count_data[cc_name.replace(cc_suffix, "")] = pd.read_csv(f) + + fig, ax = plt.subplots(len(prefixes), 3, squeeze=False) + fig.set_figheight(4 * len(prefixes)) + fig.set_figwidth(15) + + for prefix in prefixes: + error_q_list, norm_error_q_list = [], [] + pos = [] + for v in data[prefix]: + pos.append(axis_labels[v]) + df = data[prefix][v] + error_q_list.append(df['error_q']) + norm_error_q_list.append(df['norm_error_q']) + ax[prefixes.index(prefix), 0].set_title(clean_string(prefix) + implementation + " error") + ax[prefixes.index(prefix), 0].boxplot(error_q_list, positions=pos, whis=[5, 95], + showfliers=False) + ax[prefixes.index(prefix), 0].set_yscale('log') + ax[prefixes.index(prefix), 1].set_title( + clean_string(prefix) + implementation + " norm_error") + ax[prefixes.index(prefix), 1].boxplot(norm_error_q_list, positions=pos, whis=[5, 95], + showfliers=False) + ax[prefixes.index(prefix), 1].set_yscale('log') + ax[prefixes.index(prefix), 2].set_title( + clean_string(prefix) + implementation + " " + cc_suffix.replace(".csv", "").lstrip("_")) + ax[prefixes.index(prefix), 2].hist(centroid_count_data[prefix]["centroid_count"], + range=[5, 95], + bins=30) + + fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9, + hspace=0.4, wspace=0.3) + + if save is True: + plt.savefig(outfilename, dpi=dpi) + elif save is False: + plt.show() + + +def generate_size_figures(prefix="K_0_USUAL", save=False, outfilename="", value='0.01', + location="", centroid_index=0, dpi=None): + data = {} + centroid_sizes_data = {} + + for impl in implementations: + data[impl] = {} + centroid_sizes_data[impl] = {} + for dist in distributions: + data[impl][dist] = {} + centroid_sizes_data[impl][dist] = {} + filename = "{0}_{1}.csv".format(prefix, value) + with open("{0}/{1}/{2}".format(location, impl, dist) + "/" + filename, 'r') as f: + data[impl][dist][value] = pd.read_csv(f) + with open("{0}/{1}/{2}".format(location, impl, dist) + "/" + prefix + cs_suffix, + 'r') as f: + _d = f.readlines() + centroid_sizes_data[impl][dist][prefix] = [ + [int(x) for x in y.rstrip(',\n').split(',')] for y in _d] + + fig, ax = plt.subplots(len(implementations), len(distributions), squeeze=False) + fig.set_figheight(15) + fig.set_figwidth(15) + + for impl in implementations: + for dist in distributions: + error_q_list, norm_error_q_list = [], [] + pos = [] + for v in data[impl][dist]: + pos.append(axis_labels[v]) + df = data[impl][dist][v] + error_q_list.append(df['error_q']) + norm_error_q_list.append(df['norm_error_q']) + title = "{0}, {1}, {2}, q={3}, index {4}".format(clean_string(prefix), impl, + dist.lower(), value, + str(centroid_index)) + ax[implementations.index(impl), distributions.index(dist)].set_title(title) + _a, b = centroid_sizes_data[impl][dist][prefix], df['norm_error_q'] + a = [i[centroid_index] for i in _a] + ax[implementations.index(impl), distributions.index(dist)].scatter(a, b) + + fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9, + hspace=0.4, wspace=0.3) + + if save is True: + plt.savefig(outfilename, dpi=dpi) + elif save is False: + plt.show() + plt.close('all') + + +def generate_figures_both_distr(prefixes=scale_function_prefixes, save=False, outfilename="", + locations=[""], implementation="", dpi=None): + data = {} + + for prefix in prefixes: + data[prefix] = {} + for location in locations: + data[prefix][location] = {} + filenames = filter( + lambda x: x.startswith(prefix) and not x.endswith(cc_suffix) and not x.endswith( + cs_suffix), + os.listdir(location)) + for filename in filenames: + value = filename.replace(prefix + "_", "").replace(".csv", "") + with open(location + filename, 'r') as f: + data[prefix][location][value] = pd.read_csv(f) + + centroid_count_data = {} + centroid_counts = map(lambda x: x + cc_suffix, prefixes) + for cc_name in centroid_counts: + centroid_count_data[cc_name.replace(cc_suffix, "")] = {} + for location in locations: + # centroid_count_data[cc_name.replace(cc_suffix, "")][location] = {} + with open(location + cc_name, 'r') as f: + centroid_count_data[cc_name.replace(cc_suffix, "")][location] = pd.read_csv(f) + + fig, ax = plt.subplots(len(prefixes), 3, squeeze=False) + fig.set_figheight(4 * len(prefixes)) + fig.set_figwidth(15) + + for prefix in prefixes: + error_q_list, norm_error_q_list = {}, {} + pos = {} + for location in locations: + error_q_list[location] = [] + norm_error_q_list[location] = [] + pos[location] = [] + for v in data[prefix][location]: + pos[location].append(axis_labels[v]) + df = data[prefix][location][v] + error_q_list[location].append(df['error_q']) + norm_error_q_list[location].append(df['norm_error_q']) + + location_0, location_1 = locations + + ax[prefixes.index(prefix), 0].set_title(clean_string(prefix) + implementation + " error") + ax[prefixes.index(prefix), 0].boxplot(error_q_list[location_0], + positions=[x - .15 for x in pos[location_0]], + whis=[5, 95], + showfliers=False, widths=0.2, + medianprops=dict(linestyle='-', linewidth=1.5, + label=location_0.split('/')[ + -2].lower(), color='orange')) + ax[prefixes.index(prefix), 0].boxplot(error_q_list[location_1], + positions=[x + .15 for x in pos[location_1]], + whis=[5, 95], + showfliers=False, widths=0.2, + medianprops=dict(linestyle='-', linewidth=4.5, + label=location_1.split('/')[ + -2].lower(), color='blue')) + + handles, labels = ax[prefixes.index(prefix), 0].get_legend_handles_labels() + ax[prefixes.index(prefix), 0].legend([handles[0], handles[-1]], [labels[0], labels[-1]]) + ax[prefixes.index(prefix), 0].set_xticks(range(-5, 6)) + ax[prefixes.index(prefix), 0].set_xticklabels(range(-5, 6)) + ax[prefixes.index(prefix), 0].set_yscale('log') + + ax[prefixes.index(prefix), 1].set_title( + clean_string(prefix) + implementation + " norm_error") + ax[prefixes.index(prefix), 1].boxplot(norm_error_q_list[location_0], + positions=[x - .15 for x in pos[location_0]], + whis=[5, 95], + showfliers=False, widths=0.2, + medianprops=dict(linewidth=1.5, + label=location_0.split('/')[ + -2].lower(), color='orange')) + ax[prefixes.index(prefix), 1].boxplot(norm_error_q_list[location_1], + positions=[x + .15 for x in pos[location_1]], + whis=[5, 95], + showfliers=False, widths=0.2, + medianprops=dict(linewidth=4.5, + label=location_1.split('/')[ + -2].lower(), color='blue')) + + handles, labels = ax[prefixes.index(prefix), 1].get_legend_handles_labels() + ax[prefixes.index(prefix), 1].legend([handles[0], handles[-1]], [labels[0], labels[-1]]) + ax[prefixes.index(prefix), 1].set_xticks(range(-5, 6)) + ax[prefixes.index(prefix), 1].set_xticklabels(range(-5, 6)) + ax[prefixes.index(prefix), 1].set_yscale('log') + + ax[prefixes.index(prefix), 2].set_title( + clean_string(prefix) + implementation + " " + cc_suffix.replace(".csv", "").lstrip("_")) + ax[prefixes.index(prefix), 2].hist( + centroid_count_data[prefix][location_0]["centroid_count"], range=[20, 100], + bins=40, color='orange', alpha=0.5, label=location_0.split('/')[-2].lower()) + ax[prefixes.index(prefix), 2].hist( + centroid_count_data[prefix][location_1]["centroid_count"], range=[20, 100], + bins=40, color='blue', alpha=0.5, label=location_1.split('/')[-2].lower()) + ax[prefixes.index(prefix), 2].legend() + + fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9, + hspace=0.4, wspace=0.3) + if save is True: + plt.savefig(outfilename, dpi=dpi) + elif save is False: + plt.show() + plt.close('all') + + +# for separate plots for the two distributions +_params = [ + ("{0}/{1}/{2}/".format(out_prefix, impl, dist), "{0}/{1}/{2}/".format(in_prefix, impl, dist), + " ({0}, {1})".format(impl, dist.lower())) for impl in implementations for dist in + distributions] + +params = [("{0}/{1}/BOTH/".format(out_prefix, impl), + ["{0}/{1}/{2}/".format(in_prefix, impl, dist) for dist in distributions], + " ({0})".format(impl)) for impl in ['tree', 'merging']] + + +def main(): + for a, b, c in params: + generate_figures_both_distr(prefixes=["K_0_USUAL", "K_QUADRATIC"], save=True, + outfilename="{}t_digest_figs_K_0q".format(a), locations=b, + implementation=c, dpi=350) + generate_figures_both_distr(prefixes=["K_1_{}".format(y) for y in ["USUAL", "GLUED"]], + save=True, + outfilename="{}t_digest_figs_K_1".format(a), locations=b, + implementation=c, dpi=350) + generate_figures_both_distr(prefixes=["K_2_{}".format(y) for y in ["USUAL", "GLUED"]], + save=True, + outfilename="{}t_digest_figs_K_2".format(a), locations=b, + implementation=c, dpi=350) + generate_figures_both_distr(prefixes=["K_3_{}".format(y) for y in ["USUAL", "GLUED"]], + save=True, + outfilename="{}t_digest_figs_K_3".format(a), locations=b, + implementation=c, dpi=350) + + for v in ['0.99', '0.999']: + fcn = 'K_0_USUAL' + centroid_index = -1 + outfile = out_prefix + '/' + 'size/' + fcn + '_' + v + '_' + str(centroid_index) + '.png' + generate_size_figures(location=in_prefix + '/', prefix=fcn, value=v, + centroid_index=centroid_index, + outfilename=outfile, save=True, dpi=350) + generate_size_figures(location=in_prefix + '/', prefix=fcn, value=v, + centroid_index=centroid_index, + outfilename=outfile, save=True, dpi=350) + + _v = '0.01' + _fcn = 'K_0_USUAL' + centroid_index = 0 + outfile = out_prefix + '/' + 'size/' + _fcn + '_' + _v + '_' + str(centroid_index) + '.png' + generate_size_figures(location=in_prefix + '/', prefix=_fcn, value=_v, + centroid_index=centroid_index, + outfilename=outfile, save=True, dpi=350) + + # these plots are no longer used in the paper + for a, b, c in _params: + generate_figures(prefixes=["K_0_USUAL", "K_QUADRATIC"], save=True, + outfilename="{}t_digest_figs_K_0q".format(a), location=b, + implementation=c) + generate_figures(prefixes=["K_1_{}".format(y) for y in ["USUAL", "GLUED"]], save=True, + outfilename="{}t_digest_figs_K_1".format(a), location=b, + implementation=c) + generate_figures(prefixes=["K_2_{}".format(y) for y in ["USUAL", "GLUED"]], save=True, + outfilename="{}t_digest_figs_K_2".format(a), location=b, + implementation=c) + generate_figures(prefixes=["K_3_{}".format(y) for y in ["USUAL", "GLUED"]], save=True, + outfilename="{}t_digest_figs_K_3".format(a), location=b, + implementation=c) + + +if __name__ == "__main__": + main() diff --git a/docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_0q.png b/docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_0q.png new file mode 100644 index 00000000..8bea6268 Binary files /dev/null and b/docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_0q.png differ diff --git a/docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_1.png b/docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_1.png new file mode 100644 index 00000000..4ffac575 Binary files /dev/null and b/docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_1.png differ diff --git a/docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_2.png b/docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_2.png new file mode 100644 index 00000000..8c6fb038 Binary files /dev/null and b/docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_2.png differ diff --git a/docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_3.png b/docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_3.png new file mode 100644 index 00000000..a347dc8f Binary files /dev/null and b/docs/asymmetric/plots/merging/BOTH/t_digest_figs_K_3.png differ diff --git a/docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_0q.png b/docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_0q.png new file mode 100644 index 00000000..20729a61 Binary files /dev/null and b/docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_0q.png differ diff --git a/docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_1.png b/docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_1.png new file mode 100644 index 00000000..9a6f28a3 Binary files /dev/null and b/docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_1.png differ diff --git a/docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_2.png b/docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_2.png new file mode 100644 index 00000000..98e49e41 Binary files /dev/null and b/docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_2.png differ diff --git a/docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_3.png b/docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_3.png new file mode 100644 index 00000000..8bd8455e Binary files /dev/null and b/docs/asymmetric/plots/merging/EXPONENTIAL/t_digest_figs_K_3.png differ diff --git a/docs/asymmetric/plots/merging/UNIFORM/t_digest_figs_K_0q.png b/docs/asymmetric/plots/merging/UNIFORM/t_digest_figs_K_0q.png new file mode 100644 index 00000000..6badc8b1 Binary files /dev/null and b/docs/asymmetric/plots/merging/UNIFORM/t_digest_figs_K_0q.png differ diff --git a/docs/asymmetric/plots/merging/UNIFORM/t_digest_figs_K_1.png b/docs/asymmetric/plots/merging/UNIFORM/t_digest_figs_K_1.png new file mode 100644 index 00000000..e6e5b432 Binary files /dev/null and b/docs/asymmetric/plots/merging/UNIFORM/t_digest_figs_K_1.png differ diff --git a/docs/asymmetric/plots/merging/UNIFORM/t_digest_figs_K_2.png b/docs/asymmetric/plots/merging/UNIFORM/t_digest_figs_K_2.png new file mode 100644 index 00000000..bf96f2e5 Binary files /dev/null and b/docs/asymmetric/plots/merging/UNIFORM/t_digest_figs_K_2.png differ diff --git a/docs/asymmetric/plots/merging/UNIFORM/t_digest_figs_K_3.png b/docs/asymmetric/plots/merging/UNIFORM/t_digest_figs_K_3.png new file mode 100644 index 00000000..5ca2063d Binary files /dev/null and b/docs/asymmetric/plots/merging/UNIFORM/t_digest_figs_K_3.png differ diff --git a/docs/asymmetric/plots/size/K_0_USUAL_0.01_0.png b/docs/asymmetric/plots/size/K_0_USUAL_0.01_0.png new file mode 100644 index 00000000..7bf530de Binary files /dev/null and b/docs/asymmetric/plots/size/K_0_USUAL_0.01_0.png differ diff --git a/docs/asymmetric/plots/size/K_0_USUAL_0.999_-1.png b/docs/asymmetric/plots/size/K_0_USUAL_0.999_-1.png new file mode 100644 index 00000000..69dfc5e6 Binary files /dev/null and b/docs/asymmetric/plots/size/K_0_USUAL_0.999_-1.png differ diff --git a/docs/asymmetric/plots/size/K_0_USUAL_0.99_-1.png b/docs/asymmetric/plots/size/K_0_USUAL_0.99_-1.png new file mode 100644 index 00000000..ef389293 Binary files /dev/null and b/docs/asymmetric/plots/size/K_0_USUAL_0.99_-1.png differ diff --git a/docs/asymmetric/plots/tree/BOTH/t_digest_figs_K_0q.png b/docs/asymmetric/plots/tree/BOTH/t_digest_figs_K_0q.png new file mode 100644 index 00000000..db41fd40 Binary files /dev/null and b/docs/asymmetric/plots/tree/BOTH/t_digest_figs_K_0q.png differ diff --git a/docs/asymmetric/plots/tree/BOTH/t_digest_figs_K_1.png b/docs/asymmetric/plots/tree/BOTH/t_digest_figs_K_1.png new file mode 100644 index 00000000..811c5d38 Binary files /dev/null and b/docs/asymmetric/plots/tree/BOTH/t_digest_figs_K_1.png differ diff --git a/docs/asymmetric/plots/tree/BOTH/t_digest_figs_K_2.png b/docs/asymmetric/plots/tree/BOTH/t_digest_figs_K_2.png new file mode 100644 index 00000000..6a8d18b6 Binary files /dev/null and b/docs/asymmetric/plots/tree/BOTH/t_digest_figs_K_2.png differ diff --git a/docs/asymmetric/plots/tree/BOTH/t_digest_figs_K_3.png b/docs/asymmetric/plots/tree/BOTH/t_digest_figs_K_3.png new file mode 100644 index 00000000..f5a90270 Binary files /dev/null and b/docs/asymmetric/plots/tree/BOTH/t_digest_figs_K_3.png differ diff --git a/docs/asymmetric/plots/tree/EXPONENTIAL/t_digest_figs_K_0q.png b/docs/asymmetric/plots/tree/EXPONENTIAL/t_digest_figs_K_0q.png new file mode 100644 index 00000000..3632eb0d Binary files /dev/null and b/docs/asymmetric/plots/tree/EXPONENTIAL/t_digest_figs_K_0q.png differ diff --git a/docs/asymmetric/plots/tree/EXPONENTIAL/t_digest_figs_K_1.png b/docs/asymmetric/plots/tree/EXPONENTIAL/t_digest_figs_K_1.png new file mode 100644 index 00000000..a5ae2337 Binary files /dev/null and b/docs/asymmetric/plots/tree/EXPONENTIAL/t_digest_figs_K_1.png differ diff --git a/docs/asymmetric/plots/tree/EXPONENTIAL/t_digest_figs_K_2.png b/docs/asymmetric/plots/tree/EXPONENTIAL/t_digest_figs_K_2.png new file mode 100644 index 00000000..bbab2d29 Binary files /dev/null and b/docs/asymmetric/plots/tree/EXPONENTIAL/t_digest_figs_K_2.png differ diff --git a/docs/asymmetric/plots/tree/EXPONENTIAL/t_digest_figs_K_3.png b/docs/asymmetric/plots/tree/EXPONENTIAL/t_digest_figs_K_3.png new file mode 100644 index 00000000..acefa33b Binary files /dev/null and b/docs/asymmetric/plots/tree/EXPONENTIAL/t_digest_figs_K_3.png differ diff --git a/docs/asymmetric/plots/tree/UNIFORM/t_digest_figs_K_0q.png b/docs/asymmetric/plots/tree/UNIFORM/t_digest_figs_K_0q.png new file mode 100644 index 00000000..c4acedc2 Binary files /dev/null and b/docs/asymmetric/plots/tree/UNIFORM/t_digest_figs_K_0q.png differ diff --git a/docs/asymmetric/plots/tree/UNIFORM/t_digest_figs_K_1.png b/docs/asymmetric/plots/tree/UNIFORM/t_digest_figs_K_1.png new file mode 100644 index 00000000..0a5b0bd5 Binary files /dev/null and b/docs/asymmetric/plots/tree/UNIFORM/t_digest_figs_K_1.png differ diff --git a/docs/asymmetric/plots/tree/UNIFORM/t_digest_figs_K_2.png b/docs/asymmetric/plots/tree/UNIFORM/t_digest_figs_K_2.png new file mode 100644 index 00000000..0b9e0b37 Binary files /dev/null and b/docs/asymmetric/plots/tree/UNIFORM/t_digest_figs_K_2.png differ diff --git a/docs/asymmetric/plots/tree/UNIFORM/t_digest_figs_K_3.png b/docs/asymmetric/plots/tree/UNIFORM/t_digest_figs_K_3.png new file mode 100644 index 00000000..e0f62cc5 Binary files /dev/null and b/docs/asymmetric/plots/tree/UNIFORM/t_digest_figs_K_3.png differ diff --git a/docs/asymmetric/requirements.txt b/docs/asymmetric/requirements.txt new file mode 100644 index 00000000..5d56fdde --- /dev/null +++ b/docs/asymmetric/requirements.txt @@ -0,0 +1,2 @@ +pandas +matplotlib diff --git a/docs/proofs/gluing-construction.pdf b/docs/proofs/gluing-construction.pdf new file mode 100644 index 00000000..6f0d7aa4 Binary files /dev/null and b/docs/proofs/gluing-construction.pdf differ