diff --git a/ciftools/binary/encoding/impl/binary_cif_encoder.py b/ciftools/binary/encoding/impl/binary_cif_encoder.py index 8612878..8ee3857 100644 --- a/ciftools/binary/encoding/impl/binary_cif_encoder.py +++ b/ciftools/binary/encoding/impl/binary_cif_encoder.py @@ -11,15 +11,24 @@ def encode_cif_data(self, data: any) -> EncodedCIFData: encodings: list[EncodingBase] = [] for encoder in self.encoders: + # get EncodedCIFData typeddict with 'data' and 'encoding' encoded = encoder.encode(data) + # get ref to 'encoding' of that typeddict added_encodings = encoded["encoding"] + # if 'encoding' is None or 0 - raise Error if not added_encodings or not len(added_encodings): raise ValueError("Encodings must be non-empty.") + # get ref to 'data' of typeddict data = encoded["data"] + + # add 'encoding' to list of encodings encodings.extend(added_encodings) + # on next iteration, already encoded data by the first encoder, + # is encoded by the 2nd, then by 3rd, each time encoding is added to list + if not isinstance(data, bytes): raise ValueError( f"The encoding must result in bytes but it was {str(type(data))}. Fix your encoding chain." diff --git a/ciftools/binary/encoding/impl/encoders/integer_packing.py b/ciftools/binary/encoding/impl/encoders/integer_packing.py index 4bd0a0d..77ff500 100644 --- a/ciftools/binary/encoding/impl/encoders/integer_packing.py +++ b/ciftools/binary/encoding/impl/encoders/integer_packing.py @@ -1,4 +1,5 @@ import math +from numba import jit, int32, uint32, njit import numpy as np from ciftools.binary.encoding.base.cif_encoder_base import CIFEncoderBase @@ -7,7 +8,6 @@ from ciftools.binary.encoding.types import EncodedCIFData from numpy import int8, int16, uint8, uint16 - class IntegerPackingCIFEncoder(CIFEncoderBase): def encode(self, data: np.ndarray) -> EncodedCIFData: @@ -38,22 +38,61 @@ def encode(self, data: np.ndarray) -> EncodedCIFData: # TODO: figure out if there is a way to implement this # better & faster with numpy methods. - packed_index = 0 - for _v in data: - value = _v - if value >= 0: - while value >= upper_limit: - packed[packed_index] = upper_limit - packed_index += 1 - value -= upper_limit + packed = _packing_loop( + data=data, + upper_limit=upper_limit, + lower_limit=lower_limit, + packed=packed + ) + + byte_array_result = BYTE_ARRAY_CIF_ENCODER.encode(packed) + + integer_packing_encoding: IntegerPackingEncoding = { + "kind": EncodingEnun.IntegerPacking, + "isUnsigned": not packing.isSigned, + "srcSize": len(data), + "byteCount": packing.bytesPerElement, + } + + return EncodedCIFData( + data=byte_array_result["data"], encoding=[integer_packing_encoding, byte_array_result["encoding"][0]] + ) + + def encode_optimized(self, data: np.ndarray) -> EncodedCIFData: + + # TODO: must be 32bit integer + + packing = _determine_packing_optimized(data) + if packing.bytesPerElement == 4: + return BYTE_ARRAY_CIF_ENCODER.encode(data) + + # integer packing + + if packing.isSigned: + if packing.bytesPerElement == 1: + upper_limit = 0x7F + packed = np.empty(packing.size, dtype=int8) else: - while value <= lower_limit: - packed[packed_index] = lower_limit - packed_index += 1 - value -= lower_limit + upper_limit = 0x7FFF + packed = np.empty(packing.size, dtype=int16) + else: + if packing.bytesPerElement == 1: + upper_limit = 0xFF + packed = np.empty(packing.size, dtype=uint8) + else: + upper_limit = 0xFFFF + packed = np.empty(packing.size, dtype=uint16) - packed[packed_index] = value - packed_index += 1 + lower_limit = -upper_limit - 1 + + # TODO: figure out if there is a way to implement this + # better & faster with numpy methods. + packed = _packing_loop_optimized( + data=data, + upper_limit=upper_limit, + lower_limit=lower_limit, + packed=packed + ) byte_array_result = BYTE_ARRAY_CIF_ENCODER.encode(packed) @@ -74,6 +113,46 @@ class _PackingInfo: size: int bytesPerElement: int +def _packing_loop(data: np.ndarray, upper_limit, lower_limit, packed: np.ndarray) -> np.ndarray: + packed_index = 0 + for _v in data: + value = _v + if value >= 0: + while value >= upper_limit: + packed[packed_index] = upper_limit + packed_index += 1 + value -= upper_limit + else: + while value <= lower_limit: + packed[packed_index] = lower_limit + packed_index += 1 + value -= lower_limit + + packed[packed_index] = value + packed_index += 1 + + return packed + +@jit(nopython=True) +def _packing_loop_optimized(data: np.ndarray, upper_limit, lower_limit, packed: np.ndarray) -> np.ndarray: + packed_index = 0 + for _v in data: + value = _v + if value >= 0: + while value >= upper_limit: + packed[packed_index] = upper_limit + packed_index += 1 + value -= upper_limit + else: + while value <= lower_limit: + packed[packed_index] = lower_limit + packed_index += 1 + value -= lower_limit + + packed[packed_index] = value + packed_index += 1 + + return packed def _determine_packing(data: np.ndarray) -> _PackingInfo: # determine sign @@ -102,6 +181,35 @@ def _determine_packing(data: np.ndarray) -> _PackingInfo: return packing +def _determine_packing_optimized(data: np.ndarray) -> _PackingInfo: + # determine sign + is_signed = np.any(data < 0) + + # determine packing size + size8 = _packing_size_optimized(data, 0x7F) if is_signed else _packing_size_optimized(data, 0xFF) + size16 = _packing_size_optimized(data, 0x7FFF) if is_signed else _packing_size_optimized(data, 0xFFFF) + + # size8 = _packing_size_signed_optimized(data, 0x7F) if is_signed else _packing_size_unsigned_optimized(data, 0xFF) + # size16 = _packing_size_signed_optimized(data, 0x7FFF) if is_signed else _packing_size_unsigned_optimized(data, 0xFFFF) + + packing = _PackingInfo() + packing.isSigned = is_signed + + data_len = len(data) + + if data_len * 4 < size16 * 2: + packing.size = data_len + packing.bytesPerElement = 4 + + elif size16 * 2 < size8: + packing.size = size16 + packing.bytesPerElement = 2 + + else: + packing.size = size8 + packing.bytesPerElement = 1 + + return packing def _packing_size(data: np.ndarray, upper_limit: int) -> int: lower_limit = -upper_limit - 1 @@ -121,5 +229,73 @@ def _packing_size(data: np.ndarray, upper_limit: int) -> int: return size +@jit(nopython=True) +def _packing_size_unsigned_optimized(data: np.ndarray, upper_limit: int) -> int: + # lower_limit = -upper_limit - 1 + size = 0 + + for value in data: + size = size + math.floor(value / upper_limit) + 1 + + return size + +@jit(nopython=True) +def _packing_size_signed_optimized(data: np.ndarray, upper_limit: int) -> int: + lower_limit = -upper_limit - 1 + size = 0 + + for value in data: + if value >= 0: + size = size + math.floor(value / upper_limit) + 1 + else: + size = size + math.floor(value / lower_limit) + 1 + + return size + +# works, but no difference in time +# @njit([(int32[:], int32), (uint32[:], int32)]) +@njit +def _packing_size_optimized(data: np.ndarray, upper_limit: int) -> int: + lower_limit = -upper_limit - 1 + size = 0 + + # Fastest + for value in data: + if value >= 0: + size = size + math.floor(value / upper_limit) + 1 + else: + size = size + math.floor(value / lower_limit) + 1 + + # Masks - slower x3 times + # positives = data[data >= 0] + # positive_size = np.sum(np.floor(positives / upper_limit)) + len(positives) + + # negatives = data[data < 0] + # negative_size = np.sum(np.floor(negatives / lower_limit)) + len(negatives) + + # size = int(positive_size + negative_size) + + # x3 times slower, // + # for value in data: + # if value >= 0: + # size = size + value // upper_limit + 1 + # else: + # size = size + value // lower_limit + 1 + + + # Original + # for value in data: + # if value == 0: + # size = size + 1 + # elif value > 0: + # size = size + math.ceil(value / upper_limit) + # if value % upper_limit == 0: + # size = size + 1 + # else: + # size = size + math.ceil(value / lower_limit) + # if value % lower_limit == 0: + # size = size + 1 + + return size INTEGER_PACKING_CIF_ENCODER = IntegerPackingCIFEncoder() diff --git a/requirements.txt b/requirements.txt index aade9d2..35ced87 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,12 @@ -numpy >= 1.11.1 -msgpack >= 1.0.3 \ No newline at end of file +attrs==22.1.0 +iniconfig==1.1.1 +msgpack==1.0.4 +numpy==1.23.3 +packaging==21.3 +pluggy==1.0.0 +py==1.11.0 +py-cpuinfo==8.0.0 +pyparsing==3.0.9 +pytest==7.1.3 +pytest-benchmark==3.4.1 +tomli==2.0.1 diff --git a/tests/integer_packing.py b/tests/integer_packing.py index e3365b3..395214d 100644 --- a/tests/integer_packing.py +++ b/tests/integer_packing.py @@ -26,3 +26,14 @@ def test(self): self.assertTrue(np.array_equal(test_arr, decoded)) self.assertEqual(is_unsigned, encoded["encoding"][0]["isUnsigned"]) self.assertEqual(byte_count, encoded["encoding"][0]["byteCount"]) + + # testing optimized versions too + for test_arr, is_unsigned, byte_count in test_suite: + encoder = INTEGER_PACKING_CIF_ENCODER + encoded = encoder.encode_optimized(test_arr) + decoded = decode_cif_data(encoded) + msgpack.loads(msgpack.dumps(encoded)) + + self.assertTrue(np.array_equal(test_arr, decoded)) + self.assertEqual(is_unsigned, encoded["encoding"][0]["isUnsigned"]) + self.assertEqual(byte_count, encoded["encoding"][0]["byteCount"]) diff --git a/tests/test_benchmarking_integer_packing.py b/tests/test_benchmarking_integer_packing.py new file mode 100644 index 0000000..769c697 --- /dev/null +++ b/tests/test_benchmarking_integer_packing.py @@ -0,0 +1,65 @@ +from argparse import ArgumentError +import pytest +import msgpack +import numpy as np +from ciftools.binary.encoding.data_types import DataTypeEnum +from ciftools.binary.decoder import decode_cif_data +from ciftools.binary.encoding.impl.binary_cif_encoder import BinaryCIFEncoder +from ciftools.binary.encoding.impl.encoders.byte_array import BYTE_ARRAY_CIF_ENCODER +from numba import jit + +from ciftools.binary.encoding.impl.encoders.integer_packing import INTEGER_PACKING_CIF_ENCODER + +# TODO: +# Next - next encoder (quantization?) + +# NOTE: Later: +# 2. function that produces inputs for decoding (sizes?) (negatives will be there or not?) +# 4. Test decoding - decode and decode optimized? +INPUT_DTYPE = 'i4' + +INPUTS_FOR_ENCODING_NO_NEGATIVES = [ + # 0.8, 8, 80 MB + np.random.randint(low=0, high=300, size=(2*10**5), dtype=INPUT_DTYPE), + np.random.randint(low=0, high=300, size=(2*10**6), dtype=INPUT_DTYPE), + np.random.randint(low=0, high=300, size=(2*10**7), dtype=INPUT_DTYPE) +] + +INPUTS_FOR_ENCODING_WITH_NEGATIVES = [ + np.random.randint(low=-50, high=50, size=(2*10**5), dtype=INPUT_DTYPE), + np.random.randint(low=-50, high=50, size=(2*10**6), dtype=INPUT_DTYPE), + # np.random.randint(low=-50, high=50, size=(2*10**7), dtype=INPUT_DTYPE) +] + +def compute_inputs_for_decoding(inputs_for_encoding: list): + inputs_for_decoding = [] + for input_arr in inputs_for_encoding: + encoder = BinaryCIFEncoder([BYTE_ARRAY_CIF_ENCODER]) + encoded = encoder.encode_cif_data(input_arr) + inputs_for_decoding.append(encoded) + + print(inputs_for_decoding) + return inputs_for_decoding + +# INPUTS_FOR_DECODING = compute_inputs_for_decoding() + +OPTIMIZED = [True] + +def int_packing_encoding(encoding_input, optimization): + encoder = INTEGER_PACKING_CIF_ENCODER + if not optimization: + encoded = encoder.encode(encoding_input) + else: + encoded = encoder.encode_optimized(encoding_input) + + +@pytest.mark.parametrize("encoding_input", INPUTS_FOR_ENCODING_NO_NEGATIVES) +@pytest.mark.parametrize("optimization", OPTIMIZED) +def test_integer_packing_encoding_NO_negatives(benchmark, encoding_input, optimization): + result = benchmark(int_packing_encoding, encoding_input=encoding_input, optimization=optimization) + +# @pytest.mark.parametrize("encoding_input", INPUTS_FOR_ENCODING_WITH_NEGATIVES) +# @pytest.mark.parametrize("optimization", OPTIMIZED) +# def test_integer_packing_encoding_WITH_negatives(benchmark, encoding_input, optimization): +# result = benchmark(int_packing_encoding, encoding_input=encoding_input, optimization=optimization) +