diff --git a/.gitignore b/.gitignore
index dbd6338..73ab46b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,10 @@ dist/
 .idea
 *.egg-info/
 *.safetensors
-outputs/
\ No newline at end of file
+outputs/
+outputs_past/
+packed_delta
+.cache
+delta_outputs/
+.io/
+outputs_exp/
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..d99f2f3
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter"
+    },
+    "python.formatting.provider": "none"
+}
\ No newline at end of file
diff --git a/autotune_gptq.py b/autotune_gptq.py
new file mode 100644
index 0000000..34e987d
--- /dev/null
+++ b/autotune_gptq.py
@@ -0,0 +1,175 @@
+import math
+import time
+import torch
+import transformers
+import torch.nn as nn
+from loguru import logger
+from quant import quantize
+
+DEBUG = False
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+
+def hard_threshold(x, fraction_of_zero=0.1):
+    if fraction_of_zero == 0:
+        return x
+    y, _ = torch.sort(x.view(-1).abs().clone())
+    num_params = torch.numel(x)
+    thresh_index = int(num_params * fraction_of_zero)
+    threshold = y[thresh_index]
+    mask = x.abs().clone().gt(threshold).type(torch.cuda.HalfTensor)
+    return mask * x
+
+class GPTQ:
+    def __init__(self, layer):
+        self.layer = layer
+        self.original_weight = layer.weight.data.clone()
+        self.dev = self.layer.weight.device
+        W = layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        self.rows = W.shape[0]
+        self.columns = W.shape[1]
+        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
+        self.nsamples = 0
+
+    def add_batch(self, inp, out):
+        self.inp1 = inp
+        self.out1 = out
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear):
+            if len(inp.shape) == 3:
+                inp = inp.reshape((-1, inp.shape[-1]))
+            inp = inp.t()
+        
+        self.H *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        # inp = inp.float()
+        inp = math.sqrt(2 / self.nsamples) * inp.float()
+        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
+        self.H += inp.matmul(inp.t())
+
+    def fasterquant(
+        self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, write=True, sparsity=None
+    ):
+        W = self.layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        W = W.float()
+
+        tick = time.time()
+
+        if not self.quantizer.ready():
+            self.quantizer.find_params(W, weight=True)
+
+        H = self.H
+        if write:
+            del self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        W[:, dead] = 0
+
+        if actorder:
+            perm = torch.argsort(torch.diag(H), descending=True)
+            W = W[:, perm]
+            H = H[perm][:, perm]
+
+        Losses = torch.zeros_like(W)
+        Q = torch.zeros_like(W)
+
+        damp = percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.dev)
+        H[diag, diag] += damp
+        H = torch.linalg.cholesky(H)
+        H = torch.cholesky_inverse(H)
+        H = torch.linalg.cholesky(H, upper=True)
+        Hinv = H
+
+        for i1 in range(0, self.columns, blocksize):
+            i2 = min(i1 + blocksize, self.columns)
+            count = i2 - i1
+
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+
+                if groupsize != -1:
+                    if (i1 + i) % groupsize == 0:
+                        self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True)
+
+                q = quantize(
+                    w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq
+                ).flatten()
+                Q1[:, i] = q
+                Losses1[:, i] = (w - q) ** 2 / d ** 2
+
+                err1 = (w - q) / d
+                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                Err1[:, i] = err1
+
+            Q[:, i1:i2] = Q1
+            Losses[:, i1:i2] = Losses1 / 2
+
+            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+
+            if DEBUG:
+                pass
+                #self.layer.weight.data[:, :i2] = Q[:, :i2]
+                #self.layer.weight.data[:, i2:] = W[:, i2:]
+                #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+                #print(torch.sum(Losses))
+
+        torch.cuda.synchronize()
+        total_time = time.time() - tick
+        # print('time %.2f' % total_time)
+        # error = torch.sum(Losses).item()
+        # print('error', error)
+
+        if actorder:
+            invperm = torch.argsort(perm)
+            Q = Q[:, invperm]
+
+        if isinstance(self.layer, transformers.Conv1D):
+            Q = Q.t()
+        # here report the loss of the quantized layer vs. the original layer
+        new_weight = Q.reshape(self.layer.weight.shape).to(self.layer.weight.dtype)
+        losses = {}
+        if sparsity is None:
+            sparsed_new_weight = new_weight
+            losses[0] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2)
+        else:
+            for s_sity in sparsity:
+                if write:
+                    logger.info(f"HT with: sparsity={s_sity}")
+                sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=s_sity)
+                losses[s_sity] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2)
+                if losses[s_sity] > 100:
+                    logger.info(f"{sparsed_new_weight}")
+                    logger.info(f"{new_weight}")
+                    logger.info(f"{sparsed_new_weight.shape}")
+                    logger.info(f"{torch.max(torch.abs(self.inp1 @ (sparsed_new_weight.T) - self.out1))}")
+        if write:
+            self.layer.weight.data = sparsed_new_weight
+        return losses
+
+    def free(self):
+        if DEBUG:
+            self.inp1 = None
+            self.out1 = None
+        self.H = None
+        self.Losses = None
+        self.Trace = None
+        torch.cuda.empty_cache()
\ No newline at end of file
diff --git a/cli.py b/cli.py
index eb1850d..b0757b0 100644
--- a/cli.py
+++ b/cli.py
@@ -6,11 +6,11 @@
 from modelutils import get_opt
 from evaluation import opt_eval
 from datautils import get_loaders
-from save_and_load import save_lr_tensors, load_lr_tensors
 from core_compression import opt_delta_lr
+from save_and_load import save_lr_tensors, load_lr_tensors
 
 @torch.no_grad()
-def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_samples):
+def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_samples, decompose_only=False):
     # first do low rank approximation
     # then quantize
     original_finetuned_model = copy.deepcopy(target_model)
@@ -29,7 +29,8 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s
             'percdamp': 0.01,
             'groupsize': -1,
             'actorder': False,
-        }
+        },
+        decompose_only=decompose_only
     )
     
     target_model.to(base_model.device)
@@ -53,9 +54,11 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s
     argparser.add_argument('--save', type=str, default='', help='Path to save the quantized model')
     argparser.add_argument('--wbits', type=int, default=8, help='Number of bits to use for quantization')
     argparser.add_argument('--sym', action='store_true', default=True, help='Whether to use symmetric quantization')
+    argparser.add_argument('--decompose-only', action='store_true', default=False, help='Whether to use quantization')
     argparser.add_argument('--trits', action='store_true', default=False, help='Whether to use trits')
 
     args = argparser.parse_args()
+    print(args)
     seed = args.seed
     
     base_model = get_opt(args.base_model)
@@ -77,10 +80,10 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s
         trainloader,
         args.rank,
         args.wbits,
-        args.nsamples
+        args.nsamples,
+        args.decompose_only,
     )
     if args.save:
-        save_lr_tensors(lr_tensors, f"{args.save}/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-lr.safetensors")
-    
+        save_lr_tensors(lr_tensors, f"{args.save}/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-decompose.{args.decompose_only}-lr.safetensors")
     ppl = opt_eval(target_model, loader_enc, args, target_model.device)
     logger.info(f"Perplexity: {ppl}")
\ No newline at end of file
diff --git a/compress_utils.py b/compress_utils.py
new file mode 100644
index 0000000..143f017
--- /dev/null
+++ b/compress_utils.py
@@ -0,0 +1,351 @@
+import cupy
+import math
+import torch
+import numpy as np
+from torch.utils.dlpack import to_dlpack, from_dlpack
+from quant import Quantizer
+
+def cupy_to_tensor(x):
+    return from_dlpack(x.toDlpack())
+
+def tensor_to_cupy(x):
+    return cupy.fromDlpack(to_dlpack(x))
+
+def pack_uint8_tensor(x):
+    if x.device != torch.device('cpu'):
+        return cupy_to_tensor(
+            cupy.packbits(tensor_to_cupy(x))
+        )
+    else:
+        return torch.from_numpy(
+            np.packbits(x.numpy())
+        )
+
+def unpack_uint8_tensor(x):
+    if x.device != torch.device('cpu'):
+        return cupy_to_tensor(
+            cupy.unpackbits(tensor_to_cupy(x))
+        )
+    else:
+        return torch.from_numpy(
+            np.unpackbits(x.numpy())
+        )
+
+def pack_low_bit_tensor(x, bits):
+    
+    if x.device != torch.device('cpu'):
+        assert x.dtype == torch.uint8
+        y = cupy.packbits(
+            cupy.unpackbits(tensor_to_cupy(x)).reshape(*x.shape, 8)[..., -bits:]
+        )
+        y = cupy_to_tensor(y)
+    else:
+        y = np.packbits(
+            np.unpackbits(x.numpy()).reshape(*x.shape, 8)[..., -bits:]
+        )
+        y = torch.from_numpy(y)
+        
+    return y
+
+def unpack_low_bit_tensor(x, bits, original_shape):
+    if x.device != torch.device('cpu'):
+        y = cupy.packbits(cupy.pad(
+            cupy.unpackbits(
+                tensor_to_cupy(x)
+            )[:np.prod(original_shape)*bits].reshape(-1, bits),
+            ((0,0), (8-bits, 0))
+        ))
+        y = cupy_to_tensor(y).view(original_shape)
+    else:
+        y = np.packbits(np.pad(
+            np.unpackbits(
+                x.numpy()
+            )[:np.prod(original_shape)*bits].reshape(-1, bits),
+            ((0,0), (8-bits, 0))
+        ))
+        y = torch.from_numpy(y).view(original_shape)
+    return y
+
+
+def pin_memory(array):
+    mem = cupy.cuda.alloc_pinned_memory(array.nbytes)
+    ret = np.frombuffer(mem, array.dtype, array.size).reshape(array.shape)
+    ret[...] = array
+    return ret
+
+
+def _rounding(x, stochastic=False, minimum_stochastic_distance=0.2):
+    if stochastic:
+        x_floor = x.floor()
+        th = x - x_floor
+        if minimum_stochastic_distance > 0:
+            th[th<minimum_stochastic_distance] = 0.
+            th[th>1-minimum_stochastic_distance] = 1.
+        pr = torch.rand_like(x)
+        x_floor += (pr < th)
+        return x_floor
+    else:
+        return x.round()
+
+
+def _compress_nbits(x, bits, scale_method='max', scale_dims=(0,1), 
+                    stochastic=False, minimum_stochastic_distance=0.2):
+    
+    fbits = bits - 1
+    
+    if scale_method == 'max':
+        # issue: sensitive to outlier points
+        scale = x.abs().amax(scale_dims, keepdims=True)
+    elif scale_method == 'l2':
+        # ~95% confidence interval for normal distribution
+        scale = x.pow(2).mean(scale_dims, keepdims=True).sqrt() * 2 
+    else:
+        raise Exception('unkonwn scale method.')
+    # fp16 should be enough
+    scale = scale.half()
+    x = x / (scale + 1e-6)
+    
+    x = x.ldexp(torch.tensor(fbits))
+    clip_min = -(1<<fbits)
+    clip_max = (1<<fbits)-1
+
+    x = _rounding(x, stochastic=stochastic, minimum_stochastic_distance=minimum_stochastic_distance)
+    x = x.clip(clip_min, clip_max)
+    
+    x = x - clip_min
+    x = x.type(torch.uint8)
+    
+    return x, scale
+
+
+def _decompress_nbits(x, scale, bits):
+    
+    fbits = bits - 1
+    
+    clip_min = -(1<<fbits)
+    clip_max = (1<<fbits)-1
+    
+    x = x.float() + clip_min
+    
+    x = x / (clip_max+1) * scale
+    
+    return x
+
+
+def compress_8bit(x, scale_method='max', scale_dims=(0,1)):
+
+    x, scale = _compress_nbits(x, bits=8, scale_method=scale_method, scale_dims=scale_dims)
+    
+    return x, scale
+
+
+def decompress_8bit(x, scale):
+    
+    x = _decompress_nbits(x, scale, bits=8)
+    
+    return x
+
+def compress_4bit(x, scale_method='max', scale_dims=(0,1)):
+
+    x, scale = _compress_nbits(x, bits=4, scale_method=scale_method, scale_dims=scale_dims)
+    
+    x0, x1 = x.chunk(2, -1)
+    x = (x0 << 4) + x1
+    
+    return x, scale
+
+
+def decompress_4bit(x, scale):
+    
+    bitmask = 15
+    
+    x0 = (x >> 4)
+    x1 = (x & bitmask)
+    
+    x = torch.cat([x0, x1], -1)
+    
+    x = _decompress_nbits(x, scale, bits=4)
+    
+    return x
+
+
+def compress_2bit(x, scale_method='max', scale_dims=(0,1)):
+
+    x, scale = _compress_nbits(x, bits=2, scale_method=scale_method, scale_dims=scale_dims)
+    
+    x0, x1, x2, x3 = x.chunk(4, -1)
+    x = (x0 << 6) + (x1 << 4) + (x2 << 2) + x3
+    
+    return x, scale
+
+
+def decompress_2bit(x, scale):
+    
+    bitmask = 3
+    
+    x0 = (x >> 6)
+    x1 = (x >> 4) & bitmask
+    x2 = (x >> 2) & bitmask
+    x3 = x & bitmask
+    x = torch.cat([x0, x1, x2, x3], -1)
+    
+    x = _decompress_nbits(x, scale, bits=2)
+    
+    return x
+
+
+
+def compress_flexible_nbits(x, bits, scale_method='max', scale_dims=(0,1)):
+    # support any bits
+    # CUDA only
+    
+    x, scale = _compress_nbits(x, bits=bits, scale_method=scale_method, scale_dims=scale_dims)
+    
+    x = pack_low_bit_tensor(x, bits)
+    
+    return x, scale
+
+
+def decompress_flexible_nbits(x, scale, bits, original_shape):
+    # support any bits, but need to know original_shape
+    # CUDA only
+    
+    x = unpack_low_bit_tensor(x, bits, original_shape)
+    
+    x = _decompress_nbits(x, scale, bits=bits)
+    
+    return x
+
+
+
+def compress_nbits(x, bits, scale_method='max', scale_dims=(0,1)):
+    if bits == 8:
+        x, scale = compress_8bit(x, scale_method=scale_method, scale_dims=scale_dims)
+    elif bits == 4:
+        x, scale = compress_4bit(x, scale_method=scale_method, scale_dims=scale_dims)
+    elif bits == 2:
+        x, scale = compress_2bit(x, scale_method=scale_method, scale_dims=scale_dims)
+    
+    return x, scale
+
+
+def decompress_nbits(x, scale, bits):
+    if bits == 8:
+        y = decompress_8bit(x, scale)
+    elif bits == 4:
+        y = decompress_4bit(x, scale)
+    elif bits == 2:
+        y = decompress_2bit(x, scale)
+    
+    return y
+
+def _compress_nbits_by_bucket(x, bits, scale_method='max', bucket_size=512,
+                              stochastic=False, minimum_stochastic_distance=0.2):
+    
+    if bits == 1:
+        
+        x = x.view(bucket_size, -1)
+        
+        scale = (x.norm(dim=0) / (bucket_size**0.5)).half()
+        
+        x = (x >= 0)
+        
+        x = x.type(torch.uint8)
+        
+        return x, scale
+        
+    
+    fbits = bits - 1
+    
+    x = x.view(bucket_size, -1)
+    
+    if scale_method == 'max':
+        # issue: sensitive to outlier points
+        scale = x.abs().amax([0], keepdims=True)
+    elif scale_method == 'l2':
+        # ~95% confidence interval for normal distribution
+        scale = x.pow(2).mean([0], keepdims=True).sqrt() * 2 
+    else:
+        raise Exception('unkonwn scale method.')
+    # fp16 should be enough
+    scale = scale.half()
+    x = x / (scale + 1e-6)
+    
+    x = x.ldexp(torch.tensor(fbits))
+    clip_min = -(1<<fbits)
+    clip_max = (1<<fbits)-1
+
+    x = _rounding(x, stochastic=stochastic, minimum_stochastic_distance=minimum_stochastic_distance)
+    x = x.clip(clip_min, clip_max)
+    
+    x = x - clip_min
+    x = x.type(torch.uint8)
+    
+    return x, scale
+
+def compress_flexible_nbits_by_bucket(x, bits, scale_method='max', bucket_size=512,
+                                      stochastic=False, minimum_stochastic_distance=0.2):
+    # support any bits
+    # CUDA only
+    
+    if bucket_size > x.numel():
+        bucket_size = x.numel()
+    
+    x, scale = _compress_nbits_by_bucket(
+        x, bits=bits, scale_method=scale_method, bucket_size=bucket_size,
+        stochastic=stochastic, minimum_stochastic_distance=minimum_stochastic_distance)
+    
+    x = pack_low_bit_tensor(x, bits)
+    
+    return x, scale
+
+
+def decompress_flexible_nbits_by_bucket(x, scale, bits, original_shape, bucket_size=512):
+    # support any bits, but need to know original_shape
+    # CUDA only
+        
+    
+    numel = math.prod(original_shape)
+    if bucket_size > numel:
+        bucket_size = numel
+        
+        
+    if bits == 1:
+
+        x = unpack_low_bit_tensor(x, bits, original_shape)
+        x = x.view(bucket_size, -1)
+        x = (x.half() - 0.5)*2
+        x = x * scale.unsqueeze(0)
+        x = x.view(original_shape)
+        
+        # print('done')
+        
+        return x
+    
+    x = unpack_low_bit_tensor(x, bits, original_shape)
+
+    x = x.view(bucket_size, -1)
+    x = _decompress_nbits(x, scale, bits=bits)
+    x = x.view(original_shape)
+    
+    return x
+
+if __name__=="__main__":
+
+    x = torch.randn((512, 512), dtype=torch.float32, device='cuda')
+
+    print("original")
+    print(x)
+    quantizer = Quantizer()
+    quantizer.configure(
+        4, perchannel=True, sym=False, mse=False
+    )
+    quantizer.find_params(x, weight=True)
+    b_q = quantizer.quantize(x)
+
+    packed_x, scale = compress_flexible_nbits(b_q, 4)
+    unpacked_x = decompress_flexible_nbits(packed_x,scale=scale, bits=4, original_shape=x.shape)
+
+    print("unpacked")
+    print(unpacked_x)
+    print(f"are they equal? {torch.allclose(x, unpacked_x)}")
\ No newline at end of file
diff --git a/core_compression.py b/core_compression.py
index 2b445f7..6e79ba4 100644
--- a/core_compression.py
+++ b/core_compression.py
@@ -1,9 +1,10 @@
 import torch
 import torch.nn as nn
-from loguru import logger
-from modelutils import find_layers
+from tqdm import tqdm
 from matq import TensorQ
+from loguru import logger
 from quant import Quantizer
+from modelutils import find_layers
 
 @torch.no_grad()
 def opt_delta_lr(
@@ -15,7 +16,8 @@ def opt_delta_lr(
         sym,
         trits,
         rank,
-        args
+        args,
+        decompose_only=False,
     ):
     device = model.device
     print("Starting LR quantizer initialization...")
@@ -48,6 +50,7 @@ def forward(self, inp, **kwargs):
             cache['i'] += 1
             cache['attention_mask'] = kwargs['attention_mask']
             raise ValueError
+    
     layers[0] = Catcher(layers[0])
     for batch in dataloader:
         try:
@@ -55,6 +58,7 @@ def forward(self, inp, **kwargs):
         except ValueError:
             pass
     layers[0] = layers[0].module
+    
     model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
     model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
     if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
@@ -62,7 +66,6 @@ def forward(self, inp, **kwargs):
     if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
         model.model.decoder.project_in = model.model.decoder.project_in.cpu()
     torch.cuda.empty_cache()
-
     outs = torch.zeros_like(inps)
     original_outs = torch.zeros_like(inps)
     attention_mask = cache['attention_mask']
@@ -71,14 +74,15 @@ def forward(self, inp, **kwargs):
     quantizers = {}
     l_quantizers = {}
     lr_tensors = {}
-    for i in range(len(delta_layers)):
+    # parallelize this to allocate to multiple GPUs?
+    for i in tqdm(range(len(delta_layers))):
         layer = delta_layers[i].to(device)
         original_layer = layers[i].to(device)
 
         subset = find_layers(layer)
         lr_gptq = {}
         for name in subset:
-            lr_gptq[name] = TensorQ(subset[name], rank)
+            lr_gptq[name] = TensorQ(subset[name], rank, sensitive_decompose=True)
             lr_gptq[name].quantizer = Quantizer()
             lr_gptq[name].quantizer.configure(
                 wbits,
@@ -108,16 +112,17 @@ def temp(_, inp, out):
             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
 
             original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-        
+
         for h in handles:
             h.remove()
-        
+
         for name in subset:
             logger.info(f"Quantizing {name}...")
             lr_gptq[name].lr_quant(
                 percdamp=args['percdamp'],
                 groupsize=args['groupsize'],
                 actorder=args['actorder'],
+                decompose_only=decompose_only,
             )
             lr_tensors[f'<R>.model.decoder.layers.{i}.{name}'] = lr_gptq[name].R
             lr_tensors[f'<L>.model.decoder.layers.{i}.{name}'] = lr_gptq[name].L
diff --git a/datautils.py b/datautils.py
index 045121a..71a8616 100644
--- a/datautils.py
+++ b/datautils.py
@@ -1,5 +1,9 @@
-import numpy as np
+import json
 import torch
+import random
+import numpy as np
+from datasets import Dataset
+from transformers import AutoTokenizer
 
 def set_seed(seed):
     np.random.seed(seed)
@@ -157,6 +161,57 @@ def __init__(self, input_ids):
 
     return trainloader, valenc
 
+def get_jsonl(train_path, val_path, n_samples, seed, seq_len, model_name, val_size=None, val_seq_len=256, padding=False):
+    """
+    train_path: path to train jsonl file
+    test_path: path to test jsonl file
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+    with open(train_path, 'r') as f:
+        traindata = [json.loads(line) for line in f.readlines()]
+    with open(val_path, 'r') as f:
+        valdata = [json.loads(line) for line in f.readlines()]
+    traindata = {"text": [d['text'] for d in traindata]}
+    valdata = {"text": [d['text'] for d in valdata]}
+    traindata = Dataset.from_dict(traindata)
+    valdata = Dataset.from_dict(valdata)
+    set_seed(seed)
+
+    trainloader = []
+    for _ in range(n_samples):
+        # for all datasets, we take the samples that are longer than seq_len
+        while True:
+            i = random.randint(0, len(traindata) - 1)
+            if padding:
+                trainenc = tokenizer(traindata[i]['text'], padding='max_length', truncation=True, max_length=seq_len, return_tensors='pt')
+            else:
+                trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
+            if trainenc.input_ids.shape[1] >= seq_len:
+                break
+        if not padding:
+            # then clip the samples to seq_len
+            i = random.randint(0, trainenc.input_ids.shape[1] - seq_len - 1)
+            j = i + seq_len
+            inp = trainenc.input_ids[:, i:j]
+            tar = inp.clone()
+            tar[:, :-1] = -100
+            trainloader.append((inp, tar))
+        else:
+            inp = trainenc.input_ids
+            tar = inp.clone()
+            tar[:, :-1] = -100
+            trainloader.append((inp, tar))
+    if val_size is not None:
+        valenc = tokenizer(' '.join(valdata[:val_size]['text']), return_tensors='pt')
+    else:
+        valenc = tokenizer(' '.join(valdata['text']), return_tensors='pt')
+    valenc = valenc.input_ids[:, :(val_seq_len * seq_len)]
+
+    class TokenizerWrapper:
+        def __init__(self, input_ids):
+            self.input_ids = input_ids
+    valenc = TokenizerWrapper(valenc)
+    return trainloader, valenc
 
 def get_loaders(
     name, nsamples=128, seed=0, seqlen=2048, model=''
@@ -171,3 +226,41 @@ def get_loaders(
         if 'new' in name:
             return get_c4_new(nsamples, seed, seqlen, model)
         return get_c4(nsamples, seed, seqlen, model)
+    if name == "answer_verification":
+        return get_jsonl(
+            ".cache/ni_calib/train/answer_verification.jsonl", 
+            ".cache/ni_calib/test/answer_verification.jsonl", 
+            nsamples, 
+            seed, 
+            seqlen, 
+            model, 
+            val_size=1000, 
+            padding=True
+        )
+    if name == "coherence_classification":
+        return get_jsonl(".cache/ni_calib/test/coherence_classification.jsonl", ".cache/ni_calib/test/coherence_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
+    if name == "commonsense_classification":
+        return get_jsonl(".cache/ni_calib/train/commonsense_classification.jsonl", ".cache/ni_calib/test/commonsense_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
+    if name == "dialogue_state_tracking":
+        return get_jsonl(".cache/ni_calib/train/dialogue_state_tracking.jsonl", ".cache/ni_calib/test/dialogue_state_tracking.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
+    if name == "fact_verification":
+        return get_jsonl(".cache/ni_calib/train/fact_verification.jsonl", ".cache/ni_calib/test/fact_verification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
+    if name == "gender_classification":
+        return get_jsonl(".cache/ni_calib/train/gender_classification.jsonl", ".cache/ni_calib/test/gender_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
+    if name == "irony_detection":
+        return get_jsonl(".cache/ni_calib/train/irony_detection.jsonl", ".cache/ni_calib/test/irony_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
+    if name == "stance_detection":
+        return get_jsonl(".cache/ni_calib/train/stance_detection.jsonl", ".cache/ni_calib/test/stance_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
+    if name == "toxic_language_detection":
+        return get_jsonl(".cache/ni_calib/train/toxic_language_detection.jsonl", ".cache/ni_calib/test/toxic_language_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
+    if name == "word_semantics":
+        return get_jsonl(
+            ".cache/ni_calib/train/word_semantics.jsonl", 
+            ".cache/ni_calib/test/word_semantics.jsonl", 
+            nsamples, 
+            seed, 
+            seqlen, 
+            model, 
+            val_size=1000, 
+            padding=True
+        )
\ No newline at end of file
diff --git a/decomposition.py b/decomposition.py
new file mode 100644
index 0000000..5663594
--- /dev/null
+++ b/decomposition.py
@@ -0,0 +1,92 @@
+import time
+import torch
+from tqdm import tqdm
+from loguru import logger
+from torch.optim.lr_scheduler import ExponentialLR
+import torch.nn.functional as F
+
+def svd_decomposition(matrix, rank):
+    U, S, Vh = torch.pca_lowrank(matrix, q=rank)
+    return U @ torch.diag_embed(S),  Vh.T
+
+def low_rank_decomposition(W, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5, X = None):
+    L = torch.rand((W.shape[0], rank), device=W.device)
+    R = torch.rand((rank, W.shape[1]), device=W.device)
+    tick = time.time()
+    early_stop = False
+    if X is None:
+        for i in tqdm(range(max_iterations)):
+            difference = W - L @ R
+            gradient_L = -2 * (difference @ R.T)
+            gradient_R = -2 * (L.T @ difference)
+            L -= learning_rate * gradient_L
+            R -= learning_rate * gradient_R
+            if F.mse_loss(W, L@R) < tolerance:
+                early_stop = True
+                break
+        logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(W, L@R)}")
+    else:
+        for i in tqdm(range(max_iterations)):
+            diff = W @ X - L @ R @ X
+            gradient_L = -2 * (diff @ ((R@X).T))
+            gradient_R = -2 * (L.T @ diff @ X.T)
+            L -= learning_rate * gradient_L
+            R -= learning_rate * gradient_R
+            if F.mse_loss(W @ X, L @ R @ X) < tolerance:
+                early_stop = True
+                break
+            # print(F.mse_loss(W @ X, L @ R @ X))
+        logger.info(f"[With Input] Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(W@X, L@R@X)}")
+    return L, R
+
+def torch_autograd(W, X, rank, lr, steps):
+    L = torch.rand((W.shape[0], rank), device=W.device, requires_grad=True)
+    R = torch.rand((rank, W.shape[1]), device=W.device, requires_grad=True)
+    optimizer = torch.optim.SGD([L, R], lr=lr, momentum=0.9)
+    scheduler = ExponentialLR(optimizer, gamma=0.9)
+    for j in tqdm(range(steps)):
+        optimizer.zero_grad()
+        output = L @ R @ X
+        target = W @ X
+        loss = torch.nn.functional.mse_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if j % 200 == 0:
+            scheduler.step()
+    return L, R
+
+if __name__=="__main__":
+    FULL_RANK = 2048
+    FULL_RANK_H = 1024
+    FULL_RANK_W = 4096
+    LOW_RANK = 32
+    TARGET_SIZE = 2
+
+    W = torch.rand((FULL_RANK_W, FULL_RANK_H))
+    input_matrix = torch.rand((FULL_RANK_H, TARGET_SIZE))
+    output_matrix = W @ input_matrix
+    
+    L_sensitive, R_sensitive = low_rank_decomposition(
+        W,
+        LOW_RANK,
+        learning_rate=1e-9,
+        max_iterations=2000,
+        X=input_matrix
+    )
+    reconstructed_matrix = L_sensitive @ R_sensitive @ input_matrix
+    print(f"reconstructed mse: gd: {F.mse_loss(output_matrix, reconstructed_matrix)}")
+
+    L_noinput, R_noinput = low_rank_decomposition(
+        W,
+        LOW_RANK,
+        learning_rate=1e-9,
+        max_iterations=2000,
+    )
+    reconstructed_matrix_noinput = L_noinput @ R_noinput @ input_matrix
+    print(f"reconstructed mse: gd. noinput gd: {F.mse_loss(output_matrix, reconstructed_matrix_noinput)}")
+    
+    L_autograd, R_autograd = torch_autograd(W, input_matrix, LOW_RANK, 1e-9, 2000)
+    reconstructed_matrix_pca = L_autograd @ R_autograd @ input_matrix
+    print(f"reconstructed mse: autograd: {F.mse_loss(output_matrix, reconstructed_matrix_pca)}")
+    
+    
diff --git a/docs/number.md b/docs/number.md
index e69de29..ec5bac9 100644
--- a/docs/number.md
+++ b/docs/number.md
@@ -0,0 +1,20 @@
+In theory:
+    With a matrix of size 2048 * 2048, 10% elements are non-zero. The original bits is 2048 * 2048 * 16 = 16 * 4M
+
+    To store the indices of non-zero elements, it takes 2048 * 2048 * 10% * log2(2048 * 2048) ~= 2.2 * 4M
+
+    Considering indices only, we achieve 16 / 2.2 ~= 7.3x compression ratio
+
+In practice:
+    Saving a matrix of size 2048 * 2048, 10% elements are non-zero takes 8M bytes on disk (with torch.save).
+
+    Saving packed indices takes 1.9M on disk, achieving 17 / 1.9 ~= 8.9x compression ratio.
+
+    With zip, the packed indices takes 1.1M on disk, achieving 17 / 1.1 ~= 15.5x compression ratio.
+
+
+256 x 256 -> 64k fp 16 -> 128k on disk ok.
+log2(2048*2048) = 22. 3 int8 for each index. 
+
+0.4M * 3 = 1.2M
+"""
\ No newline at end of file
diff --git a/opt_delta_fork.py b/gptj_delta_autotuned.py
similarity index 61%
rename from opt_delta_fork.py
rename to gptj_delta_autotuned.py
index 9a7710c..bd883b9 100644
--- a/opt_delta_fork.py
+++ b/gptj_delta_autotuned.py
@@ -1,52 +1,41 @@
+
 import time
+import math
 
 import torch
 import torch.nn as nn
+import transformers
 
 from gptq import *
 from modelutils import *
 from quant import *
-import json
-import pickle
+import os
 import copy
-#from prettytable import PrettyTable
 
-def get_opt(model):
+def get_gptj(model):
     import torch
     def skip(*args, **kwargs):
         pass
     torch.nn.init.kaiming_uniform_ = skip
     torch.nn.init.uniform_ = skip
     torch.nn.init.normal_ = skip
-    from transformers import OPTForCausalLM
-    # model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto')
-    model = OPTForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
+    from transformers import GPTJForCausalLM
+    model = GPTJForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
     model.seqlen = model.config.max_position_embeddings
+    print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad))
     return model
 
-def hard_threshold(x, fraction_of_zero=0.1):
-    y, _ = torch.sort(x.view(-1).abs().clone())
-    num_params = torch.numel(x)
-    thresh_index = int(num_params * fraction_of_zero)
-    threshold = y[thresh_index]
-    mask = x.abs().clone().gt(threshold).type(torch.FloatTensor)
-    return mask * x
-
 @torch.no_grad()
-def opt_sequential_delta(model, delta_model, dataloader, dev):
+def gptj_sequential(model, dataloader, dev, means=None, stds=None):
     print('Starting ...')
 
     use_cache = model.config.use_cache
     model.config.use_cache = False
-    layers = model.model.decoder.layers
-    delta_layers = delta_model.model.decoder.layers
-
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) 
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
+    #print(model.transformer.h)
+    layers = model.transformer.h
+    print(layers)
+    
+    model.transformer.wte = model.transformer.wte.to(dev)
     layers[0] = layers[0].to(dev)
 
     dtype = next(iter(model.parameters())).dtype
@@ -72,25 +61,20 @@ def forward(self, inp, **kwargs):
             pass
     layers[0] = layers[0].module
 
+    layers = model.transformer.h
     layers[0] = layers[0].cpu()
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    model.transformer.ln_f = model.transformer.ln_f.cpu()
     torch.cuda.empty_cache()
 
     outs = torch.zeros_like(inps)
-    original_outs = torch.zeros_like(inps)
     attention_mask = cache['attention_mask']
 
     print('Ready.')
 
     quantizers = {}
-    for i in range(len(delta_layers)):
-        layer = delta_layers[i].to(dev)
-        original_layer = layers[i].to(dev)
+    for i in range(len(layers)):
+        layer = layers[i].to(dev)
 
         subset = find_layers(layer)
         gptq = {}
@@ -98,9 +82,9 @@ def forward(self, inp, **kwargs):
             gptq[name] = GPTQ(subset[name])
             gptq[name].quantizer = Quantizer()
             gptq[name].quantizer.configure(
-                args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits
+                args.wbits, perchannel=True, sym=False, mse=False
             )
-
+        
         def add_batch(name):
             def tmp(_, inp, out):
                 gptq[name].add_batch(inp[0].data, out.data)
@@ -108,49 +92,39 @@ def tmp(_, inp, out):
         handles = []
         for name in subset:
             handles.append(subset[name].register_forward_hook(add_batch(name)))
-        
         for j in range(args.nsamples):
             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-            
-            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
         for h in handles:
             h.remove()
 
         for name in subset:
             print(i, name)
             print('Quantizing ...')
-            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
-            quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
-            gptq[name].free()
+            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
         for j in range(args.nsamples):
             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
 
         layers[i] = layer.cpu()
         del layer
-        del gptq 
+        del gptq
         torch.cuda.empty_cache()
 
-        inps, outs = original_outs, inps
+        inps, outs = outs, inps
 
     model.config.use_cache = use_cache
 
     return quantizers
 
 @torch.no_grad()
-def opt_sequential(model, dataloader, dev):
+def gptj_sequential_delta(model, delta_model, dataloader, dev):
     print('Starting ...')
 
     use_cache = model.config.use_cache
     model.config.use_cache = False
-    layers = model.model.decoder.layers
-
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) 
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
+    layers = model.transformer.h
+    delta_layers = delta_model.transformer.h
+
+    model.transformer.wte = model.transformer.wte.to(dev) 
     layers[0] = layers[0].to(dev)
 
     dtype = next(iter(model.parameters())).dtype
@@ -177,22 +151,19 @@ def forward(self, inp, **kwargs):
     layers[0] = layers[0].module
 
     layers[0] = layers[0].cpu()
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
     torch.cuda.empty_cache()
 
     outs = torch.zeros_like(inps)
+    original_outs = torch.zeros_like(inps)
     attention_mask = cache['attention_mask']
 
     print('Ready.')
 
     quantizers = {}
-    for i in range(len(layers)):
-        layer = layers[i].to(dev)
+    for i in range(len(delta_layers)):
+        layer = delta_layers[i].to(dev)
+        original_layer = layers[i].to(dev)
 
         subset = find_layers(layer)
         gptq = {}
@@ -200,7 +171,7 @@ def forward(self, inp, **kwargs):
             gptq[name] = GPTQ(subset[name])
             gptq[name].quantizer = Quantizer()
             gptq[name].quantizer.configure(
-                args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits
+                args.wbits, perchannel=True, sym=args.sym, mse=False
             )
 
         def add_batch(name):
@@ -212,6 +183,7 @@ def tmp(_, inp, out):
             handles.append(subset[name].register_forward_hook(add_batch(name)))
         for j in range(args.nsamples):
             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
         for h in handles:
             h.remove()
 
@@ -221,23 +193,24 @@ def tmp(_, inp, out):
             gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
             quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
             gptq[name].free()
+        
         for j in range(args.nsamples):
             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
 
         layers[i] = layer.cpu()
         del layer
         del gptq 
         torch.cuda.empty_cache()
 
-        inps, outs = outs, inps
+        inps, outs = original_outs, inps
 
     model.config.use_cache = use_cache
-    
-    return quantizers
 
+    return quantizers
 
 @torch.no_grad()
-def opt_eval(model, testenc, dev):
+def gptj_eval(model, testenc, dev):
     print('Evaluating ...')
 
     testenc = testenc.input_ids
@@ -245,14 +218,11 @@ def opt_eval(model, testenc, dev):
 
     use_cache = model.config.use_cache
     model.config.use_cache = False
-    layers = model.model.decoder.layers
-
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
+    # print(model.transformer.h)
+    layers = model.transformer.h
+    print(layers)
+    
+    model.transformer.wte = model.transformer.wte.to(dev)
     layers[0] = layers[0].to(dev)
 
     dtype = next(iter(model.parameters())).dtype
@@ -267,40 +237,38 @@ def __init__(self, module):
             self.module = module
         def forward(self, inp, **kwargs):
             inps[cache['i']] = inp
-            cache['i'] += 1
+            cache ['i'] += 1
             cache['attention_mask'] = kwargs['attention_mask']
             raise ValueError
     layers[0] = Catcher(layers[0])
     for i in range(nsamples):
-        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
+        batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev)
         try:
+            # print(batch.shape)
             model(batch)
         except ValueError:
             pass
     layers[0] = layers[0].module
 
+    layers = model.transformer.h
     layers[0] = layers[0].cpu()
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    model.transformer.ln_f = model.transformer.ln_f.cpu()
     torch.cuda.empty_cache()
-
+    
     outs = torch.zeros_like(inps)
     attention_mask = cache['attention_mask']
 
     for i in range(len(layers)):
-        # print(i)
+        print(i)
         layer = layers[i].to(dev)
 
         if args.nearest:
-            subset = find_layers(layer)
+            subset  = find_layers(layer)
             for name in subset:
                 quantizer = Quantizer()
                 quantizer.configure(
-                    args.wbits, perchannel=True, sym=args.sym, mse=False
+                    args.wbits, perchannel=True, sym=False, mse=False
                 )
                 W = subset[name].weight.data
                 quantizer.find_params(W, weight=True)
@@ -315,20 +283,14 @@ def forward(self, inp, **kwargs):
         torch.cuda.empty_cache()
         inps, outs = outs, inps
 
-    if model.model.decoder.final_layer_norm is not None:
-        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev)
-    if model.model.decoder.project_out is not None:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    model.transformer.ln_f = model.transformer.ln_f.to(dev)
     model.lm_head = model.lm_head.to(dev)
-
+    
     testenc = testenc.to(dev)
     nlls = []
     for i in range(nsamples):
         hidden_states = inps[i].unsqueeze(0)
-        if model.model.decoder.final_layer_norm is not None:
-            hidden_states = model.model.decoder.final_layer_norm(hidden_states)
-        if model.model.decoder.project_out is not None:
-            hidden_states = model.model.decoder.project_out(hidden_states)
+        hidden_states = model.transformer.ln_f(hidden_states)
         lm_logits = model.lm_head(hidden_states)
         shift_logits = lm_logits[:, :-1, :].contiguous()
         shift_labels = testenc[
@@ -340,61 +302,60 @@ def forward(self, inp, **kwargs):
         nlls.append(neg_log_likelihood)
     ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
     print(ppl.item())
+    
 
     model.config.use_cache = use_cache
-    return ppl.item()
 
-# TODO: perform packing on GPU
-def opt_pack3(model, quantizers):
+def gptj_pack(model, quantizers, wbits, groupsize):
     layers = find_layers(model)
     layers = {n: layers[n] for n in quantizers}
-    make_quant3(model, quantizers, faster=args.faster_kernel)
-    qlayers = find_layers(model, [Quant3Linear])
+    make_quant(model, quantizers, wbits, groupsize)
+    qlayers = find_layers(model, [QuantLinear])
     print('Packing ...')
     for name in qlayers:
         print(name)
-        quantizers[name] = quantizers[name].cpu()
-        qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero)
-    print('Done.')
+        quantizers[name],scale,zero = quantizers[name]
+        quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu()
+        qlayers[name].pack(layers[name], scale, zero)
+    print('Done!')
     return model
 
-def load_quant3(model, checkpoint):
-    from transformers import OPTConfig, OPTForCausalLM 
-    config = OPTConfig.from_pretrained(model)
+def load_quant(model, checkpoint, wbits, groupsize):
+    from transformers import GPTJConfig, GPTJForCausalLM
+    config = GPTJConfig.from_pretrained(model)
     def noop(*args, **kwargs):
         pass
-    torch.nn.init.kaiming_uniform_ = noop 
-    torch.nn.init.uniform_ = noop 
-    torch.nn.init.normal_ = noop 
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
 
     torch.set_default_dtype(torch.half)
     transformers.modeling_utils._init_weights = False
     torch.set_default_dtype(torch.half)
-    model = OPTForCausalLM(config)
+    model = GPTJForCausalLM(config)
     torch.set_default_dtype(torch.float)
     model = model.eval()
     layers = find_layers(model)
-    for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']:
+    for name in ['lm_head']:
         if name in layers:
             del layers[name]
-    make_quant3(model, layers, faster=args.faster_kernel)
+    make_quant(model, layers, wbits, groupsize)
 
     print('Loading model ...')
-    model.load_state_dict(torch.load(checkpoint))
-    model.seqlen = model.config.max_position_embeddings
-    print('Done.')
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+    model.seqlen = 2048
+    print('Done!')
 
     return model
 
-def opt_multigpu(model, gpus):
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0])
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0])
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(gpus[0])
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(gpus[-1])
-    if hasattr(model.model.decoder, 'final_layer_norm') and model.model.decoder.final_layer_norm:
-        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(gpus[-1])
+def gptj_multigpu(model, gpus):
+    model.model.embed_tokens = model.model.embed_tokens.to(gpus[0])
+    if hasattr(model.model, 'norm') and model.model.norm:
+        model.model.norm = model.model.norm.to(gpus[-1])
     import copy
     model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
 
@@ -403,19 +364,19 @@ def opt_multigpu(model, gpus):
     class MoveModule(nn.Module):
         def __init__(self, module):
             super().__init__()
-            self.module = module
+            self_module = module
             self.dev = next(iter(self.module.parameters())).device
         def forward(self, *inp, **kwargs):
             inp = list(inp)
             if inp[0].device != self.dev:
                 inp[0] = inp[0].to(self.dev)
-            if cache['mask'] is None or cache['mask'].device != self.dev:
+            if cache['mask'] is None or cache ['mask'].device != self.dev:
                 cache['mask'] = kwargs['attention_mask'].to(self.dev)
             kwargs['attention_mask'] = cache['mask']
             tmp = self.module(*inp, **kwargs)
             return tmp
 
-    layers = model.model.decoder.layers
+    layers = model.model.layers
     pergpu = math.ceil(len(layers) / len(gpus))
     for i in range(len(layers)):
         layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
@@ -432,7 +393,7 @@ def tmp(layer, inp, out):
             if cache['past']:
                 cache['past'][i] = None
         return tmp
-    for i, layer in enumerate(model.model.decoder.layers):
+    for i, layer in enumerate(model.model.layers):
         layer.register_forward_hook(clear_past(i))
 
     print('Benchmarking ...')
@@ -447,30 +408,35 @@ def sync():
                 torch.cuda.synchronize(gpu)
         else:
             torch.cuda.synchronize()
+    max_memory = 0
     with torch.no_grad():
         attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
         times = []
         for i in range(input_ids.numel()):
             tick = time.time()
+        
             out = model(
-                input_ids[:, i].reshape(-1),
+                input_ids[:, i:i+1],
                 past_key_values=cache['past'],
                 attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1))
             )
             sync()
             times.append(time.time() - tick)
             print(i, times[-1])
+            max_memory = max(max_memory, torch, torch.cuda.memory_allocated() / 1024 /1024)
             if check and i != input_ids.numel() - 1:
                 tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float()
-            cache['past'] = list(out.past_key_values)
+            cache['past'] = list(out.past_keys_values)
             del out
         sync()
         import numpy as np
         print('Median:', np.median(times))
         if check:
             print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
+            print('max memory(MiB):',max_memory)
 
 
+        
 def main(args):
     print(args)
     num_params_saved_lr = 0
@@ -479,9 +445,9 @@ def main(args):
         model = load_quant3(args.model, args.load)
     else:
         if args.delta and args.wbits<16:
-            model = get_opt(args.model)
+            model = get_gptj(args.model)
             model.eval()
-            base_model = get_opt(args.base_model)
+            base_model = get_gptj(args.base_model)
             base_model.eval()
             dataloader, testloader = get_loaders(
         args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
@@ -490,7 +456,7 @@ def main(args):
             for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
                 finetuned_p.data = (finetuned_p.data-base_p.data).clone()
         else:
-            model = get_opt(args.model)
+            model = get_gptj(args.model)
             model.eval()
 
     dataloader, testloader = get_loaders(
@@ -500,31 +466,23 @@ def main(args):
     if args.wbits < 16 and not args.nearest:
         if args.delta:
             tick = time.time()
-            quantizers = opt_sequential_delta(original_finetuned_model, model, dataloader, DEV)
+            quantizers = gptj_sequential_delta(original_finetuned_model, model, dataloader, DEV)
 
             comp_time = time.time()-tick
         else:
-            quantizers = opt_sequential(model, dataloader, DEV)
+            quantizers = gptj_sequential(model, dataloader, DEV)
     
     if args.delta and args.wbits<16:
         for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
-            # don't hard threshold for now
-            # if args.sparsify_hard_threshold:
-            #     print('Hard Thresholding...')
-            #     W = finetuned_p.data
-            #     finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero)
+            if args.sparsify_hard_threshold:
+                print('Hard Thresholding...')
+                W = finetuned_p.data
+                finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero)
             if args.rank>0 and len(finetuned_p.shape) == 2:
                 print('Finding Low Rank Approximation...')
                 A = finetuned_p.data.float()
                 U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5)
-                # let's say L = U
-                # and R = diag(S)*V.T
-                L = U
-                R = torch.diag_embed(S) @ Vh.T
-                # now quantize R
-                
-                A  = L @ R
-            
+                A  = U @ torch.diag_embed(S) @ Vh.T
                 finetuned_p.data =  A.half()
                 num_params_saved_lr += torch.numel(A) -  (torch.numel(U) + torch.numel(S) + torch.numel(Vh))
             num_params += torch.numel(finetuned_p.data)
@@ -533,7 +491,7 @@ def main(args):
     if args.benchmark:
         gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
         if len(gpus) > 1:
-            opt_multigpu(model, gpus)
+            gptj_multigpu(model, gpus)
         else:
             model = model.to(DEV)
         if args.benchmark:
@@ -547,34 +505,35 @@ def main(args):
         dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
     )
     
-    ppl = opt_eval(model, testloader, DEV)
+    ppl = gptj_eval(model, testloader, DEV)
     print(ppl)
 
     if args.rank > 0:
-        print("Number of params without low rank ", num_params)
-        print("Number of params with low rank", num_params - num_params_saved_lr)
+        n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print("Number of params without low rank ", n_params)
+        print("Number of params with low rank", n_params - num_params_saved_lr)
     if args.save:
-        opt_pack3(model, quantizers)
+        gptj_pack(model, quantizers, args.wbits, args.groupsize)
         torch.save(model.state_dict(), args.save) 
-
+    return ppl
 
 if __name__ == '__main__':
     import argparse
     from datautils import *
 
     parser = argparse.ArgumentParser()
-
+    
     parser.add_argument(
-        '--model', type=str, default='lnair/opt-1.3b-wikitext2',
-        help='OPT model to load; pass `facebook/opt-X`.'
+        '--model', type=str, default='togethercomputer/GPT-JT-6B-v1',
+        help='GPT-J finetuned model to load; pass `togethercomputer/GPT-JT-6B-v1`.'
     )
     parser.add_argument(
-        '--dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], default='wikitext2',
-        help='Where to extract calibration data from.'
+        '--base_model', type=str, default='EleutherAI/gpt-j-6b',
+        help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.'
     )
     parser.add_argument(
-        '--base-model', type=str, default='facebook/opt-1.3b',
-        help='base OPT model to load'
+        '--dataset', type=str, default='ptb', choices=['wikitext2', 'ptb', 'c4'],
+        help='Where to extract calibration data from.'
     )
     parser.add_argument(
         '--seed',
@@ -591,30 +550,26 @@ def main(args):
     parser.add_argument(
         '--nearest', action='store_true',
         help='Whether to run the RTN baseline.'
-    ) 
-    parser.add_argument(
-        '--wbits', type=int, default=2, choices=[2, 3, 4, 16],
-        help='#bits to use for quantization; use 16 for evaluating base model.'
     )
     parser.add_argument(
-        '--trits', action='store_true',
-        help='Whether to use trits for quantization.'
+        '--wbits', type=int, default=16, choices=[2, 3, 4, 16],
+        help='#bits to use for quantization; use 16 for evaluating base model.'
     )
     parser.add_argument(
         '--groupsize', type=int, default=-1,
         help='Groupsize to use for quantization; default uses full row.'
     )
     parser.add_argument(
-        '--sym', action='store_true',
-        help='Whether to perform symmetric quantization.'
+        '--save', type=str, default='',
+        help='Save the quantized GPT-J model under this name.'
     )
     parser.add_argument(
-        '--save', type=str, default='',
-        help='Save quantized checkpoint under this name.'
+        '--save_safetensors', type=str, default='',
+        help='Save the quantized GPT-J model as a  `.safetensors` ckpt'
     )
     parser.add_argument(
         '--load', type=str, default='',
-        help='Load quantized model.'
+        help='Load the quantized GPT-J model'
     )
     parser.add_argument(
         '--benchmark', type=int, default=0,
@@ -622,19 +577,7 @@ def main(args):
     )
     parser.add_argument(
         '--check', action='store_true',
-        help='Whether to compute perplexity during benchmarking for verification.'
-    )
-    parser.add_argument(
-        '--new-eval', action='store_true',
-        help='Whether to use the new PTB and C4 eval.'
-    )
-    parser.add_argument(
-        '--faster-kernel', action='store_true',
-        help='Whether to use the new faster kernel for benchmarking.'
-    )
-    parser.add_argument(
-        '--act-order', action='store_true',
-        help='Whether to apply the activation order GPTQ heuristic'
+        help='Whether to compute perpexity during benchmarking for verification.'
     )
     parser.add_argument(
         '--delta', action='store_true',
@@ -648,15 +591,34 @@ def main(args):
         '--fraction_of_zero', type=float, default=0.99,
         help='Sparsity ratio'
     )
-
     parser.add_argument(
-        '--rank', type=int, default=0,
-        help='The rank to use for decomposing each matrices'
+        '--benchmark_results', type=str, default='',
+        help='store benchmark results'
     )
-    args = parser.parse_args()
-
-    #results = PrettyTable()
-
-    main(args)
+    parser.add_argument(
+        '--sym', action='store_true', default=True,
+        help='Whether to use symmetric quantization'
+    )
+    parser.add_argument(
+        '--trits', action='store_true', default=False, 
+        help='Whether to use trits'
+    )
+    parser.add_argument('--act_order', type=str, default=False)
     
-    print('finished.')
+    args = parser.parse_args()
+        
+    results = PrettyTable()
+    results.field_names = ['Bits', 'n_params', 'Time', 'wiki']
+    for n_bits in [4, 3, 2]:
+        ppls = []
+        for dataset in ['wikitext2', 'ptb', 'c4']:
+            args.dataset = dataset
+            args.wbits = n_bits
+            args.save = 'gptj-delta-%s-wbits%d.pt' % (dataset, n_bits)
+            ppl = main(args)
+            ppls.append(ppl)
+        results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
+        print(results)
+        with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f:
+            f.write(str(results))
+    print('finished.')
\ No newline at end of file
diff --git a/gptq.py b/gptq.py
index 2477cac..87dd8cf 100644
--- a/gptq.py
+++ b/gptq.py
@@ -3,17 +3,34 @@
 import torch
 import transformers
 import torch.nn as nn
-
+from loguru import logger
 from quant import quantize
 
-DEBUG = False 
+DEBUG = False
 
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
 
+def hard_threshold(x, fraction_of_zero=0.1, random_sparsification=0.5):
+    if fraction_of_zero == 0:
+        return x, None
+    # randomly set random_sparsification of the weights to zero
+    if random_sparsification > 0:
+        logger.info(f"Randomly sparsifying the weights with {random_sparsification}")
+        mask = torch.rand(x.shape, device=x.device) > random_sparsification
+        x = x * mask
+    y, _ = torch.sort(x.view(-1).abs().clone())
+    num_params = torch.numel(x)
+    
+    thresh_index = int(num_params * fraction_of_zero * (1/random_sparsification))
+    threshold = y[thresh_index]
+    mask = x.abs().clone().gt(threshold).type(torch.cuda.HalfTensor)
+    return mask * x, mask
+
 class GPTQ:
     def __init__(self, layer):
         self.layer = layer
+        self.original_weight = layer.weight.data.clone()
         self.dev = self.layer.weight.device
         W = layer.weight.data.clone()
         if isinstance(self.layer, nn.Conv2d):
@@ -26,26 +43,15 @@ def __init__(self, layer):
         self.nsamples = 0
 
     def add_batch(self, inp, out):
-        if DEBUG:
-            self.inp1 = inp
-            self.out1 = out
+        self.inp1 = inp
+        self.out1 = out
         if len(inp.shape) == 2:
             inp = inp.unsqueeze(0)
         tmp = inp.shape[0]
-        if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
+        if isinstance(self.layer, nn.Linear):
             if len(inp.shape) == 3:
                 inp = inp.reshape((-1, inp.shape[-1]))
             inp = inp.t()
-        if isinstance(self.layer, nn.Conv2d):
-            unfold = nn.Unfold(
-                self.layer.kernel_size,
-                dilation=self.layer.dilation,
-                padding=self.layer.padding,
-                stride=self.layer.stride
-            )
-            inp = unfold(inp)
-            inp = inp.permute([1, 0, 2])
-            inp = inp.flatten(1)
         self.H *= self.nsamples / (self.nsamples + tmp)
         self.nsamples += tmp
         # inp = inp.float()
@@ -54,7 +60,7 @@ def add_batch(self, inp, out):
         self.H += inp.matmul(inp.t())
 
     def fasterquant(
-        self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False
+        self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, write=True, sparsity=None
     ):
         W = self.layer.weight.data.clone()
         if isinstance(self.layer, nn.Conv2d):
@@ -69,7 +75,8 @@ def fasterquant(
             self.quantizer.find_params(W, weight=True)
 
         H = self.H
-        del self.H
+        if write:
+            del self.H
         dead = torch.diag(H) == 0
         H[dead, dead] = 1
         W[:, dead] = 0
@@ -124,15 +131,16 @@ def fasterquant(
             W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
 
             if DEBUG:
-                self.layer.weight.data[:, :i2] = Q[:, :i2]
-                self.layer.weight.data[:, i2:] = W[:, i2:]
-                print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
-                print(torch.sum(Losses))
+                pass
+                #self.layer.weight.data[:, :i2] = Q[:, :i2]
+                #self.layer.weight.data[:, i2:] = W[:, i2:]
+                #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+                #print(torch.sum(Losses))
 
         torch.cuda.synchronize()
         total_time = time.time() - tick
         # print('time %.2f' % total_time)
-        error = torch.sum(Losses).item()
+        # error = torch.sum(Losses).item()
         # print('error', error)
 
         if actorder:
@@ -141,9 +149,23 @@ def fasterquant(
 
         if isinstance(self.layer, transformers.Conv1D):
             Q = Q.t()
-        self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
-        if DEBUG:
-            print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+        # here report the loss of the quantized layer vs. the original layer
+        new_weight = Q.reshape(self.layer.weight.shape).to(self.layer.weight.dtype)
+        losses = {}
+        mask = None
+        if sparsity is None:
+            sparsed_new_weight = new_weight
+            losses[0] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2)
+        else:
+            for s_sity in sparsity:
+                sparsed_new_weight, mask = hard_threshold(new_weight, fraction_of_zero=s_sity)
+                if write:
+                    logger.info(f"HT with: sparsity={s_sity}")
+                losses[s_sity] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2)
+                
+        if write:
+            self.layer.weight.data = sparsed_new_weight
+        return losses, mask
 
     def free(self):
         if DEBUG:
@@ -152,4 +174,4 @@ def free(self):
         self.H = None
         self.Losses = None
         self.Trace = None
-        torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
\ No newline at end of file
diff --git a/lr_only.py b/lr_only.py
new file mode 100644
index 0000000..dc392c2
--- /dev/null
+++ b/lr_only.py
@@ -0,0 +1,42 @@
+import copy
+import torch
+import argparse
+import torch.nn as nn
+from loguru import logger
+from evaluation import opt_eval
+from datautils import get_loaders
+from core_compression import opt_delta_lr
+from modelutils import get_opt, find_layers
+from save_and_load import save_lr_tensors, load_lr_tensors
+
+@torch.no_grad()
+def lowrank_decomposition(model, rank, n_samples, data_loader=None):
+    lr_iopairs = {}
+    
+    def add_batch(name):
+        def temp(_, inp, out):
+            lr_iopairs[name] = (inp, out)
+        return temp
+    layers = model.model.decoder.layers
+    inps = torch.zeros(
+        (n_samples, model.seqlen, model.config.hidden_size), dtype=torch.fp16, device=model.device
+    )
+    handles = []
+    for i in range(len(layers)):
+        subset = find_layers(layers[i])
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(f"decoder.layers.{i}.{name}")))
+            layer_id = f"decoder.layers.{i}.{name}"
+            decomposing_layer = subset[name].weight
+            # decompose this into low rank matrices
+            
+if __name__=="__main__":
+    base_model = get_opt('facebook/opt-1.3b')
+    trainloader, loader_enc = get_loaders(
+        'wikitext2',
+        nsamples = 128,
+        seed=42,
+        model='facebook/opt-1.3b',
+        seqlen=base_model.seqlen,
+    )
+    lowrank_decomposition(base_model, 32, 128, trainloader)
\ No newline at end of file
diff --git a/matq.py b/matq.py
index 417a2f9..2898db5 100644
--- a/matq.py
+++ b/matq.py
@@ -5,15 +5,15 @@
 import transformers
 from loguru import logger
 from quant import quantize
+from decomposition import low_rank_decomposition
 
-
-DEBUG = False 
+DEBUG = False
 
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
 
 class TensorQ:
-    def __init__(self, layer, rank=32):
+    def __init__(self, layer, rank=32, sensitive_decompose=False):
         self.layer = layer
         self.dev = self.layer.weight.device
         W = layer.weight.data.clone()
@@ -22,47 +22,40 @@ def __init__(self, layer, rank=32):
         if isinstance(self.layer, transformers.Conv1D):
             W = W.t()
         self.rank = rank
-        self.decompose()
+        if not sensitive_decompose:
+            self.decompose()
         self.rows = W.shape[0]
         self.columns = W.shape[1]
-        self.L_columns = self.L.shape[1]
+        self.L_columns = rank
         self.H = torch.zeros((self.columns, self.columns), device=self.dev)
         self.H_R = torch.zeros((self.columns, self.columns), device=self.dev)
         self.H_L = torch.zeros((self.L_columns, self.L_columns), device=self.dev)
         self.nsamples = 0
 
     def add_batch_lr(self, inp, out):
-        if DEBUG:
-            self.inp1 = inp
-            self.out1 = out
+        #if DEBUG:
+        # self.inp1 = inp
+        # self.out1 = out
         if len(inp.shape) == 2:
             inp = inp.unsqueeze(0)
-        tmp = inp.shape[0]
-        if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
+        self.tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear):
             if len(inp.shape) == 3:
                 inp = inp.reshape((-1, inp.shape[-1]))
             inp = inp.t()
-        if isinstance(self.layer, nn.Conv2d):
-            unfold = nn.Unfold(
-                self.layer.kernel_size,
-                dilation=self.layer.dilation,
-                padding=self.layer.padding,
-                stride=self.layer.stride
-            )
-            inp = unfold(inp)
-            inp = inp.permute([1, 0, 2])
-            inp = inp.flatten(1)
-        self.H_R *= self.nsamples / (self.nsamples + tmp)
-        self.nsamples += tmp
+        
+        self.H_R *= self.nsamples / (self.nsamples + self.tmp)
+        self.nsamples += self.tmp
         inp = math.sqrt(2 / self.nsamples) * inp.float()
+        self.inp = inp
 
-        self.H_R += inp.matmul(inp.t())
+    def calculate_hessian(self):
+        self.H_R += self.inp.matmul(self.inp.t())
         # logger.info(f"self.H_R: {self.H_R.shape}")
         # for L, consider the input to be R@X
-        inp = self.R @ inp
-        self.H_L *= self.nsamples / (self.nsamples + tmp)
-        self.H_L += inp.matmul(inp.t())
-        # logger.info(f"self.H_L: {self.H_L.shape}")
+        l_inp = self.R @ self.inp
+        self.H_L *= self.nsamples / (self.nsamples + self.tmp)
+        self.H_L += l_inp.matmul(l_inp.t())
 
     def free(self):
         if DEBUG:
@@ -77,24 +70,19 @@ def free(self):
 
     def decompose(self):
         W = self.layer.weight.data.clone()
-        if isinstance(self.layer, nn.Conv2d):
-            W = W.flatten(1)
-        if isinstance(self.layer, transformers.Conv1D):
-            W = W.t()
         W = W.float()
         logger.info("starting decomposition")
         tick = time.time()
-        U, S, Vh = torch.pca_lowrank(W, q=self.rank, center=True, niter=5)
-        # let's say L = U
-        # and R = diag(S)*V.T
-        self.L = U
-        self.R = torch.diag_embed(S) @ Vh.T
+        self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-10, max_iterations=5000, X=self.inp)
         logger.info(f"decomposition done. elapsed time: {time.time() - tick}, L: {self.L.shape}, R: {self.R.shape}")
 
-    def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False):
-        self.lr_quant_R(blocksize, percdamp, groupsize, actorder)
-        self.lr_quant_L(blocksize, percdamp, groupsize, actorder)
-        # restored weight is L@R
+    def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, decompose_only=False):
+        self.decompose()
+        if not decompose_only:
+            self.calculate_hessian()
+            self.lr_quant_R(blocksize, percdamp, groupsize, actorder)
+            self.lr_quant_L(blocksize, percdamp, groupsize, actorder)
+        # restored weight is L@R, we overwrite the weight for evaluation if needed
         # but on disk we only save L, R
         self.layer.weight.data = (self.L @ self.R).reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
         
diff --git a/modelutils.py b/modelutils.py
index c93410d..f9436c7 100644
--- a/modelutils.py
+++ b/modelutils.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from transformers import OPTForCausalLM
+from transformers import OPTForCausalLM, AutoModel, AutoTokenizer
 DEV = torch.device('cuda:0')
 
 def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
diff --git a/opt_delta.py b/opt_delta.py
index f9f6bbf..10a34d3 100644
--- a/opt_delta.py
+++ b/opt_delta.py
@@ -3,12 +3,13 @@
 import torch
 import pickle
 import torch.nn as nn
-
+from pack_utils import pack_to_bits, unpack_from_bits
 from gptq import *
 from modelutils import *
 from quant import *
-
+from transformers import AutoTokenizer, AutoModel
 import copy
+from tensorio import TensorIO, model_packing
 #from prettytable import PrettyTable
 
 def get_opt(model):
@@ -41,7 +42,7 @@ def opt_sequential_delta(model, delta_model, dataloader, dev):
     layers = model.model.decoder.layers
     delta_layers = delta_model.model.decoder.layers
 
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) 
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
     model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
     
     if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
@@ -425,7 +426,6 @@ def forward(self, *inp, **kwargs):
 def benchmark(model, input_ids, check=False):
     input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
     torch.cuda.synchronize()
-
     cache = {'past': None}
     def clear_past(i):
         def tmp(layer, inp, out):
@@ -507,20 +507,27 @@ def main(args):
             quantizers = opt_sequential(model, dataloader, DEV)
     
     if args.delta and args.wbits<16:
-        for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
-            if args.sparsify_hard_threshold:
-                print('Hard Thresholding...')
-                W = finetuned_p.data
-                finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero)
-            if args.rank>0 and len(finetuned_p.shape) == 2:
-                print('Finding Low Rank Approximation...')
-                A = finetuned_p.data.float()
-                U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5)
-                A  = U @ torch.diag_embed(S) @ Vh.T
-                finetuned_p.data =  A.half()
-                num_params_saved_lr += torch.numel(A) -  (torch.numel(U) + torch.numel(S) + torch.numel(Vh))
+        for idx, (base_p, finetuned_p) in enumerate(zip(base_model.parameters(), model.parameters())):
+            # if args.sparsify_hard_threshold:
+            #     print('Hard Thresholding...')
+            #     W = finetuned_p.data
+            #     finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero)
+            # if args.rank>0 and len(finetuned_p.shape) == 2:
+            #     print('Finding Low Rank Approximation...')
+            #     A = finetuned_p.data.float()
+            #     U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5)
+            #     A  = U @ torch.diag_embed(S) @ Vh.T
+            #     finetuned_p.data =  A.half()
+            #     num_params_saved_lr += torch.numel(A) -  (torch.numel(U) + torch.numel(S) + torch.numel(Vh))
             num_params += torch.numel(finetuned_p.data)
             finetuned_p.data = (base_p.data + finetuned_p.data).clone()
+    
+    if args.save_delta:
+        new_weights, scale = model_packing(model, quantizers, bits=args.wbits)
+        torch.save({
+            'weight': new_weights,
+            'scale': scale,
+        }, f"delta_outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz")
 
     if args.benchmark:
         gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
@@ -545,11 +552,18 @@ def main(args):
     if args.rank > 0:
         print("Number of params without low rank ", num_params)
         print("Number of params with low rank", num_params - num_params_saved_lr)
-    if args.save:
+    if args.save_hf:
+        if args.delta:
+            hf_path = f"outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz"
+        else:
+            hf_path = f"outputs/{args.model.replace('/', '.')}_{args.wbits}bits"
+        model.save_pretrained(hf_path)
+        tokenizer = AutoTokenizer.from_pretrained(args.model)
+        tokenizer.save_pretrained(hf_path)
+    else:
         opt_pack3(model, quantizers)
         torch.save(model.state_dict(), args.save) 
 
-
 if __name__ == '__main__':
     import argparse
     from datautils import *
@@ -636,6 +650,13 @@ def main(args):
         '--sparsify_hard_threshold', action='store_true',
         help='Whether to add sparsity'
     )
+    parser.add_argument(
+        '--save-hf', action='store_true', default=False,
+        help='Whether to save a huggingface model'
+    )
+    parser.add_argument(
+        '--save-delta', action='store_true', default=False,
+    )
     parser.add_argument(
         '--fraction_of_zero', type=float, default=0.99,
         help='Sparsity ratio'
diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py
new file mode 100644
index 0000000..937ab7f
--- /dev/null
+++ b/opt_delta_autotune.py
@@ -0,0 +1,584 @@
+import os
+import copy
+import time
+import json
+import torch
+import pickle
+from gptq import *
+from quant import *
+import torch.nn as nn
+from modelutils import *
+from loguru import logger
+from tensorio import TensorIO, model_packing
+from transformers import AutoTokenizer, AutoModel
+import torchvision.transforms as T
+# from prettytable import PrettyTable
+
+def get_opt(model):
+    import torch
+
+    def skip(*args, **kwargs):
+        pass
+
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import OPTForCausalLM
+
+    # model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto')
+    model = OPTForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
+    model.seqlen = model.config.max_position_embeddings
+    return model
+
+@torch.no_grad()
+def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2):
+    search_space = {
+        "wbits": [2,3,4],
+        "sparsities": [0.0, 0.33, 0.5, 0.67, 0.9, 0.95]
+    }
+    base_floats = 16
+    compression_rates = {}
+    masks = {}
+    for wbit in search_space['wbits']:
+        for sparsity in search_space['sparsities']:
+            compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity)
+    compression_rates = sorted(
+        compression_rates.items(),
+        key=lambda x: x[1],
+        reverse=True
+    )
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+    delta_layers = delta_model.model.decoder.layers
+
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
+
+    if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {"i": 0, "attention_mask": None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+
+        def forward(self, inp, **kwargs):
+            inps[cache["i"]] = inp
+            cache["i"] += 1
+            cache["attention_mask"] = kwargs["attention_mask"]
+            raise ValueError
+
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
+    if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
+    if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    original_outs = torch.zeros_like(inps)
+    attention_mask = cache["attention_mask"]
+
+    print("Ready.")
+    tuned_params = {}
+    tuned_configs = {}
+    quantizers = {}
+    for i in range(len(delta_layers)):
+        layer = delta_layers[i].to(dev)
+        original_layer = layers[i].to(dev)
+        subset = find_layers(layer)
+        for name in subset:
+            tuned_params[f'{i}_{name}'] = {}
+            tuned_configs[f'{i}_{name}'] = {}
+            for wbit in search_space['wbits']:
+                tuned_params[f'{i}_{name}'][f'wbit.{wbit}'] = {
+                    'gptq': GPTQ(subset[name])
+                }
+
+                tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].quantizer = Quantizer()
+
+                tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].quantizer.configure(
+                    wbit, perchannel=True, sym=args.sym, mse=False, trits=args.trits
+                )
+
+        def add_batch(name):
+            def tmp(_, inp, out):
+                for wbit in search_space['wbits']:
+                    tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].add_batch(inp[0].data, out.data)
+            return tmp
+
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(
+                inps[j].unsqueeze(0), attention_mask=attention_mask
+            )[0]
+
+        for h in handles:
+            h.remove()
+
+        for name in subset:
+            logger.info(f"Quantizing {i}.{name} ...")
+            for wbit in search_space['wbits']:
+                losses, _ = tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].fasterquant(
+                    percdamp=args.percdamp,
+                    groupsize=args.groupsize,
+                    actorder=args.act_order,
+                    sparsity = search_space['sparsities'],
+                    write=False,
+                )
+                for s_sity in losses.keys():
+                    tuned_configs[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{s_sity}'] = {
+                        'loss': losses[s_sity].item()
+                    }
+                    logger.info(f"wbit: {wbit}; sparsity: {s_sity}; loss: {losses[s_sity].item()}")
+            # within the tol, pick the minimal wbit and maximal sparsity
+            best_wbit = None
+            best_sparsity = None
+            best_loss = None
+            # starting from the maximal compression rate
+            # loop through all compression rates:
+            for cr in compression_rates:
+                config = cr[0]
+                wbit = int(config.split('_')[0].split('.')[1])
+                sparsity = float(config.split('_')[1].replace('sparsity.',''))
+                # find the corresponding loss
+                loss = tuned_configs[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['loss']
+                # if the loss is within the tolerance
+                if loss <= tol:
+                    best_wbit = wbit
+                    best_sparsity = sparsity
+                    break
+            # if not, pick the lowest compression rate
+            if best_wbit is None:
+                best_wbit = int(compression_rates[-1][0].split('_')[0].split('.')[1])
+                best_sparsity = float(compression_rates[-1][0].split('_')[1].replace('sparsity.',''))
+            if best_sparsity == -1:
+                best_sparsity = -1
+            best_loss = tuned_configs[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['loss']
+            # redo the actual work, and write to the layer
+            logger.info(f"Applying wbit={best_wbit}, sparsity={best_sparsity} ...")
+            loss, mask = tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}']['gptq'].fasterquant(
+                percdamp=args.percdamp,
+                groupsize=args.groupsize,
+                actorder=args.act_order,
+                write=True,
+                sparsity = [best_sparsity],
+            )
+            if mask is not None:
+                masks[f'{i}_{name}'] = mask
+
+            quantizers["model.decoder.layers.%d.%s" % (i, name)] = tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}']['gptq'].quantizer
+            tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}']['gptq'].free()
+            tuned_configs[f'{i}_{name}']['choice'] = {
+                'best_wbit': best_wbit,
+                'best_sparsity': best_sparsity,
+                'best_loss': best_loss
+            }
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(
+                inps[j].unsqueeze(0), attention_mask=attention_mask
+            )[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        for key in tuned_params.keys():
+            if key.startswith(f'{i}_'):
+                for wbit in search_space['wbits']:
+                    del tuned_params[key][f'wbit.{wbit}']['gptq']
+        torch.cuda.empty_cache()
+
+        inps, outs = original_outs, inps
+
+    model.config.use_cache = use_cache
+
+    return quantizers, tuned_configs, masks
+
+@torch.no_grad()
+def opt_eval(model, testenc, dev):
+    print("Evaluating ...")
+
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
+    if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {"i": 0, "attention_mask": None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+
+        def forward(self, inp, **kwargs):
+            inps[cache["i"]] = inp
+            cache["i"] += 1
+            cache["attention_mask"] = kwargs["attention_mask"]
+            raise ValueError
+
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
+    if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
+    if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache["attention_mask"]
+
+    for i in range(len(layers)):
+        # print(i)
+        layer = layers[i].to(dev)
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    if model.model.decoder.final_layer_norm is not None:
+        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(
+            dev
+        )
+    if model.model.decoder.project_out is not None:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        if model.model.decoder.final_layer_norm is not None:
+            hidden_states = model.model.decoder.final_layer_norm(hidden_states)
+        if model.model.decoder.project_out is not None:
+            hidden_states = model.model.decoder.project_out(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(
+            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+        )
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+
+    model.config.use_cache = use_cache
+    return ppl.item()
+
+def benchmark(model, input_ids, check=False):
+    input_ids = input_ids.to(model.gpus[0] if hasattr(model, "gpus") else DEV)
+    torch.cuda.synchronize()
+    cache = {"past": None}
+
+    def clear_past(i):
+        def tmp(layer, inp, out):
+            if cache["past"]:
+                cache["past"][i] = None
+
+        return tmp
+
+    for i, layer in enumerate(model.model.decoder.layers):
+        layer.register_forward_hook(clear_past(i))
+
+    print("Benchmarking ...")
+
+    if check:
+        loss = nn.CrossEntropyLoss()
+        tot = 0.0
+
+    def sync():
+        if hasattr(model, "gpus"):
+            for gpu in model.gpus:
+                torch.cuda.synchronize(gpu)
+        else:
+            torch.cuda.synchronize()
+
+    with torch.no_grad():
+        attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
+        times = []
+        for i in range(input_ids.numel()):
+            tick = time.time()
+            out = model(
+                input_ids[:, i].reshape(-1),
+                past_key_values=cache["past"],
+                attention_mask=attention_mask[:, : (i + 1)].reshape((1, -1)),
+            )
+            sync()
+            times.append(time.time() - tick)
+            print(i, times[-1])
+            if check and i != input_ids.numel() - 1:
+                tot += loss(
+                    out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)
+                ).float()
+            cache["past"] = list(out.past_key_values)
+            del out
+        sync()
+        import numpy as np
+
+        print("Median:", np.median(times))
+        if check:
+            print("PPL:", torch.exp(tot / (input_ids.numel() - 1)).item())
+
+def main(args):
+    print(args)
+    num_params = 0
+    if args.delta and args.wbits < 16:
+        model = get_opt(args.model)
+        model.eval()
+        base_model = get_opt(args.base_model)
+        base_model.eval()
+        dataloader, testloader = get_loaders(
+            args.dataset,
+            nsamples=args.nsamples,
+            seed=args.seed,
+            model=args.model,
+            seqlen=model.seqlen,
+        )
+        original_finetuned_model = copy.deepcopy(model)
+        for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+            finetuned_p.data = (finetuned_p.data - base_p.data).clone()
+    else:
+        model = get_opt(args.model)
+        model.eval()
+
+    if args.wbits < 16:
+        if args.delta:
+            tick = time.time()
+            quantizers, tuned_params, masks = opt_sequential_delta(
+                original_finetuned_model, model, dataloader, DEV, args.tol
+            )
+            data_dir = os.path.join(".cache", args.model.replace('/', '.')) 
+            os.makedirs(data_dir, exist_ok=True)
+            with open(f".cache/{args.model.replace('/', '.')}/delta_tol={args.tol}_tuned_params.json", "w+") as f:
+                json.dump(tuned_params, f)
+            # iterate over all the dict keys in masks
+            transforms = T.ToPILImage()
+            for key in masks.keys():
+                logger.info(f"Saving mask for {key}")
+                binmask = transforms(masks[key])
+                binmask = binmask.convert("1")
+                binmask.save(os.path.join(data_dir, f"delta_tol={args.tol}_mask_{key}.bmp"))
+        else:
+            raise NotImplementedError
+    
+    if args.delta and args.wbits < 16:
+        for idx, (base_p, finetuned_p) in enumerate(
+            zip(base_model.parameters(), model.parameters())
+        ):
+            num_params += torch.numel(finetuned_p.data)
+            finetuned_p.data = (base_p.data + finetuned_p.data).clone()
+
+    if args.save_delta:
+        new_weights, scale = model_packing(model, quantizers, bits=args.wbits)
+        torch.save(
+            {
+                "weight": new_weights,
+                "scale": scale,
+            },
+            f"delta_outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz",
+        )
+
+    if args.benchmark:
+        model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, : args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.load:
+        exit()
+
+    dataset = args.dataset
+    dataloader, testloader = get_loaders(
+        dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    ppl = opt_eval(model, testloader, DEV)
+    print(ppl)
+
+    if args.save_hf:
+        if args.delta:
+            hf_path = f"outputs_exp/{args.model.replace('/', '.')}_delta_autotune_tol={args.tol}"
+        else:
+            hf_path = f"outputs_exp/{args.model.replace('/', '.')}_autotuned_tol={args.tol}"
+        model.save_pretrained(hf_path)
+        tokenizer = AutoTokenizer.from_pretrained(args.model)
+        tokenizer.save_pretrained(hf_path)
+
+if __name__ == "__main__":
+    import argparse
+    from datautils import *
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="lnair/opt-1.3b-wikitext2",
+        help="OPT model to load; pass `facebook/opt-X`.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="wikitext2",
+        help="Where to extract calibration data from.",
+    )
+    parser.add_argument(
+        "--base-model",
+        type=str,
+        default="facebook/opt-1.3b",
+        help="base OPT model to load",
+    )
+    parser.add_argument(
+        "--tol",
+        type=float,
+        default=0.2,
+        help="Tolerance of the loss per layer",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=0, help="Seed for sampling the calibration data."
+    )
+    parser.add_argument(
+        "--nsamples", type=int, default=128, help="Number of calibration data samples."
+    )
+    parser.add_argument(
+        "--percdamp",
+        type=float,
+        default=0.01,
+        help="Percent of the average Hessian diagonal to use for dampening.",
+    )
+    parser.add_argument(
+        "--wbits",
+        type=int,
+        default=2,
+        choices=[2, 3, 4, 16],
+        help="#bits to use for quantization; use 16 for evaluating base model.",
+    )
+    parser.add_argument(
+        "--trits", action="store_true", help="Whether to use trits for quantization."
+    )
+    parser.add_argument(
+        "--groupsize",
+        type=int,
+        default=-1,
+        help="Groupsize to use for quantization; default uses full row.",
+    )
+    parser.add_argument(
+        "--sym", action="store_true", help="Whether to perform symmetric quantization."
+    )
+    parser.add_argument(
+        "--save",
+        type=str,
+        default="",
+        help="Save quantized checkpoint under this name.",
+    )
+    parser.add_argument("--load", type=str, default="", help="Load quantized model.")
+    parser.add_argument(
+        "--benchmark",
+        type=int,
+        default=0,
+        help="Number of tokens to use for benchmarking.",
+    )
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="Whether to compute perplexity during benchmarking for verification.",
+    )
+    parser.add_argument(
+        "--new-eval",
+        action="store_true",
+        help="Whether to use the new PTB and C4 eval.",
+    )
+    parser.add_argument(
+        "--faster-kernel",
+        action="store_true",
+        help="Whether to use the new faster kernel for benchmarking.",
+    )
+    parser.add_argument(
+        "--act-order",
+        action="store_true",
+        help="Whether to apply the activation order GPTQ heuristic",
+    )
+    parser.add_argument(
+        "--delta", action="store_true", help="Whether to use delta compression"
+    )
+    parser.add_argument(
+        "--sparsify_hard_threshold", action="store_true", help="Whether to add sparsity"
+    )
+    parser.add_argument(
+        "--save-hf",
+        action="store_true",
+        default=False,
+        help="Whether to save a huggingface model",
+    )
+    parser.add_argument(
+        "--save-delta",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--fraction_of_zero", type=float, default=0.99, help="Sparsity ratio"
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=0,
+        help="The rank to use for decomposing each matrices",
+    )
+    args = parser.parse_args()
+
+    # results = PrettyTable()
+
+    main(args)
+
+    print("finished.")
diff --git a/opt_eval_ppl.py b/opt_eval_ppl.py
new file mode 100644
index 0000000..ca290b5
--- /dev/null
+++ b/opt_eval_ppl.py
@@ -0,0 +1,129 @@
+import os
+import json
+import torch
+import torch.nn as nn
+from modelutils import get_opt
+from datautils import get_loaders
+
+BENCHMARK = 2048
+
+nsamples = 128
+
+@torch.no_grad()
+def opt_eval(model, testenc, dev):
+    print('Evaluating ...')
+
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    if model.model.decoder.final_layer_norm is not None:
+        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev)
+    if model.model.decoder.project_out is not None:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        if model.model.decoder.final_layer_norm is not None:
+            hidden_states = model.model.decoder.final_layer_norm(hidden_states)
+        if model.model.decoder.project_out is not None:
+            hidden_states = model.model.decoder.project_out(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[
+            :, (i * model.seqlen):((i + 1) * model.seqlen)
+        ][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    model.config.use_cache = use_cache
+    return ppl.item()
+
+models = os.listdir(".cache/models")
+res = {}
+# models = [
+#     # 'facebook/opt-1.3b',
+#     # 'facebook/opt-350m', 
+#     'facebook/opt-2.7b', 
+#     # 'lnair/opt-350m-wikitext2',
+#     # 'lnair/opt-1.3b-wikitext2',
+#     'lnair/opt-2.7b-wikitext2'
+# ]
+for model_name in models:
+    dataset = model_name
+    model_path = os.path.join(".cache", "models", model_name)
+    model = get_opt(model_path)
+    model.to("cuda")
+    print("model loaded")
+    _, testloader = get_loaders(
+        dataset, nsamples=128, seed=0, model=model_path, seqlen=model.seqlen
+    )
+    print("data loaded")
+    ppl = opt_eval(model, testloader, model.device)
+    res[model_name] = ppl
+    print(res)
+    with open("ppl_res.json", "w") as f:
+        json.dump(res, f)
\ No newline at end of file
diff --git a/pack_utils.py b/pack_utils.py
new file mode 100644
index 0000000..5c6ace9
--- /dev/null
+++ b/pack_utils.py
@@ -0,0 +1,167 @@
+import math
+import torch
+import numpy as np
+from typing import Any
+from quant import Quantizer
+from safetensors import safe_open
+from safetensors.torch import save_file
+
+def pack_to_bits(
+        weight: torch.Tensor,
+        quantizer:Quantizer,
+        bits: int,
+        groupsize = 1024
+    ):
+    if groupsize == -1:
+        groupsize = weight.shape[0]
+    if bits not in [2,3,4,8]:
+        raise ValueError("bits must be one of [2,3,4,8]")
+    scales = quantizer.scale.t().contiguous()
+    zeros = quantizer.zero.t().contiguous()
+    scale_zeros = zeros * scales
+    intweight = []
+    for idx in range(weight.shape[0]):
+        g_idx = idx // groupsize
+        intweight.append(torch.round((weight[:,idx] + scale_zeros[g_idx]) / scales[g_idx]).to(torch.int)[:,None])
+
+    intweight = torch.cat(intweight, dim=1)
+    intweight = intweight.t().contiguous()
+    intweight = intweight.numpy().astype(np.uint32)
+    qweight = np.zeros(
+            (intweight.shape[0] // 256 * (bits * 8), intweight.shape[1]), dtype=np.uint32
+    )
+    i = 0
+    row = 0
+    while row < qweight.shape[0]:
+        if bits in [2,4,8]:
+            for j in range(i, i + (32//bits)):
+                qweight[row] |= intweight[j] << (bits * (j - i))
+            i += 32//bits
+            row += 1
+            
+        elif bits == 3:
+            for j in range(i, i + 10):
+                qweight[row] |= intweight[j] << (3 * (j - i))
+            i += 10
+            qweight[row] |= intweight[i] << 30
+            row += 1
+            qweight[row] |= (intweight[i] >> 2) & 1
+            i += 1
+            for j in range(i, i + 10):
+                qweight[row] |= intweight[j] << (3 * (j - i) + 1)
+            i += 10
+            qweight[row] |= intweight[i] << 31
+            row += 1
+            qweight[row] |= (intweight[i] >> 1) & 0x3
+            i += 1
+            for j in range(i, i + 10):
+                qweight[row] |= intweight[j] << (3 * (j - i) + 2)
+            i += 10
+            row += 1
+    
+    qweight = qweight.astype(np.int32)
+    qweight = torch.from_numpy(qweight)
+    return qweight
+
+def unpack_from_bits(
+        qweight: torch.Tensor,
+        quantizer:Quantizer,
+        bits: int,
+        groupsize = 1024
+    ):
+    if bits not in [2,3,4,8]:
+        raise ValueError("bits must be one of [2,3,4,8]")
+
+    scales = quantizer.scale.t().contiguous()
+    zeros = quantizer.zero.t().contiguous()
+    scale_zeros = zeros * scales
+    qweight = qweight.numpy().astype(np.uint32)
+    
+    intweight = np.zeros(
+            (qweight.shape[0] // (bits * 8) * 256, qweight.shape[1]), dtype=np.uint32
+    )
+    i = 0
+    row = 0
+    while row < qweight.shape[0]:
+        if bits in [2,4,8]:
+            for j in range(i, i+ 32 // bits):
+                intweight[j] = (qweight[row] >> (bits * (j - i))) & ((1 << bits) - 1)
+            i += 32 // bits
+            row += 1
+        elif bits == 3:
+            for j in range(i, i+10):
+                intweight[j] = (qweight[row] >> (3 * (j - i))) & 7
+            i += 10
+            intweight[i] = (qweight[row] >> 30) & 1
+            row += 1
+            intweight[i] |= (qweight[row] & 1) << 2
+            i += 1
+            for j in range(i, i+10):
+                intweight[j] = (qweight[row] >> (3 * (j - i) + 1)) & 7
+            i += 10
+            intweight[i] = (qweight[row] >> 31) & 1
+            row += 1
+            intweight[i] |= (qweight[row] & 3) << 1
+            i += 1
+            for j in range(i, i+10):
+                intweight[j] = (qweight[row] >> (3 * (j - i) + 2)) & 7
+            i += 10
+            row += 1
+
+    intweight = intweight.astype(np.int32)
+    intweight = torch.from_numpy(intweight).t().contiguous()
+
+    weight = []
+    for idx in range(intweight.shape[0]):
+        g_idx = idx // groupsize
+        weight.append((intweight[:,idx] * scales[g_idx] - scale_zeros[g_idx]).to(torch.float32)[:,None])
+
+    weight = torch.cat(weight, dim=1)
+    return weight
+
+class SparseTensor():
+    def __init__(self, m: torch.Tensor, format: str, minifloats: int=-1) -> None:
+        self.m = m
+        self.size = m.size()
+        self.minifloats = minifloats
+        self.format = format
+        
+    def _convert(self):
+        # flatten the matrix
+        self.m = self.m.flatten()
+        # get the indices of the non-zero elements
+        indices = torch.nonzero(self.m)
+        # get the non-zero elements
+        values = self.m[indices]
+        self.payload = {
+            'indices': indices,
+            'values': values,
+            'size': torch.tensor(self.size),
+        }
+
+    def restore(self):
+        # restore the matrix from the self.payload
+        self.m = torch.zeros(math.prod(self.size), dtype=self.payload['values'].dtype)
+        self.m[self.payload['indices']] = self.payload['values']
+        self.m = self.m.reshape(self.size)
+
+    def to_disk(self, path):
+        self._convert()
+        save_file(self.payload, path)
+
+    @classmethod
+    def from_disk(cls, path):
+        tensors = {}
+        with safe_open(path, framework='pt', device='cpu') as f:
+            for key in f.keys():
+                tensors[key] = f.get_tensor(key)
+        m = torch.zeros(math.prod(tensors['size']), dtype=tensors['values'].dtype)
+        m[tensors['indices']] = tensors['values']
+        tensors['size'] = tensors['size'].tolist()
+        print(tensors['size'])
+        m = m.reshape(tensors['size'])
+        return cls(m, 'sparse', minifloats=-1)
+    
+    @property
+    def tensor(self):
+        return self.m
\ No newline at end of file
diff --git a/pack_utils_test.py b/pack_utils_test.py
new file mode 100644
index 0000000..563f31d
--- /dev/null
+++ b/pack_utils_test.py
@@ -0,0 +1,20 @@
+import torch
+from quant import Quantizer
+from opt_delta import hard_threshold
+from safetensors.torch import save_file
+
+QUANTIZED_BITS = 4
+
+if __name__=="__main__":
+    torch.set_printoptions(precision=4)
+    b = torch.rand((1, 1), dtype=torch.float32)
+    # save b
+    save_file({'wb1': b}, '.cache/original_b.safetensor')
+    quantizer = Quantizer()
+    quantizer.configure(
+        QUANTIZED_BITS, perchannel=True, sym=False, mse=False
+    )
+    quantizer.find_params(b, weight=True)
+    b_q = quantizer.quantize(b)
+    print(b_q)
+    
\ No newline at end of file
diff --git a/playground.py b/playground.py
new file mode 100644
index 0000000..3b15c6d
--- /dev/null
+++ b/playground.py
@@ -0,0 +1,74 @@
+import math
+import torch
+import numpy as np
+import torchvision.transforms as T
+
+def bin_array(num, m):
+    """Convert a positive integer num into an m-bit bit vector"""
+    return np.array(list(np.binary_repr(num).zfill(m))).astype(np.int8)
+
+def hard_threshold(x, fraction_of_zero=0.1):
+    if fraction_of_zero == 0:
+        return x
+    y, _ = torch.sort(x.view(-1).abs().clone())
+    num_params = torch.numel(x)
+    thresh_index = int(num_params * fraction_of_zero)
+    threshold = y[thresh_index]
+    mask = x.abs().clone().gt(threshold).type(torch.FloatTensor)
+    transform = T.ToPILImage()
+
+    # convert the tensor to PIL image using above transform
+    binmask = transform(mask)
+    binmask = binmask.convert('1')
+    binmask.save('.io/binmask.bmp')
+    return mask * x
+
+def packing_indices(x):
+    matrix_size = x.shape[0] * x.shape[1]
+    y = torch.zeros(x.shape)
+    y = y.flatten()
+    # find indices of non-zero elements
+    x = x.clone().flatten()
+    indices = torch.nonzero(x)
+    # assume matrix is a power of 2
+    bit_width = int(math.log2(matrix_size))
+    # turn into a python tensor with boolean values
+    indices_binary = torch.tensor(np.array([bin_array(i, bit_width) for i in indices]))
+    packed_indices = torch.tensor(np.packbits(indices_binary, axis=1), dtype=torch.uint8)
+    return packed_indices
+
+def unpacking_indices(packed_indices):
+    # unpack with numpy
+    unpacked_indices = np.unpackbits(packed_indices, axis=1)
+    # convert bits back to indices
+    unpacked_indices = torch.tensor(np.array([int("".join(map(str, i)), 2) for i in unpacked_indices]))
+    return unpacked_indices
+
+def compression_rate_calc(msize, wbit, sparsity):
+    original_bit_used = msize * 16
+    nonzeros = msize * sparsity
+    to_store_value = nonzeros * wbit
+    to_store_index = nonzeros * math.log2(msize)
+    print("original_bit_used: ", original_bit_used)
+    print("to_store_value: ", to_store_value)
+    print("to_store_index: ", to_store_index)
+    print("compression rate: ", original_bit_used / (to_store_value + to_store_index))
+    return original_bit_used, to_store_value, to_store_index
+
+if __name__=="__main__":
+    base_floats = 16
+    wbits = 3
+    m_size = 2048
+    nonsparsity = 0.9
+    x = torch.randn((m_size, m_size), dtype=torch.float16)
+    torch.save(x, ".io/x.pt")
+    x = hard_threshold(x, nonsparsity)
+    # 10% x 4M indices -> 800k on disk
+    packed_indices = packing_indices(x)
+    print(packed_indices.shape)
+    print(packed_indices.shape)
+    print(packed_indices.dtype)
+    torch.save(packed_indices, ".io/packed_indices.pt")
+    unpacked_indices = unpacking_indices(packed_indices)
+    
+    compression_rate_calc(2048*2048, 3, 0.1)
\ No newline at end of file
diff --git a/ppl_res.json b/ppl_res.json
new file mode 100644
index 0000000..f86c4f0
--- /dev/null
+++ b/ppl_res.json
@@ -0,0 +1 @@
+{"fact_verification": 7.487515449523926}
\ No newline at end of file
diff --git a/quant.py b/quant.py
index f8cc1b7..386845c 100644
--- a/quant.py
+++ b/quant.py
@@ -1,6 +1,6 @@
 import math
-import numpy as np
 import torch
+import numpy as np
 import torch.nn as nn
 
 def quantize(x, scale, zero, maxq):
@@ -83,7 +83,12 @@ def find_params(self, x, weight=False):
                 xmax1 = p * xmax
                 scale1 = (xmax1 - xmin1) / self.maxq
                 zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
-                q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
+                q = quantize(
+                    x,
+                    scale1.unsqueeze(1),
+                    zero1.unsqueeze(1),
+                    self.maxq
+                )
                 q -= x
                 q.abs_()
                 q.pow_(self.norm)
@@ -287,7 +292,7 @@ def pack(self, linear, scales, zeros):
                 raise NotImplementedError("Only 2,3,4,8 bits are supported.")
                 
         qweight = qweight.astype(np.int32)
-        self.qweight = torch.from_numpy(qweight) 
+        self.qweight = torch.from_numpy(qweight)
         
         zeros -= 1;
         zeros = zeros.numpy().astype(np.uint32)
diff --git a/quant_cuda.cpp b/quant_cuda.cpp
index 1bf0894..ff97571 100644
--- a/quant_cuda.cpp
+++ b/quant_cuda.cpp
@@ -10,7 +10,7 @@ void vecquant3matmul_cuda(
 void vecquant3matmul_faster_cuda(
   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
   torch::Tensor scales, torch::Tensor zeros
-); 
+);
 
 void vecquant3matmul(
   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
diff --git a/replay.py b/replay.py
new file mode 100644
index 0000000..e69de29
diff --git a/requirements.txt b/requirements.txt
index 7417000..79d456c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 transformers
 loguru
-datasets
\ No newline at end of file
+datasets
+safetensors
\ No newline at end of file
diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh
new file mode 100644
index 0000000..ec3d1f2
--- /dev/null
+++ b/scripts/gptq_delta.sh
@@ -0,0 +1,10 @@
+python opt_delta_autotune.py \
+    --dataset answer_verification \
+    --base-model facebook/opt-1.3b \
+    --model .cache/models/answer_verification \
+    --delta \
+    --wbits 2 \
+    --tol 2 \
+    --save-delta \
+    --save-hf \
+    --groupsize 1024
diff --git a/scripts/lr_quant.sh b/scripts/lr_quant.sh
index e462a64..2665e7b 100644
--- a/scripts/lr_quant.sh
+++ b/scripts/lr_quant.sh
@@ -3,27 +3,7 @@ python cli.py \
     --target-model lnair/opt-1.3b-wikitext2 \
     --base-model facebook/opt-1.3b \
     --delta \
-    --rank 32 \
-    --save outputs/ \
-    --nsamples 128 \
-    --wbits 8
-
-python cli.py \
-    --dataset wikitext2 \
-    --target-model lnair/opt-1.3b-wikitext2 \
-    --base-model facebook/opt-1.3b \
-    --delta \
-    --rank 32 \
-    --save outputs/ \
-    --nsamples 128 \
-    --wbits 4
-
-python cli.py \
-    --dataset wikitext2 \
-    --target-model lnair/opt-1.3b-wikitext2 \
-    --base-model facebook/opt-1.3b \
-    --delta \
-    --rank 64 \
+    --rank 16 \
     --save outputs/ \
     --nsamples 128 \
     --wbits 8
@@ -36,4 +16,5 @@ python cli.py \
     --rank 16 \
     --save outputs/ \
     --nsamples 128 \
+    --decompose-only \
     --wbits 8
\ No newline at end of file
diff --git a/scripts/lr_quant_2.sh b/scripts/lr_quant_2.sh
new file mode 100644
index 0000000..2250e30
--- /dev/null
+++ b/scripts/lr_quant_2.sh
@@ -0,0 +1,20 @@
+python cli.py \
+    --dataset wikitext2 \
+    --target-model lnair/opt-1.3b-wikitext2 \
+    --base-model facebook/opt-1.3b \
+    --delta \
+    --rank 32 \
+    --save outputs/ \
+    --nsamples 128 \
+    --decompose-only \
+    --wbits 8
+
+python cli.py \
+    --dataset wikitext2 \
+    --target-model lnair/opt-1.3b-wikitext2 \
+    --base-model facebook/opt-1.3b \
+    --delta \
+    --rank 32 \
+    --save outputs/ \
+    --nsamples 128 \
+    --wbits 8
\ No newline at end of file
diff --git a/scripts/lr_quant_350m.sh b/scripts/lr_quant_350m.sh
new file mode 100644
index 0000000..4710196
--- /dev/null
+++ b/scripts/lr_quant_350m.sh
@@ -0,0 +1,10 @@
+python cli.py \
+    --dataset wikitext2 \
+    --target-model lnair/opt-350m-wikitext2 \
+    --base-model facebook/opt-350m \
+    --delta \
+    --rank 32 \
+    --save outputs/ \
+    --nsamples 128 \
+    --decompose-only \
+    --wbits 8
diff --git a/scripts/opt_delta_exp.sh b/scripts/opt_delta_exp.sh
new file mode 100644
index 0000000..2b41218
--- /dev/null
+++ b/scripts/opt_delta_exp.sh
@@ -0,0 +1,10 @@
+python opt_delta_autotune.py \
+    --dataset wikitext2 \
+    --base-model facebook/opt-1.3b \
+    --model lnair/opt-1.3b-wikitext2 \
+    --delta \
+    --wbits 2 \
+    --tol 2 \
+    --save-delta \
+    --save-hf \
+    --groupsize 1024
\ No newline at end of file
diff --git a/scripts/playground.ipynb b/scripts/playground.ipynb
index 88e2175..86c66d6 100644
--- a/scripts/playground.ipynb
+++ b/scripts/playground.ipynb
@@ -1,151 +1,11 @@
 {
  "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "seed=42\n",
-    "target_model_name = \"lnair/opt-1.3b-wikitext2\"\n",
-    "base_model_name = \"facebook/opt-1.3b\"\n",
-    "n_samples = 128\n",
-    "dataset = 'wikitext2'\n",
-    "import sys\n",
-    "sys.path.append(\"..\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/xiayao/miniconda3/envs/fmzip/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "OPTForCausalLM(\n",
-       "  (model): OPTModel(\n",
-       "    (decoder): OPTDecoder(\n",
-       "      (embed_tokens): Embedding(50272, 2048, padding_idx=1)\n",
-       "      (embed_positions): OPTLearnedPositionalEmbedding(2050, 2048)\n",
-       "      (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
-       "      (layers): ModuleList(\n",
-       "        (0-23): 24 x OPTDecoderLayer(\n",
-       "          (self_attn): OPTAttention(\n",
-       "            (k_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
-       "            (v_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
-       "            (q_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
-       "            (out_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
-       "          )\n",
-       "          (activation_fn): ReLU()\n",
-       "          (self_attn_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
-       "          (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n",
-       "          (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n",
-       "          (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "  )\n",
-       "  (lm_head): Linear(in_features=2048, out_features=50272, bias=False)\n",
-       ")"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from modelutils import get_opt\n",
-    "base_model = get_opt(base_model_name)\n",
-    "target_model = get_opt(target_model_name)\n",
-    "base_model.to('cuda')\n",
-    "target_model.to('cuda')\n",
-    "base_model.eval()\n",
-    "target_model.eval()"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset wikitext (/home/xiayao/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n",
-      "Found cached dataset wikitext (/home/xiayao/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n"
-     ]
-    }
-   ],
-   "source": [
-    "from datautils import get_loaders\n",
-    "trainloader, loader_enc = get_loaders(\n",
-    "    dataset,\n",
-    "    nsamples = n_samples,\n",
-    "    seed=seed,\n",
-    "    model=target_model_name,\n",
-    "    seqlen=base_model.seqlen,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from cli import quantize_with_lowrank\n",
-    "r_quantizer, l_quantizer, lr_tensors = quantize_with_lowrank(\n",
-    "    base_model,\n",
-    "    target_model,\n",
-    "    trainloader,\n",
-    "    32\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
-     ]
-    }
-   ],
-   "source": [
-    "import torch\n",
-    "from safetensors import safe_open\n",
-    "from safetensors.torch import save_file\n",
-    "\n",
-    "# iterate over all keys in lr_tensors\n",
-    "for k in lr_tensors.keys():\n",
-    "    lr_tensors[k] = lr_tensors[k].contiguous() # make sure they are contiguous\n",
-    "# save them to a file\n",
-    "\n",
-    "save_file(lr_tensors, \"model.safetensors\")"
-   ]
   }
  ],
  "metadata": {
diff --git a/submit.py b/submit.py
new file mode 100644
index 0000000..1830099
--- /dev/null
+++ b/submit.py
@@ -0,0 +1,18 @@
+import os
+model_relations = {
+    #'facebook/opt-350m': ['lnair/opt-350m-wikitext2'],
+    # 'facebook/opt-1.3b': ['lnair/opt-1.3b-wikitext2'],
+    # 'facebook/opt-2.7b': ['lnair/opt-2.7b-wikitext2'],
+    'facebook/opt-6.7b': ['mit-han-lab/opt-6.7b-smoothquant'],
+    'facebook/opt-1.3b': ['facebook/opt-iml-1.3b', 'facebook/opt-iml-max-1.3b', 'mit-han-lab/opt-1.3b-smoothquant', 'pszemraj/opt-peter-1.3B', 'opentensor/bt-opt-1.3b']
+}
+
+tols = [4.5, 5, 6.5, 8.0]
+
+os.system("ts -S 7")
+
+for model in model_relations.keys():
+    for target_model in model_relations[model]:
+        for tol in tols:
+            cmd = f"TS_VISIBLE_DEVICES=0,2,3,4,5,6,7 ts --gpus 1 python opt_delta_autotune.py --dataset wikitext2 --delta --tol {tol} --model {target_model} --base-model {model} --save-hf --groupsize 1024"
+            os.system(cmd)
\ No newline at end of file
diff --git a/tensorio.py b/tensorio.py
new file mode 100644
index 0000000..c5526f8
--- /dev/null
+++ b/tensorio.py
@@ -0,0 +1,55 @@
+import math
+import torch
+from safetensors import safe_open
+from safetensors.torch import save_model
+from modelutils import find_layers
+from pack_utils import pack_to_bits
+from compress_utils import compress_flexible_nbits, decompress_flexible_nbits
+
+class TensorIO():
+    def __init__(self, format: str, tensors=None) -> None:
+        self.format = format
+        if tensors is None:
+            self.tensors = {}
+        else:
+            self.tensors = tensors
+
+    def add_tensor(self, idx, tensor):
+        tensor = tensor.flatten()
+        # assume that the tensor is sparse
+        indices = torch.nonzero(tensor)
+        values = tensor[indices]
+        self.tensors[f"{idx}_indices"] = indices
+        self.tensors[f"{idx}_values"] = values
+        self.tensors[f"{idx}_size"] = torch.tensor(tensor.size())
+
+    def to_disk(self, path):
+        torch.save(self.tensors, path)
+
+    @classmethod
+    def from_disk(cls, path):
+        tensors = {}
+        with safe_open(path, framework='pt', device='cpu') as f:
+            for key in f.keys():
+                tensors[key] = f.get_tensor(key)
+        # restore the tensors
+        for key in tensors.keys():
+            m = torch.zeros(math.prod(tensors[f"{key}_size"]), dtype=tensors[f'{key}_values'].dtype)
+
+            m[tensors[f"{key}_indices"]] = tensors[f"{key}_values"]
+            tensors[f"{key}_size"] = tensors[f"{key}_size"].tolist()
+            m = m.reshape(tensors[f"{key}_size"])
+
+            tensors[key] = m
+        return cls('sparse', tensors=tensors)
+
+def model_packing(model, quantizers, bits, reformat='none'):
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    qlayers = find_layers(model, )
+    print('Packing ...')
+    for name in qlayers:
+        if name in quantizers:
+            quantizers[name] = quantizers[name].cpu()
+            x, scale = compress_flexible_nbits(layers[name].weight.data.cuda(), bits)
+    return x, scale
\ No newline at end of file
diff --git a/to_hf.py b/to_hf.py
index efd25d2..033eb79 100644
--- a/to_hf.py
+++ b/to_hf.py
@@ -10,8 +10,8 @@
 base_model = get_opt(base_model_name)
 
 target_model = deepcopy(base_model)
-
-tensors = load_lr_tensors("outputs/model.safetensors")
+MODEL_ID = "lnair.opt-1.3b-wikitext2-r32-w8-decompose.True-lr"
+tensors = load_lr_tensors(f"outputs/{MODEL_ID}.safetensors")
 
 target_layers = target_model.model.decoder.layers
 
@@ -26,6 +26,6 @@
         layer[layer_id].weight.data = new_weight
 
 # save target model as HF
-target_model.save_pretrained("outputs/lnair-opt-1.3b-wikitext2-r32-w8")
+target_model.save_pretrained(f"outputs/{MODEL_ID}")
 tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-tokenizer.save_pretrained("outputs/lnair-opt-1.3b-wikitext2-r32-w8")
\ No newline at end of file
+tokenizer.save_pretrained(f"outputs/{MODEL_ID}")
\ No newline at end of file
diff --git a/utilities/analyze.py b/utilities/analyze.py
new file mode 100644
index 0000000..57c69d1
--- /dev/null
+++ b/utilities/analyze.py
@@ -0,0 +1,16 @@
+import json
+import matplotlib.pyplot as plt
+
+with open(".cache/lnair.opt-350m-wikitext2_delta_tol=2.0.json", "r") as fp:
+    data = json.load(fp)
+
+all_best_losses = []
+for layer_name in data.keys():
+    best_loss = data[layer_name]['choice']['best_loss']
+    all_best_losses.append(best_loss)
+    if (best_loss > 100):
+        print(f"{layer_name} large loss!")
+print(all_best_losses)
+# plot a histogram of the best losses
+plt.hist(all_best_losses, bins=100)
+plt.savefig('.cache/lnair.opt-350m-wikitext2_delta_tol=2.0.png')
\ No newline at end of file
diff --git a/utilities/compression_rate_estimator.py b/utilities/compression_rate_estimator.py
new file mode 100644
index 0000000..2b8a564
--- /dev/null
+++ b/utilities/compression_rate_estimator.py
@@ -0,0 +1,49 @@
+import json
+import math
+from modelutils import get_opt, find_layers
+from compression_scripts.model_utils import get_opt, find_layers
+
+base_floats = 16
+
+def calc_compression(path: str, base_model: str):
+    base_model = get_opt(base_model)
+    with open(path, "r") as f:
+        data = json.load(f)
+
+    base_layers = base_model.model.decoder.layers
+
+    total_original_bits = 0
+    total_used_bits = 0
+    sparsity_lists = []
+    total_stats = {}
+    for i in range(len(base_layers)):
+        layer = base_layers[i]
+        subset = find_layers(layer)
+        for name in subset:
+            original_weight = subset[name].weight.data
+            original_weight_count = original_weight.numel()
+            total_original_bits += original_weight_count * base_floats
+            if f"{i}_{name}" in data:
+                config = data[f"{i}_{name}"]["choice"]
+                # save them as indices + values pair
+                nonzeros = (1-config["best_sparsity"]) * original_weight_count
+                # to store values
+                used_bits = nonzeros * config["best_wbit"]
+                # to store indices
+                used_bits += nonzeros * 2 * math.log2(original_weight_count) * 8
+
+                sparsity_lists.append(config["best_sparsity"])
+
+                total_used_bits += used_bits
+            else:
+                raise ValueError(f"Layer {i}_{name} not found in {path}")
+
+    total_stats['compresion_rate'] = total_original_bits / total_used_bits
+    total_stats['sparsity'] = sum(sparsity_lists) / len(sparsity_lists)
+    return total_stats
+
+if __name__=="__main__":
+    path = ".cache/lnair.opt-1.3b-wikitext2_delta_tol=1.0.json"
+    base_model = "facebook/opt-1.3b"
+    stats = calc_compression(path, base_model)
+    print(stats)
\ No newline at end of file
diff --git a/utilities/convert_to_hf.py b/utilities/convert_to_hf.py
index 8111506..36f7262 100644
--- a/utilities/convert_to_hf.py
+++ b/utilities/convert_to_hf.py
@@ -1,16 +1,12 @@
+import os
 import torch
 import torch.nn as nn
 
 from transformers import GPTJForCausalLM
-
 from transformers import AutoConfig, AutoTokenizer
-
 from transformers.modeling_utils import no_init_weights
-import os
-
 
 def create_emtpy_gptj(config):
-
     import torch
     import torch.nn as nn
 
diff --git a/utilities/cr_cal.py b/utilities/cr_cal.py
new file mode 100644
index 0000000..e69de29
diff --git a/utilities/to_csv.py b/utilities/to_csv.py
new file mode 100644
index 0000000..db809a2
--- /dev/null
+++ b/utilities/to_csv.py
@@ -0,0 +1,19 @@
+import json
+import pandas as pd
+with open('ppl_res.json') as f:
+    res = json.load(f)
+# convert to csv
+sizes_group = ['350m', '1.3b', '2.7b']
+results = []
+for key in res.keys():
+    results.append({
+        'model': key,
+        'perplexity': res[key],
+    })
+df = pd.DataFrame(results)
+
+# pivot table such that columns is different models, rows is different perplexity
+for size in sizes_group:
+    subdf = df[df['model'].str.contains(size)]
+    subdf = subdf.pivot_table(values='perplexity', columns='model')
+    subdf.to_csv(f'ppl_res_{size}.csv', index=False)
\ No newline at end of file
diff --git a/utilities/tuning_analyser.py b/utilities/tuning_analyser.py
new file mode 100644
index 0000000..a7a658b
--- /dev/null
+++ b/utilities/tuning_analyser.py
@@ -0,0 +1,6 @@
+import json
+path = ".cache/lnair.opt-350m-wikitext2_delta_tol=0.2.json"
+
+with open(path, "r") as fp:
+    data = json.load(fp)
+