diff --git a/.gitignore b/.gitignore index dbd6338..73ab46b 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,10 @@ dist/ .idea *.egg-info/ *.safetensors -outputs/ \ No newline at end of file +outputs/ +outputs_past/ +packed_delta +.cache +delta_outputs/ +.io/ +outputs_exp/ \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..d99f2f3 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter" + }, + "python.formatting.provider": "none" +} \ No newline at end of file diff --git a/autotune_gptq.py b/autotune_gptq.py new file mode 100644 index 0000000..34e987d --- /dev/null +++ b/autotune_gptq.py @@ -0,0 +1,175 @@ +import math +import time +import torch +import transformers +import torch.nn as nn +from loguru import logger +from quant import quantize + +DEBUG = False + +torch.backends.cuda.matmul.allow_tf32 = False +torch.backends.cudnn.allow_tf32 = False + +def hard_threshold(x, fraction_of_zero=0.1): + if fraction_of_zero == 0: + return x + y, _ = torch.sort(x.view(-1).abs().clone()) + num_params = torch.numel(x) + thresh_index = int(num_params * fraction_of_zero) + threshold = y[thresh_index] + mask = x.abs().clone().gt(threshold).type(torch.cuda.HalfTensor) + return mask * x + +class GPTQ: + def __init__(self, layer): + self.layer = layer + self.original_weight = layer.weight.data.clone() + self.dev = self.layer.weight.device + W = layer.weight.data.clone() + if isinstance(self.layer, nn.Conv2d): + W = W.flatten(1) + if isinstance(self.layer, transformers.Conv1D): + W = W.t() + self.rows = W.shape[0] + self.columns = W.shape[1] + self.H = torch.zeros((self.columns, self.columns), device=self.dev) + self.nsamples = 0 + + def add_batch(self, inp, out): + self.inp1 = inp + self.out1 = out + if len(inp.shape) == 2: + inp = inp.unsqueeze(0) + tmp = inp.shape[0] + if isinstance(self.layer, nn.Linear): + if len(inp.shape) == 3: + inp = inp.reshape((-1, inp.shape[-1])) + inp = inp.t() + + self.H *= self.nsamples / (self.nsamples + tmp) + self.nsamples += tmp + # inp = inp.float() + inp = math.sqrt(2 / self.nsamples) * inp.float() + # self.H += 2 / self.nsamples * inp.matmul(inp.t()) + self.H += inp.matmul(inp.t()) + + def fasterquant( + self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, write=True, sparsity=None + ): + W = self.layer.weight.data.clone() + if isinstance(self.layer, nn.Conv2d): + W = W.flatten(1) + if isinstance(self.layer, transformers.Conv1D): + W = W.t() + W = W.float() + + tick = time.time() + + if not self.quantizer.ready(): + self.quantizer.find_params(W, weight=True) + + H = self.H + if write: + del self.H + dead = torch.diag(H) == 0 + H[dead, dead] = 1 + W[:, dead] = 0 + + if actorder: + perm = torch.argsort(torch.diag(H), descending=True) + W = W[:, perm] + H = H[perm][:, perm] + + Losses = torch.zeros_like(W) + Q = torch.zeros_like(W) + + damp = percdamp * torch.mean(torch.diag(H)) + diag = torch.arange(self.columns, device=self.dev) + H[diag, diag] += damp + H = torch.linalg.cholesky(H) + H = torch.cholesky_inverse(H) + H = torch.linalg.cholesky(H, upper=True) + Hinv = H + + for i1 in range(0, self.columns, blocksize): + i2 = min(i1 + blocksize, self.columns) + count = i2 - i1 + + W1 = W[:, i1:i2].clone() + Q1 = torch.zeros_like(W1) + Err1 = torch.zeros_like(W1) + Losses1 = torch.zeros_like(W1) + Hinv1 = Hinv[i1:i2, i1:i2] + + for i in range(count): + w = W1[:, i] + d = Hinv1[i, i] + + if groupsize != -1: + if (i1 + i) % groupsize == 0: + self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True) + + q = quantize( + w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq + ).flatten() + Q1[:, i] = q + Losses1[:, i] = (w - q) ** 2 / d ** 2 + + err1 = (w - q) / d + W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0)) + Err1[:, i] = err1 + + Q[:, i1:i2] = Q1 + Losses[:, i1:i2] = Losses1 / 2 + + W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:]) + + if DEBUG: + pass + #self.layer.weight.data[:, :i2] = Q[:, :i2] + #self.layer.weight.data[:, i2:] = W[:, i2:] + #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + #print(torch.sum(Losses)) + + torch.cuda.synchronize() + total_time = time.time() - tick + # print('time %.2f' % total_time) + # error = torch.sum(Losses).item() + # print('error', error) + + if actorder: + invperm = torch.argsort(perm) + Q = Q[:, invperm] + + if isinstance(self.layer, transformers.Conv1D): + Q = Q.t() + # here report the loss of the quantized layer vs. the original layer + new_weight = Q.reshape(self.layer.weight.shape).to(self.layer.weight.dtype) + losses = {} + if sparsity is None: + sparsed_new_weight = new_weight + losses[0] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2) + else: + for s_sity in sparsity: + if write: + logger.info(f"HT with: sparsity={s_sity}") + sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=s_sity) + losses[s_sity] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2) + if losses[s_sity] > 100: + logger.info(f"{sparsed_new_weight}") + logger.info(f"{new_weight}") + logger.info(f"{sparsed_new_weight.shape}") + logger.info(f"{torch.max(torch.abs(self.inp1 @ (sparsed_new_weight.T) - self.out1))}") + if write: + self.layer.weight.data = sparsed_new_weight + return losses + + def free(self): + if DEBUG: + self.inp1 = None + self.out1 = None + self.H = None + self.Losses = None + self.Trace = None + torch.cuda.empty_cache() \ No newline at end of file diff --git a/cli.py b/cli.py index eb1850d..b0757b0 100644 --- a/cli.py +++ b/cli.py @@ -6,11 +6,11 @@ from modelutils import get_opt from evaluation import opt_eval from datautils import get_loaders -from save_and_load import save_lr_tensors, load_lr_tensors from core_compression import opt_delta_lr +from save_and_load import save_lr_tensors, load_lr_tensors @torch.no_grad() -def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_samples): +def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_samples, decompose_only=False): # first do low rank approximation # then quantize original_finetuned_model = copy.deepcopy(target_model) @@ -29,7 +29,8 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s 'percdamp': 0.01, 'groupsize': -1, 'actorder': False, - } + }, + decompose_only=decompose_only ) target_model.to(base_model.device) @@ -53,9 +54,11 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s argparser.add_argument('--save', type=str, default='', help='Path to save the quantized model') argparser.add_argument('--wbits', type=int, default=8, help='Number of bits to use for quantization') argparser.add_argument('--sym', action='store_true', default=True, help='Whether to use symmetric quantization') + argparser.add_argument('--decompose-only', action='store_true', default=False, help='Whether to use quantization') argparser.add_argument('--trits', action='store_true', default=False, help='Whether to use trits') args = argparser.parse_args() + print(args) seed = args.seed base_model = get_opt(args.base_model) @@ -77,10 +80,10 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s trainloader, args.rank, args.wbits, - args.nsamples + args.nsamples, + args.decompose_only, ) if args.save: - save_lr_tensors(lr_tensors, f"{args.save}/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-lr.safetensors") - + save_lr_tensors(lr_tensors, f"{args.save}/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-decompose.{args.decompose_only}-lr.safetensors") ppl = opt_eval(target_model, loader_enc, args, target_model.device) logger.info(f"Perplexity: {ppl}") \ No newline at end of file diff --git a/compress_utils.py b/compress_utils.py new file mode 100644 index 0000000..143f017 --- /dev/null +++ b/compress_utils.py @@ -0,0 +1,351 @@ +import cupy +import math +import torch +import numpy as np +from torch.utils.dlpack import to_dlpack, from_dlpack +from quant import Quantizer + +def cupy_to_tensor(x): + return from_dlpack(x.toDlpack()) + +def tensor_to_cupy(x): + return cupy.fromDlpack(to_dlpack(x)) + +def pack_uint8_tensor(x): + if x.device != torch.device('cpu'): + return cupy_to_tensor( + cupy.packbits(tensor_to_cupy(x)) + ) + else: + return torch.from_numpy( + np.packbits(x.numpy()) + ) + +def unpack_uint8_tensor(x): + if x.device != torch.device('cpu'): + return cupy_to_tensor( + cupy.unpackbits(tensor_to_cupy(x)) + ) + else: + return torch.from_numpy( + np.unpackbits(x.numpy()) + ) + +def pack_low_bit_tensor(x, bits): + + if x.device != torch.device('cpu'): + assert x.dtype == torch.uint8 + y = cupy.packbits( + cupy.unpackbits(tensor_to_cupy(x)).reshape(*x.shape, 8)[..., -bits:] + ) + y = cupy_to_tensor(y) + else: + y = np.packbits( + np.unpackbits(x.numpy()).reshape(*x.shape, 8)[..., -bits:] + ) + y = torch.from_numpy(y) + + return y + +def unpack_low_bit_tensor(x, bits, original_shape): + if x.device != torch.device('cpu'): + y = cupy.packbits(cupy.pad( + cupy.unpackbits( + tensor_to_cupy(x) + )[:np.prod(original_shape)*bits].reshape(-1, bits), + ((0,0), (8-bits, 0)) + )) + y = cupy_to_tensor(y).view(original_shape) + else: + y = np.packbits(np.pad( + np.unpackbits( + x.numpy() + )[:np.prod(original_shape)*bits].reshape(-1, bits), + ((0,0), (8-bits, 0)) + )) + y = torch.from_numpy(y).view(original_shape) + return y + + +def pin_memory(array): + mem = cupy.cuda.alloc_pinned_memory(array.nbytes) + ret = np.frombuffer(mem, array.dtype, array.size).reshape(array.shape) + ret[...] = array + return ret + + +def _rounding(x, stochastic=False, minimum_stochastic_distance=0.2): + if stochastic: + x_floor = x.floor() + th = x - x_floor + if minimum_stochastic_distance > 0: + th[th1-minimum_stochastic_distance] = 1. + pr = torch.rand_like(x) + x_floor += (pr < th) + return x_floor + else: + return x.round() + + +def _compress_nbits(x, bits, scale_method='max', scale_dims=(0,1), + stochastic=False, minimum_stochastic_distance=0.2): + + fbits = bits - 1 + + if scale_method == 'max': + # issue: sensitive to outlier points + scale = x.abs().amax(scale_dims, keepdims=True) + elif scale_method == 'l2': + # ~95% confidence interval for normal distribution + scale = x.pow(2).mean(scale_dims, keepdims=True).sqrt() * 2 + else: + raise Exception('unkonwn scale method.') + # fp16 should be enough + scale = scale.half() + x = x / (scale + 1e-6) + + x = x.ldexp(torch.tensor(fbits)) + clip_min = -(1<> 4) + x1 = (x & bitmask) + + x = torch.cat([x0, x1], -1) + + x = _decompress_nbits(x, scale, bits=4) + + return x + + +def compress_2bit(x, scale_method='max', scale_dims=(0,1)): + + x, scale = _compress_nbits(x, bits=2, scale_method=scale_method, scale_dims=scale_dims) + + x0, x1, x2, x3 = x.chunk(4, -1) + x = (x0 << 6) + (x1 << 4) + (x2 << 2) + x3 + + return x, scale + + +def decompress_2bit(x, scale): + + bitmask = 3 + + x0 = (x >> 6) + x1 = (x >> 4) & bitmask + x2 = (x >> 2) & bitmask + x3 = x & bitmask + x = torch.cat([x0, x1, x2, x3], -1) + + x = _decompress_nbits(x, scale, bits=2) + + return x + + + +def compress_flexible_nbits(x, bits, scale_method='max', scale_dims=(0,1)): + # support any bits + # CUDA only + + x, scale = _compress_nbits(x, bits=bits, scale_method=scale_method, scale_dims=scale_dims) + + x = pack_low_bit_tensor(x, bits) + + return x, scale + + +def decompress_flexible_nbits(x, scale, bits, original_shape): + # support any bits, but need to know original_shape + # CUDA only + + x = unpack_low_bit_tensor(x, bits, original_shape) + + x = _decompress_nbits(x, scale, bits=bits) + + return x + + + +def compress_nbits(x, bits, scale_method='max', scale_dims=(0,1)): + if bits == 8: + x, scale = compress_8bit(x, scale_method=scale_method, scale_dims=scale_dims) + elif bits == 4: + x, scale = compress_4bit(x, scale_method=scale_method, scale_dims=scale_dims) + elif bits == 2: + x, scale = compress_2bit(x, scale_method=scale_method, scale_dims=scale_dims) + + return x, scale + + +def decompress_nbits(x, scale, bits): + if bits == 8: + y = decompress_8bit(x, scale) + elif bits == 4: + y = decompress_4bit(x, scale) + elif bits == 2: + y = decompress_2bit(x, scale) + + return y + +def _compress_nbits_by_bucket(x, bits, scale_method='max', bucket_size=512, + stochastic=False, minimum_stochastic_distance=0.2): + + if bits == 1: + + x = x.view(bucket_size, -1) + + scale = (x.norm(dim=0) / (bucket_size**0.5)).half() + + x = (x >= 0) + + x = x.type(torch.uint8) + + return x, scale + + + fbits = bits - 1 + + x = x.view(bucket_size, -1) + + if scale_method == 'max': + # issue: sensitive to outlier points + scale = x.abs().amax([0], keepdims=True) + elif scale_method == 'l2': + # ~95% confidence interval for normal distribution + scale = x.pow(2).mean([0], keepdims=True).sqrt() * 2 + else: + raise Exception('unkonwn scale method.') + # fp16 should be enough + scale = scale.half() + x = x / (scale + 1e-6) + + x = x.ldexp(torch.tensor(fbits)) + clip_min = -(1< x.numel(): + bucket_size = x.numel() + + x, scale = _compress_nbits_by_bucket( + x, bits=bits, scale_method=scale_method, bucket_size=bucket_size, + stochastic=stochastic, minimum_stochastic_distance=minimum_stochastic_distance) + + x = pack_low_bit_tensor(x, bits) + + return x, scale + + +def decompress_flexible_nbits_by_bucket(x, scale, bits, original_shape, bucket_size=512): + # support any bits, but need to know original_shape + # CUDA only + + + numel = math.prod(original_shape) + if bucket_size > numel: + bucket_size = numel + + + if bits == 1: + + x = unpack_low_bit_tensor(x, bits, original_shape) + x = x.view(bucket_size, -1) + x = (x.half() - 0.5)*2 + x = x * scale.unsqueeze(0) + x = x.view(original_shape) + + # print('done') + + return x + + x = unpack_low_bit_tensor(x, bits, original_shape) + + x = x.view(bucket_size, -1) + x = _decompress_nbits(x, scale, bits=bits) + x = x.view(original_shape) + + return x + +if __name__=="__main__": + + x = torch.randn((512, 512), dtype=torch.float32, device='cuda') + + print("original") + print(x) + quantizer = Quantizer() + quantizer.configure( + 4, perchannel=True, sym=False, mse=False + ) + quantizer.find_params(x, weight=True) + b_q = quantizer.quantize(x) + + packed_x, scale = compress_flexible_nbits(b_q, 4) + unpacked_x = decompress_flexible_nbits(packed_x,scale=scale, bits=4, original_shape=x.shape) + + print("unpacked") + print(unpacked_x) + print(f"are they equal? {torch.allclose(x, unpacked_x)}") \ No newline at end of file diff --git a/core_compression.py b/core_compression.py index 2b445f7..6e79ba4 100644 --- a/core_compression.py +++ b/core_compression.py @@ -1,9 +1,10 @@ import torch import torch.nn as nn -from loguru import logger -from modelutils import find_layers +from tqdm import tqdm from matq import TensorQ +from loguru import logger from quant import Quantizer +from modelutils import find_layers @torch.no_grad() def opt_delta_lr( @@ -15,7 +16,8 @@ def opt_delta_lr( sym, trits, rank, - args + args, + decompose_only=False, ): device = model.device print("Starting LR quantizer initialization...") @@ -48,6 +50,7 @@ def forward(self, inp, **kwargs): cache['i'] += 1 cache['attention_mask'] = kwargs['attention_mask'] raise ValueError + layers[0] = Catcher(layers[0]) for batch in dataloader: try: @@ -55,6 +58,7 @@ def forward(self, inp, **kwargs): except ValueError: pass layers[0] = layers[0].module + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: @@ -62,7 +66,6 @@ def forward(self, inp, **kwargs): if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: model.model.decoder.project_in = model.model.decoder.project_in.cpu() torch.cuda.empty_cache() - outs = torch.zeros_like(inps) original_outs = torch.zeros_like(inps) attention_mask = cache['attention_mask'] @@ -71,14 +74,15 @@ def forward(self, inp, **kwargs): quantizers = {} l_quantizers = {} lr_tensors = {} - for i in range(len(delta_layers)): + # parallelize this to allocate to multiple GPUs? + for i in tqdm(range(len(delta_layers))): layer = delta_layers[i].to(device) original_layer = layers[i].to(device) subset = find_layers(layer) lr_gptq = {} for name in subset: - lr_gptq[name] = TensorQ(subset[name], rank) + lr_gptq[name] = TensorQ(subset[name], rank, sensitive_decompose=True) lr_gptq[name].quantizer = Quantizer() lr_gptq[name].quantizer.configure( wbits, @@ -108,16 +112,17 @@ def temp(_, inp, out): outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - + for h in handles: h.remove() - + for name in subset: logger.info(f"Quantizing {name}...") lr_gptq[name].lr_quant( percdamp=args['percdamp'], groupsize=args['groupsize'], actorder=args['actorder'], + decompose_only=decompose_only, ) lr_tensors[f'.model.decoder.layers.{i}.{name}'] = lr_gptq[name].R lr_tensors[f'.model.decoder.layers.{i}.{name}'] = lr_gptq[name].L diff --git a/datautils.py b/datautils.py index 045121a..71a8616 100644 --- a/datautils.py +++ b/datautils.py @@ -1,5 +1,9 @@ -import numpy as np +import json import torch +import random +import numpy as np +from datasets import Dataset +from transformers import AutoTokenizer def set_seed(seed): np.random.seed(seed) @@ -157,6 +161,57 @@ def __init__(self, input_ids): return trainloader, valenc +def get_jsonl(train_path, val_path, n_samples, seed, seq_len, model_name, val_size=None, val_seq_len=256, padding=False): + """ + train_path: path to train jsonl file + test_path: path to test jsonl file + """ + tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) + with open(train_path, 'r') as f: + traindata = [json.loads(line) for line in f.readlines()] + with open(val_path, 'r') as f: + valdata = [json.loads(line) for line in f.readlines()] + traindata = {"text": [d['text'] for d in traindata]} + valdata = {"text": [d['text'] for d in valdata]} + traindata = Dataset.from_dict(traindata) + valdata = Dataset.from_dict(valdata) + set_seed(seed) + + trainloader = [] + for _ in range(n_samples): + # for all datasets, we take the samples that are longer than seq_len + while True: + i = random.randint(0, len(traindata) - 1) + if padding: + trainenc = tokenizer(traindata[i]['text'], padding='max_length', truncation=True, max_length=seq_len, return_tensors='pt') + else: + trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') + if trainenc.input_ids.shape[1] >= seq_len: + break + if not padding: + # then clip the samples to seq_len + i = random.randint(0, trainenc.input_ids.shape[1] - seq_len - 1) + j = i + seq_len + inp = trainenc.input_ids[:, i:j] + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + else: + inp = trainenc.input_ids + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + if val_size is not None: + valenc = tokenizer(' '.join(valdata[:val_size]['text']), return_tensors='pt') + else: + valenc = tokenizer(' '.join(valdata['text']), return_tensors='pt') + valenc = valenc.input_ids[:, :(val_seq_len * seq_len)] + + class TokenizerWrapper: + def __init__(self, input_ids): + self.input_ids = input_ids + valenc = TokenizerWrapper(valenc) + return trainloader, valenc def get_loaders( name, nsamples=128, seed=0, seqlen=2048, model='' @@ -171,3 +226,41 @@ def get_loaders( if 'new' in name: return get_c4_new(nsamples, seed, seqlen, model) return get_c4(nsamples, seed, seqlen, model) + if name == "answer_verification": + return get_jsonl( + ".cache/ni_calib/train/answer_verification.jsonl", + ".cache/ni_calib/test/answer_verification.jsonl", + nsamples, + seed, + seqlen, + model, + val_size=1000, + padding=True + ) + if name == "coherence_classification": + return get_jsonl(".cache/ni_calib/test/coherence_classification.jsonl", ".cache/ni_calib/test/coherence_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) + if name == "commonsense_classification": + return get_jsonl(".cache/ni_calib/train/commonsense_classification.jsonl", ".cache/ni_calib/test/commonsense_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) + if name == "dialogue_state_tracking": + return get_jsonl(".cache/ni_calib/train/dialogue_state_tracking.jsonl", ".cache/ni_calib/test/dialogue_state_tracking.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) + if name == "fact_verification": + return get_jsonl(".cache/ni_calib/train/fact_verification.jsonl", ".cache/ni_calib/test/fact_verification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) + if name == "gender_classification": + return get_jsonl(".cache/ni_calib/train/gender_classification.jsonl", ".cache/ni_calib/test/gender_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) + if name == "irony_detection": + return get_jsonl(".cache/ni_calib/train/irony_detection.jsonl", ".cache/ni_calib/test/irony_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) + if name == "stance_detection": + return get_jsonl(".cache/ni_calib/train/stance_detection.jsonl", ".cache/ni_calib/test/stance_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) + if name == "toxic_language_detection": + return get_jsonl(".cache/ni_calib/train/toxic_language_detection.jsonl", ".cache/ni_calib/test/toxic_language_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) + if name == "word_semantics": + return get_jsonl( + ".cache/ni_calib/train/word_semantics.jsonl", + ".cache/ni_calib/test/word_semantics.jsonl", + nsamples, + seed, + seqlen, + model, + val_size=1000, + padding=True + ) \ No newline at end of file diff --git a/decomposition.py b/decomposition.py new file mode 100644 index 0000000..5663594 --- /dev/null +++ b/decomposition.py @@ -0,0 +1,92 @@ +import time +import torch +from tqdm import tqdm +from loguru import logger +from torch.optim.lr_scheduler import ExponentialLR +import torch.nn.functional as F + +def svd_decomposition(matrix, rank): + U, S, Vh = torch.pca_lowrank(matrix, q=rank) + return U @ torch.diag_embed(S), Vh.T + +def low_rank_decomposition(W, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5, X = None): + L = torch.rand((W.shape[0], rank), device=W.device) + R = torch.rand((rank, W.shape[1]), device=W.device) + tick = time.time() + early_stop = False + if X is None: + for i in tqdm(range(max_iterations)): + difference = W - L @ R + gradient_L = -2 * (difference @ R.T) + gradient_R = -2 * (L.T @ difference) + L -= learning_rate * gradient_L + R -= learning_rate * gradient_R + if F.mse_loss(W, L@R) < tolerance: + early_stop = True + break + logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(W, L@R)}") + else: + for i in tqdm(range(max_iterations)): + diff = W @ X - L @ R @ X + gradient_L = -2 * (diff @ ((R@X).T)) + gradient_R = -2 * (L.T @ diff @ X.T) + L -= learning_rate * gradient_L + R -= learning_rate * gradient_R + if F.mse_loss(W @ X, L @ R @ X) < tolerance: + early_stop = True + break + # print(F.mse_loss(W @ X, L @ R @ X)) + logger.info(f"[With Input] Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(W@X, L@R@X)}") + return L, R + +def torch_autograd(W, X, rank, lr, steps): + L = torch.rand((W.shape[0], rank), device=W.device, requires_grad=True) + R = torch.rand((rank, W.shape[1]), device=W.device, requires_grad=True) + optimizer = torch.optim.SGD([L, R], lr=lr, momentum=0.9) + scheduler = ExponentialLR(optimizer, gamma=0.9) + for j in tqdm(range(steps)): + optimizer.zero_grad() + output = L @ R @ X + target = W @ X + loss = torch.nn.functional.mse_loss(output, target) + loss.backward() + optimizer.step() + if j % 200 == 0: + scheduler.step() + return L, R + +if __name__=="__main__": + FULL_RANK = 2048 + FULL_RANK_H = 1024 + FULL_RANK_W = 4096 + LOW_RANK = 32 + TARGET_SIZE = 2 + + W = torch.rand((FULL_RANK_W, FULL_RANK_H)) + input_matrix = torch.rand((FULL_RANK_H, TARGET_SIZE)) + output_matrix = W @ input_matrix + + L_sensitive, R_sensitive = low_rank_decomposition( + W, + LOW_RANK, + learning_rate=1e-9, + max_iterations=2000, + X=input_matrix + ) + reconstructed_matrix = L_sensitive @ R_sensitive @ input_matrix + print(f"reconstructed mse: gd: {F.mse_loss(output_matrix, reconstructed_matrix)}") + + L_noinput, R_noinput = low_rank_decomposition( + W, + LOW_RANK, + learning_rate=1e-9, + max_iterations=2000, + ) + reconstructed_matrix_noinput = L_noinput @ R_noinput @ input_matrix + print(f"reconstructed mse: gd. noinput gd: {F.mse_loss(output_matrix, reconstructed_matrix_noinput)}") + + L_autograd, R_autograd = torch_autograd(W, input_matrix, LOW_RANK, 1e-9, 2000) + reconstructed_matrix_pca = L_autograd @ R_autograd @ input_matrix + print(f"reconstructed mse: autograd: {F.mse_loss(output_matrix, reconstructed_matrix_pca)}") + + diff --git a/docs/number.md b/docs/number.md index e69de29..ec5bac9 100644 --- a/docs/number.md +++ b/docs/number.md @@ -0,0 +1,20 @@ +In theory: + With a matrix of size 2048 * 2048, 10% elements are non-zero. The original bits is 2048 * 2048 * 16 = 16 * 4M + + To store the indices of non-zero elements, it takes 2048 * 2048 * 10% * log2(2048 * 2048) ~= 2.2 * 4M + + Considering indices only, we achieve 16 / 2.2 ~= 7.3x compression ratio + +In practice: + Saving a matrix of size 2048 * 2048, 10% elements are non-zero takes 8M bytes on disk (with torch.save). + + Saving packed indices takes 1.9M on disk, achieving 17 / 1.9 ~= 8.9x compression ratio. + + With zip, the packed indices takes 1.1M on disk, achieving 17 / 1.1 ~= 15.5x compression ratio. + + +256 x 256 -> 64k fp 16 -> 128k on disk ok. +log2(2048*2048) = 22. 3 int8 for each index. + +0.4M * 3 = 1.2M +""" \ No newline at end of file diff --git a/opt_delta_fork.py b/gptj_delta_autotuned.py similarity index 61% rename from opt_delta_fork.py rename to gptj_delta_autotuned.py index 9a7710c..bd883b9 100644 --- a/opt_delta_fork.py +++ b/gptj_delta_autotuned.py @@ -1,52 +1,41 @@ + import time +import math import torch import torch.nn as nn +import transformers from gptq import * from modelutils import * from quant import * -import json -import pickle +import os import copy -#from prettytable import PrettyTable -def get_opt(model): +def get_gptj(model): import torch def skip(*args, **kwargs): pass torch.nn.init.kaiming_uniform_ = skip torch.nn.init.uniform_ = skip torch.nn.init.normal_ = skip - from transformers import OPTForCausalLM - # model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto') - model = OPTForCausalLM.from_pretrained(model, torch_dtype=torch.float16) + from transformers import GPTJForCausalLM + model = GPTJForCausalLM.from_pretrained(model, torch_dtype=torch.float16) model.seqlen = model.config.max_position_embeddings + print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad)) return model -def hard_threshold(x, fraction_of_zero=0.1): - y, _ = torch.sort(x.view(-1).abs().clone()) - num_params = torch.numel(x) - thresh_index = int(num_params * fraction_of_zero) - threshold = y[thresh_index] - mask = x.abs().clone().gt(threshold).type(torch.FloatTensor) - return mask * x - @torch.no_grad() -def opt_sequential_delta(model, delta_model, dataloader, dev): +def gptj_sequential(model, dataloader, dev, means=None, stds=None): print('Starting ...') use_cache = model.config.use_cache model.config.use_cache = False - layers = model.model.decoder.layers - delta_layers = delta_model.model.decoder.layers - - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + #print(model.transformer.h) + layers = model.transformer.h + print(layers) + + model.transformer.wte = model.transformer.wte.to(dev) layers[0] = layers[0].to(dev) dtype = next(iter(model.parameters())).dtype @@ -72,25 +61,20 @@ def forward(self, inp, **kwargs): pass layers[0] = layers[0].module + layers = model.transformer.h layers[0] = layers[0].cpu() - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() - model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.cpu() - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.cpu() + model.transformer.wte = model.transformer.wte.cpu() + model.transformer.ln_f = model.transformer.ln_f.cpu() torch.cuda.empty_cache() outs = torch.zeros_like(inps) - original_outs = torch.zeros_like(inps) attention_mask = cache['attention_mask'] print('Ready.') quantizers = {} - for i in range(len(delta_layers)): - layer = delta_layers[i].to(dev) - original_layer = layers[i].to(dev) + for i in range(len(layers)): + layer = layers[i].to(dev) subset = find_layers(layer) gptq = {} @@ -98,9 +82,9 @@ def forward(self, inp, **kwargs): gptq[name] = GPTQ(subset[name]) gptq[name].quantizer = Quantizer() gptq[name].quantizer.configure( - args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits + args.wbits, perchannel=True, sym=False, mse=False ) - + def add_batch(name): def tmp(_, inp, out): gptq[name].add_batch(inp[0].data, out.data) @@ -108,49 +92,39 @@ def tmp(_, inp, out): handles = [] for name in subset: handles.append(subset[name].register_forward_hook(add_batch(name))) - for j in range(args.nsamples): outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - - original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] for h in handles: h.remove() for name in subset: print(i, name) print('Quantizing ...') - gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) - quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer - gptq[name].free() + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize) for j in range(args.nsamples): outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] layers[i] = layer.cpu() del layer - del gptq + del gptq torch.cuda.empty_cache() - inps, outs = original_outs, inps + inps, outs = outs, inps model.config.use_cache = use_cache return quantizers @torch.no_grad() -def opt_sequential(model, dataloader, dev): +def gptj_sequential_delta(model, delta_model, dataloader, dev): print('Starting ...') use_cache = model.config.use_cache model.config.use_cache = False - layers = model.model.decoder.layers - - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + layers = model.transformer.h + delta_layers = delta_model.transformer.h + + model.transformer.wte = model.transformer.wte.to(dev) layers[0] = layers[0].to(dev) dtype = next(iter(model.parameters())).dtype @@ -177,22 +151,19 @@ def forward(self, inp, **kwargs): layers[0] = layers[0].module layers[0] = layers[0].cpu() - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() - model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.cpu() - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.cpu() + model.transformer.wte = model.transformer.wte.cpu() torch.cuda.empty_cache() outs = torch.zeros_like(inps) + original_outs = torch.zeros_like(inps) attention_mask = cache['attention_mask'] print('Ready.') quantizers = {} - for i in range(len(layers)): - layer = layers[i].to(dev) + for i in range(len(delta_layers)): + layer = delta_layers[i].to(dev) + original_layer = layers[i].to(dev) subset = find_layers(layer) gptq = {} @@ -200,7 +171,7 @@ def forward(self, inp, **kwargs): gptq[name] = GPTQ(subset[name]) gptq[name].quantizer = Quantizer() gptq[name].quantizer.configure( - args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits + args.wbits, perchannel=True, sym=args.sym, mse=False ) def add_batch(name): @@ -212,6 +183,7 @@ def tmp(_, inp, out): handles.append(subset[name].register_forward_hook(add_batch(name))) for j in range(args.nsamples): outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] for h in handles: h.remove() @@ -221,23 +193,24 @@ def tmp(_, inp, out): gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer gptq[name].free() + for j in range(args.nsamples): outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] layers[i] = layer.cpu() del layer del gptq torch.cuda.empty_cache() - inps, outs = outs, inps + inps, outs = original_outs, inps model.config.use_cache = use_cache - - return quantizers + return quantizers @torch.no_grad() -def opt_eval(model, testenc, dev): +def gptj_eval(model, testenc, dev): print('Evaluating ...') testenc = testenc.input_ids @@ -245,14 +218,11 @@ def opt_eval(model, testenc, dev): use_cache = model.config.use_cache model.config.use_cache = False - layers = model.model.decoder.layers - - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + # print(model.transformer.h) + layers = model.transformer.h + print(layers) + + model.transformer.wte = model.transformer.wte.to(dev) layers[0] = layers[0].to(dev) dtype = next(iter(model.parameters())).dtype @@ -267,40 +237,38 @@ def __init__(self, module): self.module = module def forward(self, inp, **kwargs): inps[cache['i']] = inp - cache['i'] += 1 + cache ['i'] += 1 cache['attention_mask'] = kwargs['attention_mask'] raise ValueError layers[0] = Catcher(layers[0]) for i in range(nsamples): - batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) + batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev) try: + # print(batch.shape) model(batch) except ValueError: pass layers[0] = layers[0].module + layers = model.transformer.h layers[0] = layers[0].cpu() - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() - model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.cpu() - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.cpu() + model.transformer.wte = model.transformer.wte.cpu() + model.transformer.ln_f = model.transformer.ln_f.cpu() torch.cuda.empty_cache() - + outs = torch.zeros_like(inps) attention_mask = cache['attention_mask'] for i in range(len(layers)): - # print(i) + print(i) layer = layers[i].to(dev) if args.nearest: - subset = find_layers(layer) + subset = find_layers(layer) for name in subset: quantizer = Quantizer() quantizer.configure( - args.wbits, perchannel=True, sym=args.sym, mse=False + args.wbits, perchannel=True, sym=False, mse=False ) W = subset[name].weight.data quantizer.find_params(W, weight=True) @@ -315,20 +283,14 @@ def forward(self, inp, **kwargs): torch.cuda.empty_cache() inps, outs = outs, inps - if model.model.decoder.final_layer_norm is not None: - model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev) - if model.model.decoder.project_out is not None: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + model.transformer.ln_f = model.transformer.ln_f.to(dev) model.lm_head = model.lm_head.to(dev) - + testenc = testenc.to(dev) nlls = [] for i in range(nsamples): hidden_states = inps[i].unsqueeze(0) - if model.model.decoder.final_layer_norm is not None: - hidden_states = model.model.decoder.final_layer_norm(hidden_states) - if model.model.decoder.project_out is not None: - hidden_states = model.model.decoder.project_out(hidden_states) + hidden_states = model.transformer.ln_f(hidden_states) lm_logits = model.lm_head(hidden_states) shift_logits = lm_logits[:, :-1, :].contiguous() shift_labels = testenc[ @@ -340,61 +302,60 @@ def forward(self, inp, **kwargs): nlls.append(neg_log_likelihood) ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) print(ppl.item()) + model.config.use_cache = use_cache - return ppl.item() -# TODO: perform packing on GPU -def opt_pack3(model, quantizers): +def gptj_pack(model, quantizers, wbits, groupsize): layers = find_layers(model) layers = {n: layers[n] for n in quantizers} - make_quant3(model, quantizers, faster=args.faster_kernel) - qlayers = find_layers(model, [Quant3Linear]) + make_quant(model, quantizers, wbits, groupsize) + qlayers = find_layers(model, [QuantLinear]) print('Packing ...') for name in qlayers: print(name) - quantizers[name] = quantizers[name].cpu() - qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero) - print('Done.') + quantizers[name],scale,zero = quantizers[name] + quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu() + qlayers[name].pack(layers[name], scale, zero) + print('Done!') return model -def load_quant3(model, checkpoint): - from transformers import OPTConfig, OPTForCausalLM - config = OPTConfig.from_pretrained(model) +def load_quant(model, checkpoint, wbits, groupsize): + from transformers import GPTJConfig, GPTJForCausalLM + config = GPTJConfig.from_pretrained(model) def noop(*args, **kwargs): pass - torch.nn.init.kaiming_uniform_ = noop - torch.nn.init.uniform_ = noop - torch.nn.init.normal_ = noop + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop torch.set_default_dtype(torch.half) transformers.modeling_utils._init_weights = False torch.set_default_dtype(torch.half) - model = OPTForCausalLM(config) + model = GPTJForCausalLM(config) torch.set_default_dtype(torch.float) model = model.eval() layers = find_layers(model) - for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']: + for name in ['lm_head']: if name in layers: del layers[name] - make_quant3(model, layers, faster=args.faster_kernel) + make_quant(model, layers, wbits, groupsize) print('Loading model ...') - model.load_state_dict(torch.load(checkpoint)) - model.seqlen = model.config.max_position_embeddings - print('Done.') + if checkpoint.endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + model.load_state_dict(safe_load(checkpoint)) + else: + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = 2048 + print('Done!') return model -def opt_multigpu(model, gpus): - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0]) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0]) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(gpus[0]) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(gpus[-1]) - if hasattr(model.model.decoder, 'final_layer_norm') and model.model.decoder.final_layer_norm: - model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(gpus[-1]) +def gptj_multigpu(model, gpus): + model.model.embed_tokens = model.model.embed_tokens.to(gpus[0]) + if hasattr(model.model, 'norm') and model.model.norm: + model.model.norm = model.model.norm.to(gpus[-1]) import copy model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1]) @@ -403,19 +364,19 @@ def opt_multigpu(model, gpus): class MoveModule(nn.Module): def __init__(self, module): super().__init__() - self.module = module + self_module = module self.dev = next(iter(self.module.parameters())).device def forward(self, *inp, **kwargs): inp = list(inp) if inp[0].device != self.dev: inp[0] = inp[0].to(self.dev) - if cache['mask'] is None or cache['mask'].device != self.dev: + if cache['mask'] is None or cache ['mask'].device != self.dev: cache['mask'] = kwargs['attention_mask'].to(self.dev) kwargs['attention_mask'] = cache['mask'] tmp = self.module(*inp, **kwargs) return tmp - layers = model.model.decoder.layers + layers = model.model.layers pergpu = math.ceil(len(layers) / len(gpus)) for i in range(len(layers)): layers[i] = MoveModule(layers[i].to(gpus[i // pergpu])) @@ -432,7 +393,7 @@ def tmp(layer, inp, out): if cache['past']: cache['past'][i] = None return tmp - for i, layer in enumerate(model.model.decoder.layers): + for i, layer in enumerate(model.model.layers): layer.register_forward_hook(clear_past(i)) print('Benchmarking ...') @@ -447,30 +408,35 @@ def sync(): torch.cuda.synchronize(gpu) else: torch.cuda.synchronize() + max_memory = 0 with torch.no_grad(): attention_mask = torch.ones((1, input_ids.numel()), device=DEV) times = [] for i in range(input_ids.numel()): tick = time.time() + out = model( - input_ids[:, i].reshape(-1), + input_ids[:, i:i+1], past_key_values=cache['past'], attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)) ) sync() times.append(time.time() - tick) print(i, times[-1]) + max_memory = max(max_memory, torch, torch.cuda.memory_allocated() / 1024 /1024) if check and i != input_ids.numel() - 1: tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float() - cache['past'] = list(out.past_key_values) + cache['past'] = list(out.past_keys_values) del out sync() import numpy as np print('Median:', np.median(times)) if check: print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) + print('max memory(MiB):',max_memory) + def main(args): print(args) num_params_saved_lr = 0 @@ -479,9 +445,9 @@ def main(args): model = load_quant3(args.model, args.load) else: if args.delta and args.wbits<16: - model = get_opt(args.model) + model = get_gptj(args.model) model.eval() - base_model = get_opt(args.base_model) + base_model = get_gptj(args.base_model) base_model.eval() dataloader, testloader = get_loaders( args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen @@ -490,7 +456,7 @@ def main(args): for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): finetuned_p.data = (finetuned_p.data-base_p.data).clone() else: - model = get_opt(args.model) + model = get_gptj(args.model) model.eval() dataloader, testloader = get_loaders( @@ -500,31 +466,23 @@ def main(args): if args.wbits < 16 and not args.nearest: if args.delta: tick = time.time() - quantizers = opt_sequential_delta(original_finetuned_model, model, dataloader, DEV) + quantizers = gptj_sequential_delta(original_finetuned_model, model, dataloader, DEV) comp_time = time.time()-tick else: - quantizers = opt_sequential(model, dataloader, DEV) + quantizers = gptj_sequential(model, dataloader, DEV) if args.delta and args.wbits<16: for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): - # don't hard threshold for now - # if args.sparsify_hard_threshold: - # print('Hard Thresholding...') - # W = finetuned_p.data - # finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero) + if args.sparsify_hard_threshold: + print('Hard Thresholding...') + W = finetuned_p.data + finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero) if args.rank>0 and len(finetuned_p.shape) == 2: print('Finding Low Rank Approximation...') A = finetuned_p.data.float() U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5) - # let's say L = U - # and R = diag(S)*V.T - L = U - R = torch.diag_embed(S) @ Vh.T - # now quantize R - - A = L @ R - + A = U @ torch.diag_embed(S) @ Vh.T finetuned_p.data = A.half() num_params_saved_lr += torch.numel(A) - (torch.numel(U) + torch.numel(S) + torch.numel(Vh)) num_params += torch.numel(finetuned_p.data) @@ -533,7 +491,7 @@ def main(args): if args.benchmark: gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] if len(gpus) > 1: - opt_multigpu(model, gpus) + gptj_multigpu(model, gpus) else: model = model.to(DEV) if args.benchmark: @@ -547,34 +505,35 @@ def main(args): dataset, seed=args.seed, model=args.model, seqlen=model.seqlen ) - ppl = opt_eval(model, testloader, DEV) + ppl = gptj_eval(model, testloader, DEV) print(ppl) if args.rank > 0: - print("Number of params without low rank ", num_params) - print("Number of params with low rank", num_params - num_params_saved_lr) + n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print("Number of params without low rank ", n_params) + print("Number of params with low rank", n_params - num_params_saved_lr) if args.save: - opt_pack3(model, quantizers) + gptj_pack(model, quantizers, args.wbits, args.groupsize) torch.save(model.state_dict(), args.save) - + return ppl if __name__ == '__main__': import argparse from datautils import * parser = argparse.ArgumentParser() - + parser.add_argument( - '--model', type=str, default='lnair/opt-1.3b-wikitext2', - help='OPT model to load; pass `facebook/opt-X`.' + '--model', type=str, default='togethercomputer/GPT-JT-6B-v1', + help='GPT-J finetuned model to load; pass `togethercomputer/GPT-JT-6B-v1`.' ) parser.add_argument( - '--dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], default='wikitext2', - help='Where to extract calibration data from.' + '--base_model', type=str, default='EleutherAI/gpt-j-6b', + help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.' ) parser.add_argument( - '--base-model', type=str, default='facebook/opt-1.3b', - help='base OPT model to load' + '--dataset', type=str, default='ptb', choices=['wikitext2', 'ptb', 'c4'], + help='Where to extract calibration data from.' ) parser.add_argument( '--seed', @@ -591,30 +550,26 @@ def main(args): parser.add_argument( '--nearest', action='store_true', help='Whether to run the RTN baseline.' - ) - parser.add_argument( - '--wbits', type=int, default=2, choices=[2, 3, 4, 16], - help='#bits to use for quantization; use 16 for evaluating base model.' ) parser.add_argument( - '--trits', action='store_true', - help='Whether to use trits for quantization.' + '--wbits', type=int, default=16, choices=[2, 3, 4, 16], + help='#bits to use for quantization; use 16 for evaluating base model.' ) parser.add_argument( '--groupsize', type=int, default=-1, help='Groupsize to use for quantization; default uses full row.' ) parser.add_argument( - '--sym', action='store_true', - help='Whether to perform symmetric quantization.' + '--save', type=str, default='', + help='Save the quantized GPT-J model under this name.' ) parser.add_argument( - '--save', type=str, default='', - help='Save quantized checkpoint under this name.' + '--save_safetensors', type=str, default='', + help='Save the quantized GPT-J model as a `.safetensors` ckpt' ) parser.add_argument( '--load', type=str, default='', - help='Load quantized model.' + help='Load the quantized GPT-J model' ) parser.add_argument( '--benchmark', type=int, default=0, @@ -622,19 +577,7 @@ def main(args): ) parser.add_argument( '--check', action='store_true', - help='Whether to compute perplexity during benchmarking for verification.' - ) - parser.add_argument( - '--new-eval', action='store_true', - help='Whether to use the new PTB and C4 eval.' - ) - parser.add_argument( - '--faster-kernel', action='store_true', - help='Whether to use the new faster kernel for benchmarking.' - ) - parser.add_argument( - '--act-order', action='store_true', - help='Whether to apply the activation order GPTQ heuristic' + help='Whether to compute perpexity during benchmarking for verification.' ) parser.add_argument( '--delta', action='store_true', @@ -648,15 +591,34 @@ def main(args): '--fraction_of_zero', type=float, default=0.99, help='Sparsity ratio' ) - parser.add_argument( - '--rank', type=int, default=0, - help='The rank to use for decomposing each matrices' + '--benchmark_results', type=str, default='', + help='store benchmark results' ) - args = parser.parse_args() - - #results = PrettyTable() - - main(args) + parser.add_argument( + '--sym', action='store_true', default=True, + help='Whether to use symmetric quantization' + ) + parser.add_argument( + '--trits', action='store_true', default=False, + help='Whether to use trits' + ) + parser.add_argument('--act_order', type=str, default=False) - print('finished.') + args = parser.parse_args() + + results = PrettyTable() + results.field_names = ['Bits', 'n_params', 'Time', 'wiki'] + for n_bits in [4, 3, 2]: + ppls = [] + for dataset in ['wikitext2', 'ptb', 'c4']: + args.dataset = dataset + args.wbits = n_bits + args.save = 'gptj-delta-%s-wbits%d.pt' % (dataset, n_bits) + ppl = main(args) + ppls.append(ppl) + results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) + print(results) + with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f: + f.write(str(results)) + print('finished.') \ No newline at end of file diff --git a/gptq.py b/gptq.py index 2477cac..87dd8cf 100644 --- a/gptq.py +++ b/gptq.py @@ -3,17 +3,34 @@ import torch import transformers import torch.nn as nn - +from loguru import logger from quant import quantize -DEBUG = False +DEBUG = False torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False +def hard_threshold(x, fraction_of_zero=0.1, random_sparsification=0.5): + if fraction_of_zero == 0: + return x, None + # randomly set random_sparsification of the weights to zero + if random_sparsification > 0: + logger.info(f"Randomly sparsifying the weights with {random_sparsification}") + mask = torch.rand(x.shape, device=x.device) > random_sparsification + x = x * mask + y, _ = torch.sort(x.view(-1).abs().clone()) + num_params = torch.numel(x) + + thresh_index = int(num_params * fraction_of_zero * (1/random_sparsification)) + threshold = y[thresh_index] + mask = x.abs().clone().gt(threshold).type(torch.cuda.HalfTensor) + return mask * x, mask + class GPTQ: def __init__(self, layer): self.layer = layer + self.original_weight = layer.weight.data.clone() self.dev = self.layer.weight.device W = layer.weight.data.clone() if isinstance(self.layer, nn.Conv2d): @@ -26,26 +43,15 @@ def __init__(self, layer): self.nsamples = 0 def add_batch(self, inp, out): - if DEBUG: - self.inp1 = inp - self.out1 = out + self.inp1 = inp + self.out1 = out if len(inp.shape) == 2: inp = inp.unsqueeze(0) tmp = inp.shape[0] - if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D): + if isinstance(self.layer, nn.Linear): if len(inp.shape) == 3: inp = inp.reshape((-1, inp.shape[-1])) inp = inp.t() - if isinstance(self.layer, nn.Conv2d): - unfold = nn.Unfold( - self.layer.kernel_size, - dilation=self.layer.dilation, - padding=self.layer.padding, - stride=self.layer.stride - ) - inp = unfold(inp) - inp = inp.permute([1, 0, 2]) - inp = inp.flatten(1) self.H *= self.nsamples / (self.nsamples + tmp) self.nsamples += tmp # inp = inp.float() @@ -54,7 +60,7 @@ def add_batch(self, inp, out): self.H += inp.matmul(inp.t()) def fasterquant( - self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False + self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, write=True, sparsity=None ): W = self.layer.weight.data.clone() if isinstance(self.layer, nn.Conv2d): @@ -69,7 +75,8 @@ def fasterquant( self.quantizer.find_params(W, weight=True) H = self.H - del self.H + if write: + del self.H dead = torch.diag(H) == 0 H[dead, dead] = 1 W[:, dead] = 0 @@ -124,15 +131,16 @@ def fasterquant( W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:]) if DEBUG: - self.layer.weight.data[:, :i2] = Q[:, :i2] - self.layer.weight.data[:, i2:] = W[:, i2:] - print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) - print(torch.sum(Losses)) + pass + #self.layer.weight.data[:, :i2] = Q[:, :i2] + #self.layer.weight.data[:, i2:] = W[:, i2:] + #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + #print(torch.sum(Losses)) torch.cuda.synchronize() total_time = time.time() - tick # print('time %.2f' % total_time) - error = torch.sum(Losses).item() + # error = torch.sum(Losses).item() # print('error', error) if actorder: @@ -141,9 +149,23 @@ def fasterquant( if isinstance(self.layer, transformers.Conv1D): Q = Q.t() - self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype) - if DEBUG: - print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + # here report the loss of the quantized layer vs. the original layer + new_weight = Q.reshape(self.layer.weight.shape).to(self.layer.weight.dtype) + losses = {} + mask = None + if sparsity is None: + sparsed_new_weight = new_weight + losses[0] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2) + else: + for s_sity in sparsity: + sparsed_new_weight, mask = hard_threshold(new_weight, fraction_of_zero=s_sity) + if write: + logger.info(f"HT with: sparsity={s_sity}") + losses[s_sity] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2) + + if write: + self.layer.weight.data = sparsed_new_weight + return losses, mask def free(self): if DEBUG: @@ -152,4 +174,4 @@ def free(self): self.H = None self.Losses = None self.Trace = None - torch.cuda.empty_cache() + torch.cuda.empty_cache() \ No newline at end of file diff --git a/lr_only.py b/lr_only.py new file mode 100644 index 0000000..dc392c2 --- /dev/null +++ b/lr_only.py @@ -0,0 +1,42 @@ +import copy +import torch +import argparse +import torch.nn as nn +from loguru import logger +from evaluation import opt_eval +from datautils import get_loaders +from core_compression import opt_delta_lr +from modelutils import get_opt, find_layers +from save_and_load import save_lr_tensors, load_lr_tensors + +@torch.no_grad() +def lowrank_decomposition(model, rank, n_samples, data_loader=None): + lr_iopairs = {} + + def add_batch(name): + def temp(_, inp, out): + lr_iopairs[name] = (inp, out) + return temp + layers = model.model.decoder.layers + inps = torch.zeros( + (n_samples, model.seqlen, model.config.hidden_size), dtype=torch.fp16, device=model.device + ) + handles = [] + for i in range(len(layers)): + subset = find_layers(layers[i]) + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(f"decoder.layers.{i}.{name}"))) + layer_id = f"decoder.layers.{i}.{name}" + decomposing_layer = subset[name].weight + # decompose this into low rank matrices + +if __name__=="__main__": + base_model = get_opt('facebook/opt-1.3b') + trainloader, loader_enc = get_loaders( + 'wikitext2', + nsamples = 128, + seed=42, + model='facebook/opt-1.3b', + seqlen=base_model.seqlen, + ) + lowrank_decomposition(base_model, 32, 128, trainloader) \ No newline at end of file diff --git a/matq.py b/matq.py index 417a2f9..2898db5 100644 --- a/matq.py +++ b/matq.py @@ -5,15 +5,15 @@ import transformers from loguru import logger from quant import quantize +from decomposition import low_rank_decomposition - -DEBUG = False +DEBUG = False torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False class TensorQ: - def __init__(self, layer, rank=32): + def __init__(self, layer, rank=32, sensitive_decompose=False): self.layer = layer self.dev = self.layer.weight.device W = layer.weight.data.clone() @@ -22,47 +22,40 @@ def __init__(self, layer, rank=32): if isinstance(self.layer, transformers.Conv1D): W = W.t() self.rank = rank - self.decompose() + if not sensitive_decompose: + self.decompose() self.rows = W.shape[0] self.columns = W.shape[1] - self.L_columns = self.L.shape[1] + self.L_columns = rank self.H = torch.zeros((self.columns, self.columns), device=self.dev) self.H_R = torch.zeros((self.columns, self.columns), device=self.dev) self.H_L = torch.zeros((self.L_columns, self.L_columns), device=self.dev) self.nsamples = 0 def add_batch_lr(self, inp, out): - if DEBUG: - self.inp1 = inp - self.out1 = out + #if DEBUG: + # self.inp1 = inp + # self.out1 = out if len(inp.shape) == 2: inp = inp.unsqueeze(0) - tmp = inp.shape[0] - if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D): + self.tmp = inp.shape[0] + if isinstance(self.layer, nn.Linear): if len(inp.shape) == 3: inp = inp.reshape((-1, inp.shape[-1])) inp = inp.t() - if isinstance(self.layer, nn.Conv2d): - unfold = nn.Unfold( - self.layer.kernel_size, - dilation=self.layer.dilation, - padding=self.layer.padding, - stride=self.layer.stride - ) - inp = unfold(inp) - inp = inp.permute([1, 0, 2]) - inp = inp.flatten(1) - self.H_R *= self.nsamples / (self.nsamples + tmp) - self.nsamples += tmp + + self.H_R *= self.nsamples / (self.nsamples + self.tmp) + self.nsamples += self.tmp inp = math.sqrt(2 / self.nsamples) * inp.float() + self.inp = inp - self.H_R += inp.matmul(inp.t()) + def calculate_hessian(self): + self.H_R += self.inp.matmul(self.inp.t()) # logger.info(f"self.H_R: {self.H_R.shape}") # for L, consider the input to be R@X - inp = self.R @ inp - self.H_L *= self.nsamples / (self.nsamples + tmp) - self.H_L += inp.matmul(inp.t()) - # logger.info(f"self.H_L: {self.H_L.shape}") + l_inp = self.R @ self.inp + self.H_L *= self.nsamples / (self.nsamples + self.tmp) + self.H_L += l_inp.matmul(l_inp.t()) def free(self): if DEBUG: @@ -77,24 +70,19 @@ def free(self): def decompose(self): W = self.layer.weight.data.clone() - if isinstance(self.layer, nn.Conv2d): - W = W.flatten(1) - if isinstance(self.layer, transformers.Conv1D): - W = W.t() W = W.float() logger.info("starting decomposition") tick = time.time() - U, S, Vh = torch.pca_lowrank(W, q=self.rank, center=True, niter=5) - # let's say L = U - # and R = diag(S)*V.T - self.L = U - self.R = torch.diag_embed(S) @ Vh.T + self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-10, max_iterations=5000, X=self.inp) logger.info(f"decomposition done. elapsed time: {time.time() - tick}, L: {self.L.shape}, R: {self.R.shape}") - def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False): - self.lr_quant_R(blocksize, percdamp, groupsize, actorder) - self.lr_quant_L(blocksize, percdamp, groupsize, actorder) - # restored weight is L@R + def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, decompose_only=False): + self.decompose() + if not decompose_only: + self.calculate_hessian() + self.lr_quant_R(blocksize, percdamp, groupsize, actorder) + self.lr_quant_L(blocksize, percdamp, groupsize, actorder) + # restored weight is L@R, we overwrite the weight for evaluation if needed # but on disk we only save L, R self.layer.weight.data = (self.L @ self.R).reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype) diff --git a/modelutils.py b/modelutils.py index c93410d..f9436c7 100644 --- a/modelutils.py +++ b/modelutils.py @@ -1,6 +1,6 @@ import torch import torch.nn as nn -from transformers import OPTForCausalLM +from transformers import OPTForCausalLM, AutoModel, AutoTokenizer DEV = torch.device('cuda:0') def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): diff --git a/opt_delta.py b/opt_delta.py index f9f6bbf..10a34d3 100644 --- a/opt_delta.py +++ b/opt_delta.py @@ -3,12 +3,13 @@ import torch import pickle import torch.nn as nn - +from pack_utils import pack_to_bits, unpack_from_bits from gptq import * from modelutils import * from quant import * - +from transformers import AutoTokenizer, AutoModel import copy +from tensorio import TensorIO, model_packing #from prettytable import PrettyTable def get_opt(model): @@ -41,7 +42,7 @@ def opt_sequential_delta(model, delta_model, dataloader, dev): layers = model.model.decoder.layers delta_layers = delta_model.model.decoder.layers - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: @@ -425,7 +426,6 @@ def forward(self, *inp, **kwargs): def benchmark(model, input_ids, check=False): input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV) torch.cuda.synchronize() - cache = {'past': None} def clear_past(i): def tmp(layer, inp, out): @@ -507,20 +507,27 @@ def main(args): quantizers = opt_sequential(model, dataloader, DEV) if args.delta and args.wbits<16: - for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): - if args.sparsify_hard_threshold: - print('Hard Thresholding...') - W = finetuned_p.data - finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero) - if args.rank>0 and len(finetuned_p.shape) == 2: - print('Finding Low Rank Approximation...') - A = finetuned_p.data.float() - U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5) - A = U @ torch.diag_embed(S) @ Vh.T - finetuned_p.data = A.half() - num_params_saved_lr += torch.numel(A) - (torch.numel(U) + torch.numel(S) + torch.numel(Vh)) + for idx, (base_p, finetuned_p) in enumerate(zip(base_model.parameters(), model.parameters())): + # if args.sparsify_hard_threshold: + # print('Hard Thresholding...') + # W = finetuned_p.data + # finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero) + # if args.rank>0 and len(finetuned_p.shape) == 2: + # print('Finding Low Rank Approximation...') + # A = finetuned_p.data.float() + # U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5) + # A = U @ torch.diag_embed(S) @ Vh.T + # finetuned_p.data = A.half() + # num_params_saved_lr += torch.numel(A) - (torch.numel(U) + torch.numel(S) + torch.numel(Vh)) num_params += torch.numel(finetuned_p.data) finetuned_p.data = (base_p.data + finetuned_p.data).clone() + + if args.save_delta: + new_weights, scale = model_packing(model, quantizers, bits=args.wbits) + torch.save({ + 'weight': new_weights, + 'scale': scale, + }, f"delta_outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz") if args.benchmark: gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] @@ -545,11 +552,18 @@ def main(args): if args.rank > 0: print("Number of params without low rank ", num_params) print("Number of params with low rank", num_params - num_params_saved_lr) - if args.save: + if args.save_hf: + if args.delta: + hf_path = f"outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz" + else: + hf_path = f"outputs/{args.model.replace('/', '.')}_{args.wbits}bits" + model.save_pretrained(hf_path) + tokenizer = AutoTokenizer.from_pretrained(args.model) + tokenizer.save_pretrained(hf_path) + else: opt_pack3(model, quantizers) torch.save(model.state_dict(), args.save) - if __name__ == '__main__': import argparse from datautils import * @@ -636,6 +650,13 @@ def main(args): '--sparsify_hard_threshold', action='store_true', help='Whether to add sparsity' ) + parser.add_argument( + '--save-hf', action='store_true', default=False, + help='Whether to save a huggingface model' + ) + parser.add_argument( + '--save-delta', action='store_true', default=False, + ) parser.add_argument( '--fraction_of_zero', type=float, default=0.99, help='Sparsity ratio' diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py new file mode 100644 index 0000000..937ab7f --- /dev/null +++ b/opt_delta_autotune.py @@ -0,0 +1,584 @@ +import os +import copy +import time +import json +import torch +import pickle +from gptq import * +from quant import * +import torch.nn as nn +from modelutils import * +from loguru import logger +from tensorio import TensorIO, model_packing +from transformers import AutoTokenizer, AutoModel +import torchvision.transforms as T +# from prettytable import PrettyTable + +def get_opt(model): + import torch + + def skip(*args, **kwargs): + pass + + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import OPTForCausalLM + + # model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto') + model = OPTForCausalLM.from_pretrained(model, torch_dtype=torch.float16) + model.seqlen = model.config.max_position_embeddings + return model + +@torch.no_grad() +def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2): + search_space = { + "wbits": [2,3,4], + "sparsities": [0.0, 0.33, 0.5, 0.67, 0.9, 0.95] + } + base_floats = 16 + compression_rates = {} + masks = {} + for wbit in search_space['wbits']: + for sparsity in search_space['sparsities']: + compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity) + compression_rates = sorted( + compression_rates.items(), + key=lambda x: x[1], + reverse=True + ) + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.decoder.layers + delta_layers = delta_model.model.decoder.layers + + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) + model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) + + if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {"i": 0, "attention_mask": None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + + def forward(self, inp, **kwargs): + inps[cache["i"]] = inp + cache["i"] += 1 + cache["attention_mask"] = kwargs["attention_mask"] + raise ValueError + + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() + model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() + if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.cpu() + if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + original_outs = torch.zeros_like(inps) + attention_mask = cache["attention_mask"] + + print("Ready.") + tuned_params = {} + tuned_configs = {} + quantizers = {} + for i in range(len(delta_layers)): + layer = delta_layers[i].to(dev) + original_layer = layers[i].to(dev) + subset = find_layers(layer) + for name in subset: + tuned_params[f'{i}_{name}'] = {} + tuned_configs[f'{i}_{name}'] = {} + for wbit in search_space['wbits']: + tuned_params[f'{i}_{name}'][f'wbit.{wbit}'] = { + 'gptq': GPTQ(subset[name]) + } + + tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].quantizer = Quantizer() + + tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].quantizer.configure( + wbit, perchannel=True, sym=args.sym, mse=False, trits=args.trits + ) + + def add_batch(name): + def tmp(_, inp, out): + for wbit in search_space['wbits']: + tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].add_batch(inp[0].data, out.data) + return tmp + + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer( + inps[j].unsqueeze(0), attention_mask=attention_mask + )[0] + + for h in handles: + h.remove() + + for name in subset: + logger.info(f"Quantizing {i}.{name} ...") + for wbit in search_space['wbits']: + losses, _ = tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].fasterquant( + percdamp=args.percdamp, + groupsize=args.groupsize, + actorder=args.act_order, + sparsity = search_space['sparsities'], + write=False, + ) + for s_sity in losses.keys(): + tuned_configs[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{s_sity}'] = { + 'loss': losses[s_sity].item() + } + logger.info(f"wbit: {wbit}; sparsity: {s_sity}; loss: {losses[s_sity].item()}") + # within the tol, pick the minimal wbit and maximal sparsity + best_wbit = None + best_sparsity = None + best_loss = None + # starting from the maximal compression rate + # loop through all compression rates: + for cr in compression_rates: + config = cr[0] + wbit = int(config.split('_')[0].split('.')[1]) + sparsity = float(config.split('_')[1].replace('sparsity.','')) + # find the corresponding loss + loss = tuned_configs[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['loss'] + # if the loss is within the tolerance + if loss <= tol: + best_wbit = wbit + best_sparsity = sparsity + break + # if not, pick the lowest compression rate + if best_wbit is None: + best_wbit = int(compression_rates[-1][0].split('_')[0].split('.')[1]) + best_sparsity = float(compression_rates[-1][0].split('_')[1].replace('sparsity.','')) + if best_sparsity == -1: + best_sparsity = -1 + best_loss = tuned_configs[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['loss'] + # redo the actual work, and write to the layer + logger.info(f"Applying wbit={best_wbit}, sparsity={best_sparsity} ...") + loss, mask = tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}']['gptq'].fasterquant( + percdamp=args.percdamp, + groupsize=args.groupsize, + actorder=args.act_order, + write=True, + sparsity = [best_sparsity], + ) + if mask is not None: + masks[f'{i}_{name}'] = mask + + quantizers["model.decoder.layers.%d.%s" % (i, name)] = tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}']['gptq'].quantizer + tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}']['gptq'].free() + tuned_configs[f'{i}_{name}']['choice'] = { + 'best_wbit': best_wbit, + 'best_sparsity': best_sparsity, + 'best_loss': best_loss + } + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer( + inps[j].unsqueeze(0), attention_mask=attention_mask + )[0] + + layers[i] = layer.cpu() + del layer + for key in tuned_params.keys(): + if key.startswith(f'{i}_'): + for wbit in search_space['wbits']: + del tuned_params[key][f'wbit.{wbit}']['gptq'] + torch.cuda.empty_cache() + + inps, outs = original_outs, inps + + model.config.use_cache = use_cache + + return quantizers, tuned_configs, masks + +@torch.no_grad() +def opt_eval(model, testenc, dev): + print("Evaluating ...") + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.decoder.layers + + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) + model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) + if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {"i": 0, "attention_mask": None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + + def forward(self, inp, **kwargs): + inps[cache["i"]] = inp + cache["i"] += 1 + cache["attention_mask"] = kwargs["attention_mask"] + raise ValueError + + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(dev) + try: + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() + model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() + if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.cpu() + if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache["attention_mask"] + + for i in range(len(layers)): + # print(i) + layer = layers[i].to(dev) + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + if model.model.decoder.final_layer_norm is not None: + model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to( + dev + ) + if model.model.decoder.project_out is not None: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + if model.model.decoder.final_layer_norm is not None: + hidden_states = model.model.decoder.final_layer_norm(hidden_states) + if model.model.decoder.project_out is not None: + hidden_states = model.model.decoder.project_out(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct( + shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) + ) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + print(ppl.item()) + + model.config.use_cache = use_cache + return ppl.item() + +def benchmark(model, input_ids, check=False): + input_ids = input_ids.to(model.gpus[0] if hasattr(model, "gpus") else DEV) + torch.cuda.synchronize() + cache = {"past": None} + + def clear_past(i): + def tmp(layer, inp, out): + if cache["past"]: + cache["past"][i] = None + + return tmp + + for i, layer in enumerate(model.model.decoder.layers): + layer.register_forward_hook(clear_past(i)) + + print("Benchmarking ...") + + if check: + loss = nn.CrossEntropyLoss() + tot = 0.0 + + def sync(): + if hasattr(model, "gpus"): + for gpu in model.gpus: + torch.cuda.synchronize(gpu) + else: + torch.cuda.synchronize() + + with torch.no_grad(): + attention_mask = torch.ones((1, input_ids.numel()), device=DEV) + times = [] + for i in range(input_ids.numel()): + tick = time.time() + out = model( + input_ids[:, i].reshape(-1), + past_key_values=cache["past"], + attention_mask=attention_mask[:, : (i + 1)].reshape((1, -1)), + ) + sync() + times.append(time.time() - tick) + print(i, times[-1]) + if check and i != input_ids.numel() - 1: + tot += loss( + out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV) + ).float() + cache["past"] = list(out.past_key_values) + del out + sync() + import numpy as np + + print("Median:", np.median(times)) + if check: + print("PPL:", torch.exp(tot / (input_ids.numel() - 1)).item()) + +def main(args): + print(args) + num_params = 0 + if args.delta and args.wbits < 16: + model = get_opt(args.model) + model.eval() + base_model = get_opt(args.base_model) + base_model.eval() + dataloader, testloader = get_loaders( + args.dataset, + nsamples=args.nsamples, + seed=args.seed, + model=args.model, + seqlen=model.seqlen, + ) + original_finetuned_model = copy.deepcopy(model) + for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + finetuned_p.data = (finetuned_p.data - base_p.data).clone() + else: + model = get_opt(args.model) + model.eval() + + if args.wbits < 16: + if args.delta: + tick = time.time() + quantizers, tuned_params, masks = opt_sequential_delta( + original_finetuned_model, model, dataloader, DEV, args.tol + ) + data_dir = os.path.join(".cache", args.model.replace('/', '.')) + os.makedirs(data_dir, exist_ok=True) + with open(f".cache/{args.model.replace('/', '.')}/delta_tol={args.tol}_tuned_params.json", "w+") as f: + json.dump(tuned_params, f) + # iterate over all the dict keys in masks + transforms = T.ToPILImage() + for key in masks.keys(): + logger.info(f"Saving mask for {key}") + binmask = transforms(masks[key]) + binmask = binmask.convert("1") + binmask.save(os.path.join(data_dir, f"delta_tol={args.tol}_mask_{key}.bmp")) + else: + raise NotImplementedError + + if args.delta and args.wbits < 16: + for idx, (base_p, finetuned_p) in enumerate( + zip(base_model.parameters(), model.parameters()) + ): + num_params += torch.numel(finetuned_p.data) + finetuned_p.data = (base_p.data + finetuned_p.data).clone() + + if args.save_delta: + new_weights, scale = model_packing(model, quantizers, bits=args.wbits) + torch.save( + { + "weight": new_weights, + "scale": scale, + }, + f"delta_outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz", + ) + + if args.benchmark: + model = model.to(DEV) + if args.benchmark: + input_ids = next(iter(dataloader))[0][:, : args.benchmark] + benchmark(model, input_ids, check=args.check) + if args.load: + exit() + + dataset = args.dataset + dataloader, testloader = get_loaders( + dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + ppl = opt_eval(model, testloader, DEV) + print(ppl) + + if args.save_hf: + if args.delta: + hf_path = f"outputs_exp/{args.model.replace('/', '.')}_delta_autotune_tol={args.tol}" + else: + hf_path = f"outputs_exp/{args.model.replace('/', '.')}_autotuned_tol={args.tol}" + model.save_pretrained(hf_path) + tokenizer = AutoTokenizer.from_pretrained(args.model) + tokenizer.save_pretrained(hf_path) + +if __name__ == "__main__": + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model", + type=str, + default="lnair/opt-1.3b-wikitext2", + help="OPT model to load; pass `facebook/opt-X`.", + ) + parser.add_argument( + "--dataset", + type=str, + default="wikitext2", + help="Where to extract calibration data from.", + ) + parser.add_argument( + "--base-model", + type=str, + default="facebook/opt-1.3b", + help="base OPT model to load", + ) + parser.add_argument( + "--tol", + type=float, + default=0.2, + help="Tolerance of the loss per layer", + ) + parser.add_argument( + "--seed", type=int, default=0, help="Seed for sampling the calibration data." + ) + parser.add_argument( + "--nsamples", type=int, default=128, help="Number of calibration data samples." + ) + parser.add_argument( + "--percdamp", + type=float, + default=0.01, + help="Percent of the average Hessian diagonal to use for dampening.", + ) + parser.add_argument( + "--wbits", + type=int, + default=2, + choices=[2, 3, 4, 16], + help="#bits to use for quantization; use 16 for evaluating base model.", + ) + parser.add_argument( + "--trits", action="store_true", help="Whether to use trits for quantization." + ) + parser.add_argument( + "--groupsize", + type=int, + default=-1, + help="Groupsize to use for quantization; default uses full row.", + ) + parser.add_argument( + "--sym", action="store_true", help="Whether to perform symmetric quantization." + ) + parser.add_argument( + "--save", + type=str, + default="", + help="Save quantized checkpoint under this name.", + ) + parser.add_argument("--load", type=str, default="", help="Load quantized model.") + parser.add_argument( + "--benchmark", + type=int, + default=0, + help="Number of tokens to use for benchmarking.", + ) + parser.add_argument( + "--check", + action="store_true", + help="Whether to compute perplexity during benchmarking for verification.", + ) + parser.add_argument( + "--new-eval", + action="store_true", + help="Whether to use the new PTB and C4 eval.", + ) + parser.add_argument( + "--faster-kernel", + action="store_true", + help="Whether to use the new faster kernel for benchmarking.", + ) + parser.add_argument( + "--act-order", + action="store_true", + help="Whether to apply the activation order GPTQ heuristic", + ) + parser.add_argument( + "--delta", action="store_true", help="Whether to use delta compression" + ) + parser.add_argument( + "--sparsify_hard_threshold", action="store_true", help="Whether to add sparsity" + ) + parser.add_argument( + "--save-hf", + action="store_true", + default=False, + help="Whether to save a huggingface model", + ) + parser.add_argument( + "--save-delta", + action="store_true", + default=False, + ) + parser.add_argument( + "--fraction_of_zero", type=float, default=0.99, help="Sparsity ratio" + ) + parser.add_argument( + "--rank", + type=int, + default=0, + help="The rank to use for decomposing each matrices", + ) + args = parser.parse_args() + + # results = PrettyTable() + + main(args) + + print("finished.") diff --git a/opt_eval_ppl.py b/opt_eval_ppl.py new file mode 100644 index 0000000..ca290b5 --- /dev/null +++ b/opt_eval_ppl.py @@ -0,0 +1,129 @@ +import os +import json +import torch +import torch.nn as nn +from modelutils import get_opt +from datautils import get_loaders + +BENCHMARK = 2048 + +nsamples = 128 + +@torch.no_grad() +def opt_eval(model, testenc, dev): + print('Evaluating ...') + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.decoder.layers + + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) + model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) + try: + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() + model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.cpu() + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + for i in range(len(layers)): + print(i) + layer = layers[i].to(dev) + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + if model.model.decoder.final_layer_norm is not None: + model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev) + if model.model.decoder.project_out is not None: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + if model.model.decoder.final_layer_norm is not None: + hidden_states = model.model.decoder.final_layer_norm(hidden_states) + if model.model.decoder.project_out is not None: + hidden_states = model.model.decoder.project_out(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[ + :, (i * model.seqlen):((i + 1) * model.seqlen) + ][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + model.config.use_cache = use_cache + return ppl.item() + +models = os.listdir(".cache/models") +res = {} +# models = [ +# # 'facebook/opt-1.3b', +# # 'facebook/opt-350m', +# 'facebook/opt-2.7b', +# # 'lnair/opt-350m-wikitext2', +# # 'lnair/opt-1.3b-wikitext2', +# 'lnair/opt-2.7b-wikitext2' +# ] +for model_name in models: + dataset = model_name + model_path = os.path.join(".cache", "models", model_name) + model = get_opt(model_path) + model.to("cuda") + print("model loaded") + _, testloader = get_loaders( + dataset, nsamples=128, seed=0, model=model_path, seqlen=model.seqlen + ) + print("data loaded") + ppl = opt_eval(model, testloader, model.device) + res[model_name] = ppl + print(res) + with open("ppl_res.json", "w") as f: + json.dump(res, f) \ No newline at end of file diff --git a/pack_utils.py b/pack_utils.py new file mode 100644 index 0000000..5c6ace9 --- /dev/null +++ b/pack_utils.py @@ -0,0 +1,167 @@ +import math +import torch +import numpy as np +from typing import Any +from quant import Quantizer +from safetensors import safe_open +from safetensors.torch import save_file + +def pack_to_bits( + weight: torch.Tensor, + quantizer:Quantizer, + bits: int, + groupsize = 1024 + ): + if groupsize == -1: + groupsize = weight.shape[0] + if bits not in [2,3,4,8]: + raise ValueError("bits must be one of [2,3,4,8]") + scales = quantizer.scale.t().contiguous() + zeros = quantizer.zero.t().contiguous() + scale_zeros = zeros * scales + intweight = [] + for idx in range(weight.shape[0]): + g_idx = idx // groupsize + intweight.append(torch.round((weight[:,idx] + scale_zeros[g_idx]) / scales[g_idx]).to(torch.int)[:,None]) + + intweight = torch.cat(intweight, dim=1) + intweight = intweight.t().contiguous() + intweight = intweight.numpy().astype(np.uint32) + qweight = np.zeros( + (intweight.shape[0] // 256 * (bits * 8), intweight.shape[1]), dtype=np.uint32 + ) + i = 0 + row = 0 + while row < qweight.shape[0]: + if bits in [2,4,8]: + for j in range(i, i + (32//bits)): + qweight[row] |= intweight[j] << (bits * (j - i)) + i += 32//bits + row += 1 + + elif bits == 3: + for j in range(i, i + 10): + qweight[row] |= intweight[j] << (3 * (j - i)) + i += 10 + qweight[row] |= intweight[i] << 30 + row += 1 + qweight[row] |= (intweight[i] >> 2) & 1 + i += 1 + for j in range(i, i + 10): + qweight[row] |= intweight[j] << (3 * (j - i) + 1) + i += 10 + qweight[row] |= intweight[i] << 31 + row += 1 + qweight[row] |= (intweight[i] >> 1) & 0x3 + i += 1 + for j in range(i, i + 10): + qweight[row] |= intweight[j] << (3 * (j - i) + 2) + i += 10 + row += 1 + + qweight = qweight.astype(np.int32) + qweight = torch.from_numpy(qweight) + return qweight + +def unpack_from_bits( + qweight: torch.Tensor, + quantizer:Quantizer, + bits: int, + groupsize = 1024 + ): + if bits not in [2,3,4,8]: + raise ValueError("bits must be one of [2,3,4,8]") + + scales = quantizer.scale.t().contiguous() + zeros = quantizer.zero.t().contiguous() + scale_zeros = zeros * scales + qweight = qweight.numpy().astype(np.uint32) + + intweight = np.zeros( + (qweight.shape[0] // (bits * 8) * 256, qweight.shape[1]), dtype=np.uint32 + ) + i = 0 + row = 0 + while row < qweight.shape[0]: + if bits in [2,4,8]: + for j in range(i, i+ 32 // bits): + intweight[j] = (qweight[row] >> (bits * (j - i))) & ((1 << bits) - 1) + i += 32 // bits + row += 1 + elif bits == 3: + for j in range(i, i+10): + intweight[j] = (qweight[row] >> (3 * (j - i))) & 7 + i += 10 + intweight[i] = (qweight[row] >> 30) & 1 + row += 1 + intweight[i] |= (qweight[row] & 1) << 2 + i += 1 + for j in range(i, i+10): + intweight[j] = (qweight[row] >> (3 * (j - i) + 1)) & 7 + i += 10 + intweight[i] = (qweight[row] >> 31) & 1 + row += 1 + intweight[i] |= (qweight[row] & 3) << 1 + i += 1 + for j in range(i, i+10): + intweight[j] = (qweight[row] >> (3 * (j - i) + 2)) & 7 + i += 10 + row += 1 + + intweight = intweight.astype(np.int32) + intweight = torch.from_numpy(intweight).t().contiguous() + + weight = [] + for idx in range(intweight.shape[0]): + g_idx = idx // groupsize + weight.append((intweight[:,idx] * scales[g_idx] - scale_zeros[g_idx]).to(torch.float32)[:,None]) + + weight = torch.cat(weight, dim=1) + return weight + +class SparseTensor(): + def __init__(self, m: torch.Tensor, format: str, minifloats: int=-1) -> None: + self.m = m + self.size = m.size() + self.minifloats = minifloats + self.format = format + + def _convert(self): + # flatten the matrix + self.m = self.m.flatten() + # get the indices of the non-zero elements + indices = torch.nonzero(self.m) + # get the non-zero elements + values = self.m[indices] + self.payload = { + 'indices': indices, + 'values': values, + 'size': torch.tensor(self.size), + } + + def restore(self): + # restore the matrix from the self.payload + self.m = torch.zeros(math.prod(self.size), dtype=self.payload['values'].dtype) + self.m[self.payload['indices']] = self.payload['values'] + self.m = self.m.reshape(self.size) + + def to_disk(self, path): + self._convert() + save_file(self.payload, path) + + @classmethod + def from_disk(cls, path): + tensors = {} + with safe_open(path, framework='pt', device='cpu') as f: + for key in f.keys(): + tensors[key] = f.get_tensor(key) + m = torch.zeros(math.prod(tensors['size']), dtype=tensors['values'].dtype) + m[tensors['indices']] = tensors['values'] + tensors['size'] = tensors['size'].tolist() + print(tensors['size']) + m = m.reshape(tensors['size']) + return cls(m, 'sparse', minifloats=-1) + + @property + def tensor(self): + return self.m \ No newline at end of file diff --git a/pack_utils_test.py b/pack_utils_test.py new file mode 100644 index 0000000..563f31d --- /dev/null +++ b/pack_utils_test.py @@ -0,0 +1,20 @@ +import torch +from quant import Quantizer +from opt_delta import hard_threshold +from safetensors.torch import save_file + +QUANTIZED_BITS = 4 + +if __name__=="__main__": + torch.set_printoptions(precision=4) + b = torch.rand((1, 1), dtype=torch.float32) + # save b + save_file({'wb1': b}, '.cache/original_b.safetensor') + quantizer = Quantizer() + quantizer.configure( + QUANTIZED_BITS, perchannel=True, sym=False, mse=False + ) + quantizer.find_params(b, weight=True) + b_q = quantizer.quantize(b) + print(b_q) + \ No newline at end of file diff --git a/playground.py b/playground.py new file mode 100644 index 0000000..3b15c6d --- /dev/null +++ b/playground.py @@ -0,0 +1,74 @@ +import math +import torch +import numpy as np +import torchvision.transforms as T + +def bin_array(num, m): + """Convert a positive integer num into an m-bit bit vector""" + return np.array(list(np.binary_repr(num).zfill(m))).astype(np.int8) + +def hard_threshold(x, fraction_of_zero=0.1): + if fraction_of_zero == 0: + return x + y, _ = torch.sort(x.view(-1).abs().clone()) + num_params = torch.numel(x) + thresh_index = int(num_params * fraction_of_zero) + threshold = y[thresh_index] + mask = x.abs().clone().gt(threshold).type(torch.FloatTensor) + transform = T.ToPILImage() + + # convert the tensor to PIL image using above transform + binmask = transform(mask) + binmask = binmask.convert('1') + binmask.save('.io/binmask.bmp') + return mask * x + +def packing_indices(x): + matrix_size = x.shape[0] * x.shape[1] + y = torch.zeros(x.shape) + y = y.flatten() + # find indices of non-zero elements + x = x.clone().flatten() + indices = torch.nonzero(x) + # assume matrix is a power of 2 + bit_width = int(math.log2(matrix_size)) + # turn into a python tensor with boolean values + indices_binary = torch.tensor(np.array([bin_array(i, bit_width) for i in indices])) + packed_indices = torch.tensor(np.packbits(indices_binary, axis=1), dtype=torch.uint8) + return packed_indices + +def unpacking_indices(packed_indices): + # unpack with numpy + unpacked_indices = np.unpackbits(packed_indices, axis=1) + # convert bits back to indices + unpacked_indices = torch.tensor(np.array([int("".join(map(str, i)), 2) for i in unpacked_indices])) + return unpacked_indices + +def compression_rate_calc(msize, wbit, sparsity): + original_bit_used = msize * 16 + nonzeros = msize * sparsity + to_store_value = nonzeros * wbit + to_store_index = nonzeros * math.log2(msize) + print("original_bit_used: ", original_bit_used) + print("to_store_value: ", to_store_value) + print("to_store_index: ", to_store_index) + print("compression rate: ", original_bit_used / (to_store_value + to_store_index)) + return original_bit_used, to_store_value, to_store_index + +if __name__=="__main__": + base_floats = 16 + wbits = 3 + m_size = 2048 + nonsparsity = 0.9 + x = torch.randn((m_size, m_size), dtype=torch.float16) + torch.save(x, ".io/x.pt") + x = hard_threshold(x, nonsparsity) + # 10% x 4M indices -> 800k on disk + packed_indices = packing_indices(x) + print(packed_indices.shape) + print(packed_indices.shape) + print(packed_indices.dtype) + torch.save(packed_indices, ".io/packed_indices.pt") + unpacked_indices = unpacking_indices(packed_indices) + + compression_rate_calc(2048*2048, 3, 0.1) \ No newline at end of file diff --git a/ppl_res.json b/ppl_res.json new file mode 100644 index 0000000..f86c4f0 --- /dev/null +++ b/ppl_res.json @@ -0,0 +1 @@ +{"fact_verification": 7.487515449523926} \ No newline at end of file diff --git a/quant.py b/quant.py index f8cc1b7..386845c 100644 --- a/quant.py +++ b/quant.py @@ -1,6 +1,6 @@ import math -import numpy as np import torch +import numpy as np import torch.nn as nn def quantize(x, scale, zero, maxq): @@ -83,7 +83,12 @@ def find_params(self, x, weight=False): xmax1 = p * xmax scale1 = (xmax1 - xmin1) / self.maxq zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero - q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq) + q = quantize( + x, + scale1.unsqueeze(1), + zero1.unsqueeze(1), + self.maxq + ) q -= x q.abs_() q.pow_(self.norm) @@ -287,7 +292,7 @@ def pack(self, linear, scales, zeros): raise NotImplementedError("Only 2,3,4,8 bits are supported.") qweight = qweight.astype(np.int32) - self.qweight = torch.from_numpy(qweight) + self.qweight = torch.from_numpy(qweight) zeros -= 1; zeros = zeros.numpy().astype(np.uint32) diff --git a/quant_cuda.cpp b/quant_cuda.cpp index 1bf0894..ff97571 100644 --- a/quant_cuda.cpp +++ b/quant_cuda.cpp @@ -10,7 +10,7 @@ void vecquant3matmul_cuda( void vecquant3matmul_faster_cuda( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor scales, torch::Tensor zeros -); +); void vecquant3matmul( torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, diff --git a/replay.py b/replay.py new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index 7417000..79d456c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ transformers loguru -datasets \ No newline at end of file +datasets +safetensors \ No newline at end of file diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh new file mode 100644 index 0000000..ec3d1f2 --- /dev/null +++ b/scripts/gptq_delta.sh @@ -0,0 +1,10 @@ +python opt_delta_autotune.py \ + --dataset answer_verification \ + --base-model facebook/opt-1.3b \ + --model .cache/models/answer_verification \ + --delta \ + --wbits 2 \ + --tol 2 \ + --save-delta \ + --save-hf \ + --groupsize 1024 diff --git a/scripts/lr_quant.sh b/scripts/lr_quant.sh index e462a64..2665e7b 100644 --- a/scripts/lr_quant.sh +++ b/scripts/lr_quant.sh @@ -3,27 +3,7 @@ python cli.py \ --target-model lnair/opt-1.3b-wikitext2 \ --base-model facebook/opt-1.3b \ --delta \ - --rank 32 \ - --save outputs/ \ - --nsamples 128 \ - --wbits 8 - -python cli.py \ - --dataset wikitext2 \ - --target-model lnair/opt-1.3b-wikitext2 \ - --base-model facebook/opt-1.3b \ - --delta \ - --rank 32 \ - --save outputs/ \ - --nsamples 128 \ - --wbits 4 - -python cli.py \ - --dataset wikitext2 \ - --target-model lnair/opt-1.3b-wikitext2 \ - --base-model facebook/opt-1.3b \ - --delta \ - --rank 64 \ + --rank 16 \ --save outputs/ \ --nsamples 128 \ --wbits 8 @@ -36,4 +16,5 @@ python cli.py \ --rank 16 \ --save outputs/ \ --nsamples 128 \ + --decompose-only \ --wbits 8 \ No newline at end of file diff --git a/scripts/lr_quant_2.sh b/scripts/lr_quant_2.sh new file mode 100644 index 0000000..2250e30 --- /dev/null +++ b/scripts/lr_quant_2.sh @@ -0,0 +1,20 @@ +python cli.py \ + --dataset wikitext2 \ + --target-model lnair/opt-1.3b-wikitext2 \ + --base-model facebook/opt-1.3b \ + --delta \ + --rank 32 \ + --save outputs/ \ + --nsamples 128 \ + --decompose-only \ + --wbits 8 + +python cli.py \ + --dataset wikitext2 \ + --target-model lnair/opt-1.3b-wikitext2 \ + --base-model facebook/opt-1.3b \ + --delta \ + --rank 32 \ + --save outputs/ \ + --nsamples 128 \ + --wbits 8 \ No newline at end of file diff --git a/scripts/lr_quant_350m.sh b/scripts/lr_quant_350m.sh new file mode 100644 index 0000000..4710196 --- /dev/null +++ b/scripts/lr_quant_350m.sh @@ -0,0 +1,10 @@ +python cli.py \ + --dataset wikitext2 \ + --target-model lnair/opt-350m-wikitext2 \ + --base-model facebook/opt-350m \ + --delta \ + --rank 32 \ + --save outputs/ \ + --nsamples 128 \ + --decompose-only \ + --wbits 8 diff --git a/scripts/opt_delta_exp.sh b/scripts/opt_delta_exp.sh new file mode 100644 index 0000000..2b41218 --- /dev/null +++ b/scripts/opt_delta_exp.sh @@ -0,0 +1,10 @@ +python opt_delta_autotune.py \ + --dataset wikitext2 \ + --base-model facebook/opt-1.3b \ + --model lnair/opt-1.3b-wikitext2 \ + --delta \ + --wbits 2 \ + --tol 2 \ + --save-delta \ + --save-hf \ + --groupsize 1024 \ No newline at end of file diff --git a/scripts/playground.ipynb b/scripts/playground.ipynb index 88e2175..86c66d6 100644 --- a/scripts/playground.ipynb +++ b/scripts/playground.ipynb @@ -1,151 +1,11 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "seed=42\n", - "target_model_name = \"lnair/opt-1.3b-wikitext2\"\n", - "base_model_name = \"facebook/opt-1.3b\"\n", - "n_samples = 128\n", - "dataset = 'wikitext2'\n", - "import sys\n", - "sys.path.append(\"..\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/xiayao/miniconda3/envs/fmzip/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, - { - "data": { - "text/plain": [ - "OPTForCausalLM(\n", - " (model): OPTModel(\n", - " (decoder): OPTDecoder(\n", - " (embed_tokens): Embedding(50272, 2048, padding_idx=1)\n", - " (embed_positions): OPTLearnedPositionalEmbedding(2050, 2048)\n", - " (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", - " (layers): ModuleList(\n", - " (0-23): 24 x OPTDecoderLayer(\n", - " (self_attn): OPTAttention(\n", - " (k_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", - " (v_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", - " (q_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", - " (out_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", - " )\n", - " (activation_fn): ReLU()\n", - " (self_attn_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", - " (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n", - " (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n", - " (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", - " )\n", - " )\n", - " )\n", - " )\n", - " (lm_head): Linear(in_features=2048, out_features=50272, bias=False)\n", - ")" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from modelutils import get_opt\n", - "base_model = get_opt(base_model_name)\n", - "target_model = get_opt(target_model_name)\n", - "base_model.to('cuda')\n", - "target_model.to('cuda')\n", - "base_model.eval()\n", - "target_model.eval()" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset wikitext (/home/xiayao/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n", - "Found cached dataset wikitext (/home/xiayao/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n" - ] - } - ], - "source": [ - "from datautils import get_loaders\n", - "trainloader, loader_enc = get_loaders(\n", - " dataset,\n", - " nsamples = n_samples,\n", - " seed=seed,\n", - " model=target_model_name,\n", - " seqlen=base_model.seqlen,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from cli import quantize_with_lowrank\n", - "r_quantizer, l_quantizer, lr_tensors = quantize_with_lowrank(\n", - " base_model,\n", - " target_model,\n", - " trainloader,\n", - " 32\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." - ] - } - ], - "source": [ - "import torch\n", - "from safetensors import safe_open\n", - "from safetensors.torch import save_file\n", - "\n", - "# iterate over all keys in lr_tensors\n", - "for k in lr_tensors.keys():\n", - " lr_tensors[k] = lr_tensors[k].contiguous() # make sure they are contiguous\n", - "# save them to a file\n", - "\n", - "save_file(lr_tensors, \"model.safetensors\")" - ] } ], "metadata": { diff --git a/submit.py b/submit.py new file mode 100644 index 0000000..1830099 --- /dev/null +++ b/submit.py @@ -0,0 +1,18 @@ +import os +model_relations = { + #'facebook/opt-350m': ['lnair/opt-350m-wikitext2'], + # 'facebook/opt-1.3b': ['lnair/opt-1.3b-wikitext2'], + # 'facebook/opt-2.7b': ['lnair/opt-2.7b-wikitext2'], + 'facebook/opt-6.7b': ['mit-han-lab/opt-6.7b-smoothquant'], + 'facebook/opt-1.3b': ['facebook/opt-iml-1.3b', 'facebook/opt-iml-max-1.3b', 'mit-han-lab/opt-1.3b-smoothquant', 'pszemraj/opt-peter-1.3B', 'opentensor/bt-opt-1.3b'] +} + +tols = [4.5, 5, 6.5, 8.0] + +os.system("ts -S 7") + +for model in model_relations.keys(): + for target_model in model_relations[model]: + for tol in tols: + cmd = f"TS_VISIBLE_DEVICES=0,2,3,4,5,6,7 ts --gpus 1 python opt_delta_autotune.py --dataset wikitext2 --delta --tol {tol} --model {target_model} --base-model {model} --save-hf --groupsize 1024" + os.system(cmd) \ No newline at end of file diff --git a/tensorio.py b/tensorio.py new file mode 100644 index 0000000..c5526f8 --- /dev/null +++ b/tensorio.py @@ -0,0 +1,55 @@ +import math +import torch +from safetensors import safe_open +from safetensors.torch import save_model +from modelutils import find_layers +from pack_utils import pack_to_bits +from compress_utils import compress_flexible_nbits, decompress_flexible_nbits + +class TensorIO(): + def __init__(self, format: str, tensors=None) -> None: + self.format = format + if tensors is None: + self.tensors = {} + else: + self.tensors = tensors + + def add_tensor(self, idx, tensor): + tensor = tensor.flatten() + # assume that the tensor is sparse + indices = torch.nonzero(tensor) + values = tensor[indices] + self.tensors[f"{idx}_indices"] = indices + self.tensors[f"{idx}_values"] = values + self.tensors[f"{idx}_size"] = torch.tensor(tensor.size()) + + def to_disk(self, path): + torch.save(self.tensors, path) + + @classmethod + def from_disk(cls, path): + tensors = {} + with safe_open(path, framework='pt', device='cpu') as f: + for key in f.keys(): + tensors[key] = f.get_tensor(key) + # restore the tensors + for key in tensors.keys(): + m = torch.zeros(math.prod(tensors[f"{key}_size"]), dtype=tensors[f'{key}_values'].dtype) + + m[tensors[f"{key}_indices"]] = tensors[f"{key}_values"] + tensors[f"{key}_size"] = tensors[f"{key}_size"].tolist() + m = m.reshape(tensors[f"{key}_size"]) + + tensors[key] = m + return cls('sparse', tensors=tensors) + +def model_packing(model, quantizers, bits, reformat='none'): + layers = find_layers(model) + layers = {n: layers[n] for n in quantizers} + qlayers = find_layers(model, ) + print('Packing ...') + for name in qlayers: + if name in quantizers: + quantizers[name] = quantizers[name].cpu() + x, scale = compress_flexible_nbits(layers[name].weight.data.cuda(), bits) + return x, scale \ No newline at end of file diff --git a/to_hf.py b/to_hf.py index efd25d2..033eb79 100644 --- a/to_hf.py +++ b/to_hf.py @@ -10,8 +10,8 @@ base_model = get_opt(base_model_name) target_model = deepcopy(base_model) - -tensors = load_lr_tensors("outputs/model.safetensors") +MODEL_ID = "lnair.opt-1.3b-wikitext2-r32-w8-decompose.True-lr" +tensors = load_lr_tensors(f"outputs/{MODEL_ID}.safetensors") target_layers = target_model.model.decoder.layers @@ -26,6 +26,6 @@ layer[layer_id].weight.data = new_weight # save target model as HF -target_model.save_pretrained("outputs/lnair-opt-1.3b-wikitext2-r32-w8") +target_model.save_pretrained(f"outputs/{MODEL_ID}") tokenizer = AutoTokenizer.from_pretrained(base_model_name) -tokenizer.save_pretrained("outputs/lnair-opt-1.3b-wikitext2-r32-w8") \ No newline at end of file +tokenizer.save_pretrained(f"outputs/{MODEL_ID}") \ No newline at end of file diff --git a/utilities/analyze.py b/utilities/analyze.py new file mode 100644 index 0000000..57c69d1 --- /dev/null +++ b/utilities/analyze.py @@ -0,0 +1,16 @@ +import json +import matplotlib.pyplot as plt + +with open(".cache/lnair.opt-350m-wikitext2_delta_tol=2.0.json", "r") as fp: + data = json.load(fp) + +all_best_losses = [] +for layer_name in data.keys(): + best_loss = data[layer_name]['choice']['best_loss'] + all_best_losses.append(best_loss) + if (best_loss > 100): + print(f"{layer_name} large loss!") +print(all_best_losses) +# plot a histogram of the best losses +plt.hist(all_best_losses, bins=100) +plt.savefig('.cache/lnair.opt-350m-wikitext2_delta_tol=2.0.png') \ No newline at end of file diff --git a/utilities/compression_rate_estimator.py b/utilities/compression_rate_estimator.py new file mode 100644 index 0000000..2b8a564 --- /dev/null +++ b/utilities/compression_rate_estimator.py @@ -0,0 +1,49 @@ +import json +import math +from modelutils import get_opt, find_layers +from compression_scripts.model_utils import get_opt, find_layers + +base_floats = 16 + +def calc_compression(path: str, base_model: str): + base_model = get_opt(base_model) + with open(path, "r") as f: + data = json.load(f) + + base_layers = base_model.model.decoder.layers + + total_original_bits = 0 + total_used_bits = 0 + sparsity_lists = [] + total_stats = {} + for i in range(len(base_layers)): + layer = base_layers[i] + subset = find_layers(layer) + for name in subset: + original_weight = subset[name].weight.data + original_weight_count = original_weight.numel() + total_original_bits += original_weight_count * base_floats + if f"{i}_{name}" in data: + config = data[f"{i}_{name}"]["choice"] + # save them as indices + values pair + nonzeros = (1-config["best_sparsity"]) * original_weight_count + # to store values + used_bits = nonzeros * config["best_wbit"] + # to store indices + used_bits += nonzeros * 2 * math.log2(original_weight_count) * 8 + + sparsity_lists.append(config["best_sparsity"]) + + total_used_bits += used_bits + else: + raise ValueError(f"Layer {i}_{name} not found in {path}") + + total_stats['compresion_rate'] = total_original_bits / total_used_bits + total_stats['sparsity'] = sum(sparsity_lists) / len(sparsity_lists) + return total_stats + +if __name__=="__main__": + path = ".cache/lnair.opt-1.3b-wikitext2_delta_tol=1.0.json" + base_model = "facebook/opt-1.3b" + stats = calc_compression(path, base_model) + print(stats) \ No newline at end of file diff --git a/utilities/convert_to_hf.py b/utilities/convert_to_hf.py index 8111506..36f7262 100644 --- a/utilities/convert_to_hf.py +++ b/utilities/convert_to_hf.py @@ -1,16 +1,12 @@ +import os import torch import torch.nn as nn from transformers import GPTJForCausalLM - from transformers import AutoConfig, AutoTokenizer - from transformers.modeling_utils import no_init_weights -import os - def create_emtpy_gptj(config): - import torch import torch.nn as nn diff --git a/utilities/cr_cal.py b/utilities/cr_cal.py new file mode 100644 index 0000000..e69de29 diff --git a/utilities/to_csv.py b/utilities/to_csv.py new file mode 100644 index 0000000..db809a2 --- /dev/null +++ b/utilities/to_csv.py @@ -0,0 +1,19 @@ +import json +import pandas as pd +with open('ppl_res.json') as f: + res = json.load(f) +# convert to csv +sizes_group = ['350m', '1.3b', '2.7b'] +results = [] +for key in res.keys(): + results.append({ + 'model': key, + 'perplexity': res[key], + }) +df = pd.DataFrame(results) + +# pivot table such that columns is different models, rows is different perplexity +for size in sizes_group: + subdf = df[df['model'].str.contains(size)] + subdf = subdf.pivot_table(values='perplexity', columns='model') + subdf.to_csv(f'ppl_res_{size}.csv', index=False) \ No newline at end of file diff --git a/utilities/tuning_analyser.py b/utilities/tuning_analyser.py new file mode 100644 index 0000000..a7a658b --- /dev/null +++ b/utilities/tuning_analyser.py @@ -0,0 +1,6 @@ +import json +path = ".cache/lnair.opt-350m-wikitext2_delta_tol=0.2.json" + +with open(path, "r") as fp: + data = json.load(fp) +