neural_speed/models/llama/llama_utils.cpp

//  Copyright (c) 2023 Intel Corporation
//
//  Licensed under the Apache License, Version 2.0 (the "License");
//  you may not use this file except in compliance with the License.
//  You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
//  Unless required by applicable law or agreed to in writing, software
//  distributed under the License is distributed on an "AS IS" BASIS,
//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//  See the License for the specific language governing permissions and
//  limitations under the License.
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include <algorithm>
#include <cassert>
#include <cinttypes>
#include <cstring>
#include <exception>
#include <fstream>
#include <iterator>
#include <memory>
#include <random>
#include <string>
#include <unordered_map>
#include <vector>

#include "core/data_types.h"
#include "core/ne.h"
#include "core/ne_layers.h"
#include "models/llama/llama.h"
#include "models/model_utils/model_utils.h"
#include "models/model_utils/model_config.h"
#include "models/model_utils/model_files.h"
#include "models/model_utils/model_types.h"
#include "models/model_utils/quant_utils.h"
#include "models/model_utils/util.h"
#include "models/models.h"

void model_load_internal(const std::string& fname, model_archs arch, model_context* ctx, int n_gpu_layers,
                         bool use_mmap, bool use_mlock, bool vocab_only, model_progress_callback progress_callback,
                         void* progress_callback_user_data) {
  std::unique_ptr<Llama> ms(new Llama());
  ms->init(fname.c_str(), ctx, n_gpu_layers, use_mmap, use_mlock, vocab_only);
  ms->load(ctx, progress_callback, progress_callback_user_data);

  model_context& lctx = *ctx;
  lctx.support_bestla_kv = true;
}

void Llama::init(const char* path_model, model_context* ctx, int n_gpu_layer_, bool use_mmap_, bool use_mlock_,
                 bool vocab_only_) {
  model_context& lctx = *ctx;
  n_gpu_layer = n_gpu_layer_;
  use_mmap = use_mmap_;
  use_mlock = use_mlock_;
  vocab_only = vocab_only_;
  auto& model = lctx.model;
  ml.reset(new model_model_loader(path_model, use_mmap, vocab_only));
  lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
  model.hparams = ml->file_loaders.at(0)->hparams;
  model_file_version file_version = ml->file_loaders.at(0)->file_version;
  auto& hparams = model.hparams;
  n_ff = hparams.ffn_hidden_size;
  fprintf(stderr, "%s: n_vocab    = %u\n", __func__, hparams.n_vocab);
  fprintf(stderr, "%s: n_ctx      = %u\n", __func__, lctx.n_ctx);
  fprintf(stderr, "%s: n_embd     = %u\n", __func__, hparams.n_embd);
  fprintf(stderr, "%s: n_mult     = %u\n", __func__, hparams.n_mult);
  fprintf(stderr, "%s: n_head     = %u\n", __func__, hparams.n_head);
  fprintf(stderr, "%s: n_head_kv  = %u\n", __func__, hparams.n_head_kv);
  fprintf(stderr, "%s: n_layer    = %u\n", __func__, hparams.n_layer);
  fprintf(stderr, "%s: n_rot      = %u\n", __func__, hparams.n_rot);
  fprintf(stderr, "%s: n_ff       = %u\n", __func__, n_ff);
  fprintf(stderr, "%s: n_parts    = %zu\n", __func__, ml->file_loaders.size());
  n_embd = hparams.n_embd;
  n_vocab = hparams.n_vocab;
  n_layer = hparams.n_layer;
  n_head_kv = hparams.n_head_kv;
  n_head = hparams.n_head;
  n_expert = hparams.n_experts;
  n_expert_used = hparams.n_experts_used;
  scratch = llama_mem_req(n_layer, lctx.scratch_size_ratio);
  model.scratchs = scratch;
}

#define MODEL_BACKEND_OFFLOAD NE_BACKEND_CPU
void Llama::load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) {
  model_context& lctx = *ctx;
  auto& model = lctx.model;
  auto& ne_ctx = model.ctx;
  size_t ctx_size;
  size_t mmapped_size;
  ml->calc_sizes(&ctx_size, &mmapped_size);
  int n_cpu_layer = n_layer - n_gpu_layer;
  n_cpu_layer = n_cpu_layer < 0 ? 0 : n_cpu_layer;
  fprintf(stderr, "%s: ctx size   = %7.2f MB\n", __func__, ctx_size / 1024.0 / 1024.0);
  auto host_size =
      (ctx_size + (50 << 20)) * n_cpu_layer / n_layer + n_embd * n_vocab * sizeof(float);  // embedding on CPU
  auto device_size = (ctx_size + (50 << 20)) * n_gpu_layer / n_layer + (50 << 20);
  fprintf(stderr, "%s: host ctx size   = %7.2f MB\n", __func__, host_size / 1024.0 / 1024.0);
#ifdef NS_SYCL
  fprintf(stderr, "%s: device ctx size   = %7.2f MB\n", __func__, device_size / 1024.0 / 1024.0);
#endif
  // create the ne context
  lctx.model.buf.resize(host_size);
  if (use_mlock) {
    lctx.model.mlock_buf.init(lctx.model.buf.addr);
    lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
  }
  model_alloc_sycl_mem(lctx.dev_ctx, device_size);

  struct ne_init_params params = {
      /*.mem_size   =*/lctx.model.buf.size,
      /*.mem_buffer =*/lctx.model.buf.addr,
      /*.no_alloc   =*/ml->use_mmap,
  };

  model.ctx = ne_init(params);
  if (!model.ctx) {
    throw format("ne_init() failed");
  }
  ne_ctx->dev_ctx = ctx->dev_ctx;
  ml->ne_ctx = ne_ctx;

  const int i_gpu_start = n_layer - n_gpu_layer;
  model.layers.resize(n_layer);
  size_t vram_total = 0;
  size_t device_total = 0;

  if (ml->verify_tensor("token_embd.weight")) {  // GGUF
    model.others[0] = ml->get_tensor("token_embd.weight", {n_embd, n_vocab}, NE_BACKEND_CPU);
    model.others[1] = ml->get_tensor("output_norm.weight", {n_embd}, NE_BACKEND_CPU);
    model.others[2] = ml->get_tensor("output.weight", {n_embd, n_vocab},
                                     n_gpu_layer > static_cast<int>(n_layer) ? MODEL_BACKEND_OFFLOAD : NE_BACKEND_CPU);

    for (uint32_t i = 0; i < n_layer; ++i) {
      const ne_backend backend = static_cast<int>(i) < i_gpu_start ? NE_BACKEND_CPU : MODEL_BACKEND_OFFLOAD;
      auto& layer = model.layers[i];
      std::string layers_i = "blk." + std::to_string(i);

      // attention norm
      layer.norm[0] = ml->get_tensor(layers_i + ".attn_norm.weight", {n_embd}, backend);

      // qkv GEMM
      layer.attn[0] = ml->get_tensor(layers_i + ".attn_q.weight", {n_embd, n_embd}, backend);
      layer.attn[1] = ml->get_tensor(layers_i + ".attn_k.weight", {n_embd, n_embd / (n_head / n_head_kv)}, backend);
      layer.attn[2] = ml->get_tensor(layers_i + ".attn_v.weight", {n_embd, n_embd / (n_head / n_head_kv)}, backend);
      layer.attn[3] = ml->get_tensor(layers_i + ".attn_output.weight", {n_embd, n_embd}, backend);

      // ffn norm
      layer.norm[1] = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);

      // ffn GEMM
      if (ml->verify_tensor(layers_i + ".ffn_gate.weight")) {
        NE_ASSERT(n_expert == 0);
        NE_ASSERT(n_expert_used == 0);
        layer.ffn[0] = ml->get_tensor(layers_i + ".ffn_gate.weight", {n_embd, n_ff}, backend);
        layer.ffn[1] = ml->get_tensor(layers_i + ".ffn_down.weight", {n_ff, n_embd}, backend);
        layer.ffn[2] = ml->get_tensor(layers_i + ".ffn_up.weight", {n_embd, n_ff}, backend);
      } else {
        NE_ASSERT(n_expert > 0);
        NE_ASSERT(n_expert_used > 0);
        layer.ffn_gate_inp = ml->get_tensor(layers_i + ".ffn_gate_inp.weight", {n_embd, n_expert}, backend);
        for (uint32_t x = 0; x < n_expert; ++x) {
          layer.ffn_gate_exp[x] =
              ml->get_tensor(layers_i + ".ffn_gate." + std::to_string(x) + ".weight", {n_embd, n_ff}, backend);
          layer.ffn_down_exp[x] =
              ml->get_tensor(layers_i + ".ffn_down." + std::to_string(x) + ".weight", {n_ff, n_embd}, backend);
          layer.ffn_up_exp[x] =
              ml->get_tensor(layers_i + ".ffn_up." + std::to_string(x) + ".weight", {n_embd, n_ff}, backend);
        }
      }

      if (backend != NE_BACKEND_CPU) {
        vram_total += ne_nbytes(layer.norm[0]) + ne_nbytes(layer.attn[0]) + ne_nbytes(layer.attn[1]) +
                      ne_nbytes(layer.attn[2]) + ne_nbytes(layer.attn[3]) + ne_nbytes(layer.norm[1]) +
                      ne_nbytes(layer.ffn[0]) + ne_nbytes(layer.ffn[1]) + ne_nbytes(layer.ffn[2]);
      }
    }
  } else {  // NE Fortmat
    model.others[0] = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, NE_BACKEND_CPU);
    model.others[1] = ml->get_tensor("norm.weight", {n_embd}, n_gpu_layer ? NE_BACKEND_SYCL : NE_BACKEND_CPU);
    model.others[2] =
        ml->get_tensor("output.weight", {n_embd, n_vocab}, n_gpu_layer > 0 ? NE_BACKEND_SYCL : NE_BACKEND_CPU);

    for (uint32_t i = 0; i < n_layer; ++i) {
      const ne_backend backend = static_cast<int>(i) < i_gpu_start ? NE_BACKEND_CPU : NE_BACKEND_SYCL;
      auto& layer = model.layers[i];
      std::string layers_i = "layers." + std::to_string(i);

      // attention norm
      layer.norm[0] = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);

      // qkv GEMM
      layer.attn[0] = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
      layer.attn[1] =
          ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd / (n_head / n_head_kv)}, backend);
      layer.attn[2] =
          ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd / (n_head / n_head_kv)}, backend);
      layer.attn[3] = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);

      // ffn norm
      layer.norm[1] = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);

      // ffn GEMM

      if (ml->verify_tensor(layers_i + ".feed_forward.w1.weight")) {
        NE_ASSERT(n_expert == 0);
        NE_ASSERT(n_expert_used == 0);
        layer.ffn[0] = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
        layer.ffn[1] = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {n_ff, n_embd}, backend);
        layer.ffn[2] = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
      } else {
        NE_ASSERT(n_expert > 0);
        NE_ASSERT(n_expert_used > 0);
        layer.ffn_gate_inp = ml->get_tensor(layers_i + ".ffn_gate_inp.weight", {n_embd, n_expert}, backend);
        for (uint32_t x = 0; x < n_expert; ++x) {
          layer.ffn_gate_exp[x] =
              ml->get_tensor(layers_i + ".ffn_gate." + std::to_string(x) + ".weight", {n_embd, n_ff}, backend);
          layer.ffn_down_exp[x] =
              ml->get_tensor(layers_i + ".ffn_down." + std::to_string(x) + ".weight", {n_ff, n_embd}, backend);
          layer.ffn_up_exp[x] =
              ml->get_tensor(layers_i + ".ffn_up." + std::to_string(x) + ".weight", {n_embd, n_ff}, backend);
        }
      }
      auto layer_total = ne_nbytes(layer.norm[0]) + ne_nbytes(layer.attn[0]) + ne_nbytes(layer.attn[1]) +
                         ne_nbytes(layer.attn[2]) + ne_nbytes(layer.attn[3]) + ne_nbytes(layer.norm[1]) +
                         ne_nbytes(layer.ffn[0]) + ne_nbytes(layer.ffn[1]) + ne_nbytes(layer.ffn[2]);
      if (backend == NE_BACKEND_CPU) {
        vram_total += layer_total;
      } else {
        device_total += layer_total;
      }
    }
  }
  NE_ASSERT(vram_total <= host_size);
  NE_ASSERT(device_total <= device_size);

  // populate `tensors_by_name`
  for (model_load_tensor& lt : ml->tensors_map.tensors) {
    model.tensors_by_name.emplace_back(lt.name, lt.ne_tensor);
  }

  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : nullptr);
  if (progress_callback) {
    progress_callback(1.0f, progress_callback_user_data);
  }

  model.mapping = std::move(ml->mapping);
}

#undef MODEL_BACKEND_OFFLOAD

class llama_quant_layer : public quant_layer_base {
 public:
  quant_params_internal get_layer_config(std::string layername, std::vector<int64_t> ne, ne_type type) override {
    bool quantize = layername.rfind("weight") == layername.size() - 6;
    if ((layername.find("embedding") != std::string::npos) ||
        (layername == "token_embd.weight" || layername == "tok_embeddings.weight")) {
      // special layer process, can be loaded by config file
      return quant_params_internal();  // return q4_0 to cover the usage of getrow
    }

    quantize &= (ne.size() == 2);
    if (quantize) {
      if (mGCfg.bits == quant_bits::q2) {
        auto q4cfg = mGCfg;
        q4cfg.bits = quant_bits::q4;
        q4cfg.alg = quant_alg::sym;
        if (layername.find("attention.wv") != std::string::npos) {
          return q4cfg;
        }
        /*if (layername.find("attention.wq") != std::string::npos) {
          return q4cfg;
        }
        if (layername.find("attention.wk") != std::string::npos) {
          return q4cfg;
        }*/
        if (layername.find("feed_forward.w2") != std::string::npos) {
          return q4cfg;
        }
        /*if (layername.find("output.weight") != std::string::npos) {
          return q4cfg;
        }*/
      }

      return mGCfg;  // use global quant config
    } else {
      return quant_params_internal{quant_bits::count};  // non-quant
    }
  }
};
REGISTER_QUANT_LAYER_CLASS(llama);