File size: 3,387 Bytes
1cf2abd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#include "llm.h"

// https://github.com/cmp-nct/ggllm.cpp/blob/master/examples/falcon/falcon_main.cpp

#include "ggml/libfalcon.cpp"

class falcon_llm : public LLM {
 public:
  virtual ~falcon_llm() {
    if (ctx_ != nullptr) {
      falcon_free(ctx_);
    }
  }

  std::vector<gpt_vocab::id> Tokenize(const std::string &text) const override {
    return falcon_tokenize(ctx_->vocab, text, /*bos=*/false);
  }

  const std::string &Detokenize(const gpt_vocab::id id) const override {
    if (id >= falcon_n_vocab(ctx_)) {
      return kEmptyString;
    }
    return ctx_->vocab.id_to_token[id].tok;
  }

  bool IsEosToken(const gpt_vocab::id token) const override {
    return token == EosToken();
  }

  gpt_vocab::id EosToken() const override { return falcon_token_eos(); }

  int VocabSize() const override { return falcon_n_vocab(ctx_); }

  std::vector<float> &Logits() override { return ctx_->logits; }

  const std::vector<float> &Embeddings() const override {
    return ctx_->embedding;
  }

  gpt_vocab::id Sample(const int top_k, const float top_p,
                       const float temperature, const float repetition_penalty,
                       int last_n_tokens, int seed) const override {
    if (last_n_tokens < 0) {
      last_n_tokens = ContextLength();
    }
    if (seed < 0) {
      seed = time(nullptr);
    }
    ctx_->rng.seed(seed);

    const float *logits = falcon_get_logits(ctx_);
    const int n_vocab = falcon_n_vocab(ctx_);

    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
      candidates.emplace_back(
          llama_token_data{token_id, logits[token_id], 0.0f});
    }

    llama_token_data_array candidates_p = {
        candidates.data(),
        candidates.size(),
        false,
    };

    {
      std::unordered_set<gpt_vocab::id> recent_tokens_set;
      if (repetition_penalty != 1.0f) {
        recent_tokens_set = previous_tokens_.GetRecent(last_n_tokens);
      }
      std::vector<gpt_vocab::id> recent_tokens(recent_tokens_set.begin(),
                                               recent_tokens_set.end());
      falcon_sample_repetition_penalty(
          ctx_, &candidates_p, recent_tokens.data(), recent_tokens.size(),
          repetition_penalty);
    }

    falcon_sample_top_k(ctx_, &candidates_p, top_k, 1);
    falcon_sample_top_p(ctx_, &candidates_p, top_p, 1);
    falcon_sample_temperature(ctx_, &candidates_p, temperature);
    return falcon_sample_token(ctx_, &candidates_p);
  }

 protected:
  bool Load(const std::string &filename, const int context_length,
            const int gpu_layers) override {
    falcon_context_params params = falcon_context_default_params();
    params.embedding = true;
    if (context_length > 0) {
      params.n_ctx = context_length;
    }
    params.n_gpu_layers = gpu_layers;

    ctx_ = falcon_init_from_file(filename.c_str(), params);
    if (ctx_ == nullptr) {
      return false;
    }
    n_ctx_ = falcon_n_ctx(ctx_);
    return true;
  }

  bool Eval(const std::vector<gpt_vocab::id> &tokens, const int threads,
            const int n_past) override {
    const int status = falcon_eval(ctx_, tokens.data(), tokens.size(), n_past,
                                   threads, /*debug_timings=*/0);
    return status == 0;
  }

 private:
  falcon_context *ctx_ = nullptr;
};