llama.cpp/examples/main/main.cpp

461 lines
16 KiB
C++
Raw Normal View History

#include "common.h"
#include "llama.h"
2023-03-10 18:40:58 +00:00
#include <cassert>
#include <cinttypes>
2023-03-10 18:40:58 +00:00
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
2023-03-10 18:40:58 +00:00
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#include <signal.h>
#endif
static console_state con_st;
static bool is_interacting = false;
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
void sigint_handler(int signo) {
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
printf("\n"); // this also force flush stdout.
if (signo == SIGINT) {
if (!is_interacting) {
is_interacting=true;
} else {
_exit(130);
}
}
}
#endif
2023-03-10 18:40:58 +00:00
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
// save choice to use color for later
// (note for later: this is a slightly awkward choice)
con_st.use_color = params.use_color;
#if defined (_WIN32)
win32_console_init(params.use_color);
#endif
if (params.perplexity) {
printf("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.embedding) {
printf("\n************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
2023-03-10 18:40:58 +00:00
if (params.seed <= 0) {
2023-03-10 18:40:58 +00:00
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
2023-03-10 18:40:58 +00:00
std::mt19937 rng(params.seed);
if (params.random_prompt) {
2023-03-10 18:40:58 +00:00
params.prompt = gpt_random_prompt(rng);
}
2023-03-10 21:46:39 +00:00
// params.prompt = R"(// this function checks if the number n is prime
//bool is_prime(int n) {)";
llama_context * ctx;
2023-03-10 18:40:58 +00:00
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mlock = params.use_mlock;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
2023-03-10 18:40:58 +00:00
return 1;
}
}
2023-03-13 17:15:08 +00:00
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
// determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
// uncomment the "used_mem" line in llama.cpp to see the results
if (params.mem_test) {
{
const std::vector<llama_token> tmp(params.n_batch, 0);
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
}
{
const std::vector<llama_token> tmp = { 0, };
llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
}
llama_print_timings(ctx);
llama_free(ctx);
return 0;
}
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
2023-03-10 18:40:58 +00:00
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
2023-03-10 18:40:58 +00:00
const int n_ctx = llama_n_ctx(ctx);
if ((int) embd_inp.size() > n_ctx - 4) {
fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
// number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
params.n_keep = (int)embd_inp.size();
}
2023-03-10 18:40:58 +00:00
// prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
// in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) {
params.interactive_start = true;
params.antiprompt.push_back("### Instruction:\n\n");
}
// enable interactive mode if reverse prompt or interactive start is specified
if (params.antiprompt.size() != 0 || params.interactive_start) {
params.interactive = true;
}
Replace EOS with newline to prevent context/memory being flushed by EOS in interactive mode (#333) * Improve interactive mode's coherence after EOS Aims to improve coherence and ability to resume the interactive session when the user is given input back after an end of text token is reached. Not sure what token 13 is or why it seems to help. See conversation for examples. * Make newline token a constant * dynamically determine newline token * relocate previous newline token const * cleanup whitespace * print a new line on end of text in interactive this may need to be looked into further when not using a reverse prompt * only print manual newline with reverse prompt fix formatting of reverse prompts so they don't end up at the end of the current line while not introducing unnecessary new lines otherwise * alternate approach to replace end of text tokens * Inject the reverse prompt again after eos in interactive mode * tokenize reverse prompt when needed makes this PR compatible with https://github.com/ggerganov/llama.cpp/pull/330 * tokenize and inject only first reverse prompt thanks to tjohnman * tokenize first reverse prompt once * add newline token * add newline token * tokenize/inject reverse prompt for refactor this doesn't seem right though * tokenize nothing for antiprompt if no reverse * Update main.cpp * Update main.cpp * tokenize and inject reverse prompt as needed this doesn't seem to work if the reverse prompt is tokenized outside earlier on * not needed * remove newline token * remove newline token * tokenize newline token * add space to comment * Update main.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Slaren <2141330+slaren@users.noreply.github.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-23 20:22:47 +00:00
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
if (params.verbose_prompt) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
}
if (params.n_keep > 0) {
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
}
fprintf(stderr, "'\n");
}
fprintf(stderr, "\n");
2023-03-10 18:40:58 +00:00
}
if (params.interactive) {
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = sigint_handler;
sigemptyset (&sigint_action.sa_mask);
2023-03-13 17:15:08 +00:00
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
signal(SIGINT, sigint_handler);
#endif
fprintf(stderr, "%s: interactive mode on.\n", __func__);
if (params.antiprompt.size()) {
for (auto antiprompt : params.antiprompt) {
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
}
}
if (!params.input_prefix.empty()) {
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
}
}
fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
fprintf(stderr, "\n\n");
2023-03-10 18:40:58 +00:00
// TODO: replace with ring-buffer
std::vector<llama_token> last_n_tokens(n_ctx);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
if (params.interactive) {
fprintf(stderr, "== Running in interactive mode. ==\n"
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
" - Press Ctrl+C to interject at any time.\n"
#endif
" - Press Return to return control to LLaMa.\n"
" - If you want to submit another line, end your input in '\\'.\n\n");
is_interacting = params.interactive_start;
}
bool is_antiprompt = false;
bool input_noecho = false;
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
// the first thing we will do is to output the prompt, so set color accordingly
set_console_color(con_st, CONSOLE_COLOR_PROMPT);
std::vector<llama_token> embd;
2023-03-25 19:51:41 +00:00
while (n_remain != 0 || params.interactive) {
2023-03-10 18:40:58 +00:00
// predict
if (embd.size() > 0) {
// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
if (n_past + (int) embd.size() > n_ctx) {
const int n_left = n_past - params.n_keep;
n_past = params.n_keep;
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
// printf("%s", llama_token_to_str(ctx, embd[i]));
//}
//printf("'\n");
//printf("\n---\n");
}
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
2023-03-10 18:40:58 +00:00
return 1;
}
}
n_past += embd.size();
embd.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// out of user input, sample next token
const int32_t top_k = params.top_k;
const float top_p = params.top_p;
const float temp = params.temp;
const float repeat_penalty = params.repeat_penalty;
2023-03-10 18:40:58 +00:00
llama_token id = 0;
2023-03-10 18:40:58 +00:00
{
auto logits = llama_get_logits(ctx);
2023-03-10 18:40:58 +00:00
if (params.ignore_eos) {
logits[llama_token_eos()] = 0;
}
id = llama_sample_top_p_top_k(ctx,
last_n_tokens.data() + n_ctx - params.repeat_last_n,
params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
2023-03-10 18:40:58 +00:00
}
Replace EOS with newline to prevent context/memory being flushed by EOS in interactive mode (#333) * Improve interactive mode's coherence after EOS Aims to improve coherence and ability to resume the interactive session when the user is given input back after an end of text token is reached. Not sure what token 13 is or why it seems to help. See conversation for examples. * Make newline token a constant * dynamically determine newline token * relocate previous newline token const * cleanup whitespace * print a new line on end of text in interactive this may need to be looked into further when not using a reverse prompt * only print manual newline with reverse prompt fix formatting of reverse prompts so they don't end up at the end of the current line while not introducing unnecessary new lines otherwise * alternate approach to replace end of text tokens * Inject the reverse prompt again after eos in interactive mode * tokenize reverse prompt when needed makes this PR compatible with https://github.com/ggerganov/llama.cpp/pull/330 * tokenize and inject only first reverse prompt thanks to tjohnman * tokenize first reverse prompt once * add newline token * add newline token * tokenize/inject reverse prompt for refactor this doesn't seem right though * tokenize nothing for antiprompt if no reverse * Update main.cpp * Update main.cpp * tokenize and inject reverse prompt as needed this doesn't seem to work if the reverse prompt is tokenized outside earlier on * not needed * remove newline token * remove newline token * tokenize newline token * add space to comment * Update main.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Slaren <2141330+slaren@users.noreply.github.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-23 20:22:47 +00:00
// replace end of text token with newline token when in interactive mode
if (id == llama_token_eos() && params.interactive && !params.instruct) {
Replace EOS with newline to prevent context/memory being flushed by EOS in interactive mode (#333) * Improve interactive mode's coherence after EOS Aims to improve coherence and ability to resume the interactive session when the user is given input back after an end of text token is reached. Not sure what token 13 is or why it seems to help. See conversation for examples. * Make newline token a constant * dynamically determine newline token * relocate previous newline token const * cleanup whitespace * print a new line on end of text in interactive this may need to be looked into further when not using a reverse prompt * only print manual newline with reverse prompt fix formatting of reverse prompts so they don't end up at the end of the current line while not introducing unnecessary new lines otherwise * alternate approach to replace end of text tokens * Inject the reverse prompt again after eos in interactive mode * tokenize reverse prompt when needed makes this PR compatible with https://github.com/ggerganov/llama.cpp/pull/330 * tokenize and inject only first reverse prompt thanks to tjohnman * tokenize first reverse prompt once * add newline token * add newline token * tokenize/inject reverse prompt for refactor this doesn't seem right though * tokenize nothing for antiprompt if no reverse * Update main.cpp * Update main.cpp * tokenize and inject reverse prompt as needed this doesn't seem to work if the reverse prompt is tokenized outside earlier on * not needed * remove newline token * remove newline token * tokenize newline token * add space to comment * Update main.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Slaren <2141330+slaren@users.noreply.github.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-23 20:22:47 +00:00
id = llama_token_newline.front();
if (params.antiprompt.size() != 0) {
// tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
}
}
2023-03-10 18:40:58 +00:00
// add it to the context
embd.push_back(id);
// echo this to console
input_noecho = false;
// decrement remaining sampling budget
--n_remain;
2023-03-10 18:40:58 +00:00
} else {
// some user input remains from prompt or interaction, forward it to processing
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(embd_inp[n_consumed]);
++n_consumed;
2023-03-19 17:46:32 +00:00
if ((int) embd.size() >= params.n_batch) {
2023-03-10 18:40:58 +00:00
break;
}
}
}
// display text
if (!input_noecho) {
for (auto id : embd) {
printf("%s", llama_token_to_str(ctx, id));
}
fflush(stdout);
}
// reset color to default if we there is no pending user input
if (!input_noecho && (int)embd_inp.size() == n_consumed) {
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
}
// in interactive mode, and not currently processing queued inputs;
// check if we should prompt the user for more
if (params.interactive && (int) embd_inp.size() <= n_consumed) {
// check for reverse prompt
if (params.antiprompt.size()) {
std::string last_output;
for (auto id : last_n_tokens) {
last_output += llama_token_to_str(ctx, id);
}
is_antiprompt = false;
// Check if each of the reverse prompts appears at the end of the output.
for (std::string & antiprompt : params.antiprompt) {
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
is_interacting = true;
is_antiprompt = true;
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
fflush(stdout);
break;
}
}
}
if (n_past > 0 && is_interacting) {
// potentially set color to indicate we are taking user input
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
#if defined (_WIN32)
// Windows: must reactivate sigint handler after each signal
signal(SIGINT, sigint_handler);
#endif
if (params.instruct) {
printf("\n> ");
}
std::string buffer;
if (!params.input_prefix.empty()) {
buffer += params.input_prefix;
printf("%s", buffer.c_str());
}
std::string line;
bool another_line = true;
do {
if (!std::getline(std::cin, line)) {
// input stream is bad or EOF received
return 0;
}
if (line.empty() || line.back() != '\\') {
another_line = false;
} else {
line.pop_back(); // Remove the continue character
}
buffer += line + '\n'; // Append the line to the result
} while (another_line);
// done taking input, reset color
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
// Add tokens to embd only if the input buffer is non-empty
// Entering a empty line lets the user pass control back
if (buffer.length() > 1) {
2023-03-12 23:35:51 +00:00
// instruct mode: insert instruction prefix
if (params.instruct && !is_antiprompt) {
n_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
}
auto line_inp = ::llama_tokenize(ctx, buffer, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
// instruct mode: insert response suffix
if (params.instruct) {
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
}
n_remain -= line_inp.size();
}
input_noecho = true; // do not echo this again
}
if (n_past > 0) {
is_interacting = false;
}
2023-03-10 18:40:58 +00:00
}
// end of text token
if (embd.back() == llama_token_eos()) {
if (params.instruct) {
is_interacting = true;
} else {
fprintf(stderr, " [end of text]\n");
break;
}
2023-03-10 18:40:58 +00:00
}
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
n_remain = params.n_predict;
is_interacting = true;
}
2023-03-10 18:40:58 +00:00
}
#if defined (_WIN32)
signal(SIGINT, SIG_DFL);
#endif
llama_print_timings(ctx);
llama_free(ctx);
2023-03-10 18:40:58 +00:00
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
2023-03-10 18:40:58 +00:00
return 0;
}