mirror of
https://git.adityakumar.xyz/llama.cpp.git
synced 2024-11-09 23:29:44 +00:00
Don't tell users to use a bad number of threads (#243)
The readme tells people to use the command line option "-t 8", causing 8 threads to be started. On systems with fewer than 8 cores, this causes a significant slowdown. Remove the option from the example command lines and use /proc/cpuinfo on Linux to determine a sensible default.
This commit is contained in:
parent
6b0df5ccf3
commit
367946c668
5 changed files with 19 additions and 11 deletions
|
@ -34,7 +34,7 @@ else
|
||||||
echo "Unknown command: $arg1"
|
echo "Unknown command: $arg1"
|
||||||
echo "Available commands: "
|
echo "Available commands: "
|
||||||
echo " --run (-r): Run a model previously converted into ggml"
|
echo " --run (-r): Run a model previously converted into ggml"
|
||||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512"
|
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||||
echo " --convert (-c): Convert a llama model into ggml"
|
echo " --convert (-c): Convert a llama model into ggml"
|
||||||
echo " ex: \"/models/7B/\" 1"
|
echo " ex: \"/models/7B/\" 1"
|
||||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||||
|
|
10
README.md
10
README.md
|
@ -39,7 +39,7 @@ Supported platforms:
|
||||||
Here is a typical run using LLaMA-7B:
|
Here is a typical run using LLaMA-7B:
|
||||||
|
|
||||||
```java
|
```java
|
||||||
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
|
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||||
I llama.cpp build info:
|
I llama.cpp build info:
|
||||||
I UNAME_S: Darwin
|
I UNAME_S: Darwin
|
||||||
I UNAME_P: arm
|
I UNAME_P: arm
|
||||||
|
@ -150,7 +150,7 @@ python3 convert-pth-to-ggml.py models/7B/ 1
|
||||||
./quantize.sh 7B
|
./quantize.sh 7B
|
||||||
|
|
||||||
# run the inference
|
# run the inference
|
||||||
./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
|
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||||
```
|
```
|
||||||
|
|
||||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||||
|
@ -164,7 +164,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o
|
||||||
|
|
||||||
Here is an example few-shot interaction, invoked with the command
|
Here is an example few-shot interaction, invoked with the command
|
||||||
```
|
```
|
||||||
./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
|
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
|
||||||
-p \
|
-p \
|
||||||
"Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
"Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
||||||
|
|
||||||
|
@ -218,13 +218,13 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-on
|
||||||
On complete, you are ready to play!
|
On complete, you are ready to play!
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
|
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
or with light image:
|
or with light image:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
|
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|
4
ggml.c
4
ggml.c
|
@ -9318,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
||||||
if (cgraph->n_threads <= 0) {
|
|
||||||
cgraph->n_threads = 8;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int n_threads = cgraph->n_threads;
|
const int n_threads = cgraph->n_threads;
|
||||||
|
|
||||||
struct ggml_compute_state_shared state_shared = {
|
struct ggml_compute_state_shared state_shared = {
|
||||||
|
|
12
utils.cpp
12
utils.cpp
|
@ -16,6 +16,18 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
|
// determine sensible default number of threads.
|
||||||
|
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
|
||||||
|
#ifdef __linux__
|
||||||
|
std::ifstream cpuinfo("/proc/cpuinfo");
|
||||||
|
params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
|
||||||
|
std::istream_iterator<std::string>(),
|
||||||
|
std::string("processor"));
|
||||||
|
#endif
|
||||||
|
if (params.n_threads == 0) {
|
||||||
|
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
std::string arg = argv[i];
|
std::string arg = argv[i];
|
||||||
|
|
||||||
|
|
2
utils.h
2
utils.h
|
@ -14,7 +14,7 @@
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
int32_t seed = -1; // RNG seed
|
int32_t seed = -1; // RNG seed
|
||||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
int32_t n_threads;
|
||||||
int32_t n_predict = 128; // new tokens to predict
|
int32_t n_predict = 128; // new tokens to predict
|
||||||
int32_t repeat_last_n = 64; // last n tokens to penalize
|
int32_t repeat_last_n = 64; // last n tokens to penalize
|
||||||
int32_t n_ctx = 512; //context size
|
int32_t n_ctx = 512; //context size
|
||||||
|
|
Loading…
Reference in a new issue