ggml : various fixes (#1450)

- `ggml_rope()` - `ggml_diag_mask_inf()` multi-threaded - compatibility with scratch buffers
2024-11-09 23:29:44 +00:00 · 2023-05-14 18:22:50 +03:00 · 2023-05-14 18:22:50 +03:00 · 13c351ad72
commit 13c351ad72
parent 60f8c361ca
2 changed files with 263 additions and 118 deletions
--- a/ggml.c
+++ b/ggml.c
@ -3923,6 +3923,20 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
    return result;
 }
 // IMPORTANT:
 // when creating "opt" tensors, always save and load the scratch buffer
 // this is an error prone process, but it is necessary to support inplace
 // operators when using scratch buffers
 // TODO: implement a better way
 void ggml_scratch_save(struct ggml_context * ctx) {
    ctx->scratch_save = ctx->scratch;
    ctx->scratch.data = NULL;
 }
 void ggml_scratch_load(struct ggml_context * ctx) {
    ctx->scratch = ctx->scratch_save;
 }
 ////////////////////////////////////////////////////////////////////////////////
 struct ggml_tensor * ggml_new_tensor_impl(
@ -4094,12 +4108,11 @@ struct ggml_tensor * ggml_new_tensor_4d(
 }
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
-    ctx->scratch_save = ctx->scratch;
+    ggml_scratch_save(ctx);
    ctx->scratch.data = NULL;
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
-    ctx->scratch = ctx->scratch_save;
+    ggml_scratch_load(ctx);
    ggml_set_i32(result, value);
@ -4107,12 +4120,11 @@ struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
 }
 struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
-    ctx->scratch_save = ctx->scratch;
+    ggml_scratch_save(ctx);
    ctx->scratch.data = NULL;
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-    ctx->scratch = ctx->scratch_save;
+    ggml_scratch_load(ctx);
    ggml_set_f32(result, value);
@ -4541,13 +4553,19 @@ struct ggml_tensor * ggml_acc_impl(
    }
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_scratch_save(ctx);
    struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
    ((int32_t *) c->data)[0] = nb1;
    ((int32_t *) c->data)[1] = nb2;
    ((int32_t *) c->data)[2] = nb3;
    ((int32_t *) c->data)[3] = offset;
    ((int32_t *) c->data)[4] = inplace ? 1 : 0;
    ggml_scratch_load(ctx);
    result->op   = GGML_OP_ACC;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
@ -5344,13 +5362,19 @@ struct ggml_tensor * ggml_set_impl(
    // make a view of the destination
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_scratch_save(ctx);
    struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
    (( int32_t * ) c->data)[0] = nb1;
    (( int32_t * ) c->data)[1] = nb2;
    (( int32_t * ) c->data)[2] = nb3;
    (( int32_t * ) c->data)[3] = offset;
    (( int32_t * ) c->data)[4] = inplace ? 1 : 0;
    ggml_scratch_load(ctx);
    result->op   = GGML_OP_SET;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
@ -5954,10 +5978,16 @@ struct ggml_tensor * ggml_diag_mask_inf_impl(
    }
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_scratch_save(ctx);
    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
    ((int32_t *) b->data)[0] = n_past;
    ((int32_t *) b->data)[1] = inplace ? 1 : 0;
    ggml_scratch_load(ctx);
    result->op   = GGML_OP_DIAG_MASK_INF;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
@ -5995,11 +6025,17 @@ struct ggml_tensor * ggml_diag_mask_zero_impl(
    }
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_scratch_save(ctx);
    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
    ggml_set_name(b, "n_past, inplace");
    ((int32_t *) b->data)[0] = n_past;
    ((int32_t *) b->data)[1] = inplace ? 1 : 0;
    ggml_scratch_load(ctx);
    result->op   = GGML_OP_DIAG_MASK_ZERO;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
@ -6074,11 +6110,16 @@ struct ggml_tensor * ggml_rope_impl(
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_scratch_save(ctx);
    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
    ((int32_t *) b->data)[0] = n_past;
    ((int32_t *) b->data)[1] = n_dims;
    ((int32_t *) b->data)[2] = mode;
    ggml_scratch_load(ctx);
    result->op   = GGML_OP_ROPE;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
@ -6123,11 +6164,16 @@ struct ggml_tensor * ggml_rope_back(
    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
    ggml_scratch_save(ctx);
    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
    ggml_set_name(b, "n_past, n_dims, mode");
    ((int32_t *) b->data)[0] = n_past;
    ((int32_t *) b->data)[1] = n_dims;
    ((int32_t *) b->data)[2] = mode;
-    ggml_set_name(b, "n_past, n_dims, mode");
+
    ggml_scratch_load(ctx);
    result->op   = GGML_OP_ROPE_BACK;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6156,10 +6202,15 @@ struct ggml_tensor * ggml_alibi(
    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
    ggml_scratch_save(ctx);
    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
    ((int32_t *) b->data)[0] = n_past;
    ((int32_t *) b->data)[1] = n_head;
    ggml_scratch_load(ctx);
    result->op   = GGML_OP_ALIBI;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
@ -10450,19 +10501,33 @@ static void ggml_compute_forward_diag_mask_f32(
    assert(src1->type == GGML_TYPE_I32);
    assert(ggml_nelements(src1) == 2);
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+    const int  n_past  =       ((int32_t *) src1->data)[0];
    const bool inplace = (bool)((int32_t *) src1->data)[1];
    if (params->type == GGML_TASK_INIT) {
        // TODO: this hack is not good, need a better way to handle this
        if (!inplace) {
            // use the init task to copy src -> dst
            struct ggml_compute_params params_cpy = *params;
            params_cpy.ith  = 0;
            params_cpy.nth  = 1;
            params_cpy.type = GGML_TASK_COMPUTE;
            ggml_compute_forward_dup_same_cont(&params_cpy, src0, dst);
        }
        return;
    }
    if (params->type == GGML_TASK_FINALIZE) {
        return;
    }
    const int ith = params->ith;
    const int nth = params->nth;
-    const int  n_past  =       ((int32_t *) src1->data)[0];
+    assert(n_past >= 0);
    const bool inplace = (bool)((int32_t *) src1->data)[1];
    if (!inplace) {
        ggml_compute_forward_dup_same_cont(params, src0, dst);
    }
    // TODO: handle transposed/permuted matrices
@ -10626,6 +10691,8 @@ static void ggml_compute_forward_alibi_f32(
    const int n_past = ((int32_t *) src1->data)[0];
    const int n_head = ((int32_t *) src1->data)[1];
    assert(n_past >= 0);
    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
    const int ne1 = src0->ne[1]; // seq_len_without_past
    //const int ne2 = src0->ne[2]; // n_head -> this is k
@ -10687,6 +10754,8 @@ static void ggml_compute_forward_alibi_f16(
    const int n_past = ((int32_t *) src1->data)[0];
    const int n_head = ((int32_t *) src1->data)[1];
    assert(n_past >= 0);
    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
    const int ne1 = src0->ne[1]; // seq_len_without_past
    //const int ne2 = src0->ne[2]; // n_head -> this is k
@ -10780,28 +10849,34 @@ static void ggml_compute_forward_rope_f32(
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
-    //const int64_t ne0 = src0->ne[0];
+    assert(n_past >= 0);
    const int64_t ne1 = src0->ne[1];
    const int64_t ne2 = src0->ne[2];
    const int64_t ne3 = src0->ne[3];
-    const int nb0 = src0->nb[0];
+    const size_t nb00 = src0->nb[0];
-    const int nb1 = src0->nb[1];
+    const size_t nb01 = src0->nb[1];
-    const int nb2 = src0->nb[2];
+    const size_t nb02 = src0->nb[2];
-    const int nb3 = src0->nb[3];
+    const size_t nb03 = src0->nb[3];
    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];
    const int64_t ne2 = dst->ne[2];
    const int64_t ne3 = dst->ne[3];
    const size_t nb0 = dst->nb[0];
    const size_t nb1 = dst->nb[1];
    const size_t nb2 = dst->nb[2];
    const size_t nb3 = dst->nb[3];
    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
    const int ith = params->ith;
    const int nth = params->nth;
-    const int nr = ggml_nrows(src0);
+    const int nr = ggml_nrows(dst);
    const int nc = src0->ne[0];
-    GGML_ASSERT(n_dims <= nc);
+    GGML_ASSERT(n_dims <= ne0);
    GGML_ASSERT(n_dims % 2 == 0);
    // rows per thread
@ -10820,21 +10895,21 @@ static void ggml_compute_forward_rope_f32(
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
-            const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
+            const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
            for (int64_t i1 = 0; i1 < ne1; i1++) {
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
                float theta = (float)p;
-                for (int i0 = 0; i0 < n_dims; i0 += 2) {
+                if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cosf(theta);
                        const float sin_theta = sinf(theta);
                        theta *= theta_scale;
-                    if (!is_neox) {
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                        const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
                              float * dst_data  = (float *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
                        const float x0 = src[0];
@ -10842,9 +10917,21 @@ static void ggml_compute_forward_rope_f32(
                        dst_data[0] = x0*cos_theta - x1*sin_theta;
                        dst_data[1] = x0*sin_theta + x1*cos_theta;
                    }
                } else {
-                        const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+                    // TODO: this is probably wrong, but I can't figure it out ..
-                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+                    // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
                            const float cos_theta = cosf(theta);
                            const float sin_theta = sinf(theta);
                            theta *= theta_scale;
                            const int64_t i0 = ib*n_dims + ic/2;
                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  float * dst_data  = (float *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
                            const float x0 = src[0];
                            const float x1 = src[n_dims/2];
@ -10856,6 +10943,7 @@ static void ggml_compute_forward_rope_f32(
                }
            }
        }
    }
 }
 static void ggml_compute_forward_rope_f16(
@ -10874,15 +10962,22 @@ static void ggml_compute_forward_rope_f16(
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
-    //const int64_t ne0 = src0->ne[0];
+    assert(n_past >= 0);
    const int64_t ne1 = src0->ne[1];
    const int64_t ne2 = src0->ne[2];
    const int64_t ne3 = src0->ne[3];
-    const int nb0 = src0->nb[0];
+    const size_t nb00 = src0->nb[0];
-    const int nb1 = src0->nb[1];
+    const size_t nb01 = src0->nb[1];
-    const int nb2 = src0->nb[2];
+    const size_t nb02 = src0->nb[2];
-    const int nb3 = src0->nb[3];
+    const size_t nb03 = src0->nb[3];
    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];
    const int64_t ne2 = dst->ne[2];
    const int64_t ne3 = dst->ne[3];
    const size_t nb0 = dst->nb[0];
    const size_t nb1 = dst->nb[1];
    const size_t nb2 = dst->nb[2];
    const size_t nb3 = dst->nb[3];
    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@ -10892,10 +10987,9 @@ static void ggml_compute_forward_rope_f16(
    const int ith = params->ith;
    const int nth = params->nth;
-    const int nr = ggml_nrows(src0);
+    const int nr = ggml_nrows(dst);
    const int nc = src0->ne[0];
-    GGML_ASSERT(n_dims <= nc);
+    GGML_ASSERT(n_dims <= ne0);
    GGML_ASSERT(n_dims % 2 == 0);
    // rows per thread
@ -10914,21 +11008,21 @@ static void ggml_compute_forward_rope_f16(
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
-            const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
+            const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
            for (int64_t i1 = 0; i1 < ne1; i1++) {
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
                float theta = (float)p;
-                for (int i0 = 0; i0 < n_dims; i0 += 2) {
+                if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cosf(theta);
                        const float sin_theta = sinf(theta);
                        theta *= theta_scale;
-                    if (!is_neox) {
+                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                        const float x0 = GGML_FP16_TO_FP32(src[0]);
@ -10936,9 +11030,21 @@ static void ggml_compute_forward_rope_f16(
                        dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                    }
                } else {
-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+                    // TODO: this is probably wrong, but I can't figure it out ..
-                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+                    // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
                            const float cos_theta = cosf(theta);
                            const float sin_theta = sinf(theta);
                            theta *= theta_scale;
                            const int64_t i0 = ib*n_dims + ic/2;
                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                            const float x0 = GGML_FP16_TO_FP32(src[0]);
                            const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
@ -10950,6 +11056,7 @@ static void ggml_compute_forward_rope_f16(
                }
            }
        }
    }
 }
 static void ggml_compute_forward_rope(
@ -10995,15 +11102,23 @@ static void ggml_compute_forward_rope_back_f32(
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
-    //const int64_t ne0 = src0->ne[0];
+    assert(n_past >= 0);
-    const int64_t ne1 = src0->ne[1];
+
-    const int64_t ne2 = src0->ne[2];
+    const size_t nb00 = src0->nb[0];
-    const int64_t ne3 = src0->ne[3];
+    const size_t nb01 = src0->nb[1];
    const size_t nb02 = src0->nb[2];
    const size_t nb03 = src0->nb[3];
    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];
    const int64_t ne2 = dst->ne[2];
    const int64_t ne3 = dst->ne[3];
    const size_t nb0 = dst->nb[0];
    const size_t nb1 = dst->nb[1];
    const size_t nb2 = dst->nb[2];
    const size_t nb3 = dst->nb[3];
    const int nb0 = src0->nb[0];
    const int nb1 = src0->nb[1];
    const int nb2 = src0->nb[2];
    const int nb3 = src0->nb[3];
    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@ -11013,7 +11128,7 @@ static void ggml_compute_forward_rope_back_f32(
    const int ith = params->ith;
    const int nth = params->nth;
-    const int nr = ggml_nrows(src0);
+    const int nr = ggml_nrows(dst);
    // rows per thread
    const int dr = (nr + nth - 1)/nth;
@ -11031,21 +11146,21 @@ static void ggml_compute_forward_rope_back_f32(
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
-            const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
+            const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
            for (int64_t i1 = 0; i1 < ne1; i1++) {
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
                float theta = (float)p;
-                for (int i0 = 0; i0 < n_dims; i0 += 2) {
+                if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cosf(theta);
                        const float sin_theta = sinf(theta);
                        theta *= theta_scale;
-                    if (!is_neox) {
+                        const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                        const float * const dy  = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
                              float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                        const float dy0 = dy[0];
@ -11053,9 +11168,19 @@ static void ggml_compute_forward_rope_back_f32(
                        dx[0] =   dy0*cos_theta + dy1*sin_theta;
                        dx[1] = - dy0*sin_theta + dy1*cos_theta;
                    }
                } else {
-                        const float * const dy  = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-                              float *       dx  = (float *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
                            const float cos_theta = cosf(theta);
                            const float sin_theta = sinf(theta);
                            theta *= theta_scale;
                            const int64_t i0 = ib*n_dims + ic/2;
                            const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                            const float dy0 = dy[0];
                            const float dy1 = dy[n_dims/2];
@ -11067,6 +11192,7 @@ static void ggml_compute_forward_rope_back_f32(
                }
            }
        }
    }
 }
 static void ggml_compute_forward_rope_back_f16(
@ -11089,15 +11215,23 @@ static void ggml_compute_forward_rope_back_f16(
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
-    //const int64_t ne0 = src0->ne[0];
+    assert(n_past >= 0);
-    const int64_t ne1 = src0->ne[1];
+
-    const int64_t ne2 = src0->ne[2];
+    const size_t nb00 = src0->nb[0];
-    const int64_t ne3 = src0->ne[3];
+    const size_t nb01 = src0->nb[1];
    const size_t nb02 = src0->nb[2];
    const size_t nb03 = src0->nb[3];
    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];
    const int64_t ne2 = dst->ne[2];
    const int64_t ne3 = dst->ne[3];
    const size_t nb0 = dst->nb[0];
    const size_t nb1 = dst->nb[1];
    const size_t nb2 = dst->nb[2];
    const size_t nb3 = dst->nb[3];
    const int nb0 = src0->nb[0];
    const int nb1 = src0->nb[1];
    const int nb2 = src0->nb[2];
    const int nb3 = src0->nb[3];
    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@ -11107,7 +11241,7 @@ static void ggml_compute_forward_rope_back_f16(
    const int ith = params->ith;
    const int nth = params->nth;
-    const int nr = ggml_nrows(src0);
+    const int nr = ggml_nrows(dst);
    // rows per thread
    const int dr = (nr + nth - 1)/nth;
@ -11125,21 +11259,21 @@ static void ggml_compute_forward_rope_back_f16(
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
-            const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
+            const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
            for (int64_t i1 = 0; i1 < ne1; i1++) {
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
                float theta = (float)p;
-                for (int i0 = 0; i0 < n_dims; i0 += 2) {
+                if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cosf(theta);
                        const float sin_theta = sinf(theta);
                        theta *= theta_scale;
-                    if (!is_neox) {
+                        const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                        const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
                              ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                        const float dy0 = GGML_FP16_TO_FP32(dy[0]);
@ -11147,9 +11281,19 @@ static void ggml_compute_forward_rope_back_f16(
                        dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
                        dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
                    }
                } else {
-                        const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-                              ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
                            const float cos_theta = cosf(theta);
                            const float sin_theta = sinf(theta);
                            theta *= theta_scale;
                            const int64_t i0 = ib*n_dims + ic/2;
                            const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                            const float dy0 = GGML_FP16_TO_FP32(dy[0]);
                            const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
@ -11161,6 +11305,7 @@ static void ggml_compute_forward_rope_back_f16(
                }
            }
        }
    }
 }
 static void ggml_compute_forward_rope_back(
--- a/ggml.h
+++ b/ggml.h
@ -372,7 +372,7 @@ extern "C" {
        char name[32];
-        char padding[9]; // TODO: remove and add padding to name?
+        char padding[16];
    };
    // computation graph