@@ -330,7 +330,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
330330// precomputed f32 table for f16 (256 KB)
331331static float table_f32_f16 [1 <<16 ];
332332
333- #if defined(__ARM_NEON )
333+ #if defined(__ARM_NEON )|| defined( __wasm_simd128__ )
334334#define B1 (c ,s ,n ) 0x ## n ## c , 0x ## n ## s
335335#define B2 (c ,s ,n ) B1(c,s,n ## c), B1(c,s,n ## s)
336336#define B3 (c ,s ,n ) B2(c,s,n ## c), B2(c,s,n ## s)
@@ -1087,7 +1087,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
10871087const v128_t v = wasm_f32x4_mul (srcv [l ],wasm_f32x4_splat (id ));
10881088const v128_t vf = wasm_f32x4_add (v ,wasm_f32x4_splat (8.5f ));
10891089const v128_t vi = wasm_i32x4_trunc_sat_f32x4 (vf );
1090- const v128_t vc = wasm_i32x4_min_u (vi ,wasm_i32x4_splat (15 ));
1090+ const v128_t vc = wasm_i32x4_min (vi ,wasm_i32x4_splat (15 ));
10911091
10921092y [i ].qs [2 * l + 0 ]= wasm_i32x4_extract_lane (vc ,0 ) | (wasm_i32x4_extract_lane (vc ,1 ) <<4 );
10931093y [i ].qs [2 * l + 1 ]= wasm_i32x4_extract_lane (vc ,2 ) | (wasm_i32x4_extract_lane (vc ,3 ) <<4 );
@@ -3180,6 +3180,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
31803180 }
31813181
31823182* s = vaddvq_f32 (sumv );
3183+ #elif defined(__wasm_simd128__ )
3184+ v128_t sumv = wasm_f32x4_splat (0.0f );
3185+
3186+ uint64_t tmp [4 ];
3187+
3188+ for (int i = 0 ;i < nb ;++ i ) {
3189+ const block_q5_0 * restrictx0 = & x [i ];
3190+ const block_q8_0 * restricty0 = & y [i ];
3191+
3192+ const v128_t m4b = wasm_i8x16_splat (0x0F );
3193+ const v128_t s16b = wasm_i8x16_splat (0x10 );
3194+
3195+ // extract the 5th bit
3196+ uint32_t qh ;
3197+ memcpy (& qh ,x0 -> qh ,sizeof (qh ));
3198+
3199+ tmp [0 ]= table_b2b_u [(qh >>0 )& 0xFF ];
3200+ tmp [1 ]= table_b2b_u [(qh >>8 )& 0xFF ];
3201+ tmp [2 ]= table_b2b_u [(qh >>16 )& 0xFF ];
3202+ tmp [3 ]= table_b2b_u [(qh >>24 ) ];
3203+
3204+ const v128_t qhl = wasm_v128_load (tmp + 0 );
3205+ const v128_t qhh = wasm_v128_load (tmp + 2 );
3206+
3207+ const v128_t v0 = wasm_v128_load (x0 -> qs );
3208+
3209+ // 4-bit -> 8-bit
3210+ const v128_t v0l = wasm_v128_and (v0 ,m4b );
3211+ const v128_t v0h = wasm_u8x16_shr (v0 ,4 );
3212+
3213+ // interleave
3214+ const v128_t v0lz = wasm_v8x16_shuffle (v0l ,v0h ,0 ,16 ,1 ,17 ,2 ,18 ,3 ,19 ,4 ,20 ,5 ,21 ,6 ,22 ,7 ,23 );
3215+ const v128_t v0hz = wasm_v8x16_shuffle (v0l ,v0h ,8 ,24 ,9 ,25 ,10 ,26 ,11 ,27 ,12 ,28 ,13 ,29 ,14 ,30 ,15 ,31 );
3216+
3217+ // add high bit and sub 16
3218+ const v128_t v0lf = wasm_i8x16_sub (wasm_v128_or (v0lz ,qhl ),s16b );
3219+ const v128_t v0hf = wasm_i8x16_sub (wasm_v128_or (v0hz ,qhh ),s16b );
3220+
3221+ // load y
3222+ const v128_t v1l = wasm_v128_load (y0 -> qs );
3223+ const v128_t v1h = wasm_v128_load (y0 -> qs + 16 );
3224+
3225+ // int8x16 -> int16x8
3226+ const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf );
3227+ const v128_t v0lfh = wasm_i16x8_extend_high_i8x16 (v0lf );
3228+ const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf );
3229+ const v128_t v0hfh = wasm_i16x8_extend_high_i8x16 (v0hf );
3230+
3231+ const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l );
3232+ const v128_t v1lh = wasm_i16x8_extend_high_i8x16 (v1l );
3233+ const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h );
3234+ const v128_t v1hh = wasm_i16x8_extend_high_i8x16 (v1h );
3235+
3236+ const float x0d = GGML_FP16_TO_FP32 (x0 -> d );
3237+
3238+ // dot product
3239+ sumv = wasm_f32x4_add (sumv ,wasm_f32x4_mul (wasm_f32x4_convert_i32x4 (
3240+ wasm_i32x4_add (
3241+ wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0lfl ,v1ll ),
3242+ wasm_i32x4_dot_i16x8 (v0lfh ,v1lh )),
3243+ wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0hfl ,v1hl ),
3244+ wasm_i32x4_dot_i16x8 (v0hfh ,v1hh )))),wasm_f32x4_splat (x0d * y0 -> d )));
3245+ }
3246+
3247+ * s = wasm_f32x4_extract_lane (sumv ,0 )+ wasm_f32x4_extract_lane (sumv ,1 )+
3248+ wasm_f32x4_extract_lane (sumv ,2 )+ wasm_f32x4_extract_lane (sumv ,3 );
31833249#elif defined(__AVX2__ )
31843250// Initialize accumulator with zeros
31853251__m256 acc = _mm256_setzero_ps ();
@@ -3311,6 +3377,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
33113377 }
33123378
33133379* s = vaddvq_f32 (sumv )+ summs ;
3380+ #elif defined(__wasm_simd128__ )
3381+ v128_t sumv = wasm_f32x4_splat (0.0f );
3382+
3383+ float summs = 0.0f ;
3384+
3385+ uint64_t tmp [4 ];
3386+
3387+ for (int i = 0 ;i < nb ;++ i ) {
3388+ const block_q5_1 * restrictx0 = & x [i ];
3389+ const block_q8_1 * restricty0 = & y [i ];
3390+
3391+ summs += GGML_FP16_TO_FP32 (x0 -> m )* (y0 -> s0 + y0 -> s1 );
3392+
3393+ const v128_t m4b = wasm_i8x16_splat (0x0F );
3394+
3395+ // extract the 5th bit
3396+ uint32_t qh ;
3397+ memcpy (& qh ,x0 -> qh ,sizeof (qh ));
3398+
3399+ tmp [0 ]= table_b2b_u [(qh >>0 )& 0xFF ];
3400+ tmp [1 ]= table_b2b_u [(qh >>8 )& 0xFF ];
3401+ tmp [2 ]= table_b2b_u [(qh >>16 )& 0xFF ];
3402+ tmp [3 ]= table_b2b_u [(qh >>24 ) ];
3403+
3404+ const v128_t qhl = wasm_v128_load (tmp + 0 );
3405+ const v128_t qhh = wasm_v128_load (tmp + 2 );
3406+
3407+ const v128_t v0 = wasm_v128_load (x0 -> qs );
3408+
3409+ // 4-bit -> 8-bit
3410+ const v128_t v0l = wasm_v128_and (v0 ,m4b );
3411+ const v128_t v0h = wasm_u8x16_shr (v0 ,4 );
3412+
3413+ static bool x = true;
3414+
3415+ // interleave
3416+ const v128_t v0lz = wasm_v8x16_shuffle (v0l ,v0h ,0 ,16 ,1 ,17 ,2 ,18 ,3 ,19 ,4 ,20 ,5 ,21 ,6 ,22 ,7 ,23 );
3417+ const v128_t v0hz = wasm_v8x16_shuffle (v0l ,v0h ,8 ,24 ,9 ,25 ,10 ,26 ,11 ,27 ,12 ,28 ,13 ,29 ,14 ,30 ,15 ,31 );
3418+
3419+ // add high bit
3420+ const v128_t v0lf = wasm_v128_or (v0lz ,qhl );
3421+ const v128_t v0hf = wasm_v128_or (v0hz ,qhh );
3422+
3423+ // load y
3424+ const v128_t v1l = wasm_v128_load (y0 -> qs );
3425+ const v128_t v1h = wasm_v128_load (y0 -> qs + 16 );
3426+
3427+ // int8x16 -> int16x8
3428+ const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf );
3429+ const v128_t v0lfh = wasm_i16x8_extend_high_i8x16 (v0lf );
3430+ const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf );
3431+ const v128_t v0hfh = wasm_i16x8_extend_high_i8x16 (v0hf );
3432+
3433+ const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l );
3434+ const v128_t v1lh = wasm_i16x8_extend_high_i8x16 (v1l );
3435+ const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h );
3436+ const v128_t v1hh = wasm_i16x8_extend_high_i8x16 (v1h );
3437+
3438+ const float x0d = GGML_FP16_TO_FP32 (x0 -> d );
3439+
3440+ // dot product
3441+ sumv = wasm_f32x4_add (sumv ,wasm_f32x4_mul (wasm_f32x4_convert_i32x4 (
3442+ wasm_i32x4_add (
3443+ wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0lfl ,v1ll ),
3444+ wasm_i32x4_dot_i16x8 (v0lfh ,v1lh )),
3445+ wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0hfl ,v1hl ),
3446+ wasm_i32x4_dot_i16x8 (v0hfh ,v1hh )))),wasm_f32x4_splat (x0d * y0 -> d )));
3447+ }
3448+
3449+ * s = wasm_f32x4_extract_lane (sumv ,0 )+ wasm_f32x4_extract_lane (sumv ,1 )+
3450+ wasm_f32x4_extract_lane (sumv ,2 )+ wasm_f32x4_extract_lane (sumv ,3 )+ summs ;
33143451#elif defined(__AVX2__ )
33153452// Initialize accumulator with zeros
33163453__m256 acc = _mm256_setzero_ps ();
@@ -4057,6 +4194,27 @@ bool ggml_is_quantized(enum ggml_type type) {
40574194return GGML_IS_QUANTIZED [type ];
40584195}
40594196
4197+ enum ggml_type ggml_ftype_to_ggml_type (enum ggml_ftype ftype ) {
4198+ enum ggml_type wtype = GGML_TYPE_COUNT ;
4199+
4200+ switch (ftype ) {
4201+ case GGML_FTYPE_ALL_F32 :wtype = GGML_TYPE_F32 ;break ;
4202+ case GGML_FTYPE_MOSTLY_F16 :wtype = GGML_TYPE_F16 ;break ;
4203+ case GGML_FTYPE_MOSTLY_Q4_0 :wtype = GGML_TYPE_Q4_0 ;break ;
4204+ case GGML_FTYPE_MOSTLY_Q4_1 :wtype = GGML_TYPE_Q4_1 ;break ;
4205+ case GGML_FTYPE_MOSTLY_Q4_2 :wtype = GGML_TYPE_Q4_2 ;break ;
4206+ case GGML_FTYPE_MOSTLY_Q5_0 :wtype = GGML_TYPE_Q5_0 ;break ;
4207+ case GGML_FTYPE_MOSTLY_Q5_1 :wtype = GGML_TYPE_Q5_1 ;break ;
4208+ case GGML_FTYPE_MOSTLY_Q8_0 :wtype = GGML_TYPE_Q8_0 ;break ;
4209+ case GGML_FTYPE_UNKNOWN :wtype = GGML_TYPE_COUNT ;break ;
4210+ case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 :wtype = GGML_TYPE_COUNT ;break ;
4211+ }
4212+
4213+ GGML_ASSERT (wtype != GGML_TYPE_COUNT );
4214+
4215+ return wtype ;
4216+ }
4217+
40604218static inline bool ggml_is_transposed (const struct ggml_tensor * tensor ) {
40614219return tensor -> nb [0 ]> tensor -> nb [1 ];
40624220}