62 const line_buf* aug,
ui32 repeat,
bool synthesis) = NULL;
81 const line_buf* aug,
ui32 repeat,
bool synthesis) = NULL;
100 static std::once_flag wavelet_transform_functions_init_flag;
101 std::call_once(wavelet_transform_functions_init_flag, [](){
102#if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
113 #ifndef OJPH_DISABLE_SIMD
115 #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
117 #ifndef OJPH_DISABLE_SSE
127 #ifndef OJPH_DISABLE_SSE2
136 #ifndef OJPH_DISABLE_AVX
146 #ifndef OJPH_DISABLE_AVX2
155 #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512))
169 #elif defined(OJPH_ARCH_ARM)
190#if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
196 ui32 repeat,
bool synthesis)
203 const si32* src1 = sig->
i32, * src2 = other->
i32;
210 for (
ui32 i = repeat; i > 0; --i)
211 *dst++ -= (b + *src1++ + *src2++) >> e;
213 for (
ui32 i = repeat; i > 0; --i)
214 *dst++ += (b + *src1++ + *src2++) >> e;
216 else if (a == -1 && b == 1 && e == 1)
219 for (
ui32 i = repeat; i > 0; --i)
220 *dst++ += (*src1++ + *src2++) >> e;
222 for (
ui32 i = repeat; i > 0; --i)
223 *dst++ -= (*src1++ + *src2++) >> e;
228 for (
ui32 i = repeat; i > 0; --i)
229 *dst++ -= (b - (*src1++ + *src2++)) >> e;
231 for (
ui32 i = repeat; i > 0; --i)
232 *dst++ += (b - (*src1++ + *src2++)) >> e;
236 for (
ui32 i = repeat; i > 0; --i)
237 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
239 for (
ui32 i = repeat; i > 0; --i)
240 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
248 ui32 repeat,
bool synthesis)
255 const si64* src1 = sig->
i64, * src2 = other->
i64;
262 for (
ui32 i = repeat; i > 0; --i)
263 *dst++ -= (b + *src1++ + *src2++) >> e;
265 for (
ui32 i = repeat; i > 0; --i)
266 *dst++ += (b + *src1++ + *src2++) >> e;
268 else if (a == -1 && b == 1 && e == 1)
271 for (
ui32 i = repeat; i > 0; --i)
272 *dst++ += (*src1++ + *src2++) >> e;
274 for (
ui32 i = repeat; i > 0; --i)
275 *dst++ -= (*src1++ + *src2++) >> e;
280 for (
ui32 i = repeat; i > 0; --i)
281 *dst++ -= (b - (*src1++ + *src2++)) >> e;
283 for (
ui32 i = repeat; i > 0; --i)
284 *dst++ += (b - (*src1++ + *src2++)) >> e;
288 for (
ui32 i = repeat; i > 0; --i)
289 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
291 for (
ui32 i = repeat; i > 0; --i)
292 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
299 ui32 repeat,
bool synthesis)
323 ui32 width,
bool even)
336 for (; w > 1; w -= 2)
338 *dpl++ = *sp++; *dph++ = *sp++;
346 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
347 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
349 for (
ui32 j = num_steps; j > 0; --j)
359 lp[l_width] = lp[l_width - 1];
361 const si32* sp = lp + (even ? 1 : 0);
365 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
366 *dp += (b + (sp[-1] + sp[0])) >> e;
368 else if (a == -1 && b == 1 && e == 1)
370 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
371 *dp -= (sp[-1] + sp[0]) >> e;
375 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
376 *dp += (b - (sp[-1] + sp[0])) >> e;
380 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
381 *dp += (b + a * (sp[-1] + sp[0])) >> e;
385 si32* t = lp; lp = hp; hp = t;
387 ui32 w = l_width; l_width = h_width; h_width = w;
392 ldst->
i32[0] = src->
i32[0];
394 hdst->
i32[0] = src->
i32[0] << 1;
402 ui32 width,
bool even)
415 for (; w > 1; w -= 2)
417 *dpl++ = *sp++; *dph++ = *sp++;
425 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
426 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
428 for (
ui32 j = num_steps; j > 0; --j)
438 lp[l_width] = lp[l_width - 1];
440 const si64* sp = lp + (even ? 1 : 0);
444 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
445 *dp += (b + (sp[-1] + sp[0])) >> e;
447 else if (a == -1 && b == 1 && e == 1)
449 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
450 *dp -= (sp[-1] + sp[0]) >> e;
454 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
455 *dp += (b - (sp[-1] + sp[0])) >> e;
459 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
460 *dp += (b + a * (sp[-1] + sp[0])) >> e;
464 si64* t = lp; lp = hp; hp = t;
466 ui32 w = l_width; l_width = h_width; h_width = w;
471 ldst->
i64[0] = src->
i64[0];
473 hdst->
i64[0] = src->
i64[0] << 1;
480 ui32 width,
bool even)
501 ui32 width,
bool even)
507 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
508 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
510 for (
ui32 j = 0; j < num_steps; ++j)
519 oth[oth_width] = oth[oth_width - 1];
521 const si32* sp = oth + (ev ? 0 : 1);
525 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
526 *dp -= (b + (sp[-1] + sp[0])) >> e;
528 else if (a == -1 && b == 1 && e == 1)
530 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
531 *dp += (sp[-1] + sp[0]) >> e;
535 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
536 *dp -= (b - (sp[-1] + sp[0])) >> e;
540 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
541 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
545 si32* t = aug; aug = oth; oth = t;
547 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
559 for (; w > 1; w -= 2)
561 *dp++ = *spl++; *dp++ = *sph++;
570 dst->
i32[0] = lsrc->
i32[0];
572 dst->
i32[0] = hsrc->
i32[0] >> 1;
580 ui32 width,
bool even)
586 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
587 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
589 for (
ui32 j = 0; j < num_steps; ++j)
598 oth[oth_width] = oth[oth_width - 1];
600 const si64* sp = oth + (ev ? 0 : 1);
604 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
605 *dp -= (b + (sp[-1] + sp[0])) >> e;
607 else if (a == -1 && b == 1 && e == 1)
609 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
610 *dp += (sp[-1] + sp[0]) >> e;
614 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
615 *dp -= (b - (sp[-1] + sp[0])) >> e;
619 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
620 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
624 si64* t = aug; aug = oth; oth = t;
626 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
638 for (; w > 1; w -= 2)
640 *dp++ = *spl++; *dp++ = *sph++;
649 dst->
i64[0] = lsrc->
i64[0];
651 dst->
i64[0] = hsrc->
i64[0] >> 1;
658 ui32 width,
bool even)
678 ui32 repeat,
bool synthesis)
685 float* dst = aug->
f32;
686 const float* src1 = sig->
f32, * src2 = other->
f32;
687 for (
ui32 i = repeat; i > 0; --i)
688 *dst++ += a * (*src1++ + *src2++);
694 float* dst = aug->
f32;
695 for (
ui32 i = repeat; i > 0; --i)
702 ui32 width,
bool even)
707 float* dph = hdst->
f32;
708 float* dpl = ldst->
f32;
709 float* sp = src->
f32;
715 for (; w > 1; w -= 2)
717 *dpl++ = *sp++; *dph++ = *sp++;
724 float* hp = hdst->
f32, * lp = ldst->
f32;
725 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
726 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
728 for (
ui32 j = num_steps; j > 0; --j)
735 lp[l_width] = lp[l_width - 1];
737 const float* sp = lp + (even ? 1 : 0);
739 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
740 *dp += a * (sp[-1] + sp[0]);
743 float* t = lp; lp = hp; hp = t;
745 ui32 w = l_width; l_width = h_width; h_width = w;
749 float K = atk->
get_K();
750 float K_inv = 1.0f / K;
754 for (
ui32 i = l_width; i > 0; --i)
758 for (
ui32 i = h_width; i > 0; --i)
764 ldst->
f32[0] = src->
f32[0];
766 hdst->
f32[0] = src->
f32[0] * 2.0f;
773 ui32 width,
bool even)
778 float* oth = hsrc->
f32, * aug = lsrc->
f32;
779 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
780 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
783 float K = atk->
get_K();
784 float K_inv = 1.0f / K;
788 for (
ui32 i = aug_width; i > 0; --i)
792 for (
ui32 i = oth_width; i > 0; --i)
797 for (
ui32 j = 0; j < num_steps; ++j)
804 oth[oth_width] = oth[oth_width - 1];
806 const float* sp = oth + (ev ? 0 : 1);
808 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
809 *dp -= a * (sp[-1] + sp[0]);
812 float* t = aug; aug = oth; oth = t;
814 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
818 float* sph = hsrc->
f32;
819 float* spl = lsrc->
f32;
820 float* dp = dst->
f32;
823 { *dp++ = *sph++; --w; }
824 for (; w > 1; w -= 2)
825 { *dp++ = *spl++; *dp++ = *sph++; }
827 { *dp++ = *spl++; --w; }
831 dst->
f32[0] = lsrc->
f32[0];
833 dst->
f32[0] = hsrc->
f32[0] * 0.5f;
void(* rev_horz_ana)(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void gen_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
void gen_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void gen_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void gen_rev_horz_syn32(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void sse2_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
static void gen_rev_vert_step64(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx512_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void gen_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void gen_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void sse2_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void gen_rev_vert_step32(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
static void gen_rev_horz_ana64(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void(* irv_vert_times_K)(float K, const line_buf *aug, ui32 repeat)
void gen_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void(* irv_vert_step)(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void sse_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
void avx2_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void init_wavelet_transform_functions()
void wasm_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
static void gen_rev_horz_syn64(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx512_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void avx2_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void sse_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void sse_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void wasm_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void sse_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx512_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
void avx512_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void avx2_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void(* rev_horz_syn)(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void sse2_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void(* irv_horz_ana)(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void(* rev_vert_step)(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void gen_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void avx_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
void(* irv_horz_syn)(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void wasm_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void gen_rev_horz_ana32(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
OJPH_EXPORT int get_cpu_ext_level()
@ X86_CPU_EXT_LEVEL_AVX512
ui32 get_num_steps() const
const lifting_step * get_step(ui32 s) const