@@ -1536,6 +1536,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
15361536#endif // CV_SIMD
15371537
15381538
1539+
1540+
15391541struct RGB2Lab_b
15401542{
15411543typedef uchar channel_type;
@@ -1571,6 +1573,69 @@ struct RGB2Lab_b
15711573 }
15721574 }
15731575
1576+ #if CV_NEON
1577+ template <int n>
1578+ inline void rgb2lab_batch (const ushort* tab,
1579+ const v_uint8 vRi,const v_uint8 vGi,const v_uint8 vBi,
1580+ v_int32& vL, v_int32& va, v_int32& vb)const
1581+ {
1582+ // Define some scalar constants which we will make use of later
1583+ const int Lscale = (116 *255 +50 )/100 ;
1584+ const int Lshift = -((16 *255 *(1 << lab_shift2) +50 )/100 );
1585+ const int xyzDescaleShift = (1 << (lab_shift -1 ));
1586+ const int labDescaleShift = (1 << (lab_shift2 -1 ));
1587+ const int abShift =128 *(1 << lab_shift2);
1588+
1589+ const int C0 = coeffs[0 ], C1 = coeffs[1 ], C2 = coeffs[2 ],
1590+ C3 = coeffs[3 ], C4 = coeffs[4 ], C5 = coeffs[5 ],
1591+ C6 = coeffs[6 ], C7 = coeffs[7 ], C8 = coeffs[8 ];
1592+
1593+ // int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]];
1594+ v_int32vR (tab[v_extract_n<4 *n+0 >(vRi)], tab[v_extract_n<4 *n+1 >(vRi)],
1595+ tab[v_extract_n<4 *n+2 >(vRi)], tab[v_extract_n<4 *n+3 >(vRi)]);
1596+ v_int32vG (tab[v_extract_n<4 *n+0 >(vGi)], tab[v_extract_n<4 *n+1 >(vGi)],
1597+ tab[v_extract_n<4 *n+2 >(vGi)], tab[v_extract_n<4 *n+3 >(vGi)]);
1598+ v_int32vB (tab[v_extract_n<4 *n+0 >(vBi)], tab[v_extract_n<4 *n+1 >(vBi)],
1599+ tab[v_extract_n<4 *n+2 >(vBi)], tab[v_extract_n<4 *n+3 >(vBi)]);
1600+
1601+ /* int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];*/
1602+ v_int32 vfX =v_fma (vR,v_setall_s32 (C0),v_setall_s32 (xyzDescaleShift));
1603+ vfX =v_fma (vG,v_setall_s32 (C1), vfX);
1604+ vfX =v_fma (vB,v_setall_s32 (C2), vfX);
1605+ vfX = v_shr<lab_shift>(vfX);
1606+ vfX =v_int32 (LabCbrtTab_b[v_extract_n<0 >(vfX)], LabCbrtTab_b[v_extract_n<1 >(vfX)],
1607+ LabCbrtTab_b[v_extract_n<2 >(vfX)], LabCbrtTab_b[v_extract_n<3 >(vfX)]);
1608+
1609+ /* int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];*/
1610+ v_int32 vfY =v_fma (vR,v_setall_s32 (C3),v_setall_s32 (xyzDescaleShift));
1611+ vfY =v_fma (vG,v_setall_s32 (C4), vfY);
1612+ vfY =v_fma (vB,v_setall_s32 (C5), vfY);
1613+ vfY = v_shr<lab_shift>(vfY);
1614+ vfY =v_int32 (LabCbrtTab_b[v_extract_n<0 >(vfY)], LabCbrtTab_b[v_extract_n<1 >(vfY)],
1615+ LabCbrtTab_b[v_extract_n<2 >(vfY)], LabCbrtTab_b[v_extract_n<3 >(vfY)]);
1616+
1617+ /* int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];*/
1618+ v_int32 vfZ =v_fma (vR,v_setall_s32 (C6),v_setall_s32 (xyzDescaleShift));
1619+ vfZ =v_fma (vG,v_setall_s32 (C7), vfZ);
1620+ vfZ =v_fma (vB,v_setall_s32 (C8), vfZ);
1621+ vfZ = v_shr<lab_shift>(vfZ);
1622+ vfZ =v_int32 (LabCbrtTab_b[v_extract_n<0 >(vfZ)], LabCbrtTab_b[v_extract_n<1 >(vfZ)],
1623+ LabCbrtTab_b[v_extract_n<2 >(vfZ)], LabCbrtTab_b[v_extract_n<3 >(vfZ)]);
1624+
1625+ /* int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );*/
1626+ vL =v_fma (vfY,v_setall_s32 (Lscale),v_setall_s32 (Lshift+labDescaleShift));
1627+ vL = v_shr<lab_shift2>(vL);
1628+
1629+ /* int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );*/
1630+ va =v_fma (vfX - vfY,v_setall_s32 (500 ),v_setall_s32 (abShift+labDescaleShift));
1631+ va = v_shr<lab_shift2>(va);
1632+
1633+ /* int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );*/
1634+ vb =v_fma (vfY - vfZ,v_setall_s32 (200 ),v_setall_s32 (abShift+labDescaleShift));
1635+ vb = v_shr<lab_shift2>(vb);
1636+ }
1637+ #endif // CV_NEON
1638+
15741639void operator ()(const uchar* src, uchar* dst,int n)const
15751640 {
15761641CV_INSTRUMENT_REGION ();
@@ -1585,6 +1650,45 @@ struct RGB2Lab_b
15851650
15861651 i =0 ;
15871652
1653+ #if CV_NEON
1654+ // On each loop, we load nlanes of RGB/A v_uint8s and store nlanes of
1655+ // Lab v_uint8s
1656+ for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes,
1657+ src += scn*v_uint8::nlanes, dst +=3 *v_uint8::nlanes )
1658+ {
1659+ // Load 4 batches of 4 src
1660+ v_uint8 vRi, vGi, vBi;
1661+ if (scn ==4 )
1662+ {
1663+ v_uint8 vAi;
1664+ v_load_deinterleave (src, vRi, vGi, vBi, vAi);
1665+ }
1666+ else // scn == 3
1667+ {
1668+ v_load_deinterleave (src, vRi, vGi, vBi);
1669+ }
1670+
1671+ // Do 4 batches of 4 RGB2Labs
1672+ v_int32 vL0, va0, vb0;
1673+ rgb2lab_batch<0 >(tab, vRi, vGi, vBi, vL0, va0, vb0);
1674+ v_int32 vL1, va1, vb1;
1675+ rgb2lab_batch<1 >(tab, vRi, vGi, vBi, vL1, va1, vb1);
1676+ v_int32 vL2, va2, vb2;
1677+ rgb2lab_batch<2 >(tab, vRi, vGi, vBi, vL2, va2, vb2);
1678+ v_int32 vL3, va3, vb3;
1679+ rgb2lab_batch<3 >(tab, vRi, vGi, vBi, vL3, va3, vb3);
1680+
1681+ // Saturate, combine and store all batches
1682+ // dst[0] = saturate_cast<uchar>(L);
1683+ // dst[1] = saturate_cast<uchar>(a);
1684+ // dst[2] = saturate_cast<uchar>(b);
1685+ v_store_interleave (dst,
1686+ v_pack (v_pack_u (vL0, vL1),v_pack_u (vL2, vL3)),
1687+ v_pack (v_pack_u (va0, va1),v_pack_u (va2, va3)),
1688+ v_pack (v_pack_u (vb0, vb1),v_pack_u (vb2, vb3)));
1689+ }
1690+ #endif // CV_NEON
1691+
15881692#if CV_SIMD
15891693const int vsize = v_uint8::nlanes;
15901694const int xyzDescaleShift =1 << (lab_shift -1 );