Commit 438283c12d8378c449b78e011cb4f9c0ff33dcc3

Authored by animetosho
1 parent f373b138
Exists in neon_fixes

Use similar strategy for SPLIT(16,4) ALTMAP NEON implementation as SPLIT(32,4)

Showing 1 changed file with 41 additions and 95 deletions   Show diff stats
src/neon/gf_w16_neon.c
@@ -105,58 +105,6 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, @@ -105,58 +105,6 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
105 } 105 }
106 } 106 }
107 107
108 -static  
109 -inline  
110 -void  
111 -neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,  
112 - uint8_t *dst, uint8_t *d_end,  
113 - uint8_t *tbl, gf_val_32_t val,  
114 - int xor)  
115 -{  
116 - unsigned i;  
117 - uint8_t *high = tbl + 4 * 16;  
118 - uint8x16_t vh, vl, rh, rl;  
119 - uint8x16_t loset;  
120 -  
121 - uint8x16_t tbl_h[4], tbl_l[4];  
122 - for (i = 0; i < 4; i++) {  
123 - tbl_l[i] = vld1q_u8(tbl + i*16);  
124 - tbl_h[i] = vld1q_u8(high + i*16);  
125 - }  
126 -  
127 - loset = vdupq_n_u8(0xf);  
128 -  
129 - while (dst < d_end) {  
130 - vh = vld1q_u8(src);  
131 - vl = vld1q_u8(src + 16);  
132 -  
133 - rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset));  
134 - rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset));  
135 - rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset)));  
136 - rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset)));  
137 -  
138 - vl = vshrq_n_u8(vl, 4);  
139 - vh = vshrq_n_u8(vh, 4);  
140 -  
141 - rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl));  
142 - rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl));  
143 - rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh));  
144 - rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh));  
145 -  
146 - if (xor) {  
147 - vh = vld1q_u8(dst);  
148 - vl = vld1q_u8(dst + 16);  
149 - rh = veorq_u8(rh, vh);  
150 - rl = veorq_u8(rl, vl);  
151 - }  
152 - vst1q_u8(dst, rh);  
153 - vst1q_u8(dst + 16, rl);  
154 -  
155 - src += 32;  
156 - dst += 32;  
157 - }  
158 -}  
159 -  
160 #else /* ARCH_AARCH64 */ 108 #else /* ARCH_AARCH64 */
161 109
162 static 110 static
@@ -211,6 +159,12 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, @@ -211,6 +159,12 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
211 dst += 8; 159 dst += 8;
212 } 160 }
213 } 161 }
  162 +#endif /* ARCH_AARCH64 */
  163 +
  164 +#ifndef ARCH_AARCH64
  165 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
  166 + vtbl2_u8(tbl, vget_high_u8(v)))
  167 +#endif
214 168
215 static 169 static
216 inline 170 inline
@@ -222,68 +176,60 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, @@ -222,68 +176,60 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
222 { 176 {
223 unsigned i; 177 unsigned i;
224 uint8_t *high = tbl + 4 * 16; 178 uint8_t *high = tbl + 4 * 16;
225 - uint8x8_t vh0, vh1, vl0, vl1, rh0, rh1, rl0, rl1;  
226 - uint8x8_t loset; 179 + uint8x16_t vh, vl, rh, rl;
  180 + uint8x16_t loset;
227 181
  182 +#ifdef ARCH_AARCH64
  183 + uint8x16_t tbl_h[4], tbl_l[4];
  184 +#else
228 uint8x8x2_t tbl_h[4], tbl_l[4]; 185 uint8x8x2_t tbl_h[4], tbl_l[4];
  186 +#endif
229 for (i = 0; i < 4; i++) { 187 for (i = 0; i < 4; i++) {
  188 +#ifdef ARCH_AARCH64
  189 + tbl_l[i] = vld1q_u8(tbl + i*16);
  190 + tbl_h[i] = vld1q_u8(high + i*16);
  191 +#else
230 tbl_l[i].val[0] = vld1_u8(tbl + i*16); 192 tbl_l[i].val[0] = vld1_u8(tbl + i*16);
231 tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); 193 tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
232 tbl_h[i].val[0] = vld1_u8(high + i*16); 194 tbl_h[i].val[0] = vld1_u8(high + i*16);
233 tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); 195 tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
  196 +#endif
234 } 197 }
235 198
236 - loset = vdup_n_u8(0xf); 199 + loset = vdupq_n_u8(0xf);
237 200
238 while (dst < d_end) { 201 while (dst < d_end) {
239 - vh0 = vld1_u8(src);  
240 - vh1 = vld1_u8(src + 8);  
241 - vl0 = vld1_u8(src + 16);  
242 - vl1 = vld1_u8(src + 24);  
243 -  
244 - rl0 = vtbl2_u8(tbl_l[0], vand_u8(vl0, loset));  
245 - rl1 = vtbl2_u8(tbl_l[0], vand_u8(vl1, loset));  
246 - rh0 = vtbl2_u8(tbl_h[0], vand_u8(vl0, loset));  
247 - rh1 = vtbl2_u8(tbl_h[0], vand_u8(vl1, loset));  
248 - rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[2], vand_u8(vh0, loset)));  
249 - rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[2], vand_u8(vh1, loset)));  
250 - rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[2], vand_u8(vh0, loset)));  
251 - rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[2], vand_u8(vh1, loset)));  
252 -  
253 - vh0 = vshr_n_u8(vh0, 4);  
254 - vh1 = vshr_n_u8(vh1, 4);  
255 - vl0 = vshr_n_u8(vl0, 4);  
256 - vl1 = vshr_n_u8(vl1, 4);  
257 -  
258 - rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[1], vl0));  
259 - rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[1], vl1));  
260 - rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[1], vl0));  
261 - rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[1], vl1));  
262 - rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[3], vh0));  
263 - rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[3], vh1));  
264 - rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[3], vh0));  
265 - rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[3], vh1)); 202 + vh = vld1q_u8(src);
  203 + vl = vld1q_u8(src + 16);
  204 +
  205 + rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset));
  206 + rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset));
  207 + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset)));
  208 + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset)));
  209 +
  210 + vl = vshrq_n_u8(vl, 4);
  211 + vh = vshrq_n_u8(vh, 4);
  212 +
  213 + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl));
  214 + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl));
  215 + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh));
  216 + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh));
266 217
267 if (xor) { 218 if (xor) {
268 - vh0 = vld1_u8(dst);  
269 - vh1 = vld1_u8(dst + 8);  
270 - vl0 = vld1_u8(dst + 16);  
271 - vl1 = vld1_u8(dst + 24);  
272 - rh0 = veor_u8(rh0, vh0);  
273 - rh1 = veor_u8(rh1, vh1);  
274 - rl0 = veor_u8(rl0, vl0);  
275 - rl1 = veor_u8(rl1, vl1); 219 + vh = vld1q_u8(dst);
  220 + vl = vld1q_u8(dst + 16);
  221 + rh = veorq_u8(rh, vh);
  222 + rl = veorq_u8(rl, vl);
276 } 223 }
277 - vst1_u8(dst, rh0);  
278 - vst1_u8(dst + 8, rh1);  
279 - vst1_u8(dst + 16, rl0);  
280 - vst1_u8(dst + 24, rl1); 224 + vst1q_u8(dst, rh);
  225 + vst1q_u8(dst + 16, rl);
281 226
282 src += 32; 227 src += 32;
283 dst += 32; 228 dst += 32;
284 } 229 }
285 } 230 }
286 -#endif /* ARCH_AARCH64 */ 231 +
  232 +
287 233
288 static 234 static
289 inline 235 inline