Subversion Repository

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2058 - (show annotations)
Wed Nov 3 06:06:54 2010 UTC (2 years, 7 months ago) by acrux
File size: 167710 byte(s)
lame: initial commit, power architecture optimizations
1 --- libmp3lame/machine.h.orig 2008-04-23 10:50:31.000000000 +0900
2 +++ libmp3lame/machine.h 2010-03-01 14:05:00.000000000 +0900
3 @@ -174,6 +174,24 @@
4
5 #define NEQ(a,b) (!EQ(a,b))
6
7 +#ifdef ALTIVEC
8 +#ifdef __APPLE_CC__
9 +#define VINIT4(a,b,c,d) (a,b,c,d)
10 +#define VINIT8(a,b,c,d,e,f,g,h) (a,b,c,d,e,f,g,h)
11 +#define VINIT16(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) (a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p)
12 +#define VINIT4ALL(a) (a,a,a,a)
13 +#define VINIT8ALL(a) (a,a,a,a,a,a,a,a)
14 +#define VINIT16ALL(a) (a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a)
15 +#else
16 +#define VINIT4(a,b,c,d) {a,b,c,d}
17 +#define VINIT8(a,b,c,d,e,f,g,h) {a,b,c,d,e,f,g,h}
18 +#define VINIT16(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) {a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p}
19 +#define VINIT4ALL(a) {a,a,a,a}
20 +#define VINIT8ALL(a) {a,a,a,a,a,a,a,a}
21 +#define VINIT16ALL(a) {a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a}
22 +#endif
23 +#endif
24 +
25 #endif
26
27 /* end of machine.h */
28 --- libmp3lame/fft.c.orig 2008-04-13 03:18:06.000000000 +0900
29 +++ libmp3lame/fft.c 2010-03-01 14:05:00.000000000 +0900
30 @@ -38,6 +38,12 @@
31 # include <config.h>
32 #endif
33
34 +#ifdef ALTIVEC
35 +#ifndef __APPLE_CC__
36 +#include <altivec.h>
37 +#endif
38 +#endif
39 +
40 #include "lame.h"
41 #include "machine.h"
42 #include "encoder.h"
43 @@ -67,6 +73,17 @@
44 int k4;
45 FLOAT *fi, *gi;
46 FLOAT const *fn;
47 +#ifdef ALTIVEC
48 + float csvec[16] __attribute__ ((aligned (16)));
49 + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
50 + vector float vfi0,vfi1,vfi2,vfi3,vgi0,vgi1,vgi2,vgi3,vf0,vf1,vf2,vf3,vg0,vg1,vg2,vg3;
51 + vector float vprev1,vprev2,vprev3,vprev4,vc1,vc2,vs1,vs2,vzero;
52 + vector unsigned char vperm1,vperm2;
53 +
54 + vperm1 = (vector unsigned char)VINIT16(16,17,18,19,12,13,14,15,8,9,10,11,4,5,6,7);
55 + vperm2 = (vector unsigned char)VINIT16(16,17,18,19,4,5,6,7,8,9,10,11,12,13,14,15);
56 + vzero = vec_xor(vzero,vzero);
57 +#endif
58
59 n <<= 1; /* to get BLKSIZE, because of 3DNow! ASM routine */
60 fn = fz + n;
61 @@ -104,6 +121,238 @@
62 } while (fi < fn);
63 c1 = tri[0];
64 s1 = tri[1];
65 +#ifdef ALTIVEC
66 + if(kx < 4) {
67 + for (i = 1; i < kx; i++) {
68 + FLOAT c2, s2;
69 + c2 = 1 - (2 * s1) * s1;
70 + s2 = (2 * s1) * c1;
71 + fi = fz + i;
72 + gi = fz + k1 - i;
73 + do {
74 + FLOAT a, b, g0, f0, f1, g1, f2, g2, f3, g3;
75 + b = s2 * fi[k1] - c2 * gi[k1];
76 + a = c2 * fi[k1] + s2 * gi[k1];
77 + f1 = fi[0] - a;
78 + f0 = fi[0] + a;
79 + g1 = gi[0] - b;
80 + g0 = gi[0] + b;
81 + b = s2 * fi[k3] - c2 * gi[k3];
82 + a = c2 * fi[k3] + s2 * gi[k3];
83 + f3 = fi[k2] - a;
84 + f2 = fi[k2] + a;
85 + g3 = gi[k2] - b;
86 + g2 = gi[k2] + b;
87 + b = s1 * f2 - c1 * g3;
88 + a = c1 * f2 + s1 * g3;
89 + fi[k2] = f0 - a;
90 + fi[0] = f0 + a;
91 + gi[k3] = g1 - b;
92 + gi[k1] = g1 + b;
93 + b = c1 * g2 - s1 * f3;
94 + a = s1 * g2 + c1 * f3;
95 + gi[k2] = g0 - a;
96 + gi[0] = g0 + a;
97 + fi[k3] = f1 - b;
98 + fi[k1] = f1 + b;
99 + gi += k4;
100 + fi += k4;
101 + } while (fi < fn);
102 + c2 = c1;
103 + c1 = c2 * tri[0] - s1 * tri[1];
104 + s1 = c2 * tri[1] + s1 * tri[0];
105 + }
106 + }
107 + else {
108 + FLOAT c2, s2;
109 + for(i = 1; i < 4; i++) {
110 + c2 = 1 - (2*s1)*s1;
111 + s2 = (2*s1)*c1;
112 + csvec[i] = c1;
113 + csvec[i+4] = c2;
114 + csvec[i+8] = s1;
115 + csvec[i+12] = s2;
116 + c2 = c1;
117 + c1 = c2 * tri[0] - s1 * tri[1];
118 + s1 = c2 * tri[1] + s1 * tri[0];
119 + }
120 + vc1 = vec_ld(0,csvec);
121 + vc2 = vec_ld(16,csvec);
122 + vs1 = vec_ld(32,csvec);
123 + vs2 = vec_ld(48,csvec);
124 + fi = fz;
125 + gi = fz + k1;
126 + do {
127 + vfi0 = vec_ld(0,fi);
128 + vfi1 = vec_ld(0,fi+k1);
129 + vfi2 = vec_ld(0,fi+k2);
130 + vfi3 = vec_ld(0,fi+k3);
131 + vprev1 = vec_ld(0,gi-4);
132 + vprev2 = vec_ld(0,gi+k1-4);
133 + vprev3 = vec_ld(0,gi+k2-4);
134 + vprev4 = vec_ld(0,gi+k3-4);
135 + vgi0 = vec_perm(vprev1,vprev1,vperm1);
136 + vgi1 = vec_perm(vprev2,vprev2,vperm1);
137 + vgi2 = vec_perm(vprev3,vprev3,vperm1);
138 + vgi3 = vec_perm(vprev4,vprev4,vperm1);
139 +
140 + v1 = vec_madd(vfi1,vc2,vzero);
141 + v2 = vec_madd(vfi1,vs2,vzero);
142 + v3 = vec_madd(vfi3,vc2,vzero);
143 + v4 = vec_madd(vfi3,vs2,vzero);
144 + v5 = vec_madd(vgi1,vs2,v1);
145 + v6 = vec_nmsub(vgi1,vc2,v2);
146 + v7 = vec_madd(vgi3,vs2,v3);
147 + v8 = vec_nmsub(vgi3,vc2,v4);
148 +
149 + vf0 = vec_add(vfi0,v5);
150 + vf1 = vec_sub(vfi0,v5);
151 + vg0 = vec_add(vgi0,v6);
152 + vg1 = vec_sub(vgi0,v6);
153 + vf2 = vec_add(vfi2,v7);
154 + vf3 = vec_sub(vfi2,v7);
155 + vg2 = vec_add(vgi2,v8);
156 + vg3 = vec_sub(vgi2,v8);
157 +
158 + v1 = vec_madd(vf2,vc1,vzero);
159 + v2 = vec_madd(vf2,vs1,vzero);
160 + v3 = vec_madd(vg2,vs1,vzero);
161 + v4 = vec_madd(vg2,vc1,vzero);
162 + v5 = vec_madd(vg3,vs1,v1);
163 + v6 = vec_nmsub(vg3,vc1,v2);
164 + v7 = vec_madd(vf3,vc1,v3);
165 + v8 = vec_nmsub(vf3,vs1,v4);
166 +
167 + v9 = vec_add(vf0,v5);
168 + v10 = vec_sub(vf0,v5);
169 + v11 = vec_add(vg1,v6);
170 + v12 = vec_sub(vg1,v6);
171 + v13 = vec_add(vg0,v7);
172 + v14 = vec_sub(vg0,v7);
173 + v15 = vec_add(vf1,v8);
174 + v16 = vec_sub(vf1,v8);
175 +
176 + v1 = vec_perm(v9,vfi0,vperm2);
177 + v2 = vec_perm(v10,vfi2,vperm2);
178 + v3 = vec_perm(v15,vfi1,vperm2);
179 + v4 = vec_perm(v16,vfi3,vperm2);
180 + vec_st(v1,0,fi);
181 + vec_st(v2,0,fi+k2);
182 + vec_st(v3,0,fi+k1);
183 + vec_st(v4,0,fi+k3);
184 +
185 + v1 = vec_perm(v11,vprev2,vperm1);
186 + v2 = vec_perm(v12,vprev4,vperm1);
187 + v3 = vec_perm(v13,vprev1,vperm1);
188 + v4 = vec_perm(v14,vprev3,vperm1);
189 + vec_st(v1,0,gi+k1-4);
190 + vec_st(v2,0,gi+k3-4);
191 + vec_st(v3,0,gi-4);
192 + vec_st(v4,0,gi+k2-4);
193 +
194 + gi += k4;
195 + fi += k4;
196 + } while (fi<fn);
197 +
198 + /* rest loop */
199 +
200 + for (i = 4; i < kx; i+=4) {
201 + int j;
202 + for(j = 0; j < 4; j++) {
203 + c2 = 1 - (2*s1)*s1;
204 + s2 = (2*s1)*c1;
205 + csvec[j] = c1;
206 + csvec[j+4] = c2;
207 + csvec[j+8] = s1;
208 + csvec[j+12] = s2;
209 + c2 = c1;
210 + c1 = c2 * tri[0] - s1 * tri[1];
211 + s1 = c2 * tri[1] + s1 * tri[0];
212 + }
213 + vc1 = vec_ld(0,csvec);
214 + vc2 = vec_ld(16,csvec);
215 + vs1 = vec_ld(32,csvec);
216 + vs2 = vec_ld(48,csvec);
217 + fi = fz + i;
218 + gi = fz + k1 - i;
219 + do {
220 + vfi0 = vec_ld(0,fi);
221 + vfi1 = vec_ld(0,fi+k1);
222 + vfi2 = vec_ld(0,fi+k2);
223 + vfi3 = vec_ld(0,fi+k3);
224 + vprev1 = vec_ld(0,gi-4);
225 + v1 = vec_ld(0,gi);
226 + vprev2 = vec_ld(0,gi+k1-4);
227 + v2 = vec_ld(0,gi+k1);
228 + vprev3 = vec_ld(0,gi+k2-4);
229 + v3 = vec_ld(0,gi+k2);
230 + vprev4 = vec_ld(0,gi+k3-4);
231 + v4 = vec_ld(0,gi+k3);
232 + vgi0 = vec_perm(vprev1,v1,vperm1);
233 + vgi1 = vec_perm(vprev2,v2,vperm1);
234 + vgi2 = vec_perm(vprev3,v3,vperm1);
235 + vgi3 = vec_perm(vprev4,v4,vperm1);
236 +
237 + v1 = vec_madd(vfi1,vc2,vzero);
238 + v2 = vec_madd(vfi1,vs2,vzero);
239 + v3 = vec_madd(vfi3,vc2,vzero);
240 + v4 = vec_madd(vfi3,vs2,vzero);
241 + v5 = vec_madd(vgi1,vs2,v1);
242 + v6 = vec_nmsub(vgi1,vc2,v2);
243 + v7 = vec_madd(vgi3,vs2,v3);
244 + v8 = vec_nmsub(vgi3,vc2,v4);
245 +
246 + vf0 = vec_add(vfi0,v5);
247 + vf1 = vec_sub(vfi0,v5);
248 + vg0 = vec_add(vgi0,v6);
249 + vg1 = vec_sub(vgi0,v6);
250 + vf2 = vec_add(vfi2,v7);
251 + vf3 = vec_sub(vfi2,v7);
252 + vg2 = vec_add(vgi2,v8);
253 + vg3 = vec_sub(vgi2,v8);
254 +
255 + v1 = vec_madd(vf2,vc1,vzero);
256 + v2 = vec_madd(vf2,vs1,vzero);
257 + v3 = vec_madd(vg2,vs1,vzero);
258 + v4 = vec_madd(vg2,vc1,vzero);
259 + v5 = vec_madd(vg3,vs1,v1);
260 + v6 = vec_nmsub(vg3,vc1,v2);
261 + v7 = vec_madd(vf3,vc1,v3);
262 + v8 = vec_nmsub(vf3,vs1,v4);
263 +
264 + v9 = vec_add(vf0,v5);
265 + v10 = vec_sub(vf0,v5);
266 + v11 = vec_add(vg1,v6);
267 + v12 = vec_sub(vg1,v6);
268 + v13 = vec_add(vg0,v7);
269 + v14 = vec_sub(vg0,v7);
270 + v15 = vec_add(vf1,v8);
271 + v16 = vec_sub(vf1,v8);
272 +
273 + vec_st(v9,0,fi);
274 + vec_st(v10,0,fi+k2);
275 + vec_st(v15,0,fi+k1);
276 + vec_st(v16,0,fi+k3);
277 +
278 + v1 = vec_perm(v11,vprev2,vperm1);
279 + v2 = vec_perm(v12,vprev4,vperm1);
280 + v3 = vec_perm(v13,vprev1,vperm1);
281 + v4 = vec_perm(v14,vprev3,vperm1);
282 + vec_st(v1,0,gi+k1-4);
283 + vec_ste(v11,0,gi+k1);
284 + vec_st(v2,0,gi+k3-4);
285 + vec_ste(v12,0,gi+k3);
286 + vec_st(v3,0,gi-4);
287 + vec_ste(v13,0,gi);
288 + vec_st(v4,0,gi+k2-4);
289 + vec_ste(v14,0,gi+k2);
290 +
291 + gi += k4;
292 + fi += k4;
293 + } while (fi<fn);
294 + }
295 + }
296 +#else
297 for (i = 1; i < kx; i++) {
298 FLOAT c2, s2;
299 c2 = 1 - (2 * s1) * s1;
300 @@ -143,6 +392,7 @@
301 c1 = c2 * tri[0] - s1 * tri[1];
302 s1 = c2 * tri[1] + s1 * tri[0];
303 }
304 +#endif
305 tri += 2;
306 } while (k4 < n);
307 }
308 --- libmp3lame/gain_analysis.c.orig 2008-08-05 23:16:06.000000000 +0900
309 +++ libmp3lame/gain_analysis.c 2010-03-01 14:05:00.000000000 +0900
310 @@ -92,6 +92,12 @@
311 #include <config.h>
312 #endif
313
314 +#ifdef ALTIVEC
315 +#ifndef __APPLE_CC__
316 +#include <altivec.h>
317 +#endif
318 +#endif
319 +
320 #include <stdio.h>
321 #include <stdlib.h>
322 #include <string.h>
323 @@ -108,64 +114,64 @@
324 #endif
325
326 /*lint -save -e736 loss of precision */
327 -static const Float_t ABYule[9][2 * YULE_ORDER + 1] = {
328 +static const Float_t ABYule[9][2 * YULE_ORDER + 1 + 3] __attribute__ ((aligned (16))) = {
329 {0.03857599435200, -3.84664617118067, -0.02160367184185, 7.81501653005538, -0.00123395316851,
330 -11.34170355132042, -0.00009291677959, 13.05504219327545, -0.01655260341619,
331 -12.28759895145294, 0.02161526843274, 9.48293806319790, -0.02074045215285, -5.87257861775999,
332 0.00594298065125, 2.75465861874613, 0.00306428023191, -0.86984376593551, 0.00012025322027,
333 - 0.13919314567432, 0.00288463683916},
334 + 0.13919314567432, 0.00288463683916, 0.0, 0.0, 0.0},
335 {0.05418656406430, -3.47845948550071, -0.02911007808948, 6.36317777566148, -0.00848709379851,
336 -8.54751527471874, -0.00851165645469, 9.47693607801280, -0.00834990904936, -8.81498681370155,
337 0.02245293253339, 6.85401540936998, -0.02596338512915, -4.39470996079559, 0.01624864962975,
338 2.19611684890774, -0.00240879051584, -0.75104302451432, 0.00674613682247, 0.13149317958808,
339 - -0.00187763777362},
340 + -0.00187763777362, 0.0, 0.0, 0.0},
341 {0.15457299681924, -2.37898834973084, -0.09331049056315, 2.84868151156327, -0.06247880153653,
342 -2.64577170229825, 0.02163541888798, 2.23697657451713, -0.05588393329856, -1.67148153367602,
343 0.04781476674921, 1.00595954808547, 0.00222312597743, -0.45953458054983, 0.03174092540049,
344 0.16378164858596, -0.01390589421898, -0.05032077717131, 0.00651420667831, 0.02347897407020,
345 - -0.00881362733839},
346 + -0.00881362733839, 0.0, 0.0, 0.0},
347 {0.30296907319327, -1.61273165137247, -0.22613988682123, 1.07977492259970, -0.08587323730772,
348 -0.25656257754070, 0.03282930172664, -0.16276719120440, -0.00915702933434, -0.22638893773906,
349 -0.02364141202522, 0.39120800788284, -0.00584456039913, -0.22138138954925, 0.06276101321749,
350 0.04500235387352, -0.00000828086748, 0.02005851806501, 0.00205861885564, 0.00302439095741,
351 - -0.02950134983287},
352 + -0.02950134983287, 0.0, 0.0, 0.0},
353 {0.33642304856132, -1.49858979367799, -0.25572241425570, 0.87350271418188, -0.11828570177555,
354 0.12205022308084, 0.11921148675203, -0.80774944671438, -0.07834489609479, 0.47854794562326,
355 -0.00469977914380, -0.12453458140019, -0.00589500224440, -0.04067510197014, 0.05724228140351,
356 0.08333755284107, 0.00832043980773, -0.04237348025746, -0.01635381384540, 0.02977207319925,
357 - -0.01760176568150},
358 + -0.01760176568150, 0.0, 0.0, 0.0},
359 {0.44915256608450, -0.62820619233671, -0.14351757464547, 0.29661783706366, -0.22784394429749,
360 -0.37256372942400, -0.01419140100551, 0.00213767857124, 0.04078262797139, -0.42029820170918,
361 -0.12398163381748, 0.22199650564824, 0.04097565135648, 0.00613424350682, 0.10478503600251,
362 0.06747620744683, -0.01863887810927, 0.05784820375801, -0.03193428438915, 0.03222754072173,
363 - 0.00541907748707},
364 + 0.00541907748707, 0.0, 0.0, 0.0},
365 {0.56619470757641, -1.04800335126349, -0.75464456939302, 0.29156311971249, 0.16242137742230,
366 -0.26806001042947, 0.16744243493672, 0.00819999645858, -0.18901604199609, 0.45054734505008,
367 0.30931782841830, -0.33032403314006, -0.27562961986224, 0.06739368333110, 0.00647310677246,
368 -0.04784254229033, 0.08647503780351, 0.01639907836189, -0.03788984554840, 0.01807364323573,
369 - -0.00588215443421},
370 + -0.00588215443421, 0.0, 0.0, 0.0},
371 {0.58100494960553, -0.51035327095184, -0.53174909058578, -0.31863563325245, -0.14289799034253,
372 -0.20256413484477, 0.17520704835522, 0.14728154134330, 0.02377945217615, 0.38952639978999,
373 0.15558449135573, -0.23313271880868, -0.25344790059353, -0.05246019024463, 0.01628462406333,
374 -0.02505961724053, 0.06920467763959, 0.02442357316099, -0.03721611395801, 0.01818801111503,
375 - -0.00749618797172},
376 + -0.00749618797172, 0.0, 0.0, 0.0},
377 {0.53648789255105, -0.25049871956020, -0.42163034350696, -0.43193942311114, -0.00275953611929,
378 -0.03424681017675, 0.04267842219415, -0.04678328784242, -0.10214864179676, 0.26408300200955,
379 0.14590772289388, 0.15113130533216, -0.02459864859345, -0.17556493366449, -0.11202315195388,
380 -0.18823009262115, -0.04060034127000, 0.05477720428674, 0.04788665548180, 0.04704409688120,
381 - -0.02217936801134}
382 + -0.02217936801134, 0.0, 0.0, 0.0}
383 };
384
385 -static const Float_t ABButter[9][2 * BUTTER_ORDER + 1] = {
386 - {0.98621192462708, -1.97223372919527, -1.97242384925416, 0.97261396931306, 0.98621192462708},
387 - {0.98500175787242, -1.96977855582618, -1.97000351574484, 0.97022847566350, 0.98500175787242},
388 - {0.97938932735214, -1.95835380975398, -1.95877865470428, 0.95920349965459, 0.97938932735214},
389 - {0.97531843204928, -1.95002759149878, -1.95063686409857, 0.95124613669835, 0.97531843204928},
390 - {0.97316523498161, -1.94561023566527, -1.94633046996323, 0.94705070426118, 0.97316523498161},
391 - {0.96454515552826, -1.92783286977036, -1.92909031105652, 0.93034775234268, 0.96454515552826},
392 - {0.96009142950541, -1.91858953033784, -1.92018285901082, 0.92177618768381, 0.96009142950541},
393 - {0.95856916599601, -1.91542108074780, -1.91713833199203, 0.91885558323625, 0.95856916599601},
394 - {0.94597685600279, -1.88903307939452, -1.89195371200558, 0.89487434461664, 0.94597685600279}
395 +static const Float_t ABButter[9][2 * BUTTER_ORDER + 1 + 3] __attribute__ ((aligned (16))) = {
396 + {0.98621192462708, -1.97223372919527, -1.97242384925416, 0.97261396931306, 0.98621192462708, 0.0, 0.0, 0.0},
397 + {0.98500175787242, -1.96977855582618, -1.97000351574484, 0.97022847566350, 0.98500175787242, 0.0, 0.0, 0.0},
398 + {0.97938932735214, -1.95835380975398, -1.95877865470428, 0.95920349965459, 0.97938932735214, 0.0, 0.0, 0.0},
399 + {0.97531843204928, -1.95002759149878, -1.95063686409857, 0.95124613669835, 0.97531843204928, 0.0, 0.0, 0.0},
400 + {0.97316523498161, -1.94561023566527, -1.94633046996323, 0.94705070426118, 0.97316523498161, 0.0, 0.0, 0.0},
401 + {0.96454515552826, -1.92783286977036, -1.92909031105652, 0.93034775234268, 0.96454515552826, 0.0, 0.0, 0.0},
402 + {0.96009142950541, -1.91858953033784, -1.92018285901082, 0.92177618768381, 0.96009142950541, 0.0, 0.0, 0.0},
403 + {0.95856916599601, -1.91542108074780, -1.91713833199203, 0.91885558323625, 0.95856916599601, 0.0, 0.0, 0.0},
404 + {0.94597685600279, -1.88903307939452, -1.89195371200558, 0.89487434461664, 0.94597685600279, 0.0, 0.0, 0.0}
405 };
406
407 /*lint -restore */
408 @@ -176,6 +182,189 @@
409
410 /* When calling this procedure, make sure that ip[-order] and op[-order] point to real data! */
411
412 +#ifdef ALTIVEC
413 +
414 +static void
415 +filterIntegrated (const Float_t* input, Float_t* output, Float_t* output2, size_t nSamples, const Float_t* kernel, const Float_t* kernel2)
416 +{
417 + vector float v1,v2,v3,v4,v5,v6,vbase;
418 + vector float vmask1,vmask2,vout1,vout2,vout3,vout4,vzero,vkernel1,vkernel2,vkernel3,vkernel4,vkernel5,vkernel6,vkernel7,vkernel8;
419 + vector float vo1, vo2, vo3, vo4, vi2, vi3;
420 + vector unsigned char vc1,vc2,vc3,vc4,vc5,vperm1,vperm2,vperm4,vperm5,vperm6;
421 +
422 + vbase = (vector float)VINIT4ALL(1e-10f);
423 + vperm1 = (vector unsigned char)VINIT16(24,25,26,27,16,17,18,19,8,9,10,11,0,1,2,3);
424 + vperm2 = (vector unsigned char)VINIT16(28,29,30,31,20,21,22,23,12,13,14,15,4,5,6,7);
425 + vc1 = vec_splat_u8(1);
426 + vc2 = vec_splat_u8(5);
427 + vc3 = vec_sl(vc1,vc2);
428 + vc4 = vec_sl(vc3,vc1);
429 + vc5 = vec_or(vc3,vc4);
430 + v1 = (vector float)vec_splat_s32(-1);
431 + vmask1 = vec_sro(v1,vc3);
432 + vmask2 = vec_sro(v1,vc4);
433 + vzero = vec_xor(vzero,vzero);
434 +
435 + v1 = vec_ld(0,kernel);
436 + v2 = vec_ld(16,kernel);
437 + v3 = vec_ld(32,kernel);
438 + v4 = vec_ld(48,kernel);
439 + v5 = vec_ld(64,kernel);
440 + v6 = vec_ld(80,kernel);
441 + vkernel1 = vec_perm(v1,v2,vperm1);
442 + vkernel2 = vec_perm(v1,v2,vperm2);
443 + vkernel3 = vec_perm(v3,v4,vperm1);
444 + vkernel4 = vec_perm(v3,v4,vperm2);
445 + vkernel5 = vec_perm(v5,v6,vperm1);
446 + vkernel6 = vec_perm(v5,v6,vperm2);
447 + vkernel5 = vec_and(vkernel5,vmask1);
448 + vkernel6 = vec_and(vkernel6,vmask2);
449 +
450 + v1 = vec_ld(0,kernel2);
451 + v2 = vec_ld(16,kernel2);
452 + vkernel7 = vec_perm(v1,v2,vperm1);
453 + vkernel8 = vec_perm(v1,v2,vperm2);
454 + vkernel7 = vec_and(vkernel7,vmask1);
455 + vkernel8 = vec_and(vkernel8,vmask2);
456 +
457 + vperm4 = vec_lvsl(0,input-11);
458 + vperm5 = vec_lvsl(0,output-4);
459 + vperm6 = vec_lvsl(0,output2-4);
460 +
461 + v1 = vec_ld(15,input-7);
462 + v2 = vec_ld(0,input-7);
463 + v3 = vec_ld(0,input-11);
464 + vi2 = vec_perm(v2,v1,vperm4);
465 + vi3 = vec_perm(v3,v2,vperm4);
466 + vi3 = vec_and(vi3,vmask1);
467 +
468 + v1 = vec_ld(15,output-4);
469 + v2 = vec_ld(0,output-4);
470 + v3 = vec_ld(0,output-8);
471 + v4 = vec_ld(0,output-12);
472 + vo1 = vec_perm(v2,v1,vperm5);
473 + vo2 = vec_perm(v3,v2,vperm5);
474 + vo3 = vec_perm(v4,v3,vperm5);
475 + vo3 = vec_and(vo3,vmask2);
476 +
477 + v1 = vec_ld(15,output2-4);
478 + v2 = vec_ld(0,output2-4);
479 + vo4 = vec_perm(v2,v1,vperm6);
480 +
481 + vperm4 = vec_lvsl(0,input-3);
482 + vperm5 = vec_lvsr(0,output);
483 +
484 + /* 1st loop */
485 + v1 = vec_ld(15,input-3);
486 + v3 = vec_ld(0,input-3);
487 + v5 = vec_perm(v3,v1,vperm4);
488 +
489 + vout1 = vec_madd(v5,vkernel1,vbase);
490 + vout2 = vec_madd(vo1,vkernel2,vbase);
491 +
492 + vout1 = vec_madd(vi2,vkernel3,vout1);
493 + vout2 = vec_madd(vo2,vkernel4,vout2);
494 +
495 + vout1 = vec_madd(vi3,vkernel5,vout1);
496 + vout2 = vec_madd(vo3,vkernel6,vout2);
497 +
498 + vi3 = vec_sld(vi3,vi2,4);
499 + vi2 = vec_sld(vi2,v5,4);
500 +
501 + vout1 = vec_sub(vout1,vout2);
502 +
503 + v1 = vec_slo(vout1,vc3);
504 + v2 = vec_slo(vout1,vc4);
505 + v3 = vec_slo(vout1,vc5);
506 + vout1 = vec_add(vout1,v1);
507 + vout2 = vec_add(v2,v3);
508 + vout1 = vec_add(vout1,vout2);
509 +
510 + vo3 = vec_sld(vo3,vo2,4);
511 + vo2 = vec_sld(vo2,vo1,4);
512 + vo1 = vec_sld(vo1,vout1,4);
513 +
514 + vout2 = vec_perm(vout1,vout1,vperm5);
515 + vec_ste(vout2,0,output);
516 +
517 + ++output;
518 + ++input;
519 + --nSamples;
520 +
521 + while(nSamples--) {
522 + vperm4 = vec_lvsl(0,input-3);
523 + vperm5 = vec_lvsr(0,output);
524 + vperm6 = vec_lvsr(0,output2);
525 +
526 + v1 = vec_ld(15,input-3);
527 + v3 = vec_ld(0,input-3);
528 + v5 = vec_perm(v3,v1,vperm4);
529 +
530 + vout1 = vec_madd(v5,vkernel1,vbase);
531 + vout2 = vec_madd(vo1,vkernel2,vbase);
532 +
533 + vout1 = vec_madd(vi2,vkernel3,vout1);
534 + vout2 = vec_madd(vo2,vkernel4,vout2);
535 +
536 + vout1 = vec_madd(vi3,vkernel5,vout1);
537 + vout2 = vec_madd(vo3,vkernel6,vout2);
538 +
539 + vout3 = vec_nmsub(vo4,vkernel8,vzero);
540 + vout4 = vec_madd(vo1,vkernel7,vout3);
541 +
542 + vi3 = vec_sld(vi3,vi2,4);
543 + vi2 = vec_sld(vi2,v5,4);
544 +
545 + vout1 = vec_sub(vout1,vout2);
546 +
547 + v1 = vec_slo(vout1,vc3);
548 + v2 = vec_slo(vout1,vc4);
549 + v3 = vec_slo(vout1,vc5);
550 + vout1 = vec_add(vout1,v1);
551 + vout2 = vec_add(v2,v3);
552 + vout1 = vec_add(vout1,vout2);
553 +
554 + vo3 = vec_sld(vo3,vo2,4);
555 + vo2 = vec_sld(vo2,vo1,4);
556 + vo1 = vec_sld(vo1,vout1,4);
557 +
558 + v4 = vec_slo(vout4,vc3);
559 + v5 = vec_slo(vout4,vc4);
560 + v6 = vec_slo(vout4,vc5);
561 + vout4 = vec_add(vout4,v4);
562 + vout3 = vec_add(v5,v6);
563 + vout3 = vec_add(vout3,vout4);
564 +
565 + vo4 = vec_sld(vo4,vout3,4);
566 +
567 + vout2 = vec_perm(vout1,vout1,vperm5);
568 + vout4 = vec_perm(vout3,vout3,vperm6);
569 + vec_ste(vout2,0,output);
570 + vec_ste(vout4,0,output2);
571 +
572 + ++output;
573 + ++output2;
574 + ++input;
575 + }
576 +
577 + vperm6 = vec_lvsr(0,output2);
578 +
579 + vout3 = vec_nmsub(vo4,vkernel8,vzero);
580 + vout4 = vec_madd(vo1,vkernel7,vout3);
581 +
582 + v1 = vec_slo(vout4,vc3);
583 + v2 = vec_slo(vout4,vc4);
584 + v3 = vec_slo(vout4,vc5);
585 + vout4 = vec_add(vout4,v1);
586 + vout3 = vec_add(v2,v3);
587 + vout3 = vec_add(vout3,vout4);
588 +
589 + vout4 = vec_perm(vout3,vout3,vperm6);
590 + vec_ste(vout4,0,output2);
591 +}
592 +
593 +#else
594 +
595 static void
596 filterYule(const Float_t * input, Float_t * output, size_t nSamples, const Float_t * const kernel)
597 {
598 @@ -226,6 +415,7 @@
599 }
600 }
601
602 +#endif
603
604 /* returns a INIT_GAIN_ANALYSIS_OK if successful, INIT_GAIN_ANALYSIS_ERROR if not */
605
606 @@ -364,6 +554,10 @@
607 curright = right_samples + cursamplepos;
608 }
609
610 +#ifdef ALTIVEC
611 + filterIntegrated(curleft, rgData->lstep + rgData->totsamp, rgData->lout + rgData->totsamp, cursamples, ABYule[rgData->freqindex], ABButter[rgData->freqindex]);
612 + filterIntegrated(curright, rgData->rstep + rgData->totsamp, rgData->rout + rgData->totsamp, cursamples, ABYule[rgData->freqindex], ABButter[rgData->freqindex]);
613 +#else
614 YULE_FILTER(curleft, rgData->lstep + rgData->totsamp, cursamples,
615 ABYule[rgData->freqindex]);
616 YULE_FILTER(curright, rgData->rstep + rgData->totsamp, cursamples,
617 @@ -373,6 +567,7 @@
618 ABButter[rgData->freqindex]);
619 BUTTER_FILTER(rgData->rstep + rgData->totsamp, rgData->rout + rgData->totsamp, cursamples,
620 ABButter[rgData->freqindex]);
621 +#endif
622
623 curleft = rgData->lout + rgData->totsamp; /* Get the squared values */
624 curright = rgData->rout + rgData->totsamp;
625 --- libmp3lame/newmdct.c.orig 2008-04-23 08:01:22.000000000 +0900
626 +++ libmp3lame/newmdct.c 2010-03-01 14:05:00.000000000 +0900
627 @@ -30,6 +30,12 @@
628 # include <config.h>
629 #endif
630
631 +#ifdef ALTIVEC
632 +#ifndef __APPLE_CC__
633 +#include <altivec.h>
634 +#endif
635 +#endif
636 +
637 #include "lame.h"
638 #include "machine.h"
639 #include "encoder.h"
640 @@ -39,7 +45,7 @@
641
642
643 #ifndef USE_GOGO_SUBBAND
644 -static const FLOAT enwindow[] = {
645 +static const FLOAT enwindow[] __attribute__ ((aligned (16))) = {
646 -4.77e-07 * 0.740951125354959 / 2.384e-06, 1.03951e-04 * 0.740951125354959 / 2.384e-06,
647 9.53674e-04 * 0.740951125354959 / 2.384e-06, 2.841473e-03 * 0.740951125354959 / 2.384e-06,
648 3.5758972e-02 * 0.740951125354959 / 2.384e-06, 3.401756e-03 * 0.740951125354959 / 2.384e-06, 9.83715e-04 * 0.740951125354959 / 2.384e-06, 9.9182e-05 * 0.740951125354959 / 2.384e-06, /* 15 */
649 @@ -230,7 +236,7 @@
650 #define NS 12
651 #define NL 36
652
653 -static const FLOAT win[4][NL] = {
654 +static const FLOAT win[4][NL] __attribute__ ((aligned (16))) = {
655 {
656 2.382191739347913e-13,
657 6.423305872147834e-13,
658 @@ -434,7 +440,444 @@
659 FLOAT const *wp = enwindow + 10;
660
661 const sample_t *x2 = &x1[238 - 14 - 286];
662 -
663 +
664 +#ifdef ALTIVEC
665 + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
666 + vector float vw1,vw2,vw3,vw4,vw5,vw6,vw7,vw8,vs,vt,vzero;
667 + vector unsigned char vperm2,vperm3,vperm4,vperm5;
668 + vzero = vec_xor(vzero,vzero);
669 + vperm5 = (vector unsigned char)VINIT16(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
670 + vperm2 = vec_lvsl(0,wp+8);
671 + vperm3 = (vector unsigned char)VINIT16(0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
672 + vperm4 = vec_lvsl(0,x1+1);
673 + vperm4 = vec_perm(vperm4,vperm4,vperm5);
674 +
675 + for(i=0;i<3;i++) {
676 + v1 = vec_ld(0,wp-10);
677 + v2 = vec_ld(16,wp-10);
678 + v5 = vec_ld(0,wp+8);
679 + v6 = vec_ld(16,wp+8);
680 + v7 = vec_ld(32,wp+8);
681 + v3 = vec_ld(0,wp+26);
682 + v4 = vec_ld(16,wp+26);
683 + v8 = vec_ld(0,wp+44);
684 + v9 = vec_ld(16,wp+44);
685 + v10 = vec_ld(32,wp+44);
686 +
687 + v5 = vec_perm(v5,v6,vperm2);
688 + v6 = vec_perm(v6,v7,vperm2);
689 + v7 = vec_perm(v8,v9,vperm2);
690 + v8 = vec_perm(v9,v10,vperm2);
691 + v9 = vec_mergeh(v1,v3);
692 + v10 = vec_mergeh(v2,v4);
693 + v11 = vec_mergeh(v5,v7);
694 + v12 = vec_mergeh(v6,v8);
695 + v13 = vec_mergel(v1,v3);
696 + v14 = vec_mergel(v2,v4);
697 + v15 = vec_mergel(v5,v7);
698 + v16 = vec_mergel(v6,v8);
699 + vw1 = vec_mergeh(v9,v11);
700 + vw5 = vec_mergeh(v10,v12);
701 + vw2 = vec_mergel(v9,v11);
702 + vw6 = vec_mergel(v10,v12);
703 + vw3 = vec_mergeh(v13,v15);
704 + vw7 = vec_mergeh(v14,v16);
705 + vw4 = vec_mergel(v13,v15);
706 + vw8 = vec_mergel(v14,v16);
707 +
708 + v3 = vec_ld(0,x2-224);
709 + vs = vec_madd(vw1,v3,vzero);
710 + v4 = vec_ld(16,x1+221);
711 + v5 = vec_ld(0,x1+221);
712 + v6 = vec_perm(v5,v4,vperm4);
713 + vt = vec_madd(vw1,v6,vzero);
714 +
715 + v3 = vec_ld(0,x2-160);
716 + vs = vec_madd(vw2,v3,vs);
717 + v4 = vec_ld(16,x1+157);
718 + v5 = vec_ld(0,x1+157);
719 + v6 = vec_perm(v5,v4,vperm4);
720 + vt = vec_madd(vw2,v6,vt);
721 +
722 + v3 = vec_ld(0,x2-96);
723 + vs = vec_madd(vw3,v3,vs);
724 + v4 = vec_ld(16,x1+93);
725 + v5 = vec_ld(0,x1+93);
726 + v6 = vec_perm(v5,v4,vperm4);
727 + vt = vec_madd(vw3,v6,vt);
728 +
729 + v3 = vec_ld(0,x2-32);
730 + vs = vec_madd(vw4,v3,vs);
731 + v4 = vec_ld(16,x1+29);
732 + v5 = vec_ld(0,x1+29);
733 + v6 = vec_perm(v5,v4,vperm4);
734 + vt = vec_madd(vw4,v6,vt);
735 +
736 +
737 + v3 = vec_ld(0,x2+32);
738 + vs = vec_madd(vw5,v3,vs);
739 + v4 = vec_ld(16,x1-35);
740 + v5 = vec_ld(0,x1-35);
741 + v6 = vec_perm(v5,v4,vperm4);
742 + vt = vec_madd(vw5,v6,vt);
743 +
744 + v3 = vec_ld(0,x2+96);
745 + vs = vec_madd(vw6,v3,vs);
746 + v4 = vec_ld(16,x1-99);
747 + v5 = vec_ld(0,x1-99);
748 + v6 = vec_perm(v5,v4,vperm4);
749 + vt = vec_madd(vw6,v6,vt);
750 +
751 + v3 = vec_ld(0,x2+160);
752 + vs = vec_madd(vw7,v3,vs);
753 + v4 = vec_ld(16,x1-163);
754 + v5 = vec_ld(0,x1-163);
755 + v6 = vec_perm(v5,v4,vperm4);
756 + vt = vec_madd(vw7,v6,vt);
757 +
758 + v3 = vec_ld(0,x2+224);
759 + vs = vec_madd(vw8,v3,vs);
760 + v4 = vec_ld(16,x1-227);
761 + v5 = vec_ld(0,x1-227);
762 + v6 = vec_perm(v5,v4,vperm4);
763 + vt = vec_madd(vw8,v6,vt);
764 +
765 +
766 + v1 = vec_ld(0,wp-2);
767 + v2 = vec_ld(16,wp-2);
768 + v5 = vec_ld(0,wp+16);
769 + v6 = vec_ld(16,wp+16);
770 + v7 = vec_ld(32,wp+16);
771 + v3 = vec_ld(0,wp+34);
772 + v4 = vec_ld(16,wp+34);
773 + v8 = vec_ld(0,wp+52);
774 + v9 = vec_ld(16,wp+52);
775 + v10 = vec_ld(32,wp+52);
776 +
777 + v5 = vec_perm(v5,v6,vperm2);
778 + v6 = vec_perm(v6,v7,vperm2);
779 + v7 = vec_perm(v8,v9,vperm2);
780 + v8 = vec_perm(v9,v10,vperm2);
781 + v9 = vec_mergeh(v1,v3);
782 + v10 = vec_mergeh(v2,v4);
783 + v11 = vec_mergeh(v5,v7);
784 + v12 = vec_mergeh(v6,v8);
785 + v13 = vec_mergel(v1,v3);
786 + v14 = vec_mergel(v2,v4);
787 + v15 = vec_mergel(v5,v7);
788 + v16 = vec_mergel(v6,v8);
789 + vw1 = vec_mergeh(v9,v11);
790 + vw5 = vec_mergeh(v10,v12);
791 + vw2 = vec_mergel(v9,v11);
792 + vw6 = vec_mergel(v10,v12);
793 + vw3 = vec_mergeh(v13,v15);
794 + vw7 = vec_mergeh(v14,v16);
795 + vw4 = vec_mergel(v13,v15);
796 + vw8 = vec_mergel(v14,v16);
797 +
798 + v3 = vec_ld(0,x2+256);
799 + vt = vec_nmsub(vw1,v3,vt);
800 + v4 = vec_ld(16,x1-259);
801 + v5 = vec_ld(0,x1-259);
802 + v6 = vec_perm(v5,v4,vperm4);
803 + vs = vec_madd(vw1,v6,vs);
804 +
805 + v3 = vec_ld(0,x2+192);
806 + vt = vec_nmsub(vw2,v3,vt);
807 + v4 = vec_ld(16,x1-195);
808 + v5 = vec_ld(0,x1-195);
809 + v6 = vec_perm(v5,v4,vperm4);
810 + vs = vec_madd(vw2,v6,vs);
811 +
812 + v3 = vec_ld(0,x2+128);
813 + vt = vec_nmsub(vw3,v3,vt);
814 + v4 = vec_ld(16,x1-131);
815 + v5 = vec_ld(0,x1-131);
816 + v6 = vec_perm(v5,v4,vperm4);
817 + vs = vec_madd(vw3,v6,vs);
818 +
819 + v3 = vec_ld(0,x2+64);
820 + vt = vec_nmsub(vw4,v3,vt);
821 + v4 = vec_ld(16,x1-67);
822 + v5 = vec_ld(0,x1-67);
823 + v6 = vec_perm(v5,v4,vperm4);
824 + vs = vec_madd(vw4,v6,vs);
825 +
826 +
827 + v3 = vec_ld(0,x2);
828 + vt = vec_nmsub(vw5,v3,vt);
829 + v4 = vec_ld(16,x1-3);
830 + v5 = vec_ld(0,x1-3);
831 + v6 = vec_perm(v5,v4,vperm4);
832 + vs = vec_madd(vw5,v6,vs);
833 +
834 + v3 = vec_ld(0,x2-64);
835 + vt = vec_nmsub(vw6,v3,vt);
836 + v4 = vec_ld(16,x1+61);
837 + v5 = vec_ld(0,x1+61);
838 + v6 = vec_perm(v5,v4,vperm4);
839 + vs = vec_madd(vw6,v6,vs);
840 +
841 + v3 = vec_ld(0,x2-128);
842 + vt = vec_nmsub(vw7,v3,vt);
843 + v4 = vec_ld(16,x1+125);
844 + v5 = vec_ld(0,x1+125);
845 + v6 = vec_perm(v5,v4,vperm4);
846 + vs = vec_madd(vw7,v6,vs);
847 +
848 + v3 = vec_ld(0,x2-192);
849 + vt = vec_nmsub(vw8,v3,vt);
850 + v4 = vec_ld(16,x1+189);
851 + v5 = vec_ld(0,x1+189);
852 + v6 = vec_perm(v5,v4,vperm4);
853 + vs = vec_madd(vw8,v6,vs);
854 +
855 + /*end*/
856 +
857 + v3 = vec_ld(0,wp+6);
858 +
859 + v4 = vec_ld(0,wp+24);
860 + v5 = vec_ld(16,wp+24);
861 + v6 = vec_perm(v4,v5,vperm2);
862 +
863 + v9 = vec_ld(0,wp+42);
864 +
865 + v10 = vec_ld(0,wp+60);
866 + v11 = vec_ld(16,wp+60);
867 + v12 = vec_perm(v10,v11,vperm2);
868 +
869 + v13 = vec_mergeh(v3,v9);
870 + v14 = vec_mergeh(v6,v12);;
871 + vw1 = vec_mergeh(v13,v14);
872 + vw2 = vec_mergel(v13,v14);
873 +
874 + vs = vec_madd(vs,vw1,vzero);
875 + v1 = vec_sub(vt,vs);
876 + v2 = vec_add(vt,vs);
877 + v3 = vec_madd(vw2,v1,vzero);
878 + v4 = vec_mergeh(v2,v3);
879 + v5 = vec_mergel(v2,v3);
880 + vec_st(v4,0,a+i*8);
881 + vec_st(v5,16,a+i*8);
882 +
883 + wp += 72;
884 + x1-=4;
885 + x2+=4;
886 + }
887 +
888 + v1 = vec_ld(0,wp-10);
889 + v2 = vec_ld(16,wp-10);
890 + v5 = vec_ld(0,wp+8);
891 + v6 = vec_ld(16,wp+8);
892 + v7 = vec_ld(32,wp+8);
893 + v3 = vec_ld(0,wp+26);
894 + v4 = vec_ld(16,wp+26);
895 + v8 = vec_ld(0,wp+44);
896 + v9 = vec_ld(16,wp+44);
897 + v10 = vec_ld(32,wp+44);
898 +
899 + v5 = vec_perm(v5,v6,vperm2);
900 + v6 = vec_perm(v6,v7,vperm2);
901 + v7 = vec_perm(v8,v9,vperm2);
902 + v8 = vec_perm(v9,v10,vperm2);
903 + v9 = vec_mergeh(v1,v3);
904 + v10 = vec_mergeh(v2,v4);
905 + v11 = vec_mergeh(v5,v7);
906 + v12 = vec_mergeh(v6,v8);
907 + v13 = vec_mergel(v1,v3);
908 + v14 = vec_mergel(v2,v4);
909 + v15 = vec_mergel(v5,v7);
910 + v16 = vec_mergel(v6,v8);
911 + vw1 = vec_mergeh(v9,v11);
912 + vw5 = vec_mergeh(v10,v12);
913 + vw2 = vec_mergel(v9,v11);
914 + vw6 = vec_mergel(v10,v12);
915 + vw3 = vec_mergeh(v13,v15);
916 + vw7 = vec_mergeh(v14,v16);
917 + vw4 = vec_mergel(v13,v15);
918 + vw8 = vec_mergel(v14,v16);
919 +
920 + v3 = vec_ld(0,x2-224);
921 + vs = vec_madd(vw1,v3,vzero);
922 + v4 = vec_ld(16,x1+221);
923 + v5 = vec_ld(0,x1+221);
924 + v6 = vec_perm(v5,v4,vperm4);
925 + vt = vec_madd(vw1,v6,vzero);
926 +
927 + v3 = vec_ld(0,x2-160);
928 + vs = vec_madd(vw2,v3,vs);
929 + v4 = vec_ld(16,x1+157);
930 + v5 = vec_ld(0,x1+157);
931 + v6 = vec_perm(v5,v4,vperm4);
932 + vt = vec_madd(vw2,v6,vt);
933 +
934 + v3 = vec_ld(0,x2-96);
935 + vs = vec_madd(vw3,v3,vs);
936 + v4 = vec_ld(16,x1+93);
937 + v5 = vec_ld(0,x1+93);
938 + v6 = vec_perm(v5,v4,vperm4);
939 + vt = vec_madd(vw3,v6,vt);
940 +
941 + v3 = vec_ld(0,x2-32);
942 + vs = vec_madd(vw4,v3,vs);
943 + v4 = vec_ld(16,x1+29);
944 + v5 = vec_ld(0,x1+29);
945 + v6 = vec_perm(v5,v4,vperm4);
946 + vt = vec_madd(vw4,v6,vt);
947 +
948 +
949 + v3 = vec_ld(0,x2+32);
950 + vs = vec_madd(vw5,v3,vs);
951 + v4 = vec_ld(16,x1-35);
952 + v5 = vec_ld(0,x1-35);
953 + v6 = vec_perm(v5,v4,vperm4);
954 + vt = vec_madd(vw5,v6,vt);
955 +
956 + v3 = vec_ld(0,x2+96);
957 + vs = vec_madd(vw6,v3,vs);
958 + v4 = vec_ld(16,x1-99);
959 + v5 = vec_ld(0,x1-99);
960 + v6 = vec_perm(v5,v4,vperm4);
961 + vt = vec_madd(vw6,v6,vt);
962 +
963 + v3 = vec_ld(0,x2+160);
964 + vs = vec_madd(vw7,v3,vs);
965 + v4 = vec_ld(16,x1-163);
966 + v5 = vec_ld(0,x1-163);
967 + v6 = vec_perm(v5,v4,vperm4);
968 + vt = vec_madd(vw7,v6,vt);
969 +
970 + v3 = vec_ld(0,x2+224);
971 + vs = vec_madd(vw8,v3,vs);
972 + v4 = vec_ld(16,x1-227);
973 + v5 = vec_ld(0,x1-227);
974 + v6 = vec_perm(v5,v4,vperm4);
975 + vt = vec_madd(vw8,v6,vt);
976 +
977 +
978 + v1 = vec_ld(0,wp-2);
979 + v2 = vec_ld(16,wp-2);
980 + v5 = vec_ld(0,wp+16);
981 + v6 = vec_ld(16,wp+16);
982 + v7 = vec_ld(32,wp+16);
983 + v3 = vec_ld(0,wp+34);
984 + v4 = vec_ld(16,wp+34);
985 + v8 = vec_ld(0,wp+52);
986 + v9 = vec_ld(16,wp+52);
987 + v10 = vec_ld(32,wp+52);
988 +
989 + v5 = vec_perm(v5,v6,vperm2);
990 + v6 = vec_perm(v6,v7,vperm2);
991 + v7 = vec_perm(v8,v9,vperm2);
992 + v8 = vec_perm(v9,v10,vperm2);
993 + v9 = vec_mergeh(v1,v3);
994 + v10 = vec_mergeh(v2,v4);
995 + v11 = vec_mergeh(v5,v7);
996 + v12 = vec_mergeh(v6,v8);
997 + v13 = vec_mergel(v1,v3);
998 + v14 = vec_mergel(v2,v4);
999 + v15 = vec_mergel(v5,v7);
1000 + v16 = vec_mergel(v6,v8);
1001 + vw1 = vec_mergeh(v9,v11);
1002 + vw5 = vec_mergeh(v10,v12);
1003 + vw2 = vec_mergel(v9,v11);
1004 + vw6 = vec_mergel(v10,v12);
1005 + vw3 = vec_mergeh(v13,v15);
1006 + vw7 = vec_mergeh(v14,v16);
1007 + vw4 = vec_mergel(v13,v15);
1008 + vw8 = vec_mergel(v14,v16);
1009 +
1010 + v3 = vec_ld(0,x2+256);
1011 + vt = vec_nmsub(vw1,v3,vt);
1012 + v4 = vec_ld(16,x1-259);
1013 + v5 = vec_ld(0,x1-259);
1014 + v6 = vec_perm(v5,v4,vperm4);
1015 + vs = vec_madd(vw1,v6,vs);
1016 +
1017 + v3 = vec_ld(0,x2+192);
1018 + vt = vec_nmsub(vw2,v3,vt);
1019 + v4 = vec_ld(16,x1-195);
1020 + v5 = vec_ld(0,x1-195);
1021 + v6 = vec_perm(v5,v4,vperm4);
1022 + vs = vec_madd(vw2,v6,vs);
1023 +
1024 + v3 = vec_ld(0,x2+128);
1025 + vt = vec_nmsub(vw3,v3,vt);
1026 + v4 = vec_ld(16,x1-131);
1027 + v5 = vec_ld(0,x1-131);
1028 + v6 = vec_perm(v5,v4,vperm4);
1029 + vs = vec_madd(vw3,v6,vs);
1030 +
1031 + v3 = vec_ld(0,x2+64);
1032 + vt = vec_nmsub(vw4,v3,vt);
1033 + v4 = vec_ld(16,x1-67);
1034 + v5 = vec_ld(0,x1-67);
1035 + v6 = vec_perm(v5,v4,vperm4);
1036 + vs = vec_madd(vw4,v6,vs);
1037 +
1038 +
1039 + v3 = vec_ld(0,x2);
1040 + vt = vec_nmsub(vw5,v3,vt);
1041 + v4 = vec_ld(16,x1-3);
1042 + v5 = vec_ld(0,x1-3);
1043 + v6 = vec_perm(v5,v4,vperm4);
1044 + vs = vec_madd(vw5,v6,vs);
1045 +
1046 + v3 = vec_ld(0,x2-64);
1047 + vt = vec_nmsub(vw6,v3,vt);
1048 + v4 = vec_ld(16,x1+61);
1049 + v5 = vec_ld(0,x1+61);
1050 + v6 = vec_perm(v5,v4,vperm4);
1051 + vs = vec_madd(vw6,v6,vs);
1052 +
1053 + v3 = vec_ld(0,x2-128);
1054 + vt = vec_nmsub(vw7,v3,vt);
1055 + v4 = vec_ld(16,x1+125);
1056 + v5 = vec_ld(0,x1+125);
1057 + v6 = vec_perm(v5,v4,vperm4);
1058 + vs = vec_madd(vw7,v6,vs);
1059 +
1060 + v3 = vec_ld(0,x2-192);
1061 + vt = vec_nmsub(vw8,v3,vt);
1062 + v4 = vec_ld(16,x1+189);
1063 + v5 = vec_ld(0,x1+189);
1064 + v6 = vec_perm(v5,v4,vperm4);
1065 + vs = vec_madd(vw8,v6,vs);
1066 +
1067 + /*end*/
1068 +
1069 + v3 = vec_ld(0,wp+6);
1070 +
1071 + v4 = vec_ld(0,wp+24);
1072 + v5 = vec_ld(16,wp+24);
1073 + v6 = vec_perm(v4,v5,vperm2);
1074 +
1075 + v9 = vec_ld(0,wp+42);
1076 +
1077 + v10 = vec_ld(0,wp+60);
1078 + v11 = vec_ld(16,wp+60);
1079 + v12 = vec_perm(v10,v11,vperm2);
1080 +
1081 + v13 = vec_mergeh(v3,v9);
1082 + v14 = vec_mergeh(v6,v12);;
1083 + vw1 = vec_mergeh(v13,v14);
1084 + vw2 = vec_mergel(v13,v14);
1085 +
1086 + vs = vec_madd(vs,vw1,vzero);
1087 + v1 = vec_sub(vt,vs);
1088 + v2 = vec_add(vt,vs);
1089 + v3 = vec_madd(vw2,v1,vzero);
1090 + v4 = vec_ld(16,a+24);
1091 + v5 = vec_mergeh(v2,v3);
1092 + v6 = vec_mergel(v2,v3);
1093 + v7 = vec_perm(v6,v4,vperm3);
1094 + vec_st(v5,0,a+24);
1095 + vec_st(v7,16,a+24);
1096 +
1097 + wp += 54;
1098 + x1-=3;
1099 + x2+=3;
1100 +#else
1101 for (i = -15; i < 0; i++) {
1102 FLOAT w, s, t;
1103
1104 @@ -501,6 +944,7 @@
1105 x1--;
1106 x2++;
1107 }
1108 +#endif
1109 {
1110 FLOAT s, t, u, v;
1111 t = x1[-16] * wp[-10];
1112 --- libmp3lame/lame.c.orig 2010-02-21 06:08:55.000000000 +0900
1113 +++ libmp3lame/lame.c 2010-03-01 14:05:00.000000000 +0900
1114 @@ -30,6 +30,12 @@
1115 # include <config.h>
1116 #endif
1117
1118 +#ifdef ALTIVEC
1119 +#ifndef __APPLE_CC__
1120 +#include <altivec.h>
1121 +#endif
1122 +#endif
1123 +
1124
1125 #include "lame.h"
1126 #include "machine.h"
1127 @@ -546,7 +552,12 @@
1128 gfc->CPU_features.SSE = 0;
1129 gfc->CPU_features.SSE2 = 0;
1130 }
1131 -
1132 +#ifdef ALTIVEC
1133 + /* turn off JAVA mode explicitly */
1134 + vector unsigned short vscr = vec_mfvscr();
1135 + vscr = vec_or(vscr,(vector unsigned short)VINIT8(0,0,0,0,0,0,1,0));
1136 + vec_mtvscr(vscr);
1137 +#endif
1138
1139 if (NULL == gfc->ATH)
1140 gfc->ATH = calloc(1, sizeof(ATH_t));
1141 @@ -1735,6 +1746,11 @@
1142 const int buffer_r[],
1143 const int nsamples, unsigned char *mp3buf, const int mp3buf_size)
1144 {
1145 +#ifdef ALTIVEC
1146 + vector signed int v1,v2,v3,v4,v5,v6;
1147 + vector float v7,v8;
1148 + vector unsigned char vperm1,vperm2;
1149 +#endif
1150 lame_internal_flags *const gfc = gfp->internal_flags;
1151 int i;
1152 sample_t *in_buffer[2];
1153 @@ -1751,7 +1767,63 @@
1154
1155 in_buffer[0] = gfc->in_buffer_0;
1156 in_buffer[1] = gfc->in_buffer_1;
1157 -
1158 +#ifdef ALTIVEC
1159 + if (gfc->channels_in>1) {
1160 + v1 = vec_ld(0,buffer_l);
1161 + v2 = vec_ld(0,buffer_r);
1162 + vperm1 = vec_lvsl(0,buffer_l);
1163 + vperm2 = vec_lvsl(0,buffer_r);
1164 + for(i=0;i<nsamples-7;i+=4) {
1165 + v3 = vec_ld(16,buffer_l+i);
1166 + v4 = vec_ld(16,buffer_r+i);
1167 + v5 = vec_perm(v1,v3,vperm1);
1168 + v6 = vec_perm(v2,v4,vperm2);
1169 + v1 = v3;
1170 + v2 = v4;
1171 + v7 = vec_ctf(v5,16);
1172 + v8 = vec_ctf(v6,16);
1173 + vec_st(v7,0,in_buffer[0]+i);
1174 + vec_st(v8,0,in_buffer[1]+i);
1175 + }
1176 + for(;i<nsamples-3;i+=4) {
1177 + v3 = vec_ld(15,buffer_l+i);
1178 + v4 = vec_ld(15,buffer_r+i);
1179 + v5 = vec_perm(v1,v3,vperm1);
1180 + v6 = vec_perm(v2,v4,vperm2);
1181 + v1 = v3;
1182 + v2 = v4;
1183 + v7 = vec_ctf(v5,16);
1184 + v8 = vec_ctf(v6,16);
1185 + vec_st(v7,0,in_buffer[0]+i);
1186 + vec_st(v8,0,in_buffer[1]+i);
1187 + }
1188 + for(;i<nsamples;i++) {
1189 + in_buffer[0][i] = buffer_l[i] * (1.0 / ( 1L << (8 * sizeof(int) - 16)));
1190 + in_buffer[1][i] = buffer_r[i] * (1.0 / ( 1L << (8 * sizeof(int) - 16)));
1191 + }
1192 + }
1193 + else {
1194 + v1 = vec_ld(0,buffer_l);
1195 + vperm1 = vec_lvsl(0,buffer_l);
1196 + for(i=0;i<nsamples-7;i+=4) {
1197 + v3 = vec_ld(16,buffer_l+i);
1198 + v5 = vec_perm(v1,v3,vperm1);
1199 + v1 = v3;
1200 + v7 = vec_ctf(v5,16);
1201 + vec_st(v7,0,in_buffer[0]+i);
1202 + }
1203 + for(;i<nsamples-3;i+=4) {
1204 + v3 = vec_ld(15,buffer_l+i);
1205 + v5 = vec_perm(v1,v3,vperm1);
1206 + v1 = v3;
1207 + v7 = vec_ctf(v5,16);
1208 + vec_st(v7,0,in_buffer[0]+i);
1209 + }
1210 + for(;i<nsamples;i++) {
1211 + in_buffer[0][i] = buffer_l[i] * (1.0 / ( 1L << (8 * sizeof(int) - 16)));
1212 + }
1213 + }
1214 +#else
1215 /* make a copy of input buffer, changing type to sample_t */
1216 for (i = 0; i < nsamples; i++) {
1217 /* internal code expects +/- 32768.0 */
1218 @@ -1759,6 +1831,7 @@
1219 if (gfc->channels_in > 1)
1220 in_buffer[1][i] = buffer_r[i] * (1.0 / (1L << (8 * sizeof(int) - 16)));
1221 }
1222 +#endif
1223
1224 return lame_encode_buffer_sample_t(gfp, in_buffer[0], in_buffer[1],
1225 nsamples, mp3buf, mp3buf_size);
1226 --- libmp3lame/psymodel.c.orig 2009-01-19 00:44:27.000000000 +0900
1227 +++ libmp3lame/psymodel.c 2010-03-01 14:05:00.000000000 +0900
1228 @@ -144,6 +144,12 @@
1229 # include <config.h>
1230 #endif
1231
1232 +#ifdef ALTIVEC
1233 +#ifndef __APPLE_CC__
1234 +#include <altivec.h>
1235 +#endif
1236 +#endif
1237 +
1238 #include "lame.h"
1239 #include "machine.h"
1240 #include "encoder.h"
1241 @@ -231,6 +237,49 @@
1242 simply bend the existing ATH curve to achieve the desired shape.
1243 However, the potential gain may not be enough to justify an effort.
1244 */
1245 +
1246 +#ifdef ALTIVEC
1247 +static inline vector float fast_log10_altivec_2(vector float v3)
1248 +{
1249 + vector float va,vb,vc,vhalf,vzero,vsqrt2,vconst4;
1250 + vector float v1,v2,v4,v5,v6,v7,v8,vz,vz2,vlog;
1251 + vector unsigned int vconst1,vconst2,vshamt;
1252 + vector signed int vconst3;
1253 +
1254 + va = (vector float)VINIT4ALL(0.8685890659);
1255 + vb = (vector float)VINIT4ALL(0.2894672153);
1256 + vc = (vector float)VINIT4ALL(0.1793365895);
1257 + vhalf = (vector float)VINIT4ALL(0.15051499783);
1258 + vsqrt2 = (vector float)VINIT4ALL(1.4142135623731);
1259 + vconst4 = (vector float)VINIT4ALL(0.301029995664);
1260 + vzero = vec_xor(vzero,vzero);
1261 + vconst1 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(9));
1262 + vconst2 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(7));
1263 + vconst2 = vec_nor(vconst2,vconst2);
1264 + vconst3 = (vector signed int)vec_rl(vconst2,vec_splat_u32(7));
1265 + vshamt = vec_add(vec_splat_u32(9),vec_splat_u32(7));
1266 + vshamt = vec_add(vshamt,vec_splat_u32(7));
1267 + vconst2 = vec_sl((vector unsigned int)vconst3,vshamt);
1268 +
1269 + v4 = (vector float)vec_sel(vconst2,(vector unsigned int)v3,vconst1);
1270 + v5 = vec_add(v4,vsqrt2);
1271 + v6 = vec_sub(v4,vsqrt2);
1272 + v7 = vec_re(v5);
1273 + vz = vec_madd(v6, vec_madd(vec_nmsub(v7,v5,(vector float)vconst2),v7,v7), vzero);
1274 + v8 = (vector float)vec_sr((vector unsigned int)v3,vshamt);
1275 + vlog = vec_ctf(vec_sub((vector signed int)v8,vconst3),0);
1276 +
1277 + vz2 = vec_madd(vz,vz,vzero);
1278 + vlog = vec_madd(vlog,vconst4,vhalf);
1279 +
1280 + v1 = vec_madd(vz2,vc,vb);
1281 + v2 = vec_madd(vz2,v1,va);
1282 + vlog = vec_madd(vz,v2,vlog);
1283 +
1284 + return vlog;
1285 +}
1286 +#endif
1287 +
1288 static FLOAT
1289 psycho_loudness_approx(FLOAT const *energy, lame_internal_flags const *gfc)
1290 {
1291 @@ -249,11 +298,19 @@
1292 static void
1293 compute_ffts(lame_global_flags const *gfp,
1294 FLOAT fftenergy[HBLKSIZE],
1295 - FLOAT(*fftenergy_s)[HBLKSIZE_s],
1296 + FLOAT(*fftenergy_s)[HBLKSIZE_s+3],
1297 FLOAT(*wsamp_l)[BLKSIZE],
1298 FLOAT(*wsamp_s)[3][BLKSIZE_s], int gr_out, int chn, const sample_t * buffer[2]
1299 )
1300 {
1301 +#ifdef ALTIVEC
1302 + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,vhalf,vprev,vzero,vsqrt2;
1303 + vector unsigned char vperm;
1304 + vhalf = vec_ctf(vec_splat_s32(1),1);
1305 + vsqrt2 = (vector float)VINIT4ALL(0.7071067811865001);
1306 + vzero = vec_xor(vzero,vzero);
1307 + vperm = (vector unsigned char)VINIT16(0,1,2,3,28,29,30,31,24,25,26,27,20,21,22,23);
1308 +#endif
1309 int b, j;
1310 lame_internal_flags *const gfc = gfp->internal_flags;
1311 if (chn < 2) {
1312 @@ -262,6 +319,50 @@
1313 }
1314 /* FFT data for mid and side channel is derived from L & R */
1315 else if (chn == 2) {
1316 +#ifdef ALTIVEC
1317 + for(j = 0; j < BLKSIZE; j += 8) {
1318 + v1 = vec_ld(0,wsamp_l[0]+j);
1319 + v2 = vec_ld(0,wsamp_l[1]+j);
1320 + v3 = vec_ld(16,wsamp_l[0]+j);
1321 + v4 = vec_ld(16,wsamp_l[1]+j);
1322 +
1323 + v5 = vec_add(v1,v2);
1324 + v6 = vec_sub(v1,v2);
1325 + v7 = vec_add(v3,v4);
1326 + v8 = vec_sub(v3,v4);
1327 + v9 = vec_madd(v5,vsqrt2,vzero);
1328 + v10 = vec_madd(v6,vsqrt2,vzero);
1329 + v11 = vec_madd(v7,vsqrt2,vzero);
1330 + v12 = vec_madd(v8,vsqrt2,vzero);
1331 +
1332 + vec_st(v9,0,wsamp_l[0]+j);
1333 + vec_st(v10,0,wsamp_l[1]+j);
1334 + vec_st(v11,16,wsamp_l[0]+j);
1335 + vec_st(v12,16,wsamp_l[1]+j);
1336 + }
1337 + for (b = 2; b >= 0; --b) {
1338 + for(j = 0; j < BLKSIZE_s; j += 8) {
1339 + v1 = vec_ld(0,wsamp_s[0][b]+j);
1340 + v2 = vec_ld(0,wsamp_s[1][b]+j);
1341 + v3 = vec_ld(16,wsamp_s[0][b]+j);
1342 + v4 = vec_ld(16,wsamp_s[1][b]+j);
1343 +
1344 + v5 = vec_add(v1,v2);
1345 + v6 = vec_sub(v1,v2);
1346 + v7 = vec_add(v3,v4);
1347 + v8 = vec_sub(v3,v4);
1348 + v9 = vec_madd(v5,vsqrt2,vzero);
1349 + v10 = vec_madd(v6,vsqrt2,vzero);
1350 + v11 = vec_madd(v7,vsqrt2,vzero);
1351 + v12 = vec_madd(v8,vsqrt2,vzero);
1352 +
1353 + vec_st(v9,0,wsamp_s[0][b]+j);
1354 + vec_st(v10,0,wsamp_s[1][b]+j);
1355 + vec_st(v11,16,wsamp_s[0][b]+j);
1356 + vec_st(v12,16,wsamp_s[1][b]+j);
1357 + }
1358 + }
1359 +#else
1360 for (j = BLKSIZE - 1; j >= 0; --j) {
1361 FLOAT const l = wsamp_l[0][j];
1362 FLOAT const r = wsamp_l[1][j];
1363 @@ -276,11 +377,88 @@
1364 wsamp_s[1][b][j] = (l - r) * (FLOAT) (SQRT2 * 0.5);
1365 }
1366 }
1367 +#endif
1368 }
1369
1370 /*********************************************************************
1371 * compute energies
1372 *********************************************************************/
1373 +#ifdef ALTIVEC
1374 + vprev = vec_ld(0,(*wsamp_l));
1375 + for(j = 0; j < BLKSIZE/2; j += 16) {
1376 + v1 = vec_ld(0,(*wsamp_l)+j);
1377 + v2 = vec_ld(16,(*wsamp_l)+j);
1378 + v3 = vec_ld(32,(*wsamp_l)+j);
1379 + v4 = vec_ld(48,(*wsamp_l)+j);
1380 + v5 = vec_ld(48,(*wsamp_l)+1008-j);
1381 + v6 = vec_ld(32,(*wsamp_l)+1008-j);
1382 + v7 = vec_ld(16,(*wsamp_l)+1008-j);
1383 + v8 = vec_ld(0,(*wsamp_l)+1008-j);
1384 + v9 = vec_perm(vprev,v5,vperm);
1385 + v10 = vec_perm(v5,v6,vperm);
1386 + v11 = vec_perm(v6,v7,vperm);
1387 + v12 = vec_perm(v7,v8,vperm);
1388 + vprev = v8;
1389 + v1 = vec_madd(v1,v1,vzero);
1390 + v2 = vec_madd(v2,v2,vzero);
1391 + v3 = vec_madd(v3,v3,vzero);
1392 + v4 = vec_madd(v4,v4,vzero);
1393 + v5 = vec_madd(v9,v9,v1);
1394 + v6 = vec_madd(v10,v10,v2);
1395 + v7 = vec_madd(v11,v11,v3);
1396 + v8 = vec_madd(v12,v12,v4);
1397 + v9 = vec_madd(v5,vhalf,vzero);
1398 + v10 = vec_madd(v6,vhalf,vzero);
1399 + v11 = vec_madd(v7,vhalf,vzero);
1400 + v12 = vec_madd(v8,vhalf,vzero);
1401 +
1402 + vec_st(v9,0,fftenergy+j);
1403 + vec_st(v10,16,fftenergy+j);
1404 + vec_st(v11,32,fftenergy+j);
1405 + vec_st(v12,48,fftenergy+j);
1406 + }
1407 +
1408 + v1 = vec_madd(vprev,vprev,vzero);
1409 + vec_ste(v1,0,fftenergy+j);
1410 + for (b = 2; b >= 0; --b) {
1411 + vprev = vec_ld(0,(*wsamp_s)[b]);
1412 + for(j=0;j<BLKSIZE_s/2;j+=16) {
1413 + v1 = vec_ld(0,(*wsamp_s)[b]+j);
1414 + v2 = vec_ld(16,(*wsamp_s)[b]+j);
1415 + v3 = vec_ld(32,(*wsamp_s)[b]+j);
1416 + v4 = vec_ld(48,(*wsamp_s)[b]+j);
1417 + v5 = vec_ld(48,(*wsamp_s)[b]+240-j);
1418 + v6 = vec_ld(32,(*wsamp_s)[b]+240-j);
1419 + v7 = vec_ld(16,(*wsamp_s)[b]+240-j);
1420 + v8 = vec_ld(0,(*wsamp_s)[b]+240-j);
1421 + v9 = vec_perm(vprev,v5,vperm);
1422 + v10 = vec_perm(v5,v6,vperm);
1423 + v11 = vec_perm(v6,v7,vperm);
1424 + v12 = vec_perm(v7,v8,vperm);
1425 + vprev = v8;
1426 + v1 = vec_madd(v1,v1,vzero);
1427 + v2 = vec_madd(v2,v2,vzero);
1428 + v3 = vec_madd(v3,v3,vzero);
1429 + v4 = vec_madd(v4,v4,vzero);
1430 + v5 = vec_madd(v9,v9,v1);
1431 + v6 = vec_madd(v10,v10,v2);
1432 + v7 = vec_madd(v11,v11,v3);
1433 + v8 = vec_madd(v12,v12,v4);
1434 + v9 = vec_madd(v5,vhalf,vzero);
1435 + v10 = vec_madd(v6,vhalf,vzero);
1436 + v11 = vec_madd(v7,vhalf,vzero);
1437 + v12 = vec_madd(v8,vhalf,vzero);
1438 +
1439 + vec_st(v9,0,fftenergy_s[b]+j);
1440 + vec_st(v10,16,fftenergy_s[b]+j);
1441 + vec_st(v11,32,fftenergy_s[b]+j);
1442 + vec_st(v12,48,fftenergy_s[b]+j);
1443 + }
1444 +
1445 + v1 = vec_madd(vprev,vprev,vzero);
1446 + vec_ste(v1,0,fftenergy_s[b]+j);
1447 + }
1448 +#else
1449 fftenergy[0] = NON_LINEAR_SCALE_ENERGY(wsamp_l[0][0]);
1450 fftenergy[0] *= fftenergy[0];
1451
1452 @@ -298,13 +476,51 @@
1453 fftenergy_s[b][BLKSIZE_s / 2 - j] = NON_LINEAR_SCALE_ENERGY((re * re + im * im) * 0.5f);
1454 }
1455 }
1456 +#endif
1457 /* total energy */
1458 {
1459 +#ifdef ALTIVEC
1460 +#ifdef ALTIVEC_970
1461 + v5 = vec_ld(0,fftenergy+8);
1462 + v6 = vec_ld(0,fftenergy+508);
1463 + v7 = vec_ld(0,fftenergy+512);
1464 + v8 = vec_xor(v8,v8);
1465 + v5 = vec_sld(v5,v8,12);
1466 + v7 = vec_sld(v8,v7,4);
1467 +#else
1468 + v5 = vec_lde(0,fftenergy+11);
1469 + v6 = vec_ld(0,fftenergy+508);
1470 + v7 = vec_lde(0,fftenergy+512);
1471 + v8 = vec_xor(v8,v8);
1472 +#endif
1473 + for(j=12;j<508;j+=16) {
1474 + v1 = vec_ld(0,fftenergy+j);
1475 + v2 = vec_ld(16,fftenergy+j);
1476 + v3 = vec_ld(32,fftenergy+j);
1477 + v4 = vec_ld(48,fftenergy+j);
1478 + v5 = vec_add(v1,v5);
1479 + v6 = vec_add(v2,v6);
1480 + v7 = vec_add(v3,v7);
1481 + v8 = vec_add(v4,v8);
1482 + }
1483 + v5 = vec_add(v5,v6);
1484 + v7 = vec_add(v7,v8);
1485 + v5 = vec_add(v5,v7);
1486 + v6 = vec_sld(v5,v5,4);
1487 + v7 = vec_sld(v5,v5,8);
1488 + v8 = vec_sld(v5,v5,12);
1489 + v5 = vec_add(v5,v6);
1490 + v7 = vec_add(v7,v8);
1491 + v5 = vec_add(v5,v7);
1492 + v5 = vec_perm(v5,v5,vec_lvsr(0, gfc->tot_ener+chn));
1493 + vec_ste(v5,0,gfc->tot_ener+chn);
1494 +#else
1495 FLOAT totalenergy = 0.0;
1496 for (j = 11; j < HBLKSIZE; j++)
1497 totalenergy += fftenergy[j];
1498
1499 gfc->tot_ener[chn] = totalenergy;
1500 +#endif
1501 }
1502
1503 if (gfp->analysis) {
1504 @@ -345,9 +561,13 @@
1505 #define I2LIMIT 23 /* as in if(i>24) -> changed 23 */
1506 #define MLIMIT 15 /* as in if(m<15) */
1507
1508 -static FLOAT ma_max_i1;
1509 -static FLOAT ma_max_i2;
1510 +static FLOAT ma_max_i1 __attribute__ ((aligned (16)));
1511 +static FLOAT ma_max_i2 __attribute__ ((aligned (16)));
1512 static FLOAT ma_max_m;
1513 +#ifdef ALTIVEC
1514 +static vector float vmamax1 = (vector float)VINIT4ALL(3.651741);
1515 +static vector float vmamax2 = (vector float)VINIT4ALL(31.622777);
1516 +#endif
1517
1518 /*This is the masking table:
1519 According to tonality, values are going from 0dB (TMN)
1520 @@ -762,7 +982,7 @@
1521
1522 static void
1523 compute_masking_s(lame_global_flags const *gfp,
1524 - FLOAT(*fftenergy_s)[HBLKSIZE_s], FLOAT * eb, FLOAT * thr, int chn, int sblock)
1525 + FLOAT(*fftenergy_s)[HBLKSIZE_s+3], FLOAT * eb, FLOAT * thr, int chn, int sblock)
1526 {
1527 lame_internal_flags *const gfc = gfp->internal_flags;
1528 int i, j, b;
1529 @@ -1080,11 +1300,51 @@
1530 * (Note: these static variables have been moved to the gfc-> struct,
1531 * and their order in memory is layed out in util.h)
1532 */
1533 + static const FLOAT table1[] = {
1534 + 3.3246 * 3.3246, 3.23837 * 3.23837, 3.15437 * 3.15437, 3.00412 * 3.00412, 2.86103 * 2.86103,
1535 + 2.65407 * 2.65407, 2.46209 * 2.46209, 2.284 * 2.284,
1536 + 2.11879 * 2.11879, 1.96552 * 1.96552, 1.82335 * 1.82335, 1.69146 * 1.69146,
1537 + 1.56911 * 1.56911, 1.46658 * 1.46658, 1.37074 * 1.37074, 1.31036 * 1.31036,
1538 + 1.25264 * 1.25264, 1.20648 * 1.20648, 1.16203 * 1.16203, 1.12765 * 1.12765,
1539 + 1.09428 * 1.09428, 1.0659 * 1.0659, 1.03826 * 1.03826, 1.01895 * 1.01895,
1540 + 1
1541 + };
1542 +
1543 + static const FLOAT table2[] = {
1544 + 1.33352 * 1.33352, 1.35879 * 1.35879, 1.38454 * 1.38454, 1.39497 * 1.39497,
1545 + 1.40548 * 1.40548, 1.3537 * 1.3537, 1.30382 * 1.30382, 1.22321 * 1.22321,
1546 + 1.14758 * 1.14758,
1547 + 1
1548 + };
1549 +
1550 + static const FLOAT table3[] = {
1551 + 2.35364 * 2.35364, 2.29259 * 2.29259, 2.23313 * 2.23313, 2.12675 * 2.12675,
1552 + 2.02545 * 2.02545, 1.87894 * 1.87894, 1.74303 * 1.74303, 1.61695 * 1.61695,
1553 + 1.49999 * 1.49999, 1.39148 * 1.39148, 1.29083 * 1.29083, 1.19746 * 1.19746,
1554 + 1.11084 * 1.11084, 1.03826 * 1.03826
1555 + };
1556 +#ifdef ALTIVEC
1557 + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
1558 + vector float vsum,vsum1,vsum2,vsuma,vsumb,vsumc,vsumd,vmaska,vmaskb,vmaskc,vmaskd;
1559 + vector unsigned char vmask1,vmask2,vmask3,vmask4,vmask1inv,vmask2inv,vmask3inv,vmask4inv,vperm,vs4,vs8,vs12;
1560 +
1561 + vperm = (vector unsigned char)VINIT16(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
1562 + v1 = (vector float)vec_splat_u8(1);
1563 + v2 = (vector float)vec_splat_u8(5);
1564 + vs4 = vec_sl((vector unsigned char)v1,(vector unsigned char)v2);
1565 + vs8 = vec_sl(vs4,(vector unsigned char)v1);
1566 + vs12 = vec_or(vs4,vs8);
1567 + v3 = (vector float)vec_splat_s32(-1);
1568 + vmaska = vec_slo(v3,vs12);
1569 + vmaskb = vec_sro(vmaska,vs4);
1570 + vmaskc = vec_sro(vmaska,vs8);
1571 + vmaskd = vec_sro(vmaska,vs12);
1572 +#endif
1573 lame_internal_flags *const gfc = gfp->internal_flags;
1574
1575 /* fft and energy calculation */
1576 - FLOAT wsamp_L[2][BLKSIZE];
1577 - FLOAT wsamp_S[2][3][BLKSIZE_s];
1578 + FLOAT wsamp_L[2][BLKSIZE] __attribute__ ((aligned (16)));
1579 + FLOAT wsamp_S[2][3][BLKSIZE_s] __attribute__ ((aligned (16)));
1580
1581 /* convolution */
1582 FLOAT eb_l[CBANDS + 1], eb_s[CBANDS + 1];
1583 @@ -1099,7 +1359,7 @@
1584 int sb, sblock;
1585
1586 /* variables used for --nspsytune */
1587 - FLOAT ns_hpfsmpl[2][576];
1588 + FLOAT ns_hpfsmpl[2][576] __attribute__ ((aligned (16)));
1589 FLOAT pcfact;
1590
1591 unsigned char mask_idx_l[CBANDS + 2], mask_idx_s[CBANDS + 2];
1592 @@ -1128,14 +1388,142 @@
1593 /* Don't copy the input buffer into a temporary buffer */
1594 /* unroll the loop 2 times */
1595 for (chn = 0; chn < gfc->channels_out; chn++) {
1596 - static const FLOAT fircoef[] = {
1597 + static const FLOAT fircoef[] __attribute__ ((aligned (16))) = {
1598 -8.65163e-18 * 2, -0.00851586 * 2, -6.74764e-18 * 2, 0.0209036 * 2,
1599 -3.36639e-17 * 2, -0.0438162 * 2, -1.54175e-17 * 2, 0.0931738 * 2,
1600 - -5.52212e-17 * 2, -0.313819 * 2
1601 + -5.52212e-17 * 2, -0.313819 * 2, 0.0, 0.0
1602 };
1603 /* apply high pass filter of fs/4 */
1604 const sample_t *const firbuf = &buffer[chn][576 - 350 - NSFIRLEN + 192];
1605 - assert(sizeof(fircoef) / sizeof(fircoef[0]) == ((NSFIRLEN - 1) / 2));
1606 + //assert(sizeof(fircoef) / sizeof(fircoef[0]) == ((NSFIRLEN - 1) / 2));
1607 +#ifdef ALTIVEC
1608 + v1 = vec_ld(0, firbuf+10);
1609 + vmask1 = vec_lvsl(0, firbuf);
1610 + vmask2 = vec_lvsl(0, firbuf+1);
1611 + vmask3 = vec_lvsl(0, firbuf+2);
1612 + vmask4 = vec_lvsl(0, firbuf+3);
1613 + vmask1inv = vec_perm(vmask1,vmask1,vperm);
1614 + vmask2inv = vec_perm(vmask2,vmask2,vperm);
1615 + vmask3inv = vec_perm(vmask3,vmask3,vperm);
1616 + vmask4inv = vec_perm(vmask4,vmask4,vperm);
1617 + for(i=0;i<576;) {
1618 + v2 = vec_ld(16,firbuf+i+10);
1619 + vsum1 = vec_perm(v1, v2, vmask3);
1620 + v1 = v2;
1621 +
1622 + vsum2 = vec_splat(vsum1, 0);
1623 + vsum = vec_and(vsum2, vmaska);
1624 + v3 = vec_ld(0, firbuf+i);
1625 + v4 = vec_ld(16,firbuf+i+NSFIRLEN-3);
1626 + for(j=0;j<(NSFIRLEN-1)/2;j+=4) {
1627 + v5 = vec_ld(16, firbuf+i+j);
1628 + v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j);
1629 + v7 = vec_perm(v3,v5,vmask1);
1630 + v8 = vec_perm(v6,v4,vmask3inv);
1631 + v3 = v5;
1632 + v4 = v6;
1633 + v10 = vec_ld(0,fircoef+j);
1634 + v11 = vec_add(v7,v8);
1635 + vsum = vec_madd(v10,v11,vsum);
1636 + }
1637 +
1638 + v12 = vec_slo(vsum,vs4);
1639 + v13 = vec_slo(vsum,vs8);
1640 + v14 = vec_slo(vsum,vs12);
1641 + v15 = vec_add(vsum,v12);
1642 + v16 = vec_add(v13,v14);
1643 + vsuma = vec_add(v15,v16);
1644 + vsuma = vec_and(vsuma,vmaska);
1645 +
1646 + i++;
1647 +
1648 + vsum2 = vec_splat(vsum1, 1);
1649 + vsum = vec_and(vsum2, vmaska);
1650 + v3 = vec_ld(0, firbuf+i);
1651 + v4 = vec_ld(16,firbuf+i+NSFIRLEN-3);
1652 + vmask2 = vec_lvsl(0, firbuf+i);
1653 + for(j=0;j<(NSFIRLEN-1)/2;j+=4) {
1654 + v5 = vec_ld(16, firbuf+i+j);
1655 + v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j);
1656 + v7 = vec_perm(v3,v5,vmask2);
1657 + v8 = vec_perm(v6,v4,vmask4inv);
1658 + v3 = v5;
1659 + v4 = v6;
1660 + v10 = vec_ld(0,fircoef+j);
1661 + v11 = vec_add(v7,v8);
1662 + vsum = vec_madd(v10,v11,vsum);
1663 + }
1664 +
1665 + v12 = vec_sro(vsum,vs4);
1666 + v13 = vec_slo(vsum,vs4);
1667 + v14 = vec_slo(vsum,vs8);
1668 + v15 = vec_add(vsum,v12);
1669 + v16 = vec_add(v13,v14);
1670 + vsumb = vec_add(v15,v16);
1671 + vsumb = vec_and(vsumb,vmaskb);
1672 +
1673 + i++;
1674 +
1675 + vsum2 = vec_splat(vsum1, 2);
1676 + vsum = vec_and(vsum2, vmaska);
1677 + v3 = vec_ld(0, firbuf+i);
1678 + v4 = vec_ld(16,firbuf+i+NSFIRLEN-3);
1679 + vmask2 = vec_lvsl(0, firbuf+i);
1680 + for(j=0;j<(NSFIRLEN-1)/2;j+=4) {
1681 + v5 = vec_ld(16, firbuf+i+j);
1682 + v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j);
1683 + v7 = vec_perm(v3,v5,vmask3);
1684 + v8 = vec_perm(v6,v4,vmask1inv);
1685 + v3 = v5;
1686 + v4 = v6;
1687 + v10 = vec_ld(0,fircoef+j);
1688 + v11 = vec_add(v7,v8);
1689 + vsum = vec_madd(v10,v11,vsum);
1690 + }
1691 +
1692 + v12 = vec_sro(vsum,vs4);
1693 + v13 = vec_sro(vsum,vs8);
1694 + v14 = vec_slo(vsum,vs4);
1695 + v15 = vec_add(vsum,v12);
1696 + v16 = vec_add(v13,v14);
1697 + vsumc = vec_add(v15,v16);
1698 + vsumc = vec_and(vsumc,vmaskc);
1699 +
1700 + i++;
1701 +
1702 + vsum2 = vec_splat(vsum1, 3);
1703 + vsum = vec_and(vsum2, vmaska);
1704 + v3 = vec_ld(0, firbuf+i);
1705 + v4 = vec_ld(16,firbuf+i+NSFIRLEN-3);
1706 + vmask2 = vec_lvsl(0, firbuf+i);
1707 + for(j=0;j<(NSFIRLEN-1)/2;j+=4) {
1708 + v5 = vec_ld(16, firbuf+i+j);
1709 + v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j);
1710 + v7 = vec_perm(v3,v5,vmask4);
1711 + v8 = vec_perm(v6,v4,vmask2inv);
1712 + v3 = v5;
1713 + v4 = v6;
1714 + v10 = vec_ld(0,fircoef+j);
1715 + v11 = vec_add(v7,v8);
1716 + vsum = vec_madd(v10,v11,vsum);
1717 + }
1718 +
1719 + v12 = vec_sro(vsum,vs4);
1720 + v13 = vec_sro(vsum,vs8);
1721 + v14 = vec_sro(vsum,vs12);
1722 + v15 = vec_add(vsum,v12);
1723 + v16 = vec_add(v13,v14);
1724 + vsumd = vec_add(v15,v16);
1725 + vsumd = vec_and(vsumd,vmaskd);
1726 +
1727 + vsum1 = vec_or(vsuma,vsumb);
1728 + vsum2 = vec_or(vsumc,vsumd);
1729 + vsum = vec_or(vsum1,vsum2);
1730 +
1731 + i++;
1732 + vec_st(vsum,0,ns_hpfsmpl[chn]+i-4);
1733 + }
1734 +#else
1735 for (i = 0; i < 576; i++) {
1736 FLOAT sum1, sum2;
1737 sum1 = firbuf[i + 10];
1738 @@ -1146,6 +1534,7 @@
1739 }
1740 ns_hpfsmpl[chn][i] = sum1 + sum2;
1741 }
1742 +#endif
1743 masking_ratio[gr_out][chn].en = gfc->en[chn];
1744 masking_ratio[gr_out][chn].thm = gfc->thm[chn];
1745 if (numchn > 2) {
1746 @@ -1166,8 +1555,8 @@
1747 FLOAT attackThreshold;
1748 FLOAT max[CBANDS], avg[CBANDS];
1749 int ns_attacks[4] = { 0 };
1750 - FLOAT fftenergy[HBLKSIZE];
1751 - FLOAT fftenergy_s[3][HBLKSIZE_s];
1752 + FLOAT fftenergy[HBLKSIZE] __attribute__ ((aligned (16)));
1753 + FLOAT fftenergy_s[3][HBLKSIZE_s+3] __attribute__ ((aligned (16)));
1754
1755
1756 /* rh 20040301: the following loops do access one off the limits
1757 @@ -1203,9 +1592,28 @@
1758 for (i = 0; i < 9; i++) {
1759 FLOAT const *const pfe = pf + 576 / 9;
1760 FLOAT p = 1.;
1761 +#ifdef ALTIVEC
1762 + FLOAT vmax[4] __attribute__ ((aligned (16)));
1763 + v1 = (vector float)vec_splat_s32(1);
1764 + v2 = vec_ctf((vector signed int)v1,0);
1765 + for (; pf < pfe; pf+=4) {
1766 + v3 = vec_ld(0,pf);
1767 + v4 = vec_abs(v3);
1768 + v2 = vec_max(v2,v4);
1769 + }
1770 + v5 = vec_slo(v2,vs4);
1771 + v6 = vec_slo(v2,vs8);
1772 + v7 = vec_slo(v2,vs12);
1773 + v8 = vec_max(v2,v5);
1774 + v9 = vec_max(v6,v7);
1775 + v10 =vec_max(v8,v9);
1776 + vec_st(v10,0,vmax);
1777 + p = vmax[0];
1778 +#else
1779 for (; pf < pfe; pf++)
1780 if (p < fabs(*pf))
1781 p = fabs(*pf);
1782 +#endif
1783
1784 gfc->nsPsy.last_en_subshort[chn][i] = en_subshort[i + 3] = p;
1785 en_short[1 + i / 3] += p;
1786 @@ -1361,7 +1769,82 @@
1787 ecb = gfc->s3_ll[k++] * eb2;
1788 while (++kk <= gfc->s3ind[b][1]) {
1789 eb2 = eb_l[kk] * tab[mask_idx_l[kk]];
1790 - ecb = mask_add(ecb, gfc->s3_ll[k++] * eb2, kk, kk - b, gfc, 0);
1791 + {
1792 + int i;
1793 + FLOAT ratio;
1794 + FLOAT m1 = ecb;
1795 + FLOAT m2 = gfc->s3_ll[k++] * eb2;
1796 + int kkk = kk;
1797 + int bb = kk-b;
1798 +
1799 +
1800 + if (m2 > m1) {
1801 + if (m2 < (m1 * ma_max_i2))
1802 + ratio = m2 / m1;
1803 + else
1804 + {ecb = (m1 + m2); continue;}
1805 + }
1806 + else {
1807 + if (m1 >= (m2 * ma_max_i2))
1808 + {ecb = (m1 + m2); continue;}
1809 + ratio = m1 / m2;
1810 + }
1811 + /*i = abs(10*log10(m2 / m1)/10*16);
1812 + m = 10*log10((m1+m2)/gfc->ATH->cb[k]); */
1813 +
1814 +
1815 + /* Should always be true, just checking */
1816 + assert(m1 >= 0);
1817 + assert(m2 >= 0);
1818 +
1819 +
1820 + m1 += m2;
1821 +
1822 + if ((unsigned int) (bb + 3) <= 3 + 3) { /* approximately, 1 bark = 3 partitions */
1823 + /* 65% of the cases */
1824 + /* originally 'if(i > 8)' */
1825 + if (ratio >= ma_max_i1) {
1826 + /* 43% of the total */
1827 + ecb = m1; continue;
1828 + }
1829 +
1830 + /* 22% of the total */
1831 + i = (int) (FAST_LOG10_X(ratio, 16.0));
1832 + ecb = m1 * table2[i]; continue;
1833 + }
1834 +
1835 + /* m<15 equ log10((m1+m2)/gfc->ATH->cb[k])<1.5
1836 + * equ (m1+m2)/gfc->ATH->cb[k]<10^1.5
1837 + * equ (m1+m2)<10^1.5 * gfc->ATH->cb[k]
1838 + */
1839 +
1840 + i = (int) FAST_LOG10_X(ratio, 16.0);
1841 + m2 = gfc->ATH->cb_l[kkk] * gfc->ATH->adjust;
1842 + assert(m2 >= 0);
1843 + if (m1 < ma_max_m * m2) {
1844 + /* 3% of the total */
1845 + /* Originally if (m > 0) { */
1846 + if (m1 > m2) {
1847 + FLOAT f, r;
1848 +
1849 + f = 1.0;
1850 + if (i <= 13)
1851 + f = table3[i];
1852 +
1853 + r = FAST_LOG10_X(m1 / m2, 10.0 / 15.0);
1854 + ecb = m1 * ((table1[i] - f) * r + f); continue;
1855 + }
1856 +
1857 + if (i > 13) {ecb = m1; continue;};
1858 +
1859 + ecb = m1*table3[i]; continue;
1860 + }
1861 +
1862 +
1863 + /* 10% of total */
1864 + ecb = m1 * table1[i];
1865 + }
1866 + //ecb = mask_add(ecb, gfc->s3_ll[k++] * eb2, kk, kk - b, gfc, 0);
1867 }
1868 ecb *= 0.158489319246111; /* pow(10,-0.8) */
1869
1870 @@ -1458,6 +1941,14 @@
1871 vbrpsy_compute_fft_l(lame_global_flags const *gfp, const sample_t * buffer[2], int chn, int gr_out,
1872 FLOAT fftenergy[HBLKSIZE], FLOAT(*wsamp_l)[BLKSIZE])
1873 {
1874 +#ifdef ALTIVEC
1875 + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,vhalf,vprev,vzero,vsqrt2;
1876 + vector unsigned char vperm;
1877 + vhalf = vec_ctf(vec_splat_s32(1),1);
1878 + vsqrt2 = (vector float)VINIT4ALL(0.7071067811865001);
1879 + vzero = vec_xor(vzero,vzero);
1880 + vperm = (vector unsigned char)VINIT16(0,1,2,3,28,29,30,31,24,25,26,27,20,21,22,23);
1881 +#endif
1882 lame_internal_flags *const gfc = gfp->internal_flags;
1883 int j;
1884
1885 @@ -1466,17 +1957,78 @@
1886 }
1887 else if (chn == 2) {
1888 /* FFT data for mid and side channel is derived from L & R */
1889 +#ifdef ALTIVEC
1890 + for(j = 0; j < BLKSIZE; j += 8) {
1891 + v1 = vec_ld(0,wsamp_l[0]+j);
1892 + v2 = vec_ld(0,wsamp_l[1]+j);
1893 + v3 = vec_ld(16,wsamp_l[0]+j);
1894 + v4 = vec_ld(16,wsamp_l[1]+j);
1895 +
1896 + v5 = vec_add(v1,v2);
1897 + v6 = vec_sub(v1,v2);
1898 + v7 = vec_add(v3,v4);
1899 + v8 = vec_sub(v3,v4);
1900 + v9 = vec_madd(v5,vsqrt2,vzero);
1901 + v10 = vec_madd(v6,vsqrt2,vzero);
1902 + v11 = vec_madd(v7,vsqrt2,vzero);
1903 + v12 = vec_madd(v8,vsqrt2,vzero);
1904 +
1905 + vec_st(v9,0,wsamp_l[0]+j);
1906 + vec_st(v10,0,wsamp_l[1]+j);
1907 + vec_st(v11,16,wsamp_l[0]+j);
1908 + vec_st(v12,16,wsamp_l[1]+j);
1909 + }
1910 +#else
1911 for (j = BLKSIZE - 1; j >= 0; --j) {
1912 FLOAT const l = wsamp_l[0][j];
1913 FLOAT const r = wsamp_l[1][j];
1914 wsamp_l[0][j] = (l + r) * (FLOAT) (SQRT2 * 0.5);
1915 wsamp_l[1][j] = (l - r) * (FLOAT) (SQRT2 * 0.5);
1916 }
1917 +#endif
1918 }
1919
1920 /*********************************************************************
1921 * compute energies
1922 *********************************************************************/
1923 +#ifdef ALTIVEC
1924 + vprev = vec_ld(0,(*wsamp_l));
1925 + for(j = 0; j < BLKSIZE/2; j += 16) {
1926 + v1 = vec_ld(0,(*wsamp_l)+j);
1927 + v2 = vec_ld(16,(*wsamp_l)+j);
1928 + v3 = vec_ld(32,(*wsamp_l)+j);
1929 + v4 = vec_ld(48,(*wsamp_l)+j);
1930 + v5 = vec_ld(48,(*wsamp_l)+1008-j);
1931 + v6 = vec_ld(32,(*wsamp_l)+1008-j);
1932 + v7 = vec_ld(16,(*wsamp_l)+1008-j);
1933 + v8 = vec_ld(0,(*wsamp_l)+1008-j);
1934 + v9 = vec_perm(vprev,v5,vperm);
1935 + v10 = vec_perm(v5,v6,vperm);
1936 + v11 = vec_perm(v6,v7,vperm);
1937 + v12 = vec_perm(v7,v8,vperm);
1938 + vprev = v8;
1939 + v1 = vec_madd(v1,v1,vzero);
1940 + v2 = vec_madd(v2,v2,vzero);
1941 + v3 = vec_madd(v3,v3,vzero);
1942 + v4 = vec_madd(v4,v4,vzero);
1943 + v5 = vec_madd(v9,v9,v1);
1944 + v6 = vec_madd(v10,v10,v2);
1945 + v7 = vec_madd(v11,v11,v3);
1946 + v8 = vec_madd(v12,v12,v4);
1947 + v9 = vec_madd(v5,vhalf,vzero);
1948 + v10 = vec_madd(v6,vhalf,vzero);
1949 + v11 = vec_madd(v7,vhalf,vzero);
1950 + v12 = vec_madd(v8,vhalf,vzero);
1951 +
1952 + vec_st(v9,0,fftenergy+j);
1953 + vec_st(v10,16,fftenergy+j);
1954 + vec_st(v11,32,fftenergy+j);
1955 + vec_st(v12,48,fftenergy+j);
1956 + }
1957 +
1958 + v1 = vec_madd(vprev,vprev,vzero);
1959 + vec_ste(v1,0,fftenergy+j);
1960 +#else
1961 fftenergy[0] = NON_LINEAR_SCALE_ENERGY(wsamp_l[0][0]);
1962 fftenergy[0] *= fftenergy[0];
1963
1964 @@ -1485,13 +2037,51 @@
1965 FLOAT const im = (*wsamp_l)[BLKSIZE / 2 + j];
1966 fftenergy[BLKSIZE / 2 - j] = NON_LINEAR_SCALE_ENERGY((re * re + im * im) * 0.5f);
1967 }
1968 +#endif
1969 /* total energy */
1970 {
1971 +#ifdef ALTIVEC
1972 +#ifdef ALTIVEC_970
1973 + v5 = vec_ld(0,fftenergy+8);
1974 + v6 = vec_ld(0,fftenergy+508);
1975 + v7 = vec_ld(0,fftenergy+512);
1976 + v8 = vec_xor(v8,v8);
1977 + v5 = vec_sld(v5,v8,12);
1978 + v7 = vec_sld(v8,v7,4);
1979 +#else
1980 + v5 = vec_lde(0,fftenergy+11);
1981 + v6 = vec_ld(0,fftenergy+508);
1982 + v7 = vec_lde(0,fftenergy+512);
1983 + v8 = vec_xor(v8,v8);
1984 +#endif
1985 + for(j=12;j<508;j+=16) {
1986 + v1 = vec_ld(0,fftenergy+j);
1987 + v2 = vec_ld(16,fftenergy+j);
1988 + v3 = vec_ld(32,fftenergy+j);
1989 + v4 = vec_ld(48,fftenergy+j);
1990 + v5 = vec_add(v1,v5);
1991 + v6 = vec_add(v2,v6);
1992 + v7 = vec_add(v3,v7);
1993 + v8 = vec_add(v4,v8);
1994 + }
1995 + v5 = vec_add(v5,v6);
1996 + v7 = vec_add(v7,v8);
1997 + v5 = vec_add(v5,v7);
1998 + v6 = vec_sld(v5,v5,4);
1999 + v7 = vec_sld(v5,v5,8);
2000 + v8 = vec_sld(v5,v5,12);
2001 + v5 = vec_add(v5,v6);
2002 + v7 = vec_add(v7,v8);
2003 + v5 = vec_add(v5,v7);
2004 + v5 = vec_perm(v5,v5,vec_lvsr(0, gfc->tot_ener+chn));
2005 + vec_ste(v5,0,gfc->tot_ener+chn);
2006 +#else
2007 FLOAT totalenergy = 0.0;
2008 for (j = 11; j < HBLKSIZE; j++)
2009 totalenergy += fftenergy[j];
2010
2011 gfc->tot_ener[chn] = totalenergy;
2012 +#endif
2013 }
2014
2015 if (gfp->analysis) {
2016 @@ -1506,8 +2096,16 @@
2017
2018 static void
2019 vbrpsy_compute_fft_s(lame_global_flags const *gfp, const sample_t * buffer[2], int chn, int sblock,
2020 - FLOAT(*fftenergy_s)[HBLKSIZE_s], FLOAT(*wsamp_s)[3][BLKSIZE_s])
2021 + FLOAT(*fftenergy_s)[HBLKSIZE_s+3], FLOAT(*wsamp_s)[3][BLKSIZE_s])
2022 {
2023 +#ifdef ALTIVEC
2024 + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,vhalf,vprev,vzero,vsqrt2;
2025 + vector unsigned char vperm;
2026 + vhalf = vec_ctf(vec_splat_s32(1),1);
2027 + vsqrt2 = (vector float)VINIT4ALL(0.7071067811865001);
2028 + vzero = vec_xor(vzero,vzero);
2029 + vperm = (vector unsigned char)VINIT16(0,1,2,3,28,29,30,31,24,25,26,27,20,21,22,23);
2030 +#endif
2031 lame_internal_flags *const gfc = gfp->internal_flags;
2032 int j;
2033
2034 @@ -1516,17 +2114,78 @@
2035 }
2036 if (chn == 2) {
2037 /* FFT data for mid and side channel is derived from L & R */
2038 +#ifdef ALTIVEC
2039 + for(j = 0; j < BLKSIZE_s; j += 8) {
2040 + v1 = vec_ld(0,wsamp_s[0][sblock]+j);
2041 + v2 = vec_ld(0,wsamp_s[1][sblock]+j);
2042 + v3 = vec_ld(16,wsamp_s[0][sblock]+j);
2043 + v4 = vec_ld(16,wsamp_s[1][sblock]+j);
2044 +
2045 + v5 = vec_add(v1,v2);
2046 + v6 = vec_sub(v1,v2);
2047 + v7 = vec_add(v3,v4);
2048 + v8 = vec_sub(v3,v4);
2049 + v9 = vec_madd(v5,vsqrt2,vzero);
2050 + v10 = vec_madd(v6,vsqrt2,vzero);
2051 + v11 = vec_madd(v7,vsqrt2,vzero);
2052 + v12 = vec_madd(v8,vsqrt2,vzero);
2053 +
2054 + vec_st(v9,0,wsamp_s[0][sblock]+j);
2055 + vec_st(v10,0,wsamp_s[1][sblock]+j);
2056 + vec_st(v11,16,wsamp_s[0][sblock]+j);
2057 + vec_st(v12,16,wsamp_s[1][sblock]+j);
2058 + }
2059 +#else
2060 for (j = BLKSIZE_s - 1; j >= 0; --j) {
2061 FLOAT const l = wsamp_s[0][sblock][j];
2062 FLOAT const r = wsamp_s[1][sblock][j];
2063 wsamp_s[0][sblock][j] = (l + r) * (FLOAT) (SQRT2 * 0.5);
2064 wsamp_s[1][sblock][j] = (l - r) * (FLOAT) (SQRT2 * 0.5);
2065 }
2066 +#endif
2067 }
2068
2069 /*********************************************************************
2070 * compute energies
2071 *********************************************************************/
2072 +#ifdef ALTIVEC
2073 + vprev = vec_ld(0,(*wsamp_s)[sblock]);
2074 + for(j = 0; j < BLKSIZE_s/2; j += 16) {
2075 + v1 = vec_ld(0,(*wsamp_s)[sblock]+j);
2076 + v2 = vec_ld(16,(*wsamp_s)[sblock]+j);
2077 + v3 = vec_ld(32,(*wsamp_s)[sblock]+j);
2078 + v4 = vec_ld(48,(*wsamp_s)[sblock]+j);
2079 + v5 = vec_ld(48,(*wsamp_s)[sblock]+240-j);
2080 + v6 = vec_ld(32,(*wsamp_s)[sblock]+240-j);
2081 + v7 = vec_ld(16,(*wsamp_s)[sblock]+240-j);
2082 + v8 = vec_ld(0,(*wsamp_s)[sblock]+240-j);
2083 + v9 = vec_perm(vprev,v5,vperm);
2084 + v10 = vec_perm(v5,v6,vperm);
2085 + v11 = vec_perm(v6,v7,vperm);
2086 + v12 = vec_perm(v7,v8,vperm);
2087 + vprev = v8;
2088 + v1 = vec_madd(v1,v1,vzero);
2089 + v2 = vec_madd(v2,v2,vzero);
2090 + v3 = vec_madd(v3,v3,vzero);
2091 + v4 = vec_madd(v4,v4,vzero);
2092 + v5 = vec_madd(v9,v9,v1);
2093 + v6 = vec_madd(v10,v10,v2);
2094 + v7 = vec_madd(v11,v11,v3);
2095 + v8 = vec_madd(v12,v12,v4);
2096 + v9 = vec_madd(v5,vhalf,vzero);
2097 + v10 = vec_madd(v6,vhalf,vzero);
2098 + v11 = vec_madd(v7,vhalf,vzero);
2099 + v12 = vec_madd(v8,vhalf,vzero);
2100 +
2101 + vec_st(v9,0,fftenergy_s[sblock]+j);
2102 + vec_st(v10,16,fftenergy_s[sblock]+j);
2103 + vec_st(v11,32,fftenergy_s[sblock]+j);
2104 + vec_st(v12,48,fftenergy_s[sblock]+j);
2105 + }
2106 +
2107 + v1 = vec_madd(vprev,vprev,vzero);
2108 + vec_ste(v1,0,fftenergy_s[sblock]+j);
2109 +#else
2110 fftenergy_s[sblock][0] = (*wsamp_s)[sblock][0];
2111 fftenergy_s[sblock][0] *= fftenergy_s[sblock][0];
2112 for (j = BLKSIZE_s / 2 - 1; j >= 0; --j) {
2113 @@ -1535,6 +2194,7 @@
2114 fftenergy_s[sblock][BLKSIZE_s / 2 - j] =
2115 NON_LINEAR_SCALE_ENERGY((re * re + im * im) * 0.5f);
2116 }
2117 +#endif
2118 }
2119
2120
2121 @@ -1563,7 +2223,24 @@
2122 FLOAT energy[4], FLOAT sub_short_factor[4][3], int ns_attacks[4][4],
2123 int uselongblock[2])
2124 {
2125 - FLOAT ns_hpfsmpl[2][576];
2126 +#ifdef ALTIVEC
2127 + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
2128 + vector float vsum,vsum1,vsum2,vsuma,vsumb,vsumc,vsumd,vmaska,vmaskb,vmaskc,vmaskd;
2129 + vector unsigned char vmask1,vmask2,vmask3,vmask4,vmask1inv,vmask2inv,vmask3inv,vmask4inv,vperm,vs4,vs8,vs12;
2130 +
2131 + vperm = (vector unsigned char)VINIT16(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
2132 + v1 = (vector float)vec_splat_u8(1);
2133 + v2 = (vector float)vec_splat_u8(5);
2134 + vs4 = vec_sl((vector unsigned char)v1,(vector unsigned char)v2);
2135 + vs8 = vec_sl(vs4,(vector unsigned char)v1);
2136 + vs12 = vec_or(vs4,vs8);
2137 + v3 = (vector float)vec_splat_s32(-1);
2138 + vmaska = vec_slo(v3,vs12);
2139 + vmaskb = vec_sro(vmaska,vs4);
2140 + vmaskc = vec_sro(vmaska,vs8);
2141 + vmaskd = vec_sro(vmaska,vs12);
2142 +#endif
2143 + FLOAT ns_hpfsmpl[2][576] __attribute__ ((aligned (16)));
2144 lame_internal_flags *const gfc = gfp->internal_flags;
2145 int const n_chn_out = gfc->channels_out;
2146 /* chn=2 and 3 = Mid and Side channels */
2147 @@ -1572,14 +2249,142 @@
2148 /* Don't copy the input buffer into a temporary buffer */
2149 /* unroll the loop 2 times */
2150 for (chn = 0; chn < n_chn_out; chn++) {
2151 - static const FLOAT fircoef[] = {
2152 + static const FLOAT fircoef[] __attribute__ ((aligned (16))) = {
2153 -8.65163e-18 * 2, -0.00851586 * 2, -6.74764e-18 * 2, 0.0209036 * 2,
2154 -3.36639e-17 * 2, -0.0438162 * 2, -1.54175e-17 * 2, 0.0931738 * 2,
2155 - -5.52212e-17 * 2, -0.313819 * 2
2156 + -5.52212e-17 * 2, -0.313819 * 2, 0.0, 0.0
2157 };
2158 /* apply high pass filter of fs/4 */
2159 const sample_t *const firbuf = &buffer[chn][576 - 350 - NSFIRLEN + 192];
2160 - assert(dimension_of(fircoef) == ((NSFIRLEN - 1) / 2));
2161 + //assert(dimension_of(fircoef) == ((NSFIRLEN - 1) / 2));
2162 +#ifdef ALTIVEC
2163 + v1 = vec_ld(0, firbuf+10);
2164 + vmask1 = vec_lvsl(0, firbuf);
2165 + vmask2 = vec_lvsl(0, firbuf+1);
2166 + vmask3 = vec_lvsl(0, firbuf+2);
2167 + vmask4 = vec_lvsl(0, firbuf+3);
2168 + vmask1inv = vec_perm(vmask1,vmask1,vperm);
2169 + vmask2inv = vec_perm(vmask2,vmask2,vperm);
2170 + vmask3inv = vec_perm(vmask3,vmask3,vperm);
2171 + vmask4inv = vec_perm(vmask4,vmask4,vperm);
2172 + for(i=0;i<576;) {
2173 + v2 = vec_ld(16,firbuf+i+10);
2174 + vsum1 = vec_perm(v1, v2, vmask3);
2175 + v1 = v2;
2176 +
2177 + vsum2 = vec_splat(vsum1, 0);
2178 + vsum = vec_and(vsum2, vmaska);
2179 + v3 = vec_ld(0, firbuf+i);
2180 + v4 = vec_ld(16,firbuf+i+NSFIRLEN-3);
2181 + for(j=0;j<(NSFIRLEN-1)/2;j+=4) {
2182 + v5 = vec_ld(16, firbuf+i+j);
2183 + v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j);
2184 + v7 = vec_perm(v3,v5,vmask1);
2185 + v8 = vec_perm(v6,v4,vmask3inv);
2186 + v3 = v5;
2187 + v4 = v6;
2188 + v10 = vec_ld(0,fircoef+j);
2189 + v11 = vec_add(v7,v8);
2190 + vsum = vec_madd(v10,v11,vsum);
2191 + }
2192 +
2193 + v12 = vec_slo(vsum,vs4);
2194 + v13 = vec_slo(vsum,vs8);
2195 + v14 = vec_slo(vsum,vs12);
2196 + v15 = vec_add(vsum,v12);
2197 + v16 = vec_add(v13,v14);
2198 + vsuma = vec_add(v15,v16);
2199 + vsuma = vec_and(vsuma,vmaska);
2200 +
2201 + i++;
2202 +
2203 + vsum2 = vec_splat(vsum1, 1);
2204 + vsum = vec_and(vsum2, vmaska);
2205 + v3 = vec_ld(0, firbuf+i);
2206 + v4 = vec_ld(16,firbuf+i+NSFIRLEN-3);
2207 + vmask2 = vec_lvsl(0, firbuf+i);
2208 + for(j=0;j<(NSFIRLEN-1)/2;j+=4) {
2209 + v5 = vec_ld(16, firbuf+i+j);
2210 + v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j);
2211 + v7 = vec_perm(v3,v5,vmask2);
2212 + v8 = vec_perm(v6,v4,vmask4inv);
2213 + v3 = v5;
2214 + v4 = v6;
2215 + v10 = vec_ld(0,fircoef+j);
2216 + v11 = vec_add(v7,v8);
2217 + vsum = vec_madd(v10,v11,vsum);
2218 + }
2219 +
2220 + v12 = vec_sro(vsum,vs4);
2221 + v13 = vec_slo(vsum,vs4);
2222 + v14 = vec_slo(vsum,vs8);
2223 + v15 = vec_add(vsum,v12);
2224 + v16 = vec_add(v13,v14);
2225 + vsumb = vec_add(v15,v16);
2226 + vsumb = vec_and(vsumb,vmaskb);
2227 +
2228 + i++;
2229 +
2230 + vsum2 = vec_splat(vsum1, 2);
2231 + vsum = vec_and(vsum2, vmaska);
2232 + v3 = vec_ld(0, firbuf+i);
2233 + v4 = vec_ld(16,firbuf+i+NSFIRLEN-3);
2234 + vmask2 = vec_lvsl(0, firbuf+i);
2235 + for(j=0;j<(NSFIRLEN-1)/2;j+=4) {
2236 + v5 = vec_ld(16, firbuf+i+j);
2237 + v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j);
2238 + v7 = vec_perm(v3,v5,vmask3);
2239 + v8 = vec_perm(v6,v4,vmask1inv);
2240 + v3 = v5;
2241 + v4 = v6;
2242 + v10 = vec_ld(0,fircoef+j);
2243 + v11 = vec_add(v7,v8);
2244 + vsum = vec_madd(v10,v11,vsum);
2245 + }
2246 +
2247 + v12 = vec_sro(vsum,vs4);
2248 + v13 = vec_sro(vsum,vs8);
2249 + v14 = vec_slo(vsum,vs4);
2250 + v15 = vec_add(vsum,v12);
2251 + v16 = vec_add(v13,v14);
2252 + vsumc = vec_add(v15,v16);
2253 + vsumc = vec_and(vsumc,vmaskc);
2254 +
2255 + i++;
2256 +
2257 + vsum2 = vec_splat(vsum1, 3);
2258 + vsum = vec_and(vsum2, vmaska);
2259 + v3 = vec_ld(0, firbuf+i);
2260 + v4 = vec_ld(16,firbuf+i+NSFIRLEN-3);
2261 + vmask2 = vec_lvsl(0, firbuf+i);
2262 + for(j=0;j<(NSFIRLEN-1)/2;j+=4) {
2263 + v5 = vec_ld(16, firbuf+i+j);
2264 + v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j);
2265 + v7 = vec_perm(v3,v5,vmask4);
2266 + v8 = vec_perm(v6,v4,vmask2inv);
2267 + v3 = v5;
2268 + v4 = v6;
2269 + v10 = vec_ld(0,fircoef+j);
2270 + v11 = vec_add(v7,v8);
2271 + vsum = vec_madd(v10,v11,vsum);
2272 + }
2273 +
2274 + v12 = vec_sro(vsum,vs4);
2275 + v13 = vec_sro(vsum,vs8);
2276 + v14 = vec_sro(vsum,vs12);
2277 + v15 = vec_add(vsum,v12);
2278 + v16 = vec_add(v13,v14);
2279 + vsumd = vec_add(v15,v16);
2280 + vsumd = vec_and(vsumd,vmaskd);
2281 +
2282 + vsum1 = vec_or(vsuma,vsumb);
2283 + vsum2 = vec_or(vsumc,vsumd);
2284 + vsum = vec_or(vsum1,vsum2);
2285 +
2286 + i++;
2287 + vec_st(vsum,0,ns_hpfsmpl[chn]+i-4);
2288 + }
2289 +#else
2290 for (i = 0; i < 576; i++) {
2291 FLOAT sum1, sum2;
2292 sum1 = firbuf[i + 10];
2293 @@ -1590,6 +2395,7 @@
2294 }
2295 ns_hpfsmpl[chn][i] = sum1 + sum2;
2296 }
2297 +#endif
2298 masking_ratio[gr_out][chn].en = gfc->en[chn];
2299 masking_ratio[gr_out][chn].thm = gfc->thm[chn];
2300 if (n_chn_psy > 2) {
2301 @@ -1630,9 +2436,28 @@
2302 for (i = 0; i < 9; i++) {
2303 FLOAT const *const pfe = pf + 576 / 9;
2304 FLOAT p = 1.;
2305 +#ifdef ALTIVEC
2306 + FLOAT vmax[4] __attribute__ ((aligned (16)));
2307 + v1 = (vector float)vec_splat_s32(1);
2308 + v2 = vec_ctf((vector signed int)v1,0);
2309 + for (; pf < pfe; pf+=4) {
2310 + v3 = vec_ld(0,pf);
2311 + v4 = vec_abs(v3);
2312 + v2 = vec_max(v2,v4);
2313 + }
2314 + v5 = vec_slo(v2,vs4);
2315 + v6 = vec_slo(v2,vs8);
2316 + v7 = vec_slo(v2,vs12);
2317 + v8 = vec_max(v2,v5);
2318 + v9 = vec_max(v6,v7);
2319 + v10 =vec_max(v8,v9);
2320 + vec_st(v10,0,vmax);
2321 + p = vmax[0];
2322 +#else
2323 for (; pf < pfe; pf++)
2324 if (p < fabs(*pf))
2325 p = fabs(*pf);
2326 +#endif
2327
2328 gfc->nsPsy.last_en_subshort[chn][i] = en_subshort[i + 3] = p;
2329 en_short[1 + i / 3] += p;
2330 @@ -1832,7 +2657,7 @@
2331
2332
2333 static void
2334 -vbrpsy_compute_masking_s(lame_global_flags const *gfp, FLOAT(*fftenergy_s)[HBLKSIZE_s], FLOAT * eb,
2335 +vbrpsy_compute_masking_s(lame_global_flags const *gfp, FLOAT(*fftenergy_s)[HBLKSIZE_s+3], FLOAT * eb,
2336 FLOAT * thr, int chn, int sblock)
2337 {
2338 lame_internal_flags *const gfc = gfp->internal_flags;
2339 @@ -1935,21 +2760,375 @@
2340 vbrpsy_compute_masking_l(lame_internal_flags * gfc, FLOAT fftenergy[HBLKSIZE], FLOAT eb_l[CBANDS],
2341 FLOAT thr[CBANDS], int chn)
2342 {
2343 - FLOAT max[CBANDS], avg[CBANDS];
2344 - unsigned char mask_idx_l[CBANDS + 2];
2345 + FLOAT max[CBANDS] __attribute__ ((aligned (16))), avg[CBANDS];
2346 + unsigned char mask_idx_l[CBANDS + 2] __attribute__ ((aligned (16)));
2347 int k, b;
2348 +#ifdef ALTIVEC
2349 + float tmp[4] __attribute__ ((aligned (16)));
2350 + const vector unsigned char v31 = (vector unsigned char)VINIT16ALL(31);
2351 + const vector unsigned int vmask1 = (vector unsigned int)VINIT4ALL(0xff);
2352 + const vector signed int vone = (vector signed int)VINIT4ALL(1);
2353 + const vector unsigned int vtab1 = (vector unsigned int)VINIT4(0x3f800000,0x3f4b5936,0x3f218698,0x3f218698);
2354 + const vector unsigned int vtab2 = (vector unsigned int)VINIT4(0x3f218698,0x3f218698,0x3f218698,0x3e809bfa);
2355 + const vector unsigned int vtab3 = (vector unsigned int)VINIT4(0x3df09e99,0,0,0);
2356 + const vector unsigned int vtable1 = (vector unsigned int)VINIT4(0x3fe39e89,0x3fec53e5,0x3ff55ea7,0x3ff9149b);
2357 + const vector unsigned int vtable2 = (vector unsigned int)VINIT4(0x3ffcd90e,0x3fea8f7b,0x3fd997da,0x3fbf84e2);
2358 + const vector unsigned int vtable3 = (vector unsigned int)VINIT4(0x3fa8917c,0x3f800000,0,0);
2359 + const vector float vzero = vec_xor(vzero,vzero);
2360 +#endif
2361
2362 /*********************************************************************
2363 * Calculate the energy and the tonality of each partition.
2364 *********************************************************************/
2365 calc_energy(gfc, fftenergy, eb_l, max, avg);
2366 calc_mask_index_l(gfc, max, avg, mask_idx_l);
2367 +#ifdef ALTIVEC
2368 + const vector unsigned char vmaskidx1 = vec_ld(0,mask_idx_l); //needs to be aligned
2369 + const vector unsigned char vmaskidx2 = vec_ld(16,mask_idx_l);
2370 + const vector unsigned char vmaskidx3 = vec_ld(32,mask_idx_l);
2371 + const vector unsigned char vmaskidx4 = vec_ld(48,mask_idx_l);
2372 + tmp[0] = gfc->masking_lower;
2373 + vector float vmasking_lower = vec_ld(0,tmp);
2374 + vmasking_lower = vec_splat(vmasking_lower,0);
2375 +#endif
2376
2377 /*********************************************************************
2378 * convolve the partitioned energy and unpredictability
2379 * with the spreading function, s3_l[b][k]
2380 ********************************************************************/
2381 k = 0;
2382 +#ifdef ALTIVEC
2383 + for (b = 0; b < gfc->npart_l-3; b+=4) {
2384 + vector signed int v1,v2,v3,v4,v5,vkk,vkk2,vlast,vdd,vdd_n,vk,vk2;
2385 + vector float vf1,vf2,vf3,vf4,vecb,vx,veb,vavgmask;
2386 + int tmp2[4] __attribute__ ((aligned (16)));
2387 + int tmp3[4] __attribute__ ((aligned (16)));
2388 +
2389 + v1 = vec_ld(0,gfc->s3ind[b]); //needs to be aligned
2390 + v2 = vec_ld(0,gfc->s3ind[b+1]);
2391 + v3 = vec_ld(0,gfc->s3ind[b+2]);
2392 + v4 = vec_ld(0,gfc->s3ind[b+3]);
2393 + v1 = vec_mergeh(v1,v3);
2394 + v2 = vec_mergeh(v2,v4);
2395 + vkk = vec_mergeh(v1,v2);
2396 + vlast = vec_mergel(v1,v2);
2397 +
2398 + v1 = vec_sub(vlast,vkk);
2399 + v1 = vec_sel(v1,(vector signed int)vzero,vec_cmpgt((vector signed int)vzero,v1));
2400 + vec_st(v1,0,tmp2);
2401 +
2402 + tmp3[0] = k;
2403 + tmp3[1] = k+tmp2[0]+1;
2404 + tmp3[2] = k+tmp2[0]+tmp2[1]+2;
2405 + tmp3[3] = k+tmp2[0]+tmp2[1]+tmp2[2]+3;
2406 + k = k+tmp2[0]+tmp2[1]+tmp2[2]+tmp2[3]+4;
2407 + vk = vec_ld(0,tmp3);
2408 +
2409 + v1 = (vector signed int)vec_perm(vmaskidx1,vmaskidx2,(vector unsigned char)vkk);
2410 + v2 = (vector signed int)vec_perm(vmaskidx3,vmaskidx4,(vector unsigned char)vkk);
2411 + vdd = vec_sel(v1,v2,vec_cmpgt(vkk,(vector signed int)VINIT4ALL(31)));
2412 + vdd = vec_and(vdd,(vector signed int)vmask1);
2413 + vdd_n = vone;
2414 +
2415 + tmp[0] = gfc->s3_ll[tmp3[0]];
2416 + tmp[1] = gfc->s3_ll[tmp3[1]];
2417 + tmp[2] = gfc->s3_ll[tmp3[2]];
2418 + tmp[3] = gfc->s3_ll[tmp3[3]];
2419 + vf1 = vec_ld(0,tmp);
2420 +
2421 + vec_st(vkk,0,tmp2);
2422 + tmp[0] = eb_l[tmp2[0]];
2423 + tmp[1] = eb_l[tmp2[1]];
2424 + tmp[2] = eb_l[tmp2[2]];
2425 + tmp[3] = eb_l[tmp2[3]];
2426 + veb = vec_ld(0,tmp);
2427 +
2428 + vecb = vec_madd(vf1,veb,vzero);
2429 +
2430 + v1 = vec_sl(vdd,vec_splat_u32(2));
2431 + v2 = vec_add(v1,vec_splat_s32(1));
2432 + v3 = vec_add(v1,vec_splat_s32(2));
2433 + v4 = vec_add(v2,vec_splat_s32(2));
2434 + v1 = vec_sl(v1,vec_splat_u32(-8));
2435 + v2 = vec_sl(v2,vec_splat_u32(-16));
2436 + v3 = vec_sl(v3,vec_splat_u32(8));
2437 + v1 = vec_or(v1,v2);
2438 + v3 = vec_or(v3,v4);
2439 + v1 = vec_or(v1,v3);
2440 +
2441 + vf1 = (vector float)vec_perm(vtab1,vtab2,(vector unsigned char)v1);
2442 + vf2 = (vector float)vec_perm(vtab3,vtab2,(vector unsigned char)v1);
2443 + vf2 = vec_sel(vf1,vf2,(vector unsigned int)vec_cmpgt((vector unsigned char)v1,v31));
2444 + vecb = vec_madd(vecb,vf2,vzero);
2445 +
2446 + vkk = vec_add(vkk,vone);
2447 + vk = vec_add(vk,vone);
2448 + while(vec_any_le(vkk,vlast)) {
2449 + vkk2 = vec_sel(vkk,vlast,vec_cmpgt(vkk,vlast));
2450 + vk2 = vec_sel(vk,(vector signed int)vzero,vec_cmpgt(vkk,vlast));
2451 + v1 = (vector signed int)vec_perm(vmaskidx1,vmaskidx2,(vector unsigned char)vkk2);
2452 + v2 = (vector signed int)vec_perm(vmaskidx3,vmaskidx4,(vector unsigned char)vkk2);
2453 + v1 = vec_sel(v1,v2,vec_cmpgt(vkk2,(vector signed int)VINIT4ALL(31)));
2454 + v1 = vec_and(v1,(vector signed int)vmask1);
2455 + v2 = (vector signed int)vec_cmpgt(vkk,vlast);
2456 + v2 = vec_nor(v2,v2);
2457 + v5 = vec_and(v1,v2);
2458 + v2 = vec_and(vone,v2);
2459 + vdd = vec_add(vdd,v5);
2460 + vdd_n = vec_add(vdd_n,v2);
2461 +
2462 + vec_st(vk2,0,tmp2);
2463 + tmp[0] = gfc->s3_ll[tmp2[0]];
2464 + tmp[1] = gfc->s3_ll[tmp2[1]];
2465 + tmp[2] = gfc->s3_ll[tmp2[2]];
2466 + tmp[3] = gfc->s3_ll[tmp2[3]];
2467 + vf1 = vec_ld(0,tmp);
2468 +
2469 + vec_st(vkk,0,tmp2);
2470 + tmp[0] = eb_l[tmp2[0]];
2471 + tmp[1] = eb_l[tmp2[1]];
2472 + tmp[2] = eb_l[tmp2[2]];
2473 + tmp[3] = eb_l[tmp2[3]];
2474 + veb = vec_ld(0,tmp);
2475 +
2476 + vx = vec_madd(vf1,veb,vzero);
2477 +
2478 + v1 = vec_sl(v5,vec_splat_u32(2));
2479 + v2 = vec_add(v1,vec_splat_s32(1));
2480 + v3 = vec_add(v1,vec_splat_s32(2));
2481 + v4 = vec_add(v2,vec_splat_s32(2));
2482 + v1 = vec_sl(v1,vec_splat_u32(-8));
2483 + v2 = vec_sl(v2,vec_splat_u32(-16));
2484 + v3 = vec_sl(v3,vec_splat_u32(8));
2485 + v1 = vec_or(v1,v2);
2486 + v3 = vec_or(v3,v4);
2487 + v1 = vec_or(v1,v3);
2488 +
2489 + vf1 = (vector float)vec_perm(vtab1,vtab2,(vector unsigned char)v1);
2490 + vf2 = (vector float)vec_perm(vtab3,vtab2,(vector unsigned char)v1);
2491 + vf2 = vec_sel(vf1,vf2,(vector unsigned int)vec_cmpgt((vector unsigned char)v1,v31));
2492 + vx = vec_madd(vx,vf2,vzero);
2493 + {
2494 + vector float vratio,vout,vf5;
2495 + vf1 = vec_sel(vecb,vzero,vec_cmplt(vecb,vzero));
2496 + vf2 = vec_sel(vx,vzero,vec_cmplt(vx,vzero));
2497 + vf3 = vec_sel(vf1,vf2,vec_cmpgt(vf2,vf1));
2498 + vf4 = vec_sel(vf2,vf1,vec_cmpgt(vf2,vf1));
2499 + vf5 = vec_re(vf4);
2500 + vratio = vec_madd(vf3,vec_madd(vec_nmsub(vf4,vf5,(vector float)VINIT4ALL(1.0)),vf5,vf5),vzero);
2501 +
2502 + tmp2[0] = b;
2503 + tmp2[1] = b+1;
2504 + tmp2[2] = b+2;
2505 + tmp2[3] = b+3;
2506 + v1 = vec_ld(0,tmp2);
2507 + v1 = vec_sub(vkk2,v1);
2508 + v2 = (vector signed int)vec_cmplt(v1,(vector signed int)VINIT4ALL(-2));
2509 + v3 = (vector signed int)vec_cmpgt(v1,(vector signed int)VINIT4ALL(2));
2510 + v5 = vec_or(v2,v3);
2511 + v3 = (vector signed int)vec_cmpge(vratio,vmamax1);
2512 +
2513 + vf4 = vec_add(vf1,vf2);
2514 + if(vec_any_eq(vec_or(v5,v3),(vector signed int)vzero)) {
2515 + vf3 = fast_log10_altivec_2(vratio);
2516 + v1 = vec_cts(vf3,4);
2517 + v1 = vec_sl(v1,vec_splat_u32(2));
2518 + v2 = vec_add(v1,vec_splat_s32(1));
2519 + v3 = vec_add(v1,vec_splat_s32(2));
2520 + v4 = vec_add(v2,vec_splat_s32(2));
2521 + v1 = vec_sl(v1,vec_splat_u32(-8));
2522 + v2 = vec_sl(v2,vec_splat_u32(-16));
2523 + v3 = vec_sl(v3,vec_splat_u32(8));
2524 + v1 = vec_or(v1,v2);
2525 + v3 = vec_or(v3,v4);
2526 + v1 = vec_or(v1,v3);
2527 + vf3 = (vector float)vec_perm(vtable1,vtable2,(vector unsigned char)v1);
2528 + vf5 = (vector float)vec_perm(vtable3,vtable2,(vector unsigned char)v1);
2529 + vf5 = vec_sel(vf3,vf5,(vector unsigned int)vec_cmpgt((vector unsigned char)v1,v31));
2530 + vf5 = vec_madd(vf4,vf5,vzero);
2531 + vf5 = vec_sel(vf5,vf4,vec_cmpge(vratio,vmamax1));
2532 + }
2533 + else vf5 = vf4;
2534 +
2535 + vout = vec_sel(vf1,vf2,vec_cmpgt(vf2,vf1));
2536 + vout = vec_sel(vout,vf4,vec_cmpgt(vmamax2,vratio));
2537 + vout = vec_sel(vf5,vout,(vector unsigned int)v5);
2538 + vout = vec_sel(vout,vecb,(vector unsigned int)vec_cmple(vx,vzero));
2539 + vout = vec_sel(vout,vx,(vector unsigned int)vec_cmple(vecb,vzero));
2540 + vecb = vec_sel(vout,vecb,vec_cmpgt(vkk,vlast));
2541 + }
2542 + vkk = vec_add(vkk,vone);
2543 + vk = vec_add(vk,vone);
2544 + }
2545 + vdd = vec_sl(vdd,(vector unsigned int)vone);
2546 + vdd_n = vec_sl(vdd_n,(vector unsigned int)vone);
2547 + vdd = vec_add(vdd,vone);
2548 + vf1 = vec_ctf(vdd,0);
2549 + vf2 = vec_ctf(vdd_n,0);
2550 + vf2 = vec_re(vf2);
2551 + vf1 = vec_madd(vf1,vf2,vzero);
2552 + vdd = vec_cts(vf1,0);
2553 +
2554 + v1 = vec_sl(vdd,vec_splat_u32(2));
2555 + v2 = vec_add(v1,vec_splat_s32(1));
2556 + v3 = vec_add(v1,vec_splat_s32(2));
2557 + v4 = vec_add(v2,vec_splat_s32(2));
2558 + v1 = vec_sl(v1,vec_splat_u32(-8));
2559 + v2 = vec_sl(v2,vec_splat_u32(-16));
2560 + v3 = vec_sl(v3,vec_splat_u32(8));
2561 + v1 = vec_or(v1,v2);
2562 + v3 = vec_or(v3,v4);
2563 + v1 = vec_or(v1,v3);
2564 +
2565 + vf1 = (vector float)vec_perm(vtab1,vtab2,(vector unsigned char)v1);
2566 + vf2 = (vector float)vec_perm(vtab3,vtab2,(vector unsigned char)v1);
2567 + vf1 = vec_sel(vf1,vf2,(vector unsigned int)vec_cmpgt((vector unsigned char)v1,v31));
2568 + vf2 = vec_ctf(vone,1);
2569 + vavgmask = vec_madd(vf1,vf2,vzero);
2570 + vecb = vec_madd(vecb,vavgmask,vzero);
2571 +
2572 + vf4 = vec_ld(0,eb_l+b); //needs to be aligned
2573 + if (gfc->blocktype_old[chn & 0x01] == SHORT_TYPE) {
2574 + vf1 = vec_ld(0,gfc->nb_1[chn]+b); //needs to be aligned
2575 + vf3 = vec_madd(vf1,(vector float)VINIT4ALL(rpelev),vzero);
2576 + vf2 = vec_madd(vf4,(vector float)VINIT4ALL(NS_PREECHO_ATT2),vzero);
2577 + vf3 = vec_sel(vf2,vf3,vec_cmpgt(vf3,vzero));
2578 + vf3 = vec_min(vecb,vf3);
2579 + //vec_st(vf3,0,thr+b); //needs to be aligned
2580 + }
2581 + else {
2582 + vf1 = vec_ld(0,gfc->nb_1[chn]+b); //needs to be aligned
2583 + vf2 = vec_ld(0,gfc->nb_2[chn]+b); //needs to be aligned
2584 + vf3 = vec_madd(vf1,(vector float)VINIT4ALL(rpelev),vzero);
2585 + vf2 = vec_madd(vf2,(vector float)VINIT4ALL(rpelev2),vzero);
2586 + vf3 = vec_sel(vzero,vf3,vec_cmpgt(vf3,vzero));
2587 + vf2 = vec_sel(vzero,vf2,vec_cmpgt(vf2,vzero));
2588 + if (gfc->blocktype_old[chn & 0x01] == NORM_TYPE) {
2589 + vf3 = vec_min(vf3,vf2);
2590 + }
2591 + vf3 = vec_min(vecb,vf3);
2592 + //vec_st(vf3,0,thr+b); //needs to be aligned
2593 + }
2594 + vec_st(vf1,0,gfc->nb_2[chn]+b); //needs to be aligned
2595 + vec_st(vecb,0,gfc->nb_1[chn]+b); //needs to be aligned
2596 + {
2597 + vx = vec_ld(0,max+b); //needs to be aligned
2598 + vf1 = vec_ld(0,gfc->minval_l+b);
2599 + vx = vec_madd(vx,vf1,vzero);
2600 + vx = vec_madd(vx,vavgmask,vzero);
2601 + vf3 = vec_sel(vf3,vx,vec_cmpgt(vf3,vx));
2602 + //vec_st(vf3,0,thr+b); //needs to be aligned
2603 + }
2604 + if (gfc->masking_lower > 1) {
2605 + vf3 = vec_madd(vf3,vmasking_lower,vzero);
2606 + }
2607 + vf3 = vec_sel(vf3,vf4,vec_cmpgt(vf3,vf4));
2608 + if (gfc->masking_lower < 1) {
2609 + vf3 = vec_madd(vf3,vmasking_lower,vzero);
2610 + }
2611 + vec_st(vf3,0,thr+b); //needs to be aligned
2612 + }
2613 + for (; b < gfc->npart_l; b++) {
2614 + FLOAT x, ecb, avg_mask, t;
2615 + /* convolve the partitioned energy with the spreading function */
2616 + int kk = gfc->s3ind[b][0];
2617 + int const last = gfc->s3ind[b][1];
2618 + int dd = 0, dd_n = 0;
2619 + dd = mask_idx_l[kk];
2620 + dd_n += 1;
2621 + ecb = gfc->s3_ll[k] * eb_l[kk] * tab[mask_idx_l[kk]];
2622 + ++k, ++kk;
2623 + while (kk <= last) {
2624 + dd += mask_idx_l[kk];
2625 + dd_n += 1;
2626 + x = gfc->s3_ll[k] * eb_l[kk] * tab[mask_idx_l[kk]];
2627 + t = vbrpsy_mask_add(ecb, x, kk - b);
2628 +#if 0
2629 + ecb += eb_l[kk];
2630 + if (ecb > t) {
2631 + ecb = t;
2632 + }
2633 +#else
2634 + ecb = t;
2635 +#endif
2636 + ++k, ++kk;
2637 + }
2638 + dd = (1 + 2 * dd) / (2 * dd_n);
2639 + avg_mask = tab[dd] * 0.5;
2640 + ecb *= avg_mask;
2641 +
2642 + /**** long block pre-echo control ****/
2643 + /* dont use long block pre-echo control if previous granule was
2644 + * a short block. This is to avoid the situation:
2645 + * frame0: quiet (very low masking)
2646 + * frame1: surge (triggers short blocks)
2647 + * frame2: regular frame. looks like pre-echo when compared to
2648 + * frame0, but all pre-echo was in frame1.
2649 + */
2650 + /* chn=0,1 L and R channels
2651 + chn=2,3 S and M channels.
2652 + */
2653 + if (gfc->blocktype_old[chn & 0x01] == SHORT_TYPE) {
2654 + FLOAT const ecb_limit = rpelev * gfc->nb_1[chn][b];
2655 + if (ecb_limit > 0) {
2656 + thr[b] = Min(ecb, ecb_limit);
2657 + }
2658 + else {
2659 + /* Robert 071209:
2660 + Because we don't calculate long block psy when we know a granule
2661 + should be of short blocks, we don't have any clue how the granule
2662 + before would have looked like as a long block. So we have to guess
2663 + a little bit for this END_TYPE block.
2664 + Most of the time we get away with this sloppyness. (fingers crossed :)
2665 + The speed increase is worth it.
2666 + */
2667 + thr[b] = Min(ecb, eb_l[b] * NS_PREECHO_ATT2);
2668 + }
2669 + }
2670 + else {
2671 + FLOAT ecb_limit_2 = rpelev2 * gfc->nb_2[chn][b];
2672 + FLOAT ecb_limit_1 = rpelev * gfc->nb_1[chn][b];
2673 + FLOAT ecb_limit;
2674 + if (ecb_limit_2 <= 0) {
2675 + ecb_limit_2 = ecb;
2676 + }
2677 + if (ecb_limit_1 <= 0) {
2678 + ecb_limit_1 = ecb;
2679 + }
2680 + if (gfc->blocktype_old[chn & 0x01] == NORM_TYPE) {
2681 + ecb_limit = Min(ecb_limit_1, ecb_limit_2);
2682 + }
2683 + else {
2684 + ecb_limit = ecb_limit_1;
2685 + }
2686 + thr[b] = Min(ecb, ecb_limit);
2687 + }
2688 + gfc->nb_2[chn][b] = gfc->nb_1[chn][b];
2689 + gfc->nb_1[chn][b] = ecb;
2690 + {
2691 + /* if THR exceeds EB, the quantization routines will take the difference
2692 + * from other bands. in case of strong tonal samples (tonaltest.wav)
2693 + * this leads to heavy distortions. that's why we limit THR here.
2694 + */
2695 + x = max[b];
2696 + x *= gfc->minval_l[b];
2697 + x *= avg_mask;
2698 + if (thr[b] > x) {
2699 + thr[b] = x;
2700 + }
2701 + }
2702 + if (gfc->masking_lower > 1) {
2703 + thr[b] *= gfc->masking_lower;
2704 + }
2705 + if (thr[b] > eb_l[b]) {
2706 + thr[b] = eb_l[b];
2707 + }
2708 + if (gfc->masking_lower < 1) {
2709 + thr[b] *= gfc->masking_lower;
2710 + }
2711 + assert(thr[b] >= 0);
2712 + }
2713 +#else
2714 for (b = 0; b < gfc->npart_l; b++) {
2715 FLOAT x, ecb, avg_mask, t;
2716 /* convolve the partitioned energy with the spreading function */
2717 @@ -2050,6 +3229,7 @@
2718 }
2719 assert(thr[b] >= 0);
2720 }
2721 +#endif
2722 for (; b < CBANDS; ++b) {
2723 eb_l[b] = 0;
2724 thr[b] = 0;
2725 @@ -2201,11 +3381,11 @@
2726 /* fft and energy calculation */
2727 FLOAT(*wsamp_l)[BLKSIZE];
2728 FLOAT(*wsamp_s)[3][BLKSIZE_s];
2729 - FLOAT fftenergy[HBLKSIZE];
2730 - FLOAT fftenergy_s[3][HBLKSIZE_s];
2731 - FLOAT wsamp_L[2][BLKSIZE];
2732 - FLOAT wsamp_S[2][3][BLKSIZE_s];
2733 - FLOAT eb[4][CBANDS], thr[4][CBANDS];
2734 + FLOAT fftenergy[HBLKSIZE] __attribute__ ((aligned (16)));
2735 + FLOAT fftenergy_s[3][HBLKSIZE_s+3] __attribute__ ((aligned (16)));
2736 + FLOAT wsamp_L[2][BLKSIZE] __attribute__ ((aligned (16)));
2737 + FLOAT wsamp_S[2][3][BLKSIZE_s] __attribute__ ((aligned (16)));
2738 + FLOAT eb[4][CBANDS] __attribute__ ((aligned (16))), thr[4][CBANDS] __attribute__ ((aligned (16)));
2739
2740 FLOAT sub_short_factor[4][3];
2741 FLOAT thmm;
2742 @@ -2653,7 +3833,7 @@
2743
2744 static int
2745 init_s3_values(FLOAT ** p,
2746 - int (*s3ind)[2], int npart, FLOAT const *bval, FLOAT const *bval_width,
2747 + int (*s3ind)[4], int npart, FLOAT const *bval, FLOAT const *bval_width,
2748 FLOAT const *norm, int use_old_s3)
2749 {
2750 FLOAT s3[CBANDS][CBANDS];
2751 --- libmp3lame/quantize.c.orig 2008-08-05 23:16:07.000000000 +0900
2752 +++ libmp3lame/quantize.c 2010-03-01 14:05:00.000000000 +0900
2753 @@ -28,6 +28,12 @@
2754 # include <config.h>
2755 #endif
2756
2757 +#ifdef ALTIVEC
2758 +#ifndef __APPLE_CC__
2759 +#include <altivec.h>
2760 +#endif
2761 +#endif
2762 +
2763 #include "lame.h"
2764 #include "machine.h"
2765 #include "encoder.h"
2766 @@ -45,6 +51,29 @@
2767
2768
2769
2770 +#ifdef PPC_FRSQRTE
2771 +static inline double __frsqrte(double number)
2772 +{
2773 + double y;
2774 + asm("frsqrte %0,%1" : "=f" (y) : "f" (number));
2775 + return y;
2776 +}
2777 +
2778 +
2779 +static inline double ppc_sqrt(double x) {
2780 + double y;
2781 + const double halfx = 0.5 * x;
2782 + y = __frsqrte(x);
2783 + y *= 1.5 - halfx * y * y;
2784 + y *= 1.5 - halfx * y * y;
2785 + y *= 1.5 - halfx * y * y;
2786 + //y *= 1.5 - halfx * y * y;
2787 + y *= x;
2788 + return (x == 0.0) ? 0 : y;
2789 +}
2790 +#endif
2791 +
2792 +
2793 /* convert from L/R <-> Mid/Side */
2794 static void
2795 ms_convert(III_side_info_t * l3_side, int gr)
2796 @@ -73,9 +102,162 @@
2797 static void
2798 init_xrpow_core_c(gr_info * const cod_info, FLOAT xrpow[576], int upper, FLOAT * sum)
2799 {
2800 +#ifdef ALTIVEC
2801 + vector float v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16,v17,v18,v19,v20;
2802 + vector float vsum,vsum2,vsum3,vsum4,vmax,vmax2,vmax3,vmax4,vzero;
2803 + vector unsigned char vc1,vc2,vc3,vc4,vc5,vperm;
2804 + vector float vconst1 = (vector float)VINIT4ALL(0.25);
2805 + vector float vconst2 = (vector float)VINIT4ALL(1.25);
2806 +#endif
2807 int i;
2808 FLOAT tmp;
2809 *sum = 0;
2810 +#ifdef ALTIVEC
2811 + vc1 = vec_splat_u8(1);
2812 + vc2 = vec_splat_u8(5);
2813 + vc3 = vec_sl(vc1,vc2);
2814 + vc4 = vec_sl(vc3,vc1);
2815 + vc5 = vec_or(vc3,vc4);
2816 + vsum = vec_xor(vsum,vsum);
2817 + vzero = vec_xor(vzero,vzero);
2818 + vmax = vec_xor(vmax,vmax);
2819 + vsum2 = vec_xor(vsum2,vsum2);
2820 + vmax2 = vec_xor(vmax2,vmax2);
2821 + vsum3 = vec_xor(vsum3,vsum3);
2822 + vmax3 = vec_xor(vmax3,vmax3);
2823 + vsum4 = vec_xor(vsum4,vsum4);
2824 + vmax4 = vec_xor(vmax4,vmax4);
2825 +
2826 + v0 = vec_ld(0,(cod_info->xr));
2827 + vperm = vec_lvsl(0,(cod_info->xr));
2828 + for (i = 0; i <= upper-15; i+=16) {
2829 + v1 = vec_ld(16,(cod_info->xr)+i);
2830 + v2 = vec_ld(32,(cod_info->xr)+i);
2831 + v3 = vec_ld(48,(cod_info->xr)+i);
2832 + v4 = vec_ld(64,(cod_info->xr)+i);
2833 + v5 = vec_perm(v0,v1,vperm);
2834 + v6 = vec_perm(v1,v2,vperm);
2835 + v7 = vec_perm(v2,v3,vperm);
2836 + v8 = vec_perm(v3,v4,vperm);
2837 + v0 = v4;
2838 + v9 = vec_abs(v5);
2839 + v10 = vec_abs(v6);
2840 + v11 = vec_abs(v7);
2841 + v12 = vec_abs(v8);
2842 + vsum = vec_add(vsum,v9);
2843 + vsum2 = vec_add(vsum2,v10);
2844 + vsum3 = vec_add(vsum3,v11);
2845 + vsum4 = vec_add(vsum4,v12);
2846 + v1 = vec_re(vec_rsqrte(vec_rsqrte(v9)));
2847 + v2 = vec_re(vec_rsqrte(vec_rsqrte(v10)));
2848 + v3 = vec_re(vec_rsqrte(vec_rsqrte(v11)));
2849 + v4 = vec_re(vec_rsqrte(vec_rsqrte(v12)));
2850 + v5 = (vector float)vec_cmpeq(vzero,v9);
2851 + v6 = (vector float)vec_cmpeq(vzero,v10);
2852 + v7 = (vector float)vec_cmpeq(vzero,v11);
2853 + v8 = (vector float)vec_cmpeq(vzero,v12);
2854 + v13 = vec_madd(v1,v1,vzero);
2855 + v14 = vec_madd(v2,v2,vzero);
2856 + v15 = vec_madd(v3,v3,vzero);
2857 + v16 = vec_madd(v4,v4,vzero);
2858 + v13 = vec_madd(v13,v13,vzero);
2859 + v14 = vec_madd(v14,v14,vzero);
2860 + v15 = vec_madd(v15,v15,vzero);
2861 + v16 = vec_madd(v16,v16,vzero);
2862 + v17 = vec_madd(v9,vconst1,vzero);
2863 + v18 = vec_madd(v10,vconst1,vzero);
2864 + v19 = vec_madd(v11,vconst1,vzero);
2865 + v20 = vec_madd(v12,vconst1,vzero);
2866 + v13 = vec_nmsub(v13,v17,vconst2);
2867 + v14 = vec_nmsub(v14,v18,vconst2);
2868 + v15 = vec_nmsub(v15,v19,vconst2);
2869 + v16 = vec_nmsub(v16,v20,vconst2);
2870 + v1 = vec_madd(v13,v1,vzero);
2871 + v2 = vec_madd(v14,v2,vzero);
2872 + v3 = vec_madd(v15,v3,vzero);
2873 + v4 = vec_madd(v16,v4,vzero);
2874 + v1 = vec_sel(v1,vzero,(vector unsigned int)v5);
2875 + v2 = vec_sel(v2,vzero,(vector unsigned int)v6);
2876 + v3 = vec_sel(v3,vzero,(vector unsigned int)v7);
2877 + v4 = vec_sel(v4,vzero,(vector unsigned int)v8);
2878 + v17 = vec_madd(v1,v9,vzero);
2879 + v18 = vec_madd(v2,v10,vzero);
2880 + v19 = vec_madd(v3,v11,vzero);
2881 + v20 = vec_madd(v4,v12,vzero);
2882 + vec_st(v17,0,xrpow+i);
2883 + vec_st(v18,16,xrpow+i);
2884 + vec_st(v19,32,xrpow+i);
2885 + vec_st(v20,48,xrpow+i);
2886 + vmax = vec_max(v17,vmax);
2887 + vmax2 = vec_max(v18,vmax2);
2888 + vmax3 = vec_max(v19,vmax3);
2889 + vmax4 = vec_max(v20,vmax4);
2890 + }
2891 + vmax = vec_max(vmax,vmax2);
2892 + vmax3 = vec_max(vmax3,vmax4);
2893 + vmax = vec_max(vmax,vmax3);
2894 + vsum = vec_add(vsum,vsum2);
2895 + vsum3 = vec_add(vsum3,vsum4);
2896 + vsum = vec_add(vsum,vsum3);
2897 + v1 = vec_slo(vmax,vc3);
2898 + v2 = vec_slo(vsum,vc3);
2899 + v3 = vec_max(v1,vmax);
2900 + v4 = vec_add(v2,vsum);
2901 + v5 = vec_slo(v3,vc4);
2902 + v6 = vec_slo(v4,vc4);
2903 + vmax = vec_max(v3,v5);
2904 + vsum = vec_add(v4,v6);
2905 + vmax = vec_perm(vmax,vmax,vec_lvsr(0,&(cod_info->xrpow_max)));
2906 + vsum = vec_perm(vsum,vsum,vec_lvsr(0,sum));
2907 + vec_ste(vmax,0,&(cod_info->xrpow_max));
2908 + vec_ste(vsum,0,sum);
2909 +
2910 + for (; i <= upper; i++) {
2911 + tmp = fabs(cod_info->xr[i]);
2912 + *sum += tmp;
2913 + xrpow[i] = sqrt(tmp * sqrt(tmp));
2914 +
2915 + if (xrpow[i] > cod_info->xrpow_max)
2916 + cod_info->xrpow_max = xrpow[i];
2917 + }
2918 +#else
2919 +#ifdef PPC_FRSQRTE
2920 + FLOAT tmp2,tmp3,tmp4;
2921 +
2922 + for (i = 0; i <= upper-3; i+=4) {
2923 + tmp = fabs (cod_info->xr[i]);
2924 + tmp2 = fabs (cod_info->xr[i+1]);
2925 + tmp3 = fabs (cod_info->xr[i+2]);
2926 + tmp4 = fabs (cod_info->xr[i+3]);
2927 + *sum += tmp;
2928 + *sum += tmp2;
2929 + *sum += tmp3;
2930 + *sum += tmp4;
2931 +
2932 + xrpow[i] = ppc_sqrt (tmp * ppc_sqrt(tmp));
2933 + xrpow[i+1] = ppc_sqrt (tmp2 * ppc_sqrt(tmp2));
2934 + xrpow[i+2] = ppc_sqrt (tmp3 * ppc_sqrt(tmp3));
2935 + xrpow[i+3] = ppc_sqrt (tmp4 * ppc_sqrt(tmp4));
2936 +
2937 + if (xrpow[i] > cod_info->xrpow_max)
2938 + cod_info->xrpow_max = xrpow[i];
2939 + if (xrpow[i+1] > cod_info->xrpow_max)
2940 + cod_info->xrpow_max = xrpow[i+1];
2941 + if (xrpow[i+2] > cod_info->xrpow_max)
2942 + cod_info->xrpow_max = xrpow[i+2];
2943 + if (xrpow[i+3] > cod_info->xrpow_max)
2944 + cod_info->xrpow_max = xrpow[i+3];
2945 + }
2946 +
2947 + for (; i <= upper; i++) {
2948 + tmp = fabs(cod_info->xr[i]);
2949 + *sum += tmp;
2950 + xrpow[i] = ppc_sqrt (tmp * ppc_sqrt(tmp));
2951 +
2952 + if (xrpow[i] > cod_info->xrpow_max)
2953 + cod_info->xrpow_max = xrpow[i];
2954 + }
2955 +#else
2956 for (i = 0; i <= upper; ++i) {
2957 tmp = fabs(cod_info->xr[i]);
2958 *sum += tmp;
2959 @@ -84,6 +266,8 @@
2960 if (xrpow[i] > cod_info->xrpow_max)
2961 cod_info->xrpow_max = xrpow[i];
2962 }
2963 +#endif
2964 +#endif
2965 }
2966
2967
2968 @@ -1480,7 +1664,7 @@
2969 lame_internal_flags *const gfc = gfp->internal_flags;
2970 FLOAT l3_xmin[2][2][SFBMAX];
2971
2972 - FLOAT xrpow[576];
2973 + FLOAT xrpow[576] __attribute__ ((aligned (16)));
2974 int bands[2][2];
2975 int frameBits[15];
2976 int used_bits;
2977 @@ -1649,7 +1833,7 @@
2978 lame_internal_flags *const gfc = gfp->internal_flags;
2979 FLOAT l3_xmin[2][2][SFBMAX];
2980
2981 - FLOAT xrpow[2][2][576];
2982 + FLOAT xrpow[2][2][576] __attribute__ ((aligned (16)));
2983 int frameBits[15];
2984 int used_bits;
2985 int max_bits[2][2];
2986 @@ -1881,7 +2065,7 @@
2987 {
2988 lame_internal_flags *const gfc = gfp->internal_flags;
2989 FLOAT l3_xmin[SFBMAX];
2990 - FLOAT xrpow[576];
2991 + FLOAT xrpow[576] __attribute__ ((aligned (16)));
2992 int targ_bits[2][2];
2993 int mean_bits, max_frame_bits;
2994 int ch, gr, ath_over;
2995 @@ -1968,7 +2152,7 @@
2996 {
2997 lame_internal_flags *const gfc = gfp->internal_flags;
2998 FLOAT l3_xmin[SFBMAX];
2999 - FLOAT xrpow[576];
3000 + FLOAT xrpow[576] __attribute__ ((aligned (16)));
3001 int targ_bits[2];
3002 int mean_bits, max_bits;
3003 int gr, ch;
3004 --- libmp3lame/quantize_pvt.c.orig 2009-01-19 00:44:28.000000000 +0900
3005 +++ libmp3lame/quantize_pvt.c 2010-03-01 14:05:00.000000000 +0900
3006 @@ -27,6 +27,13 @@
3007 # include <config.h>
3008 #endif
3009
3010 +#ifdef ALTIVEC
3011 +#undef TAKEHIRO_IEEE754_HACK
3012 +#ifndef __APPLE_CC__
3013 +#include <altivec.h>
3014 +#endif
3015 +#endif
3016 +
3017
3018 #include "lame.h"
3019 #include "machine.h"
3020 @@ -744,6 +751,39 @@
3021 static FLOAT
3022 calc_noise_core_c(const gr_info * const cod_info, int *startline, int l, FLOAT step)
3023 {
3024 +#ifdef ALTIVEC
3025 + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,va,vb,vstep,vzero,vnoise1,vnoise2,vix01;
3026 + vector unsigned char vperm1,vperm2,vperm5,vperm6;
3027 + vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vx7,vshamt,vone;
3028 +#ifdef ALTIVEC_970
3029 + vector unsigned int vmask1,vmask2,vmask3;
3030 + vector float v10,v11,v12,v13,v14,v15,v16,v17;
3031 +#else
3032 + vector unsigned char vc1,vc2,vc3,vc4,vc5,vc6,vperm3,vperm4,vmask;
3033 +#endif
3034 + float temp[4] __attribute__ ((aligned (16)));
3035 +
3036 + temp[0] = step;
3037 + vstep = vec_ld(0,temp);
3038 + vzero = vec_xor(vzero,vzero);
3039 + vperm6 = (vector unsigned char)VINIT16(0,0,3,19,0,0,7,23,0,0,11,27,0,0,15,31);
3040 + vperm5 = vec_sld(vperm6,vperm6,2);
3041 +#ifdef ALTIVEC_970
3042 + vmask1 = vec_splat_u32(-1);
3043 + vmask2 = vec_sld((vector unsigned int)vzero,vmask1,8);
3044 + vmask3 = vec_sld((vector unsigned int)vzero,vmask1,4);
3045 + vmask1 = vec_sld((vector unsigned int)vzero,vmask1,12);
3046 +#else
3047 + vperm3 = (vector unsigned char)VINIT16(0,0,0,0,0,0,0,0,0,1,2,3,16,17,18,19);
3048 + vperm4 = vec_sld(vperm3,(vector unsigned char)vzero,8);
3049 + vmask = (vector unsigned char)VINIT16ALL(16);
3050 +#endif
3051 + vstep = vec_splat(vstep,0);
3052 + vnoise1 = vec_xor(vnoise1,vnoise1);
3053 + vnoise2 = vec_xor(vnoise2,vnoise2);
3054 + vone = vec_splat_s32(1);
3055 + vshamt = vec_splat_s32(2);
3056 +#endif
3057 FLOAT noise = 0;
3058 int j = *startline;
3059 const int *const ix = cod_info->l3_enc;
3060 @@ -760,9 +800,55 @@
3061 }
3062 }
3063 else if (j > cod_info->big_values) {
3064 - FLOAT ix01[2];
3065 + FLOAT ix01[4] __attribute__ ((aligned (16)));
3066 ix01[0] = 0;
3067 ix01[1] = step;
3068 +#ifdef ALTIVEC
3069 + vix01 = vec_ld(0,ix01);
3070 + v1 = vec_ld(0,cod_info->xr+j);
3071 + vperm1 = vec_lvsl(0,cod_info->xr+j);
3072 + vx1 = vec_ld(0,ix+j);
3073 + vperm2 = vec_lvsl(0,ix+j);
3074 + for(;l>1;l-=2) {
3075 + v2 = vec_ld(16,cod_info->xr+j);
3076 + vx2 = vec_ld(16,ix+j);
3077 + v3 = vec_perm(v1,v2,vperm1);
3078 + vx3 = vec_perm(vx1,vx2,vperm2);
3079 + va = vec_abs(v3);
3080 + v1 = v2;
3081 + vx1 = vx2;
3082 +
3083 + vx4 = vec_sl(vx3,(vector unsigned int)vshamt);
3084 + vx5 = vec_add(vx4,vone);
3085 + vx6 = vec_add(vx4,vshamt);
3086 + vx7 = vec_add(vx5,vshamt);
3087 + vx2 = vec_perm(vx4,vx5,vperm5);
3088 + vx3 = vec_perm(vx6,vx7,vperm6);
3089 + vx4 = vec_or(vx2,vx3);
3090 +
3091 + v2 = vec_perm(vix01,vix01,(vector unsigned char)vx4);
3092 + va = vec_sub(va,v2);
3093 +
3094 + vnoise1 = vec_madd(va,va,vnoise1);
3095 +
3096 + j += 4;
3097 + }
3098 + v1 = vec_sld(vnoise1,vnoise1,8);
3099 + v2 = vec_add(vnoise1,v1);
3100 + v3 = vec_sld(v2,v2,4);
3101 + v4 = vec_add(v2,v3);
3102 + v5 = vec_perm(v4,v4,vec_lvsr(0,&noise));
3103 + vec_ste(v5,0,&noise);
3104 + if(l) {
3105 + FLOAT temp;
3106 + temp = fabs(cod_info->xr[j]) - ix01[ix[j]];
3107 + j++;
3108 + noise += temp * temp;
3109 + temp = fabs(cod_info->xr[j]) - ix01[ix[j]];
3110 + j++;
3111 + noise += temp * temp;
3112 + }
3113 +#else
3114 while (l--) {
3115 FLOAT temp;
3116 temp = fabs(cod_info->xr[j]) - ix01[ix[j]];
3117 @@ -772,8 +858,138 @@
3118 j++;
3119 noise += temp * temp;
3120 }
3121 +#endif
3122 }
3123 else {
3124 +#ifdef ALTIVEC
3125 + vperm1 = vec_lvsl(0,cod_info->xr+j);
3126 + v1 = vec_ld(0,cod_info->xr+j);
3127 + for(;l>3;l-=4) {
3128 + v2 = vec_ld(16,cod_info->xr+j);
3129 + v3 = vec_ld(32,cod_info->xr+j);
3130 + v4 = vec_perm(v1,v2,vperm1);
3131 + v5 = vec_perm(v2,v3,vperm1);
3132 + va = vec_abs(v4);
3133 + vb = vec_abs(v5);
3134 + v1 = v3;
3135 +
3136 +#ifdef ALTIVEC_970
3137 + v2 = vec_lde(0,pow43+ix[j]);
3138 + v6 = vec_lde(0,pow43+ix[j+1]);
3139 + v10 = vec_lde(0,pow43+ix[j+2]);
3140 + v14 = vec_lde(0,pow43+ix[j+3]);
3141 + v4 = vec_perm(v2,v2,vec_lvsl(0,pow43+ix[j]));
3142 + v8 = vec_perm(v6,v6,vec_lvsl(-4,pow43+ix[j+1]));
3143 + v12 = vec_perm(v10,v10,vec_lvsl(-8,pow43+ix[j+2]));
3144 + v16 = vec_perm(v14,v14,vec_lvsl(-12,pow43+ix[j+3]));
3145 + v4 = vec_sel(v4,v8,vmask1);
3146 + v4 = vec_sel(v4,v12,vmask2);
3147 + v4 = vec_sel(v4,v16,vmask3);
3148 + va = vec_nmsub(v4,vstep,va);
3149 +#else
3150 + vc1 = vec_lvsl(0,pow43+ix[j]);
3151 + vc2 = vec_lvsl(0,pow43+ix[j+1]);
3152 + vc3 = vec_lvsl(0,pow43+ix[j+2]);
3153 + vc4 = vec_lvsl(0,pow43+ix[j+3]);
3154 + vc2 = vec_or(vc2,vmask);
3155 + vc4 = vec_or(vc4,vmask);
3156 + v2 = vec_lde(0,pow43+ix[j]);
3157 + v3 = vec_lde(0,pow43+ix[j+1]);
3158 + v4 = vec_lde(0,pow43+ix[j+2]);
3159 + v5 = vec_lde(0,pow43+ix[j+3]);
3160 + vc5 = vec_perm(vc1,vc2,vperm3);
3161 + vc6 = vec_perm(vc3,vc4,vperm4);
3162 + v6 = vec_perm(v2,v3,vc5);
3163 + v7 = vec_perm(v4,v5,vc6);
3164 + v8 = vec_sld(v6,v7,8);
3165 + va = vec_nmsub(v8,vstep,va);
3166 +#endif
3167 + j+=4;
3168 +
3169 +#ifdef ALTIVEC_970
3170 + v3 = vec_lde(0,pow43+ix[j]);
3171 + v7 = vec_lde(0,pow43+ix[j+1]);
3172 + v11 = vec_lde(0,pow43+ix[j+2]);
3173 + v15 = vec_lde(0,pow43+ix[j+3]);
3174 + v5 = vec_perm(v3,v3,vec_lvsl(0,pow43+ix[j]));
3175 + v9 = vec_perm(v7,v7,vec_lvsl(-4,pow43+ix[j+1]));
3176 + v13 = vec_perm(v11,v11,vec_lvsl(-8,pow43+ix[j+2]));
3177 + v17 = vec_perm(v15,v15,vec_lvsl(-12,pow43+ix[j+3]));
3178 + v5 = vec_sel(v5,v9,vmask1);
3179 + v5 = vec_sel(v5,v13,vmask2);
3180 + v5 = vec_sel(v5,v17,vmask3);
3181 + vb = vec_nmsub(v5,vstep,vb);
3182 +#else
3183 + vc1 = vec_lvsl(0,pow43+ix[j]);
3184 + vc2 = vec_lvsl(0,pow43+ix[j+1]);
3185 + vc3 = vec_lvsl(0,pow43+ix[j+2]);
3186 + vc4 = vec_lvsl(0,pow43+ix[j+3]);
3187 + vc2 = vec_or(vc2,vmask);
3188 + vc4 = vec_or(vc4,vmask);
3189 + v2 = vec_lde(0,pow43+ix[j]);
3190 + v3 = vec_lde(0,pow43+ix[j+1]);
3191 + v4 = vec_lde(0,pow43+ix[j+2]);
3192 + v5 = vec_lde(0,pow43+ix[j+3]);
3193 + vc5 = vec_perm(vc1,vc2,vperm3);
3194 + vc6 = vec_perm(vc3,vc4,vperm4);
3195 + v6 = vec_perm(v2,v3,vc5);
3196 + v7 = vec_perm(v4,v5,vc6);
3197 + v8 = vec_sld(v6,v7,8);
3198 + vb = vec_nmsub(v8,vstep,vb);
3199 +#endif
3200 +
3201 + vnoise1 = vec_madd(va,va,vnoise1);
3202 + vnoise2 = vec_madd(vb,vb,vnoise2);
3203 +
3204 + j+=4;
3205 + }
3206 + vnoise1 = vec_add(vnoise1,vnoise2);
3207 +
3208 + for(;l>1;l-=2) {
3209 + v2 = vec_ld(16,cod_info->xr+j);
3210 + v4 = vec_perm(v1,v2,vperm1);
3211 + va = vec_abs(v4);
3212 + v1 = v2;
3213 +
3214 + v2 = vec_lde(0,pow43+ix[j]);
3215 + v3 = vec_lde(0,pow43+ix[j+1]);
3216 + v4 = vec_lde(0,pow43+ix[j+2]);
3217 + v5 = vec_lde(0,pow43+ix[j+3]);
3218 + v6 = vec_perm(v2,v2,vec_lvsl(0,pow43+ix[j]));
3219 + v7 = vec_perm(v3,v3,vec_lvsl(-4,pow43+ix[j+1]));
3220 + v8 = vec_perm(v4,v4,vec_lvsl(-8,pow43+ix[j+2]));
3221 + v9 = vec_perm(v5,v5,vec_lvsl(-12,pow43+ix[j+3]));
3222 +#ifdef ALTIVEC_970
3223 + v6 = vec_sel(v6,v7,vmask1);
3224 + v6 = vec_sel(v6,v8,vmask2);
3225 + v6 = vec_sel(v6,v9,vmask3);
3226 +#else
3227 + v6 = vec_or(v6,v7);
3228 + v6 = vec_or(v6,v8);
3229 + v6 = vec_or(v6,v9);
3230 +#endif
3231 + va = vec_nmsub(v6,vstep,va);
3232 +
3233 + vnoise1 = vec_madd(va,va,vnoise1);
3234 +
3235 + j += 4;
3236 + }
3237 + v1 = vec_sld(vnoise1,vnoise1,8);
3238 + v2 = vec_add(vnoise1,v1);
3239 + v3 = vec_sld(v2,v2,4);
3240 + v4 = vec_add(v2,v3);
3241 + v5 = vec_perm(v4,v4,vec_lvsr(0,&noise));
3242 + vec_ste(v5,0,&noise);
3243 + if(l) {
3244 + FLOAT temp;
3245 + temp = fabs(cod_info->xr[j]) - pow43[ix[j]] * step;
3246 + j++;
3247 + noise += temp * temp;
3248 + temp = fabs(cod_info->xr[j]) - pow43[ix[j]] * step;
3249 + j++;
3250 + noise += temp * temp;
3251 + }
3252 +#else
3253 while (l--) {
3254 FLOAT temp;
3255 temp = fabs(cod_info->xr[j]) - pow43[ix[j]] * step;
3256 @@ -783,6 +999,7 @@
3257 j++;
3258 noise += temp * temp;
3259 }
3260 +#endif
3261 }
3262
3263 *startline = j;
3264 --- libmp3lame/tables.c.orig 2008-04-13 03:18:07.000000000 +0900
3265 +++ libmp3lame/tables.c 2010-03-01 14:05:00.000000000 +0900
3266 @@ -406,7 +406,7 @@
3267 };
3268
3269
3270 -const struct huffcodetab ht[HTN] = {
3271 +const struct huffcodetab ht[HTN] __attribute__ ((aligned (16))) = {
3272 /* xlen, linmax, table, hlen */
3273 {0, 0, NULL, NULL},
3274 {2, 0, t1HB, t1l},
3275 --- libmp3lame/takehiro.c.orig 2008-09-23 05:21:39.000000000 +0900
3276 +++ libmp3lame/takehiro.c 2010-03-01 14:05:00.000000000 +0900
3277 @@ -27,6 +27,13 @@
3278 #endif
3279
3280
3281 +#ifdef ALTIVEC
3282 +#undef TAKEHIRO_IEEE754_HACK
3283 +#ifndef __APPLE_CC__
3284 +#include <altivec.h>
3285 +#endif
3286 +#endif
3287 +
3288 #include "lame.h"
3289 #include "machine.h"
3290 #include "encoder.h"
3291 @@ -217,6 +224,150 @@
3292 static void
3293 quantize_lines_xrpow(int l, FLOAT istep, const FLOAT * xr, int *ix)
3294 {
3295 +#ifdef ALTIVEC
3296 + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,va,vb,vistep,vzero;
3297 + vector signed int vx1,vx2,vx3,vx4,vprev;
3298 + vector unsigned char vperm1,vperm2;
3299 + const vector float const1 = (vector float)VINIT4(0.4053964553387788,3.404263724373839,5.465086767819913,1.0);
3300 + const vector float const2 = (vector float)VINIT4(7.719205369637751,10.93017829043677,0,0);
3301 +#ifndef ALTIVEC_970
3302 + unsigned int temp[4] __attribute__ ((aligned (16)));
3303 +#endif
3304 + float temp2[4] __attribute__ ((aligned (16)));
3305 + temp2[0] = istep;
3306 + vistep = vec_ld(0,temp2);
3307 + vzero = vec_xor(vzero,vzero);
3308 + vistep = vec_splat(vistep,0);
3309 +
3310 + l = l >> 1;
3311 +
3312 + vperm1 = vec_lvsl(0,xr);
3313 + vperm2 = vec_lvsr(0,ix);
3314 + v1 = vec_ld(0,xr);
3315 + vx1 = vec_ld(-16,ix);
3316 + vx2 = vec_ld(0,ix);
3317 + vprev = vec_perm(vx1,vx2,vec_lvsl(0,ix));
3318 + for(;l>3;l-=4) {
3319 + v2 = vec_ld(16,xr);
3320 + v3 = vec_ld(32,xr);
3321 + v4 = vec_perm(v1,v2,vperm1);
3322 + v5 = vec_perm(v2,v3,vperm1);
3323 + va = vec_madd(v4,vistep,vzero);
3324 + vb = vec_madd(v5,vistep,vzero);
3325 + v1 = v3;
3326 +
3327 + v2 = vec_floor(va);
3328 + v3 = vec_floor(vb);
3329 + v4 = vec_splat(const1,2);
3330 + v5 = vec_splat(const1,1);
3331 + v6 = vec_splat(const2,1);
3332 + v7 = vec_splat(const2,0);
3333 + v8 = vec_madd(v2,v4,v5);
3334 + v9 = vec_madd(v3,v4,v5);
3335 + v10 = vec_madd(v2,v6,v7);
3336 + v11 = vec_madd(v3,v6,v7);
3337 + v4 = vec_splat(const1,0);
3338 + v5 = vec_splat(const1,3);
3339 + v8 = vec_madd(v8,v2,v4);
3340 + v9 = vec_madd(v9,v3,v4);
3341 + v10 = vec_madd(v10,v2,v5);
3342 + v11 = vec_madd(v11,v3,v5);
3343 + v6 = vec_re(v10);
3344 + v7 = vec_re(v11);
3345 + v10 = vec_nmsub(v10,v6,v5);
3346 + v11 = vec_nmsub(v11,v7,v5);
3347 + v10 = vec_madd(v10,v6,v6);
3348 + v11 = vec_madd(v11,v7,v7);
3349 + va = vec_madd(v8,v10,va);
3350 + vb = vec_madd(v9,v11,vb);
3351 +
3352 + vx1 = vec_cts(va,0);
3353 + vx2 = vec_cts(vb,0);
3354 +
3355 + vx3 = vec_perm(vprev,vx1,vperm2);
3356 + vx4 = vec_perm(vx1,vx2,vperm2);
3357 + vec_st(vx3,0,ix);
3358 + vec_st(vx4,16,ix);
3359 + vprev = vx2;
3360 + xr += 8;
3361 + ix += 8;
3362 + }
3363 + vx1 = vec_ld(0,ix);
3364 + vx2 = vec_ld(16,ix);
3365 + vx3 = vec_perm(vx1,vx2,vec_lvsl(0,ix));
3366 + vx4 = vec_perm(vprev,vx3,vperm2);
3367 + vec_st(vx4,0,ix);
3368 +
3369 +#ifdef ALTIVEC_970
3370 + for(;l>1;l-=2) {
3371 + FLOAT x0, x1, x2, x3;
3372 + int rx0, rx1, rx2, rx3;
3373 +
3374 + x0 = *xr++ * istep;
3375 + x1 = *xr++ * istep;
3376 + XRPOW_FTOI(x0, rx0);
3377 + x2 = *xr++ * istep;
3378 + XRPOW_FTOI(x1, rx1);
3379 + x3 = *xr++ * istep;
3380 + XRPOW_FTOI(x2, rx2);
3381 + x0 += QUANTFAC(rx0);
3382 + XRPOW_FTOI(x3, rx3);
3383 + x1 += QUANTFAC(rx1);
3384 + XRPOW_FTOI(x0, *ix++);
3385 + x2 += QUANTFAC(rx2);
3386 + XRPOW_FTOI(x1, *ix++);
3387 + x3 += QUANTFAC(rx3);
3388 + XRPOW_FTOI(x2, *ix++);
3389 + XRPOW_FTOI(x3, *ix++);
3390 + }
3391 +#else
3392 + for(;l>1;l-=2) {
3393 + v2 = vec_ld(16,xr);
3394 + v4 = vec_perm(v1,v2,vperm1);
3395 + v1 = v2;
3396 + va = vec_madd(v4,vistep,vzero);
3397 + vx1 = vec_cts(va,0);
3398 + vec_st((vector unsigned int)vx1,0,temp);
3399 + v2 = vec_lde(0,adj43+temp[0]);
3400 + v3 = vec_lde(0,adj43+temp[1]);
3401 + v4 = vec_lde(0,adj43+temp[2]);
3402 + v5 = vec_lde(0,adj43+temp[3]);
3403 + v6 = vec_perm(v2,v2,vec_lvsl(0,adj43+temp[0]));
3404 + v7 = vec_perm(v3,v3,vec_lvsl(-4,adj43+temp[1]));
3405 + v8 = vec_perm(v4,v4,vec_lvsl(-8,adj43+temp[2]));
3406 + v9 = vec_perm(v5,v5,vec_lvsl(-12,adj43+temp[3]));
3407 + v6 = vec_or(v6,v7);
3408 + v6 = vec_or(v6,v8);
3409 + v6 = vec_or(v6,v9);
3410 + va = vec_add(va,v6);
3411 + vx1 = vec_cts(va,0);
3412 + vx3 = vec_perm(vprev,vx1,vperm2);
3413 + vec_st(vx3,0,ix);
3414 + vprev = vx1;
3415 + xr += 4;
3416 + ix += 4;
3417 + }
3418 + vx1 = vec_ld(0,ix);
3419 + vx2 = vec_ld(16,ix);
3420 + vx3 = vec_perm(vx1,vx2,vec_lvsl(0,ix));
3421 + vx4 = vec_perm(vprev,vx3,vperm2);
3422 + vec_st(vx4,0,ix);
3423 +#endif
3424 +
3425 + if (l) {
3426 + FLOAT x0, x1;
3427 + int rx0, rx1;
3428 +
3429 + x0 = *xr++ * istep;
3430 + x1 = *xr++ * istep;
3431 + XRPOW_FTOI(x0, rx0);
3432 + XRPOW_FTOI(x1, rx1);
3433 + x0 += QUANTFAC(rx0);
3434 + x1 += QUANTFAC(rx1);
3435 + XRPOW_FTOI(x0, *ix++);
3436 + XRPOW_FTOI(x1, *ix++);
3437 + }
3438 +#else
3439 int remaining;
3440
3441 assert(l > 0);
3442 @@ -258,7 +409,7 @@
3443 XRPOW_FTOI(x0, *ix++);
3444 XRPOW_FTOI(x1, *ix++);
3445 }
3446 -
3447 +#endif
3448 }
3449
3450
3451 @@ -415,6 +566,60 @@
3452 /* ix_max */
3453 /*************************************************************************/
3454
3455 +#ifdef ALTIVEC
3456 +int
3457 +ix_max_vec(const int *ix, const int *end)
3458 +{
3459 + int vresult[4] __attribute__ ((aligned (16)));
3460 + int max1=0, max2=0;
3461 + vector signed int v1, v2, v3, v4, v5, v6, v7, vmax;
3462 + vector unsigned char vmask,vc1,vc2,vc3,vc4;
3463 +
3464 + if(end - ix < 8) goto normal;
3465 + int i = (end-ix)/4;
3466 + int remain = (end-ix)%4;
3467 + vc1 = vec_splat_u8(1);
3468 + vc2 = vec_splat_u8(5);
3469 + vc3 = vec_sl(vc1,vc2);
3470 + vc4 = vec_sl(vc3,vc1);
3471 +
3472 + v1 = vec_ld(0, ix);
3473 + vmask = vec_lvsl(0, ix);
3474 + vmax = vec_xor(vmax, vmax);
3475 +
3476 + while(i--) {
3477 + v2 = vec_ld(16, ix);
3478 + v3 = vec_perm(v1, v2, vmask);
3479 + v1 = v2;
3480 + vmax = vec_max(vmax,v3);
3481 + ix += 4;
3482 + }
3483 +
3484 + v4 = vec_slo(vmax,vc3);
3485 + v5 = vec_max(vmax,v4);
3486 + v6 = vec_slo(v5,vc4);
3487 + v7 = vec_max(v5,v6);
3488 + vec_st(v7,0,vresult);
3489 +
3490 + max1 = vresult[0];
3491 + if(!remain) return max1;
3492 + //max2 = vresult[2];
3493 + /*if(vresult[2] > max1) max1 = vresult[2];
3494 + if(vresult[3] > max2) max2 = vresult[3];*/
3495 +
3496 + normal:
3497 +
3498 + do{
3499 + int x1 = *ix++;
3500 + int x2 = *ix++;
3501 + if (max1 < x1) max1 = x1;
3502 + if (max2 < x2) max2 = x2;
3503 + } while (ix < end);
3504 + if(max1 < max2) max1 = max2;
3505 +
3506 + return max1;
3507 +}
3508 +#else
3509 static int
3510 ix_max(const int *ix, const int *end)
3511 {
3512 @@ -433,6 +638,7 @@
3513 max1 = max2;
3514 return max1;
3515 }
3516 +#endif
3517
3518
3519
3520 @@ -440,7 +646,7 @@
3521
3522
3523
3524 -
3525 +#if !defined(ALTIVEC) || (defined(ALTIVEC) && !defined(ALTIVEC_970))
3526 static int
3527 count_bit_ESC(const int *ix, const int *const end, int t1, const int t2, int *const s)
3528 {
3529 @@ -482,6 +688,7 @@
3530 *s += sum;
3531 return t1;
3532 }
3533 +#endif
3534
3535
3536 inline static int
3537 @@ -502,7 +709,7 @@
3538 }
3539
3540
3541 -
3542 +#if !defined(ALTIVEC)
3543 inline static int
3544 count_bit_noESC_from2(const int *ix, const int *const end, int t1, int *const s)
3545 {
3546 @@ -532,6 +739,7 @@
3547 *s += sum;
3548 return t1;
3549 }
3550 +#endif
3551
3552
3553 inline static int
3554 @@ -569,6 +777,670 @@
3555 return t;
3556 }
3557
3558 +#ifdef ALTIVEC
3559 +#ifdef ALTIVEC_970
3560 +static int
3561 +count_bit_ESC_altivec(const int *ix, const int *const end, int t1, const int t2, int *const s)
3562 +{
3563 + /* ESC-table is used */
3564 + int const linbits = ht[t1].xlen * 65536 + ht[t2].xlen;
3565 + int sum = 0, sum2;
3566 + vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
3567 + vector unsigned int vsum;
3568 + vector unsigned char vmask,vperm1,vperm2,vshamt;
3569 + vector unsigned char vzero,vs1,vs2,vs3,vs4,vs5,vs6,vlimit1,vlimit2,vone;
3570 + unsigned char tmp[16] __attribute__ ((aligned (16)));
3571 + unsigned int tmp2[4] __attribute__ ((aligned (16)));
3572 +
3573 + vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
3574 + vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31);
3575 + vlimit1 = vec_splat_u8(14);
3576 + vlimit2 = vec_splat_u8(15);
3577 + vone = vec_splat_u8(1);
3578 + vshamt = vec_splat_u8(4);
3579 + vzero = vec_xor(vzero,vzero);
3580 + vsum = vec_xor(vsum,vsum);
3581 +
3582 + if((int)(end - ix) < 32) goto normal;
3583 + v0 = vec_ld(0,ix);
3584 + vmask = vec_lvsl(0,ix);
3585 + do {
3586 + v1 = vec_ld(16,ix);
3587 + v2 = vec_ld(32,ix);
3588 + v3 = vec_ld(48,ix);
3589 + v4 = vec_ld(64,ix);
3590 + v5 = vec_ld(80,ix);
3591 + v6 = vec_ld(96,ix);
3592 + v7 = vec_ld(112,ix);
3593 + v8 = vec_ld(128,ix);
3594 + v9 = vec_perm(v0,v1,vmask);
3595 + v10 = vec_perm(v1,v2,vmask);
3596 + v11 = vec_perm(v2,v3,vmask);
3597 + v12 = vec_perm(v3,v4,vmask);
3598 + v13 = vec_perm(v4,v5,vmask);
3599 + v14 = vec_perm(v5,v6,vmask);
3600 + v15 = vec_perm(v6,v7,vmask);
3601 + v16 = vec_perm(v7,v8,vmask);
3602 + v0 = v8;
3603 + v1 = vec_perm(v9,v10,vperm1);
3604 + v2 = vec_perm(v9,v10,vperm2);
3605 + v3 = vec_perm(v11,v12,vperm1);
3606 + v4 = vec_perm(v11,v12,vperm2);
3607 + v5 = vec_perm(v13,v14,vperm1);
3608 + v6 = vec_perm(v13,v14,vperm2);
3609 + v7 = vec_perm(v15,v16,vperm1);
3610 + v8 = vec_perm(v15,v16,vperm2);
3611 +
3612 + v1 = (vector signed int)vec_packs(v1,v3);
3613 + v2 = (vector signed int)vec_packs(v2,v4);
3614 + v3 = (vector signed int)vec_packs(v5,v7);
3615 + v4 = (vector signed int)vec_packs(v6,v8);
3616 + vs1 = vec_packs((vector unsigned short)v1,(vector unsigned short)v3);
3617 + vs2 = vec_packs((vector unsigned short)v2,(vector unsigned short)v4);
3618 + vs3 = vec_sel(vs1,vlimit2,vec_cmpgt(vs1,vlimit1));
3619 + vs4 = vec_sel(vs2,vlimit2,vec_cmpgt(vs2,vlimit1));
3620 + vs5 = vec_sel(vzero,vone,vec_cmpgt(vs1,vlimit1));
3621 + vs6 = vec_sel(vzero,vone,vec_cmpgt(vs2,vlimit1));
3622 + vs5 = vec_add(vs5,vs6);
3623 + vsum = vec_sum4s(vs5,vsum);
3624 + vs3 = vec_sl(vs3,vshamt);
3625 + vs3 = vec_add(vs3,vs4);
3626 + vec_st(vs3,0,tmp);
3627 +
3628 + sum += largetbl[tmp[0]];
3629 + sum += largetbl[tmp[1]];
3630 + sum += largetbl[tmp[2]];
3631 + sum += largetbl[tmp[3]];
3632 + sum += largetbl[tmp[4]];
3633 + sum += largetbl[tmp[5]];
3634 + sum += largetbl[tmp[6]];
3635 + sum += largetbl[tmp[7]];
3636 + sum += largetbl[tmp[8]];
3637 + sum += largetbl[tmp[9]];
3638 + sum += largetbl[tmp[10]];
3639 + sum += largetbl[tmp[11]];
3640 + sum += largetbl[tmp[12]];
3641 + sum += largetbl[tmp[13]];
3642 + sum += largetbl[tmp[14]];
3643 + sum += largetbl[tmp[15]];
3644 +
3645 + ix += 32;
3646 + } while(ix < end-31);
3647 +
3648 + vsum = (vector unsigned int)vec_sums((vector signed int)vsum,(vector signed int)vzero);
3649 + vec_st(vsum,0,tmp2);
3650 + sum += tmp2[3] * linbits;
3651 +
3652 + while (ix < end) {
3653 + int x = *ix++;
3654 + int y = *ix++;
3655 +
3656 + if (x != 0) {
3657 + if (x > 14) {
3658 + x = 15;
3659 + sum += linbits;
3660 + }
3661 + x *= 16;
3662 + }
3663 +
3664 + if (y != 0) {
3665 + if (y > 14) {
3666 + y = 15;
3667 + sum += linbits;
3668 + }
3669 + x += y;
3670 + }
3671 +
3672 + sum += largetbl[x];
3673 + }
3674 + goto end;
3675 +
3676 + normal:
3677 +
3678 + do {
3679 + int x = *ix++;
3680 + int y = *ix++;
3681 +
3682 + if (x != 0) {
3683 + if (x > 14) {
3684 + x = 15;
3685 + sum += linbits;
3686 + }
3687 + x *= 16;
3688 + }
3689 +
3690 + if (y != 0) {
3691 + if (y > 14) {
3692 + y = 15;
3693 + sum += linbits;
3694 + }
3695 + x += y;
3696 + }
3697 +
3698 + sum += largetbl[x];
3699 + } while (ix < end);
3700 +
3701 + end:
3702 +
3703 + sum2 = sum & 0xffff;
3704 + sum >>= 16;
3705 +
3706 + if (sum > sum2) {
3707 + sum = sum2;
3708 + t1 = t2;
3709 + }
3710 +
3711 + *s += sum;
3712 + return t1;
3713 +}
3714 +#endif
3715 +
3716 +inline static int
3717 +count_bit_noESC_from2_altivec1(const int *ix, const int *const end, int t1, int *const s)
3718 +{
3719 + /* No ESC-words */
3720 + unsigned int sum = 0;
3721 + int sum1, sum2;
3722 + const int xlen = 3;
3723 + const unsigned int *hlen = table23;
3724 + vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
3725 + vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vxlen,vzero,vsum1,vsum2;
3726 + vector unsigned char vmask,vperm1,vperm2,vx;
3727 + vector unsigned char vhlen1,vhlen2;
3728 + vector signed char vs1,vs2;
3729 +
3730 + vhlen1 = (vector unsigned char)VINIT16(1,4,7,4,5,7,6,7,8,0,0,0,0,0,0,0);
3731 + vhlen2 = (vector unsigned char)VINIT16(2,3,7,4,4,7,6,7,8,0,0,0,0,0,0,0);
3732 + vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
3733 + vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31);
3734 + vxlen = vec_splat_s32(3);
3735 + vzero = vec_xor(vzero,vzero);
3736 + vsum1 = vec_xor(vsum1,vsum1);
3737 + vsum2 = vec_xor(vsum2,vsum2);
3738 +
3739 + if((int)(end - ix) < 32) goto normal;
3740 + v0 = vec_ld(0,ix);
3741 + vmask = vec_lvsl(0,ix);
3742 + do {
3743 + v1 = vec_ld(16,ix);
3744 + v2 = vec_ld(32,ix);
3745 + v3 = vec_ld(48,ix);
3746 + v4 = vec_ld(64,ix);
3747 + v5 = vec_ld(80,ix);
3748 + v6 = vec_ld(96,ix);
3749 + v7 = vec_ld(112,ix);
3750 + v8 = vec_ld(128,ix);
3751 + v9 = vec_perm(v0,v1,vmask);
3752 + v10 = vec_perm(v1,v2,vmask);
3753 + v11 = vec_perm(v2,v3,vmask);
3754 + v12 = vec_perm(v3,v4,vmask);
3755 + v13 = vec_perm(v4,v5,vmask);
3756 + v14 = vec_perm(v5,v6,vmask);
3757 + v15 = vec_perm(v6,v7,vmask);
3758 + v16 = vec_perm(v7,v8,vmask);
3759 + v0 = v8;
3760 + v1 = vec_perm(v9,v10,vperm1);
3761 + v2 = vec_perm(v9,v10,vperm2);
3762 + v3 = vec_perm(v11,v12,vperm1);
3763 + v4 = vec_perm(v11,v12,vperm2);
3764 + v5 = vec_perm(v13,v14,vperm1);
3765 + v6 = vec_perm(v13,v14,vperm2);
3766 + v7 = vec_perm(v15,v16,vperm1);
3767 + v8 = vec_perm(v15,v16,vperm2);
3768 + vx1 = (vector signed int)vec_mladd((vector unsigned short)v1,(vector unsigned short)vxlen,(vector unsigned short)v2);
3769 + vx2 = (vector signed int)vec_mladd((vector unsigned short)v3,(vector unsigned short)vxlen,(vector unsigned short)v4);
3770 + vx3 = (vector signed int)vec_pack(vx1,vx2);
3771 + vx4 = (vector signed int)vec_mladd((vector unsigned short)v5,(vector unsigned short)vxlen,(vector unsigned short)v6);
3772 + vx5 = (vector signed int)vec_mladd((vector unsigned short)v7,(vector unsigned short)vxlen,(vector unsigned short)v8);
3773 + vx6 = (vector signed int)vec_pack(vx4,vx5);
3774 + vx = (vector unsigned char)vec_pack((vector unsigned short)vx3,(vector unsigned short)vx6);
3775 +
3776 + vs1 = (vector signed char)vec_perm(vhlen1,vhlen1,vx);
3777 + vs2 = (vector signed char)vec_perm(vhlen2,vhlen2,vx);
3778 +
3779 + vsum1 = vec_sum4s(vs1,vsum1);
3780 + vsum2 = vec_sum4s(vs2,vsum2);
3781 +
3782 + ix += 32;
3783 + } while(ix < end-31);
3784 +
3785 + vsum1 = vec_sums(vsum1,vzero);
3786 + vsum2 = vec_sums(vsum2,vzero);
3787 +
3788 + vsum1 = vec_perm(vsum1,vsum1,vec_lvsr(4,&sum1));
3789 + vsum2 = vec_perm(vsum2,vsum2,vec_lvsr(4,&sum2));
3790 + vec_ste(vsum1,0,&sum1);
3791 + vec_ste(vsum2,0,&sum2);
3792 +
3793 + while (ix < end) {
3794 + int const x = ix[0] * xlen + ix[1];
3795 + ix += 2;
3796 + sum += hlen[x];
3797 + }
3798 +
3799 + sum2 += sum & 0xffff;
3800 + sum = (sum>>16) + sum1;
3801 +
3802 + goto end;
3803 +
3804 + normal:
3805 +
3806 + do {
3807 + int const x = ix[0] * xlen + ix[1];
3808 + ix += 2;
3809 + sum += hlen[x];
3810 + } while (ix < end);
3811 +
3812 + sum2 = sum & 0xffff;
3813 + sum >>= 16;
3814 +
3815 + end:
3816 +
3817 + if (sum > sum2) {
3818 + sum = sum2;
3819 + t1++;
3820 + }
3821 +
3822 + *s += sum;
3823 + return t1;
3824 +}
3825 +
3826 +inline static int
3827 +count_bit_noESC_from2_altivec2(const int *ix, const int *const end, int t1, int *const s)
3828 +{
3829 + /* No ESC-words */
3830 + unsigned int sum = 0;
3831 + int sum1, sum2;
3832 + const int xlen = 4;
3833 + const unsigned int *hlen = table56;
3834 + vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
3835 + vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vxlen,vzero,vsum1,vsum2;
3836 + vector unsigned char vmask,vperm1,vperm2,vx;
3837 + vector unsigned char vhlen1,vhlen2;
3838 + vector signed char vs1,vs2;
3839 +
3840 + vhlen1 = (vector unsigned char)VINIT16(1,4,7,8,4,5,8,9,7,8,9,10,8,8,9,10);
3841 + vhlen2 = (vector unsigned char)VINIT16(3,4,6,8,4,4,6,7,5,6,7,8,7,7,8,9);
3842 + vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
3843 + vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31);
3844 + vxlen = vec_splat_s32(4);
3845 + vzero = vec_xor(vzero,vzero);
3846 + vsum1 = vec_xor(vsum1,vsum1);
3847 + vsum2 = vec_xor(vsum2,vsum2);
3848 +
3849 + if((int)(end - ix) < 32) goto normal;
3850 + v0 = vec_ld(0,ix);
3851 + vmask = vec_lvsl(0,ix);
3852 + do {
3853 + v1 = vec_ld(16,ix);
3854 + v2 = vec_ld(32,ix);
3855 + v3 = vec_ld(48,ix);
3856 + v4 = vec_ld(64,ix);
3857 + v5 = vec_ld(80,ix);
3858 + v6 = vec_ld(96,ix);
3859 + v7 = vec_ld(112,ix);
3860 + v8 = vec_ld(128,ix);
3861 + v9 = vec_perm(v0,v1,vmask);
3862 + v10 = vec_perm(v1,v2,vmask);
3863 + v11 = vec_perm(v2,v3,vmask);
3864 + v12 = vec_perm(v3,v4,vmask);
3865 + v13 = vec_perm(v4,v5,vmask);
3866 + v14 = vec_perm(v5,v6,vmask);
3867 + v15 = vec_perm(v6,v7,vmask);
3868 + v16 = vec_perm(v7,v8,vmask);
3869 + v0 = v8;
3870 + v1 = vec_perm(v9,v10,vperm1);
3871 + v2 = vec_perm(v9,v10,vperm2);
3872 + v3 = vec_perm(v11,v12,vperm1);
3873 + v4 = vec_perm(v11,v12,vperm2);
3874 + v5 = vec_perm(v13,v14,vperm1);
3875 + v6 = vec_perm(v13,v14,vperm2);
3876 + v7 = vec_perm(v15,v16,vperm1);
3877 + v8 = vec_perm(v15,v16,vperm2);
3878 +
3879 + vx1 = (vector signed int)vec_mladd((vector unsigned short)v1,(vector unsigned short)vxlen,(vector unsigned short)v2);
3880 + vx2 = (vector signed int)vec_mladd((vector unsigned short)v3,(vector unsigned short)vxlen,(vector unsigned short)v4);
3881 + vx3 = (vector signed int)vec_pack(vx1,vx2);
3882 + vx4 = (vector signed int)vec_mladd((vector unsigned short)v5,(vector unsigned short)vxlen,(vector unsigned short)v6);
3883 + vx5 = (vector signed int)vec_mladd((vector unsigned short)v7,(vector unsigned short)vxlen,(vector unsigned short)v8);
3884 + vx6 = (vector signed int)vec_pack(vx4,vx5);
3885 + vx = (vector unsigned char)vec_pack((vector unsigned short)vx3,(vector unsigned short)vx6);
3886 +
3887 + vs1 = (vector signed char)vec_perm(vhlen1,vhlen1,vx);
3888 + vs2 = (vector signed char)vec_perm(vhlen2,vhlen2,vx);
3889 +
3890 + vsum1 = vec_sum4s(vs1,vsum1);
3891 + vsum2 = vec_sum4s(vs2,vsum2);
3892 +
3893 + ix += 32;
3894 + } while(ix < end-31);
3895 +
3896 + vsum1 = vec_sums(vsum1,vzero);
3897 + vsum2 = vec_sums(vsum2,vzero);
3898 +
3899 + vsum1 = vec_perm(vsum1,vsum1,vec_lvsr(4,&sum1));
3900 + vsum2 = vec_perm(vsum2,vsum2,vec_lvsr(4,&sum2));
3901 + vec_ste(vsum1,0,&sum1);
3902 + vec_ste(vsum2,0,&sum2);
3903 +
3904 + while (ix < end) {
3905 + int const x = ix[0] * xlen + ix[1];
3906 + ix += 2;
3907 + sum += hlen[x];
3908 + }
3909 +
3910 + sum2 += sum & 0xffff;
3911 + sum = (sum>>16) + sum1;
3912 +
3913 + goto end;
3914 +
3915 + normal:
3916 +
3917 + do {
3918 + int const x = ix[0] * xlen + ix[1];
3919 + ix += 2;
3920 + sum += hlen[x];
3921 + } while (ix < end);
3922 +
3923 + sum2 = sum & 0xffff;
3924 + sum >>= 16;
3925 +
3926 + end:
3927 +
3928 + if (sum > sum2) {
3929 + sum = sum2;
3930 + t1++;
3931 + }
3932 +
3933 + *s += sum;
3934 + return t1;
3935 +}
3936 +
3937 +inline static int
3938 +count_bit_noESC_from3_altivec1(const int *ix, const int *const end, int t1, int *const s)
3939 +{
3940 + /* No ESC-words */
3941 + int sum1 = 0;
3942 + int sum2 = 0;
3943 + int sum3 = 0;
3944 + const int xlen = 6;
3945 + const char *hlen1 = ht[7].hlen;
3946 + const char *hlen2 = ht[8].hlen;
3947 + const char *hlen3 = ht[9].hlen;
3948 + int t;
3949 + vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
3950 + vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vxlen,vzero,vsum1,vsum2,vsum3;
3951 + vector unsigned char vmask,vperm1,vperm2,vx,v31;
3952 + vector unsigned char vhlen11,vhlen12,vhlen13,vhlen21,vhlen22,vhlen23,vhlen31,vhlen32,vhlen33;
3953 + vector signed char vs1,vs2,vs3;
3954 +
3955 + vhlen11 = (vector unsigned char)VINIT16(1,4,7,9,9,10,4,6,8,9,9,10,7,7,9,10);
3956 + vhlen12 = (vector unsigned char)VINIT16(10,11,8,9,10,11,11,11,8,9,10,11,11,12,9,10);
3957 + vhlen13 = (vector unsigned char)VINIT16(11,12,12,12,0,0,0,0,0,0,0,0,0,0,0,0);
3958 + vhlen21 = (vector unsigned char)VINIT16(2,4,7,9,9,10,4,4,6,10,10,10,7,6,8,10);
3959 + vhlen22 = (vector unsigned char)VINIT16(10,11,9,10,10,11,11,12,9,9,10,11,12,12,10,10);
3960 + vhlen23 = (vector unsigned char)VINIT16(11,11,13,13,0,0,0,0,0,0,0,0,0,0,0,0);
3961 + vhlen31 = (vector unsigned char)VINIT16(3,4,6,7,9,10,4,5,6,7,8,10,5,6,7,8);
3962 + vhlen32 = (vector unsigned char)VINIT16(9,10,7,7,8,9,9,10,8,8,9,9,10,11,9,9);
3963 + vhlen33 = (vector unsigned char)VINIT16(10,10,11,11,0,0,0,0,0,0,0,0,0,0,0,0);
3964 + vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
3965 + vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31);
3966 + v31 = (vector unsigned char)VINIT16ALL(31);
3967 + vxlen = vec_splat_s32(6);
3968 + vzero = vec_xor(vzero,vzero);
3969 + vsum1 = vec_xor(vsum1,vsum1);
3970 + vsum2 = vec_xor(vsum2,vsum2);
3971 + vsum3 = vec_xor(vsum3,vsum3);
3972 +
3973 + if((int)(end - ix) < 32) goto normal;
3974 + //int *end2 = ix + 32*((int)(end - ix)/32);
3975 + v0 = vec_ld(0,ix);
3976 + vmask = vec_lvsl(0,ix);
3977 + do {
3978 + v1 = vec_ld(16,ix);
3979 + v2 = vec_ld(32,ix);
3980 + v3 = vec_ld(48,ix);
3981 + v4 = vec_ld(64,ix);
3982 + v5 = vec_ld(80,ix);
3983 + v6 = vec_ld(96,ix);
3984 + v7 = vec_ld(112,ix);
3985 + v8 = vec_ld(128,ix);
3986 + v9 = vec_perm(v0,v1,vmask);
3987 + v10 = vec_perm(v1,v2,vmask);
3988 + v11 = vec_perm(v2,v3,vmask);
3989 + v12 = vec_perm(v3,v4,vmask);
3990 + v13 = vec_perm(v4,v5,vmask);
3991 + v14 = vec_perm(v5,v6,vmask);
3992 + v15 = vec_perm(v6,v7,vmask);
3993 + v16 = vec_perm(v7,v8,vmask);
3994 + v0 = v8;
3995 + v1 = vec_perm(v9,v10,vperm1);
3996 + v2 = vec_perm(v9,v10,vperm2);
3997 + v3 = vec_perm(v11,v12,vperm1);
3998 + v4 = vec_perm(v11,v12,vperm2);
3999 + v5 = vec_perm(v13,v14,vperm1);
4000 + v6 = vec_perm(v13,v14,vperm2);
4001 + v7 = vec_perm(v15,v16,vperm1);
4002 + v8 = vec_perm(v15,v16,vperm2);
4003 + vx1 = (vector signed int)vec_mladd((vector unsigned short)v1,(vector unsigned short)vxlen,(vector unsigned short)v2);
4004 + vx2 = (vector signed int)vec_mladd((vector unsigned short)v3,(vector unsigned short)vxlen,(vector unsigned short)v4);
4005 + vx3 = (vector signed int)vec_pack(vx1,vx2);
4006 + vx4 = (vector signed int)vec_mladd((vector unsigned short)v5,(vector unsigned short)vxlen,(vector unsigned short)v6);
4007 + vx5 = (vector signed int)vec_mladd((vector unsigned short)v7,(vector unsigned short)vxlen,(vector unsigned short)v8);
4008 + vx6 = (vector signed int)vec_pack(vx4,vx5);
4009 + vx = (vector unsigned char)vec_pack((vector unsigned short)vx3,(vector unsigned short)vx6);
4010 +
4011 + v1 = (vector signed int)vec_perm(vhlen11,vhlen12,vx);
4012 + v2 = (vector signed int)vec_perm(vhlen13,vhlen13,vx);
4013 + v3 = (vector signed int)vec_perm(vhlen21,vhlen22,vx);
4014 + v4 = (vector signed int)vec_perm(vhlen23,vhlen23,vx);
4015 + v5 = (vector signed int)vec_perm(vhlen31,vhlen32,vx);
4016 + v6 = (vector signed int)vec_perm(vhlen33,vhlen33,vx);
4017 + v7 = (vector signed int)vec_cmpgt(vx,v31);
4018 + vs1 = (vector signed char)vec_sel(v1,v2,(vector unsigned int)v7);
4019 + vs2 = (vector signed char)vec_sel(v3,v4,(vector unsigned int)v7);
4020 + vs3 = (vector signed char)vec_sel(v5,v6,(vector unsigned int)v7);
4021 +
4022 + vsum1 = vec_sum4s(vs1,vsum1);
4023 + vsum2 = vec_sum4s(vs2,vsum2);
4024 + vsum3 = vec_sum4s(vs3,vsum3);
4025 +
4026 + ix += 32;
4027 + } while(ix < end-31);
4028 +
4029 + vsum1 = vec_sums(vsum1,vzero);
4030 + vsum2 = vec_sums(vsum2,vzero);
4031 + vsum3 = vec_sums(vsum3,vzero);
4032 +
4033 + vsum1 = vec_perm(vsum1,vsum1,vec_lvsr(4,&sum1));
4034 + vsum2 = vec_perm(vsum2,vsum2,vec_lvsr(4,&sum2));
4035 + vsum3 = vec_perm(vsum3,vsum3,vec_lvsr(4,&sum3));
4036 + vec_ste(vsum1,0,&sum1);
4037 + vec_ste(vsum2,0,&sum2);
4038 + vec_ste(vsum3,0,&sum3);
4039 +
4040 + while (ix < end) {
4041 + int x = ix[0] * xlen + ix[1];
4042 + ix += 2;
4043 + sum1 += hlen1[x];
4044 + sum2 += hlen2[x];
4045 + sum3 += hlen3[x];
4046 + }
4047 + goto end;
4048 +
4049 + normal:
4050 +
4051 + do {
4052 + int x = ix[0] * xlen + ix[1];
4053 + ix += 2;
4054 + sum1 += hlen1[x];
4055 + sum2 += hlen2[x];
4056 + sum3 += hlen3[x];
4057 + } while (ix < end);
4058 +
4059 + end:
4060 +
4061 + t = t1;
4062 + if (sum1 > sum2) {
4063 + sum1 = sum2;
4064 + t++;
4065 + }
4066 + if (sum1 > sum3) {
4067 + sum1 = sum3;
4068 + t = t1+2;
4069 + }
4070 + *s += sum1;
4071 +
4072 + return t;
4073 +}
4074 +
4075 +inline static int
4076 +count_bit_noESC_from3_altivec2(
4077 + const int * ix,
4078 + const int * const end,
4079 + int t1,
4080 + int * const s )
4081 +{
4082 + /* No ESC-words */
4083 + int sum1 = 0;
4084 + int sum2 = 0;
4085 + int sum3 = 0;
4086 + const int xlen = 8;
4087 + const char *hlen1 = ht[10].hlen;
4088 + const char *hlen2 = ht[11].hlen;
4089 + const char *hlen3 = ht[12].hlen;
4090 + int t;
4091 + vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
4092 + vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vxlen,vzero,vsum1,vsum2,vsum3;
4093 + vector unsigned char vmask,vperm1,vperm2,vx,v31;
4094 + vector unsigned char vhlen11,vhlen12,vhlen13,vhlen14,vhlen21,vhlen22,vhlen23,vhlen24,vhlen31,vhlen32,vhlen33,vhlen34;
4095 + vector signed char vs1,vs2,vs3;
4096 +
4097 + vhlen11 = (vector unsigned char)VINIT16( 1, 4, 7, 9, 10, 10, 10, 11, 4, 6, 8, 9, 10, 11, 10, 10);
4098 + vhlen12 = (vector unsigned char)VINIT16( 7, 8, 9, 10, 11, 12, 11, 11, 8, 9, 10, 11, 12, 12, 11, 12);
4099 + vhlen13 = (vector unsigned char)VINIT16( 9, 10, 11, 12, 12, 12, 12, 12,10, 11, 12, 12, 13, 13, 12, 13);
4100 + vhlen14 = (vector unsigned char)VINIT16( 9, 10, 11, 12, 12, 12, 13, 13,10, 10, 11, 12, 12, 13, 13, 13);
4101 + vhlen21 = (vector unsigned char)VINIT16( 2, 4, 6, 8, 9, 10, 9, 10, 4, 5, 6, 8, 10, 10, 9, 10);
4102 + vhlen22 = (vector unsigned char)VINIT16( 6, 7, 8, 9, 10, 11, 10, 10, 8, 8, 9, 11, 10, 12, 10, 11);
4103 + vhlen23 = (vector unsigned char)VINIT16( 9, 10, 10, 11, 11, 12, 11, 12, 9, 10, 11, 12, 12, 13, 12, 13);
4104 + vhlen24 = (vector unsigned char)VINIT16( 9, 9, 9, 10, 11, 12, 12, 12, 9, 9, 10, 11, 12, 12, 12, 12);
4105 + vhlen31 = (vector unsigned char)VINIT16( 4, 4, 6, 8, 9, 10, 10, 10, 4, 5, 6, 7, 9, 9, 10, 10);
4106 + vhlen32 = (vector unsigned char)VINIT16( 6, 6, 7, 8, 9, 10, 9, 10, 7, 7, 8, 8, 9, 10, 10, 10);
4107 + vhlen33 = (vector unsigned char)VINIT16( 8, 8, 9, 9, 10, 10, 10, 11, 9, 9, 10, 10, 10, 11, 10, 11);
4108 + vhlen34 = (vector unsigned char)VINIT16( 9, 9, 9, 10, 10, 11, 11, 12,10, 10, 10, 11, 11, 11, 11, 12);
4109 + vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
4110 + vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31);
4111 + v31 = (vector unsigned char)VINIT16ALL(31);
4112 + vxlen = vec_splat_s32(8);
4113 + vzero = vec_xor(vzero,vzero);
4114 + vsum1 = vec_xor(vsum1,vsum1);
4115 + vsum2 = vec_xor(vsum2,vsum2);
4116 + vsum3 = vec_xor(vsum3,vsum3);
4117 +
4118 + if((int)(end - ix) < 32) goto normal;
4119 + //int *end2 = ix + 32*((int)(end - ix)/32);
4120 + v0 = vec_ld(0,ix);
4121 + vmask = vec_lvsl(0,ix);
4122 + do {
4123 + v1 = vec_ld(16,ix);
4124 + v2 = vec_ld(32,ix);
4125 + v3 = vec_ld(48,ix);
4126 + v4 = vec_ld(64,ix);
4127 + v5 = vec_ld(80,ix);
4128 + v6 = vec_ld(96,ix);
4129 + v7 = vec_ld(112,ix);
4130 + v8 = vec_ld(128,ix);
4131 + v9 = vec_perm(v0,v1,vmask);
4132 + v10 = vec_perm(v1,v2,vmask);
4133 + v11 = vec_perm(v2,v3,vmask);
4134 + v12 = vec_perm(v3,v4,vmask);
4135 + v13 = vec_perm(v4,v5,vmask);
4136 + v14 = vec_perm(v5,v6,vmask);
4137 + v15 = vec_perm(v6,v7,vmask);
4138 + v16 = vec_perm(v7,v8,vmask);
4139 + v0 = v8;
4140 + v1 = vec_perm(v9,v10,vperm1);
4141 + v2 = vec_perm(v9,v10,vperm2);
4142 + v3 = vec_perm(v11,v12,vperm1);
4143 + v4 = vec_perm(v11,v12,vperm2);
4144 + v5 = vec_perm(v13,v14,vperm1);
4145 + v6 = vec_perm(v13,v14,vperm2);
4146 + v7 = vec_perm(v15,v16,vperm1);
4147 + v8 = vec_perm(v15,v16,vperm2);
4148 +
4149 + vx1 = (vector signed int)vec_mladd((vector unsigned short)v1,(vector unsigned short)vxlen,(vector unsigned short)v2);
4150 + vx2 = (vector signed int)vec_mladd((vector unsigned short)v3,(vector unsigned short)vxlen,(vector unsigned short)v4);
4151 + vx3 = (vector signed int)vec_pack(vx1,vx2);
4152 + vx4 = (vector signed int)vec_mladd((vector unsigned short)v5,(vector unsigned short)vxlen,(vector unsigned short)v6);
4153 + vx5 = (vector signed int)vec_mladd((vector unsigned short)v7,(vector unsigned short)vxlen,(vector unsigned short)v8);
4154 + vx6 = (vector signed int)vec_pack(vx4,vx5);
4155 + vx = (vector unsigned char)vec_pack((vector unsigned short)vx3,(vector unsigned short)vx6);
4156 +
4157 + v1 = (vector signed int)vec_perm(vhlen11,vhlen12,vx);
4158 + v2 = (vector signed int)vec_perm(vhlen13,vhlen14,vx);
4159 + v3 = (vector signed int)vec_perm(vhlen21,vhlen22,vx);
4160 + v4 = (vector signed int)vec_perm(vhlen23,vhlen24,vx);
4161 + v5 = (vector signed int)vec_perm(vhlen31,vhlen32,vx);
4162 + v6 = (vector signed int)vec_perm(vhlen33,vhlen34,vx);
4163 + v7 = (vector signed int)vec_cmpgt(vx,v31);
4164 + vs1 = (vector signed char)vec_sel(v1,v2,(vector unsigned int)v7);
4165 + vs2 = (vector signed char)vec_sel(v3,v4,(vector unsigned int)v7);
4166 + vs3 = (vector signed char)vec_sel(v5,v6,(vector unsigned int)v7);
4167 +
4168 + vsum1 = vec_sum4s(vs1,vsum1);
4169 + vsum2 = vec_sum4s(vs2,vsum2);
4170 + vsum3 = vec_sum4s(vs3,vsum3);
4171 +
4172 + ix += 32;
4173 + } while(ix < end-31);
4174 +
4175 + vsum1 = vec_sums(vsum1,vzero);
4176 + vsum2 = vec_sums(vsum2,vzero);
4177 + vsum3 = vec_sums(vsum3,vzero);
4178 +
4179 + vsum1 = vec_perm(vsum1,vsum1,vec_lvsr(4,&sum1));
4180 + vsum2 = vec_perm(vsum2,vsum2,vec_lvsr(4,&sum2));
4181 + vsum3 = vec_perm(vsum3,vsum3,vec_lvsr(4,&sum3));
4182 + vec_ste(vsum1,0,&sum1);
4183 + vec_ste(vsum2,0,&sum2);
4184 + vec_ste(vsum3,0,&sum3);
4185 +
4186 + while (ix < end) {
4187 + int x = ix[0] * xlen + ix[1];
4188 + ix += 2;
4189 + sum1 += hlen1[x];
4190 + sum2 += hlen2[x];
4191 + sum3 += hlen3[x];
4192 + }
4193 + goto end;
4194 +
4195 + normal:
4196 +
4197 + do {
4198 + int x = ix[0] * xlen + ix[1];
4199 + ix += 2;
4200 + sum1 += hlen1[x];
4201 + sum2 += hlen2[x];
4202 + sum3 += hlen3[x];
4203 + } while (ix < end);
4204 +
4205 + end:
4206 +
4207 + t = t1;
4208 + if (sum1 > sum2) {
4209 + sum1 = sum2;
4210 + t++;
4211 + }
4212 + if (sum1 > sum3) {
4213 + sum1 = sum3;
4214 + t = t1+2;
4215 + }
4216 + *s += sum1;
4217 +
4218 + return t;
4219 +}
4220 +#endif
4221 +
4222
4223 /*************************************************************************/
4224 /* choose table */
4225 @@ -592,7 +1464,11 @@
4226 1, 2, 5, 7, 7, 10, 10, 13, 13, 13, 13, 13, 13, 13, 13
4227 };
4228
4229 +#ifdef ALTIVEC
4230 + max = ix_max_vec(ix, end);
4231 +#else
4232 max = ix_max(ix, end);
4233 +#endif
4234
4235 switch (max) {
4236 case 0:
4237 @@ -602,13 +1478,26 @@
4238 return count_bit_noESC(ix, end, s);
4239
4240 case 2:
4241 +#ifdef ALTIVEC
4242 + return count_bit_noESC_from2_altivec1(ix, end, 2, s);
4243 +#endif
4244 case 3:
4245 +#ifdef ALTIVEC
4246 + return count_bit_noESC_from2_altivec2(ix, end, 5, s);
4247 +#else
4248 return count_bit_noESC_from2(ix, end, huf_tbl_noESC[max - 1], s);
4249 +#endif
4250
4251 case 4:
4252 case 5:
4253 +#ifdef ALTIVEC
4254 + return count_bit_noESC_from3_altivec1(ix, end, 7, s);
4255 +#endif
4256 case 6:
4257 case 7:
4258 +#ifdef ALTIVEC
4259 + return count_bit_noESC_from3_altivec2(ix, end, 10, s);
4260 +#endif
4261 case 8:
4262 case 9:
4263 case 10:
4264 @@ -637,7 +1526,11 @@
4265 break;
4266 }
4267 }
4268 +#if defined(ALTIVEC) && defined(ALTIVEC_970)
4269 + return count_bit_ESC_altivec(ix, end, choice, choice2, s);
4270 +#else
4271 return count_bit_ESC(ix, end, choice, choice2, s);
4272 +#endif
4273 }
4274 }
4275
4276 --- libmp3lame/util.c.orig 2009-04-01 07:37:27.000000000 +0900
4277 +++ libmp3lame/util.c 2010-03-01 14:05:00.000000000 +0900
4278 @@ -26,6 +26,12 @@
4279 # include <config.h>
4280 #endif
4281
4282 +#if defined(ALTIVEC) && !defined(ALTIVEC_970)
4283 +#ifndef __APPLE_CC__
4284 +#include <altivec.h>
4285 +#endif
4286 +#endif
4287 +
4288 #include "lame.h"
4289 #include "machine.h"
4290 #include "encoder.h"
4291 @@ -928,6 +934,108 @@
4292 *
4293 ***********************************************************************/
4294
4295 +#if defined(ALTIVEC) && !defined(ALTIVEC_970)
4296 +
4297 +inline ieee754_float32_t fast_log10_altivec(ieee754_float32_t x)
4298 +{
4299 + vector float va,vb,vc,vhalf,vzero,vsqrt2,vconst4;
4300 + vector float v1,v2,v3,v4,v5,v6,v7,v8,vz,vz2,vlog;
4301 + vector unsigned int vconst1,vconst2,vshamt;
4302 + vector signed int vconst3;
4303 + float out __attribute__ ((aligned (16)));
4304 +
4305 + va = (vector float)VINIT4ALL(0.8685890659);
4306 + vb = (vector float)VINIT4ALL(0.2894672153);
4307 + vc = (vector float)VINIT4ALL(0.1793365895);
4308 + vhalf = (vector float)VINIT4ALL(0.15051499783);
4309 + vsqrt2 = (vector float)VINIT4ALL(1.4142135623731);
4310 + vconst4 = (vector float)VINIT4ALL(0.301029995664);
4311 + vzero = vec_xor(vzero,vzero);
4312 + vconst1 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(9));
4313 + vconst2 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(7));
4314 + vconst2 = vec_nor(vconst2,vconst2);
4315 + vconst3 = (vector signed int)vec_rl(vconst2,vec_splat_u32(7));
4316 + vshamt = vec_add(vec_splat_u32(9),vec_splat_u32(7));
4317 + vshamt = vec_add(vshamt,vec_splat_u32(7));
4318 + vconst2 = vec_sl((vector unsigned int)vconst3,vshamt);
4319 +
4320 + v1 = vec_ld(0,&x);
4321 + v2 = vec_perm(v1,v1,vec_lvsl(0,&x));
4322 + v3 = vec_splat(v2,0);
4323 +
4324 + v4 = (vector float)vec_sel(vconst2,(vector unsigned int)v3,vconst1);
4325 + v5 = vec_add(v4,vsqrt2);
4326 + v6 = vec_sub(v4,vsqrt2);
4327 + v7 = vec_re(v5);
4328 + vz = vec_madd(v6, vec_madd(vec_nmsub(v7,v5,(vector float)vconst2),v7,v7), vzero);
4329 + v8 = (vector float)vec_sr((vector unsigned int)v3,vshamt);
4330 + vlog = vec_ctf(vec_sub((vector signed int)v8,vconst3),0);
4331 +
4332 + vz2 = vec_madd(vz,vz,vzero);
4333 + vlog = vec_madd(vlog,vconst4,vhalf);
4334 +
4335 + v1 = vec_madd(vz2,vc,vb);
4336 + v2 = vec_madd(vz2,v1,va);
4337 + vlog = vec_madd(vz,v2,vlog);
4338 +
4339 + vec_ste(vlog,0,&out);
4340 +
4341 + return out;
4342 +}
4343 +
4344 +inline ieee754_float32_t fast_loge_altivec(ieee754_float32_t x)
4345 +{
4346 + vector float va,vb,vc,vhalf,vzero,vsqrt2,vconst4;
4347 + vector float v1,v2,v3,v4,v5,v6,v7,v8,vz,vz2,vlog;
4348 + vector unsigned int vconst1,vconst2,vshamt;
4349 + vector signed int vconst3;
4350 + float out __attribute__ ((aligned (16)));
4351 +
4352 + va = (vector float)VINIT4ALL(2.0000006209);
4353 + vb = (vector float)VINIT4ALL(0.6664778517);
4354 + vc = (vector float)VINIT4ALL(0.4139745860);
4355 + vhalf = (vector float)VINIT4ALL(0.34657359028);
4356 + vsqrt2 = (vector float)VINIT4ALL(1.4142135623731);
4357 + vconst4 = (vector float)VINIT4ALL(0.6931471805599);
4358 + vzero = vec_xor(vzero,vzero);
4359 + vconst1 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(9));
4360 + vconst2 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(7));
4361 + vconst2 = vec_nor(vconst2,vconst2);
4362 + vconst3 = (vector signed int)vec_rl(vconst2,vec_splat_u32(7));
4363 + vshamt = vec_add(vec_splat_u32(9),vec_splat_u32(7));
4364 + vshamt = vec_add(vshamt,vec_splat_u32(7));
4365 + vconst2 = vec_sl((vector unsigned int)vconst3,vshamt);
4366 +
4367 + v1 = vec_ld(0,&x);
4368 + v2 = vec_perm(v1,v1,vec_lvsl(0,&x));
4369 + v3 = vec_splat(v2,0);
4370 +
4371 + v4 = (vector float)vec_sel(vconst2,(vector unsigned int)v3,vconst1);
4372 + v5 = vec_add(v4,vsqrt2);
4373 + v6 = vec_sub(v4,vsqrt2);
4374 + v7 = vec_re(v5);
4375 + vz = vec_madd(v6, vec_madd(vec_nmsub(v7,v5,(vector float)vconst2),v7,v7), vzero);
4376 + v8 = (vector float)vec_sr((vector unsigned int)v3,vshamt);
4377 + vlog = vec_ctf(vec_sub((vector signed int)v8,vconst3),0);
4378 +
4379 + vz2 = vec_madd(vz,vz,vzero);
4380 + vlog = vec_madd(vlog,vconst4,vhalf);
4381 +
4382 + v1 = vec_madd(vz2,vc,vb);
4383 + v2 = vec_madd(vz2,v1,va);
4384 + vlog = vec_madd(vz,v2,vlog);
4385 +
4386 + vec_ste(vlog,0,&out);
4387 +
4388 + return out;
4389 +}
4390 +
4391 +void
4392 +init_log_table(void)
4393 +{
4394 +}
4395 +
4396 +#else
4397
4398 #define LOG2_SIZE (512)
4399 #define LOG2_SIZE_L2 (9)
4400 @@ -978,6 +1086,8 @@
4401 return log2val;
4402 }
4403
4404 +#endif
4405 +
4406 #else /* Don't use FAST_LOG */
4407
4408
4409 --- libmp3lame/util.h.orig 2009-04-01 07:37:27.000000000 +0900
4410 +++ libmp3lame/util.h 2010-03-01 14:05:00.000000000 +0900
4411 @@ -92,10 +92,17 @@
4412
4413 /* log/log10 approximations */
4414 #ifdef USE_FAST_LOG
4415 +#if defined(ALTIVEC) && !defined(ALTIVEC_970)
4416 +#define FAST_LOG10(x) (fast_log10_altivec(x))
4417 +#define FAST_LOG(x) (fast_loge_altivec(x))
4418 +#define FAST_LOG10_X(x,y) (fast_log10_altivec(x)*(y))
4419 +#define FAST_LOG_X(x,y) (fast_loge_altivec(x)*(y))
4420 +#else
4421 #define FAST_LOG10(x) (fast_log2(x)*(LOG2/LOG10))
4422 #define FAST_LOG(x) (fast_log2(x)*LOG2)
4423 #define FAST_LOG10_X(x,y) (fast_log2(x)*(LOG2/LOG10*(y)))
4424 #define FAST_LOG_X(x,y) (fast_log2(x)*(LOG2*(y)))
4425 +#endif
4426 #else
4427 #define FAST_LOG10(x) log10(x)
4428 #define FAST_LOG(x) log(x)
4429 @@ -247,7 +254,7 @@
4430 #ifndef MFSIZE
4431 # define MFSIZE ( 3*1152 + ENCDELAY - MDCTDELAY )
4432 #endif
4433 - sample_t mfbuf[2][MFSIZE];
4434 + sample_t mfbuf[2][MFSIZE] __attribute__ ((aligned (16)));
4435
4436
4437 struct {
4438 @@ -325,7 +332,7 @@
4439 int OldValue[2];
4440 int CurrentStep[2];
4441
4442 - FLOAT masking_lower;
4443 + FLOAT masking_lower __attribute__ ((aligned (16)));
4444 char bv_scf[576];
4445 int pseudohalf[SFBMAX];
4446
4447 @@ -340,7 +347,7 @@
4448 int sideinfo_len;
4449
4450 /* variables for newmdct.c */
4451 - FLOAT sb_sample[2][2][18][SBLIMIT];
4452 + FLOAT sb_sample[2][2][18][SBLIMIT] __attribute__ ((aligned (16)));
4453 FLOAT amp_filter[32];
4454
4455 /* variables for bitstream.c */
4456 @@ -376,7 +383,7 @@
4457 /* the second index is the "age" of the data. */
4458 FLOAT minval_l[CBANDS];
4459 FLOAT minval_s[CBANDS];
4460 - FLOAT nb_1[4][CBANDS], nb_2[4][CBANDS];
4461 + FLOAT nb_1[4][CBANDS] __attribute__ ((aligned (16))), nb_2[4][CBANDS] __attribute__ ((aligned (16)));
4462 FLOAT nb_s1[4][CBANDS], nb_s2[4][CBANDS];
4463 FLOAT *s3_ss;
4464 FLOAT *s3_ll;
4465 @@ -398,9 +405,8 @@
4466 int bm_l[SBMAX_l], bo_l[SBMAX_l];
4467 int bm_s[SBMAX_s], bo_s[SBMAX_s];
4468 int npart_l, npart_s;
4469 -
4470 - int s3ind[CBANDS][2];
4471 - int s3ind_s[CBANDS][2];
4472 + int s3ind[CBANDS][4] __attribute__ ((aligned (16)));
4473 + int s3ind_s[CBANDS][4];
4474
4475 int numlines_s[CBANDS];
4476 int numlines_l[CBANDS];
4477 @@ -498,7 +504,12 @@
4478
4479 /* log/log10 approximations */
4480 extern void init_log_table(void);
4481 +#if defined(ALTIVEC) && !defined(ALTIVEC_970)
4482 + extern ieee754_float32_t fast_log10_altivec(ieee754_float32_t x);
4483 + extern ieee754_float32_t fast_loge_altivec(ieee754_float32_t x);
4484 +#else
4485 extern ieee754_float32_t fast_log2(ieee754_float32_t x);
4486 +#endif
4487
4488
4489 void fill_buffer(lame_global_flags const *gfp,
4490 --- libmp3lame/vbrquantize.c.orig 2008-08-05 23:16:07.000000000 +0900
4491 +++ libmp3lame/vbrquantize.c 2010-03-01 14:05:00.000000000 +0900
4492 @@ -27,6 +27,13 @@
4493 #endif
4494
4495
4496 +#ifdef ALTIVEC
4497 +#undef TAKEHIRO_IEEE754_HACK
4498 +#ifndef __APPLE_CC__
4499 +#include <altivec.h>
4500 +#endif
4501 +#endif
4502 +
4503 #include "lame.h"
4504 #include "machine.h"
4505 #include "encoder.h"
4506 @@ -248,11 +255,27 @@
4507 * for which holds: sfpow34*xr34 <= IXMAX_VAL
4508 */
4509
4510 +
4511 static FLOAT
4512 calc_sfb_noise_x34(const FLOAT * xr, const FLOAT * xr34, unsigned int bw, uint8_t sf)
4513 {
4514 +#ifdef ALTIVEC
4515 + float vpow[8] __attribute__ ((aligned (16)));
4516 + vector float v0, v1, v2, v3, v4, v5, v6,v7,v8,v9,v10,v11,v12,v13;
4517 + vector unsigned char vperm1, vperm2,vc1,vc2,vc3;
4518 + vector signed int vl1,vl2,vl3;
4519 + vector float vxfsf, vsfpow, vsfpow34, vabs, vzero;
4520 + unsigned int s1,s2,s3,s4,s5,s6,s7,s8;
4521 + const vector float const1 = (vector float)VINIT4(0.4053964553387788,3.404263724373839,5.465086767819913,1.0);
4522 + const vector float const2 = (vector float)VINIT4(7.719205369637751,10.93017829043677,0,0);
4523 +#ifdef ALTIVEC_970
4524 + vector unsigned int vmask1,vmask2,vmask3;
4525 +#else
4526 + vector unsigned char vperm3,vperm4,vc4,vc5,vc6,vmask;
4527 +#endif
4528 +#endif
4529 DOUBLEX x[4];
4530 - int l3[4];
4531 + int l3[4] __attribute__ ((aligned (16)));
4532 const FLOAT sfpow = pow20[sf + Q_MAX2]; /*pow(2.0,sf/4.0); */
4533 const FLOAT sfpow34 = ipow20[sf]; /*pow(sfpow,-3.0/4.0); */
4534
4535 @@ -260,6 +283,231 @@
4536 unsigned int j = bw >> 1;
4537 unsigned int const remaining = (j & 0x01u);
4538
4539 +#ifdef ALTIVEC
4540 + vpow[0] = sfpow;
4541 + vpow[1] = sfpow34;
4542 + vsfpow = vec_ld(0,vpow);
4543 + vxfsf = vec_xor(vxfsf,vxfsf);
4544 + vsfpow34 = vec_splat(vsfpow,1);
4545 + vsfpow = vec_splat(vsfpow,0);
4546 + vperm1 = vec_lvsl(0,xr);
4547 + vperm2 = vec_lvsl(0,xr34);
4548 + v0 = vec_ld(0,xr);
4549 + v1 = vec_ld(0,xr34);
4550 + vabs = (vector float)vec_splat_s32(-1);
4551 + vabs = (vector float)vec_sl((vector unsigned int)vabs, (vector unsigned int)vabs);
4552 + vzero = vec_xor(vzero,vzero);
4553 +#ifdef ALTIVEC_970
4554 + vc1 = vec_splat_u8(1);
4555 + vc2 = vec_splat_u8(5);
4556 + vc3 = vec_sl(vc1,vc2);
4557 + vmask1 = (vector unsigned int)vec_splat_s32(-1);
4558 + vmask1 = vec_sro(vmask1,vc3);
4559 + vmask2 = vec_sro(vmask1,vc3);
4560 + vmask3 = vec_sro(vmask2,vc3);
4561 +#else
4562 + vperm3 = (vector unsigned char)VINIT16(0,0,0,0,0,0,0,0,0,1,2,3,16,17,18,19);
4563 + vperm4 = vec_sld(vperm3,(vector unsigned char)vzero,8);
4564 + vmask = (vector unsigned char)VINIT16ALL(16);
4565 +#endif
4566 + for (j >>= 1; j > 1; j -= 2) {
4567 +
4568 + v2 = vec_ld(16,xr34);
4569 + v3 = vec_ld(32,xr34);
4570 + v4 = vec_perm(v1,v2,vperm2);
4571 + v5 = vec_perm(v2,v3,vperm2);
4572 + v12 = vec_madd(v4,vsfpow34,vzero);
4573 + v13 = vec_madd(v5,vsfpow34,vzero);
4574 + v1 = v3;
4575 +
4576 + v2 = vec_floor(v12);
4577 + v3 = vec_floor(v13);
4578 + v4 = vec_splat(const1,2);
4579 + v5 = vec_splat(const1,1);
4580 + v6 = vec_splat(const2,1);
4581 + v7 = vec_splat(const2,0);
4582 + v8 = vec_madd(v2,v4,v5);
4583 + v9 = vec_madd(v3,v4,v5);
4584 + v10 = vec_madd(v2,v6,v7);
4585 + v11 = vec_madd(v3,v6,v7);
4586 + v4 = vec_splat(const1,0);
4587 + v5 = vec_splat(const1,3);
4588 + v8 = vec_madd(v8,v2,v4);
4589 + v9 = vec_madd(v9,v3,v4);
4590 + v10 = vec_madd(v10,v2,v5);
4591 + v11 = vec_madd(v11,v3,v5);
4592 + v6 = vec_re(v10);
4593 + v7 = vec_re(v11);
4594 + v10 = vec_nmsub(v10,v6,v5);
4595 + v11 = vec_nmsub(v11,v7,v5);
4596 + v10 = vec_madd(v10,v6,v6);
4597 + v11 = vec_madd(v11,v7,v7);
4598 + v10 = vec_madd(v8,v10,v12);
4599 + v11 = vec_madd(v9,v11,v13);
4600 +
4601 + vl1 = vec_cts(v10,0);
4602 + vl2 = vec_cts(v11,0);
4603 + vl3 = (vector signed int)vec_pack(vl1,vl2);
4604 + vec_st(vl3,0,l3);
4605 +
4606 + s1 = l3[0] >> 16;
4607 + s2 = l3[0] & 0xffff;
4608 + s3 = l3[1] >> 16;
4609 + s4 = l3[1] & 0xffff;
4610 + s5 = l3[2] >> 16;
4611 + s6 = l3[2] & 0xffff;
4612 + s7 = l3[3] >> 16;
4613 + s8 = l3[3] & 0xffff;
4614 +
4615 +#ifdef ALTIVEC_970
4616 + v2 = vec_lde(0,pow43+s1);
4617 + v3 = vec_lde(0,pow43+s2);
4618 + v4 = vec_lde(0,pow43+s3);
4619 + v5 = vec_lde(0,pow43+s4);
4620 + v2 = vec_perm(v2,v2,vec_lvsl(0,pow43+s1));
4621 + v3 = vec_perm(v3,v3,vec_lvsl(-4,pow43+s2));
4622 + v4 = vec_perm(v4,v4,vec_lvsl(-8,pow43+s3));
4623 + v5 = vec_perm(v5,v5,vec_lvsl(-12,pow43+s4));
4624 + v12 = vec_sel(v2,v3,vmask1);
4625 + v12 = vec_sel(v12,v4,vmask2);
4626 + v12 = vec_sel(v12,v5,vmask3);
4627 +
4628 + v2 = vec_lde(0,pow43+s5);
4629 + v3 = vec_lde(0,pow43+s6);
4630 + v4 = vec_lde(0,pow43+s7);
4631 + v5 = vec_lde(0,pow43+s8);
4632 + v2 = vec_perm(v2,v2,vec_lvsl(0,pow43+s5));
4633 + v3 = vec_perm(v3,v3,vec_lvsl(-4,pow43+s6));
4634 + v4 = vec_perm(v4,v4,vec_lvsl(-8,pow43+s7));
4635 + v5 = vec_perm(v5,v5,vec_lvsl(-12,pow43+s8));
4636 + v13 = vec_sel(v2,v3,vmask1);
4637 + v13 = vec_sel(v13,v4,vmask2);
4638 + v13 = vec_sel(v13,v5,vmask3);
4639 +#else
4640 + vc1 = vec_lvsl(0,pow43+s1);
4641 + vc2 = vec_lvsl(0,pow43+s2);
4642 + vc3 = vec_lvsl(0,pow43+s3);
4643 + vc4 = vec_lvsl(0,pow43+s4);
4644 + vc2 = vec_or(vc2,vmask);
4645 + vc4 = vec_or(vc4,vmask);
4646 + v2 = vec_lde(0,pow43+s1);
4647 + v3 = vec_lde(0,pow43+s2);
4648 + v4 = vec_lde(0,pow43+s3);
4649 + v5 = vec_lde(0,pow43+s4);
4650 + vc5 = vec_perm(vc1,vc2,vperm3);
4651 + vc6 = vec_perm(vc3,vc4,vperm4);
4652 + v6 = vec_perm(v2,v3,vc5);
4653 + v7 = vec_perm(v4,v5,vc6);
4654 + v12 = vec_sld(v6,v7,8);
4655 +
4656 + vc1 = vec_lvsl(0,pow43+s5);
4657 + vc2 = vec_lvsl(0,pow43+s6);
4658 + vc3 = vec_lvsl(0,pow43+s7);
4659 + vc4 = vec_lvsl(0,pow43+s8);
4660 + vc2 = vec_or(vc2,vmask);
4661 + vc4 = vec_or(vc4,vmask);
4662 + v2 = vec_lde(0,pow43+s5);
4663 + v3 = vec_lde(0,pow43+s6);
4664 + v4 = vec_lde(0,pow43+s7);
4665 + v5 = vec_lde(0,pow43+s8);
4666 + vc5 = vec_perm(vc1,vc2,vperm3);
4667 + vc6 = vec_perm(vc3,vc4,vperm4);
4668 + v6 = vec_perm(v2,v3,vc5);
4669 + v7 = vec_perm(v4,v5,vc6);
4670 + v13 = vec_sld(v6,v7,8);
4671 +#endif
4672 +
4673 + v2 = vec_ld(16, xr);
4674 + v3 = vec_ld(32, xr);
4675 + v6 = vec_perm(v0,v2,vperm1);
4676 + v7 = vec_perm(v2,v3,vperm1);
4677 + v0 = v3;
4678 + v8 = vec_andc(v6,vabs);
4679 + v9 = vec_andc(v7,vabs);
4680 + v10 = vec_nmsub(vsfpow, v12, v8);
4681 + v11 = vec_nmsub(vsfpow, v13, v9);
4682 + vxfsf = vec_madd(v10, v10, vxfsf);
4683 + vxfsf = vec_madd(v11, v11, vxfsf);
4684 +
4685 + xr += 8;
4686 + xr34 += 8;
4687 + }
4688 + if (j) {
4689 +#ifdef ALTIVEC_970
4690 + x[0] = sfpow34 * xr34[0];
4691 + x[1] = sfpow34 * xr34[1];
4692 + x[2] = sfpow34 * xr34[2];
4693 + x[3] = sfpow34 * xr34[3];
4694 +
4695 + k_34_4(x, l3);
4696 +
4697 + vpow[0] = pow43[l3[0]];
4698 + vpow[1] = pow43[l3[1]];
4699 + vpow[2] = pow43[l3[2]];
4700 + vpow[3] = pow43[l3[3]];
4701 + v1 = vec_ld(0, vpow);
4702 + v2 = vec_ld(16, xr);
4703 + v3 = vec_perm(v0,v2,vperm1);
4704 + v4 = vec_andc(v3,vabs);
4705 + v5 = vec_nmsub(vsfpow, v1, v4);
4706 + vxfsf = vec_madd(v5, v5, vxfsf);
4707 +#else
4708 + v2 = vec_ld(16,xr34);
4709 + v3 = vec_perm(v1,v2,vperm2);
4710 + v4 = vec_madd(v3,vsfpow34,vzero);
4711 + vl1 = vec_cts(v4,0);
4712 + vec_st(vl1,0,l3);
4713 +
4714 + v5 = vec_lde(0,adj43+l3[0]);
4715 + v6 = vec_lde(0,adj43+l3[1]);
4716 + v7 = vec_lde(0,adj43+l3[2]);
4717 + v8 = vec_lde(0,adj43+l3[3]);
4718 + v9 = vec_perm(v5,v5,vec_lvsl(0,adj43+l3[0]));
4719 + v10 = vec_perm(v6,v6,vec_lvsl(-4,adj43+l3[1]));
4720 + v11 = vec_perm(v7,v7,vec_lvsl(-8,adj43+l3[2]));
4721 + v12 = vec_perm(v8,v8,vec_lvsl(-12,adj43+l3[3]));
4722 + v9 = vec_or(v9,v10);
4723 + v9 = vec_or(v9,v11);
4724 + v9 = vec_or(v9,v12);
4725 +
4726 + v10 = vec_add(v4,v9);
4727 + vl1 = vec_cts(v10,0);
4728 + vec_st(vl1,0,l3);
4729 +
4730 + v2 = vec_lde(0,pow43+l3[0]);
4731 + v3 = vec_lde(0,pow43+l3[1]);
4732 + v4 = vec_lde(0,pow43+l3[2]);
4733 + v5 = vec_lde(0,pow43+l3[3]);
4734 + v6 = vec_perm(v2,v2,vec_lvsl(0,pow43+l3[0]));
4735 + v7 = vec_perm(v3,v3,vec_lvsl(-4,pow43+l3[1]));
4736 + v8 = vec_perm(v4,v4,vec_lvsl(-8,pow43+l3[2]));
4737 + v9 = vec_perm(v5,v5,vec_lvsl(-12,pow43+l3[3]));
4738 + v6 = vec_or(v6,v7);
4739 + v6 = vec_or(v6,v8);
4740 + v6 = vec_or(v6,v9);
4741 +
4742 + v2 = vec_ld(16, xr);
4743 + v3 = vec_perm(v0,v2,vperm1);
4744 + v4 = vec_andc(v3,vabs);
4745 + v5 = vec_nmsub(vsfpow, v6, v4);
4746 + vxfsf = vec_madd(v5, v5, vxfsf);
4747 +#endif
4748 + xr += 4;
4749 + xr34 += 4;
4750 + }
4751 + if (remaining) {
4752 + x[0] = sfpow34 * xr34[0];
4753 + x[1] = sfpow34 * xr34[1];
4754 +
4755 + k_34_2(x, l3);
4756 +
4757 + x[0] = fabs(xr[0]) - sfpow * pow43[l3[0]];
4758 + x[1] = fabs(xr[1]) - sfpow * pow43[l3[1]];
4759 + xfsf += x[0] * x[0] + x[1] * x[1];
4760 + }
4761 + vec_st(vxfsf,0,vpow);
4762 + return xfsf + vpow[0] + vpow[1] + vpow[2] + vpow[3];
4763 +#else
4764 for (j >>= 1; j > 0; --j) {
4765 x[0] = sfpow34 * xr34[0];
4766 x[1] = sfpow34 * xr34[1];
4767 @@ -288,6 +536,7 @@
4768 xfsf += x[0] * x[0] + x[1] * x[1];
4769 }
4770 return xfsf;
4771 +#endif
4772 }
4773
4774