FFmpeg  4.4.4
vc1dsp_mmi.c
Go to the documentation of this file.
1 /*
2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
3  *
4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/avassert.h"
24 #include "libavutil/mem_internal.h"
25 
26 #include "libavcodec/vc1dsp.h"
27 #include "constants.h"
28 #include "vc1dsp_mips.h"
29 #include "hpeldsp_mips.h"
30 #include "libavutil/mem_internal.h"
32 
33 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0) \
34  "li %[tmp0], "#r1" \n\t" \
35  "mtc1 %[tmp0], %[ftmp13] \n\t" \
36  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
37  "li %[tmp0], "#r2" \n\t" \
38  "mtc1 %[tmp0], %[ftmp14] \n\t" \
39  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
40  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
41  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
42  "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
43  "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
44  "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
45  "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
46  \
47  "li %[tmp0], "#r3" \n\t" \
48  "mtc1 %[tmp0], %[ftmp13] \n\t" \
49  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
50  "li %[tmp0], "#r4" \n\t" \
51  "mtc1 %[tmp0], %[ftmp14] \n\t" \
52  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
53  "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
54  "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
55  "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
56  "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
57  "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
58  "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
59  \
60  "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
61  "paddw %[ftmp2], %[ftmp2], "#c0" \n\t" \
62  "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
63  "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
64  "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
65  "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
66  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
67  "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
68  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
69  "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
70  "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
71  "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
72  "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
73  "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
74  "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
75  "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
76 
77 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1) \
78  "li %[tmp0], "#r1" \n\t" \
79  "mtc1 %[tmp0], %[ftmp13] \n\t" \
80  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
81  "li %[tmp0], "#r2" \n\t" \
82  "mtc1 %[tmp0], %[ftmp14] \n\t" \
83  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
84  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
85  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
86  "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
87  "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
88  "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
89  "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
90  \
91  "li %[tmp0], "#r3" \n\t" \
92  "mtc1 %[tmp0], %[ftmp13] \n\t" \
93  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
94  "li %[tmp0], "#r4" \n\t" \
95  "mtc1 %[tmp0], %[ftmp14] \n\t" \
96  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
97  "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
98  "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
99  "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
100  "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
101  "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
102  "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
103  \
104  "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
105  "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
106  "paddw %[ftmp14], %[ftmp14], "#c1" \n\t" \
107  "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
108  "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
109  "paddw %[ftmp3], %[ftmp3], "#c1" \n\t" \
110  "paddw %[ftmp13], %[ftmp13], "#c0" \n\t" \
111  "paddw %[ftmp14], %[ftmp14], "#c0" \n\t" \
112  "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
113  "paddw %[ftmp3], %[ftmp3], "#c0" \n\t" \
114  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
115  "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
116  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
117  "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
118  "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
119  "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
120  "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
121  "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
122  "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
123  "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
124 
125 /* Do inverse transform on 8x8 block */
126 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
127 {
128  int dc = block[0];
129  double ftmp[9];
130  mips_reg addr[1];
131  int count;
132 
133  dc = (3 * dc + 1) >> 1;
134  dc = (3 * dc + 16) >> 5;
135 
136  __asm__ volatile(
137  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
138  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
139  "li %[count], 0x02 \n\t"
140 
141  "1: \n\t"
142  MMI_LDC1(%[ftmp1], %[dest], 0x00)
143  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
144  MMI_LDC1(%[ftmp2], %[addr0], 0x00)
145  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
146  MMI_LDC1(%[ftmp3], %[addr0], 0x00)
147  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
148  MMI_LDC1(%[ftmp4], %[addr0], 0x00)
149 
150  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
151  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
152  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
153  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
154  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
155  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
156  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
157  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
158 
159  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
160  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
161  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
162  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
163  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
164  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
165  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
166  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
167 
168  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
169  "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
170  "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
171  "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
172 
173  MMI_SDC1(%[ftmp1], %[dest], 0x00)
174  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
175  MMI_SDC1(%[ftmp2], %[addr0], 0x00)
176  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
177  MMI_SDC1(%[ftmp3], %[addr0], 0x00)
178  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
179  MMI_SDC1(%[ftmp4], %[addr0], 0x00)
180 
181  "addiu %[count], %[count], -0x01 \n\t"
182  PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t"
183  "bnez %[count], 1b \n\t"
184  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
185  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
186  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
187  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
188  [ftmp8]"=&f"(ftmp[8]),
189  [addr0]"=&r"(addr[0]),
190  [count]"=&r"(count), [dest]"+&r"(dest)
191  : [linesize]"r"((mips_reg)linesize),
192  [dc]"f"(dc)
193  : "memory"
194  );
195 }
196 
197 #if _MIPS_SIM != _ABIO32
198 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
199 {
200  DECLARE_ALIGNED(16, int16_t, temp[64]);
201  DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
202  DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
203  DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
204  double ftmp[23];
205  uint64_t tmp[1];
206 
207  __asm__ volatile (
208  /* 1st loop: start */
209  "li %[tmp0], 0x03 \n\t"
210  "mtc1 %[tmp0], %[ftmp0] \n\t"
211 
212  // 1st part
213  MMI_LDC1(%[ftmp1], %[block], 0x00)
214  MMI_LDC1(%[ftmp11], %[block], 0x10)
215  MMI_LDC1(%[ftmp2], %[block], 0x20)
216  MMI_LDC1(%[ftmp12], %[block], 0x30)
217  MMI_LDC1(%[ftmp3], %[block], 0x40)
218  MMI_LDC1(%[ftmp13], %[block], 0x50)
219  MMI_LDC1(%[ftmp4], %[block], 0x60)
220  MMI_LDC1(%[ftmp14], %[block], 0x70)
221  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
222  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
223  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
224  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
225 
226  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
227  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
228  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
229  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
230 
231  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
232  VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
233  0x000f0010, 0x00040009, %[ff_pw_4])
234 
235  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
236  VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
237  0xfffc000f, 0xfff7fff0, %[ff_pw_4])
238 
239  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
240  VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
241  0xfff00009, 0x000f0004, %[ff_pw_4])
242 
243  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
244  VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
245  0xfff70004, 0xfff0000f, %[ff_pw_4])
246 
247  TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
248  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
249 
250  TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
251  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
252 
253  MMI_SDC1(%[ftmp15], %[temp], 0x00)
254  MMI_SDC1(%[ftmp19], %[temp], 0x08)
255  MMI_SDC1(%[ftmp16], %[temp], 0x10)
256  MMI_SDC1(%[ftmp20], %[temp], 0x18)
257  MMI_SDC1(%[ftmp17], %[temp], 0x20)
258  MMI_SDC1(%[ftmp21], %[temp], 0x28)
259  MMI_SDC1(%[ftmp18], %[temp], 0x30)
260  MMI_SDC1(%[ftmp22], %[temp], 0x38)
261 
262  // 2nd part
263  MMI_LDC1(%[ftmp1], %[block], 0x08)
264  MMI_LDC1(%[ftmp11], %[block], 0x18)
265  MMI_LDC1(%[ftmp2], %[block], 0x28)
266  MMI_LDC1(%[ftmp12], %[block], 0x38)
267  MMI_LDC1(%[ftmp3], %[block], 0x48)
268  MMI_LDC1(%[ftmp13], %[block], 0x58)
269  MMI_LDC1(%[ftmp4], %[block], 0x68)
270  MMI_LDC1(%[ftmp14], %[block], 0x78)
271  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
272  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
273  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
274  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
275 
276  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
277  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
278  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
279  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
280 
281  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
282  VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
283  0x000f0010, 0x00040009, %[ff_pw_4])
284 
285  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
286  VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
287  0xfffc000f, 0xfff7fff0, %[ff_pw_4])
288 
289  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
290  VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
291  0xfff00009, 0x000f0004, %[ff_pw_4])
292 
293  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
294  VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
295  0xfff70004, 0xfff0000f, %[ff_pw_4])
296 
297  TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
298  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
299 
300  TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
301  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
302 
303  MMI_SDC1(%[ftmp19], %[temp], 0x48)
304  MMI_SDC1(%[ftmp20], %[temp], 0x58)
305  MMI_SDC1(%[ftmp21], %[temp], 0x68)
306  MMI_SDC1(%[ftmp22], %[temp], 0x78)
307  /* 1st loop: end */
308 
309  /* 2nd loop: start */
310  "li %[tmp0], 0x07 \n\t"
311  "mtc1 %[tmp0], %[ftmp0] \n\t"
312 
313  // 1st part
314  MMI_LDC1(%[ftmp1], %[temp], 0x00)
315  MMI_LDC1(%[ftmp11], %[temp], 0x10)
316  MMI_LDC1(%[ftmp2], %[temp], 0x20)
317  MMI_LDC1(%[ftmp12], %[temp], 0x30)
318  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
319  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
320  "punpcklhw %[ftmp7], %[ftmp15], %[ftmp17] \n\t"
321  "punpckhhw %[ftmp8], %[ftmp15], %[ftmp17] \n\t"
322 
323  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
324  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
325  "punpcklhw %[ftmp11], %[ftmp16], %[ftmp18] \n\t"
326  "punpckhhw %[ftmp12], %[ftmp16], %[ftmp18] \n\t"
327 
328  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
329  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
330  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
331 
332  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
333  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
334  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
335 
336  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
337  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
338  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
339 
340  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
341  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
342  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
343 
344  MMI_SDC1(%[ftmp15], %[block], 0x00)
345  MMI_SDC1(%[ftmp16], %[block], 0x10)
346  MMI_SDC1(%[ftmp17], %[block], 0x20)
347  MMI_SDC1(%[ftmp18], %[block], 0x30)
348  MMI_SDC1(%[ftmp19], %[block], 0x40)
349  MMI_SDC1(%[ftmp20], %[block], 0x50)
350  MMI_SDC1(%[ftmp21], %[block], 0x60)
351  MMI_SDC1(%[ftmp22], %[block], 0x70)
352 
353  // 2nd part
354  MMI_LDC1(%[ftmp1], %[temp], 0x08)
355  MMI_LDC1(%[ftmp11], %[temp], 0x18)
356  MMI_LDC1(%[ftmp2], %[temp], 0x28)
357  MMI_LDC1(%[ftmp12], %[temp], 0x38)
358  MMI_LDC1(%[ftmp3], %[temp], 0x48)
359  MMI_LDC1(%[ftmp13], %[temp], 0x58)
360  MMI_LDC1(%[ftmp4], %[temp], 0x68)
361  MMI_LDC1(%[ftmp14], %[temp], 0x78)
362  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
363  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
364  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
365  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
366 
367  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
368  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
369  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
370  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
371 
372  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
373  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
374  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
375 
376  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
377  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
378  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
379 
380  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
381  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
382  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
383 
384  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
385  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
386  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
387 
388  MMI_SDC1(%[ftmp15], %[block], 0x08)
389  MMI_SDC1(%[ftmp16], %[block], 0x18)
390  MMI_SDC1(%[ftmp17], %[block], 0x28)
391  MMI_SDC1(%[ftmp18], %[block], 0x38)
392  MMI_SDC1(%[ftmp19], %[block], 0x48)
393  MMI_SDC1(%[ftmp20], %[block], 0x58)
394  MMI_SDC1(%[ftmp21], %[block], 0x68)
395  MMI_SDC1(%[ftmp22], %[block], 0x78)
396  /* 2nd loop: end */
397  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
398  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
399  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
400  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
401  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
402  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
403  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
404  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
405  [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
406  [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
407  [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
408  [ftmp22]"=&f"(ftmp[22]),
409  [tmp0]"=&r"(tmp[0])
410  : [ff_pw_1]"f"(ff_pw_1_local), [ff_pw_64]"f"(ff_pw_64_local),
411  [ff_pw_4]"f"(ff_pw_4_local), [block]"r"(block),
412  [temp]"r"(temp)
413  : "memory"
414  );
415 }
416 #endif
417 
418 /* Do inverse transform on 8x4 part of block */
419 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
420 {
421  int dc = block[0];
422  double ftmp[9];
423 
424  dc = ( 3 * dc + 1) >> 1;
425  dc = (17 * dc + 64) >> 7;
426 
427  __asm__ volatile(
428  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
429  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
430 
431  MMI_LDC1(%[ftmp1], %[dest0], 0x00)
432  MMI_LDC1(%[ftmp2], %[dest1], 0x00)
433  MMI_LDC1(%[ftmp3], %[dest2], 0x00)
434  MMI_LDC1(%[ftmp4], %[dest3], 0x00)
435 
436  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
437  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
438  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
439  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
440  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
441  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
442  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
443  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
444 
445  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
446  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
447  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
448  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
449  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
450  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
451  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
452  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
453 
454  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
455  "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
456  "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
457  "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
458 
459  MMI_SDC1(%[ftmp1], %[dest0], 0x00)
460  MMI_SDC1(%[ftmp2], %[dest1], 0x00)
461  MMI_SDC1(%[ftmp3], %[dest2], 0x00)
462  MMI_SDC1(%[ftmp4], %[dest3], 0x00)
463  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
464  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
465  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
466  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
467  [ftmp8]"=&f"(ftmp[8])
468  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
469  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
470  [dc]"f"(dc)
471  : "memory"
472  );
473 }
474 
475 #if _MIPS_SIM != _ABIO32
476 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
477 {
478  int16_t *src = block;
479  int16_t *dst = block;
480  double ftmp[16];
481  uint32_t tmp[1];
482  int16_t count = 4;
483  DECLARE_ALIGNED(16, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
484  DECLARE_ALIGNED(16, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
485  int16_t coeff[64] = {12, 16, 16, 15, 12, 9, 6, 4,
486  12, 15, 6, -4, -12, -16, -16, -9,
487  12, 9, -6, -16, -12, 4, 16, 15,
488  12, 4, -16, -9, 12, 15, -6, -16,
489  12, -4, -16, 9, 12, -15, -6, 16,
490  12, -9, -6, 16, -12, -4, 16, -15,
491  12, -15, 6, 4, -12, 16, -16, 9,
492  12, -16, 16, -15, 12, -9, 6, -4};
493 
494  // 1st loop
495  __asm__ volatile (
496  "li %[tmp0], 0x03 \n\t"
497  "mtc1 %[tmp0], %[ftmp0] \n\t"
498 
499  "1: \n\t"
500  MMI_LDC1(%[ftmp1], %[src], 0x00)
501  MMI_LDC1(%[ftmp2], %[src], 0x08)
502 
503  /* ftmp11: dst1,dst0 */
504  MMI_LDC1(%[ftmp3], %[coeff], 0x00)
505  MMI_LDC1(%[ftmp4], %[coeff], 0x08)
506  MMI_LDC1(%[ftmp5], %[coeff], 0x10)
507  MMI_LDC1(%[ftmp6], %[coeff], 0x18)
508  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
509  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
510  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
511  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
512  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
513  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
514  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
515  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
516  "paddw %[ftmp11], %[ftmp7], %[ftmp8] \n\t"
517  "paddw %[ftmp11], %[ftmp11], %[ff_pw_4] \n\t"
518 
519  /* ftmp12: dst3,dst2 */
520  MMI_LDC1(%[ftmp3], %[coeff], 0x20)
521  MMI_LDC1(%[ftmp4], %[coeff], 0x28)
522  MMI_LDC1(%[ftmp5], %[coeff], 0x30)
523  MMI_LDC1(%[ftmp6], %[coeff], 0x38)
524  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
525  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
526  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
527  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
528  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
529  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
530  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
531  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
532  "paddw %[ftmp12], %[ftmp7], %[ftmp8] \n\t"
533  "paddw %[ftmp12], %[ftmp12], %[ff_pw_4] \n\t"
534 
535  /* ftmp13: dst5,dst4 */
536  MMI_LDC1(%[ftmp3], %[coeff], 0x40)
537  MMI_LDC1(%[ftmp4], %[coeff], 0x48)
538  MMI_LDC1(%[ftmp5], %[coeff], 0x50)
539  MMI_LDC1(%[ftmp6], %[coeff], 0x58)
540  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
541  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
542  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
543  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
544  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
545  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
546  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
547  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
548  "paddw %[ftmp13], %[ftmp7], %[ftmp8] \n\t"
549  "paddw %[ftmp13], %[ftmp13], %[ff_pw_4] \n\t"
550 
551  /* ftmp14: dst7,dst6 */
552  MMI_LDC1(%[ftmp3], %[coeff], 0x60)
553  MMI_LDC1(%[ftmp4], %[coeff], 0x68)
554  MMI_LDC1(%[ftmp5], %[coeff], 0x70)
555  MMI_LDC1(%[ftmp6], %[coeff], 0x78)
556  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
557  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
558  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
559  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
560  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
561  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
562  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
563  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
564  "paddw %[ftmp14], %[ftmp7], %[ftmp8] \n\t"
565  "paddw %[ftmp14], %[ftmp14], %[ff_pw_4] \n\t"
566 
567  /* ftmp9: dst3,dst2,dst1,dst0 ftmp10: dst7,dst6,dst5,dst4 */
568  "psraw %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
569  "psraw %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
570  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t"
571  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
572  "punpcklhw %[ftmp7], %[ftmp11], %[ftmp12] \n\t"
573  "punpckhhw %[ftmp8], %[ftmp11], %[ftmp12] \n\t"
574  "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
575  "punpcklhw %[ftmp7], %[ftmp13], %[ftmp14] \n\t"
576  "punpckhhw %[ftmp8], %[ftmp13], %[ftmp14] \n\t"
577  "punpcklhw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
578  MMI_SDC1(%[ftmp9], %[dst], 0x00)
579  MMI_SDC1(%[ftmp10], %[dst], 0x08)
580 
581  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
582  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
583  "addiu %[count], %[count], -0x01 \n\t"
584  "bnez %[count], 1b \n\t"
585  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
586  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
587  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
588  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
589  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
590  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
591  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
592  [ftmp14]"=&f"(ftmp[14]), [tmp0]"=&r"(tmp[0]),
593  [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
594  : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
595  : "memory"
596  );
597 
598  src = block;
599 
600  // 2nd loop
601  __asm__ volatile (
602  "li %[tmp0], 0x44 \n\t"
603  "mtc1 %[tmp0], %[ftmp15] \n\t"
604 
605  // 1st part
606  "li %[tmp0], 0x07 \n\t"
607  "mtc1 %[tmp0], %[ftmp0] \n\t"
608  MMI_LDC1(%[ftmp1], %[src], 0x00)
609  MMI_LDC1(%[ftmp2], %[src], 0x10)
610  MMI_LDC1(%[ftmp3], %[src], 0x20)
611  MMI_LDC1(%[ftmp4], %[src], 0x30)
612  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
613  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
614  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
615  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
616 
617  /* ftmp11: dst03,dst02,dst01,dst00 */
618  "li %[tmp0], 0x00160011 \n\t"
619  "mtc1 %[tmp0], %[ftmp3] \n\t"
620  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
621  "li %[tmp0], 0x000a0011 \n\t"
622  "mtc1 %[tmp0], %[ftmp4] \n\t"
623  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
624  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
625  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
626  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
627  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
628  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
629  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
630  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
631  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
632  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
633  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
634  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
635  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
636  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
637 
638  /* ftmp12: dst13,dst12,dst11,dst10 */
639  "li %[tmp0], 0x000a0011 \n\t"
640  "mtc1 %[tmp0], %[ftmp3] \n\t"
641  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
642  "li %[tmp0], 0xffeaffef \n\t"
643  "mtc1 %[tmp0], %[ftmp4] \n\t"
644  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
645  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
646  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
647  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
648  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
649  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
650  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
651  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
652  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
653  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
654  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
655  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
656  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
657  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
658 
659  /* ftmp13: dst23,dst22,dst21,dst20 */
660  "li %[tmp0], 0xfff60011 \n\t"
661  "mtc1 %[tmp0], %[ftmp3] \n\t"
662  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
663  "li %[tmp0], 0x0016ffef \n\t"
664  "mtc1 %[tmp0], %[ftmp4] \n\t"
665  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
666  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
667  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
668  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
669  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
670  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
671  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
672  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
673  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
674  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
675  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
676  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
677  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
678  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
679 
680  /* ftmp14: dst33,dst32,dst31,dst30 */
681  "li %[tmp0], 0xffea0011 \n\t"
682  "mtc1 %[tmp0], %[ftmp3] \n\t"
683  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
684  "li %[tmp0], 0xfff60011 \n\t"
685  "mtc1 %[tmp0], %[ftmp4] \n\t"
686  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
687  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
688  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
689  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
690  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
691  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
692  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
693  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
694  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
695  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
696  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
697  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
698  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
699  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
700 
701  MMI_LWC1(%[ftmp1], %[dest], 0x00)
702  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
703  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
704  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
705  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
706  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
707  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
708  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
709  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
710  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
711  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
712  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
713  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
714  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
715  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
716  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
717  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
718  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
719  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
720  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
721  MMI_SWC1(%[ftmp1], %[dest], 0x00)
722  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
723  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
724  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
725  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
726  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
727  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
728 
729  // 2nd part
730  "li %[tmp0], 0x07 \n\t"
731  "mtc1 %[tmp0], %[ftmp0] \n\t"
732  MMI_LDC1(%[ftmp1], %[src], 0x08)
733  MMI_LDC1(%[ftmp2], %[src], 0x18)
734  MMI_LDC1(%[ftmp3], %[src], 0x28)
735  MMI_LDC1(%[ftmp4], %[src], 0x38)
736  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
737  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
738  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
739  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
740 
741  /* ftmp11: dst03,dst02,dst01,dst00 */
742  "li %[tmp0], 0x00160011 \n\t"
743  "mtc1 %[tmp0], %[ftmp3] \n\t"
744  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
745  "li %[tmp0], 0x000a0011 \n\t"
746  "mtc1 %[tmp0], %[ftmp4] \n\t"
747  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
748  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
749  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
750  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
751  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
752  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
753  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
754  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
755  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
756  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
757  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
758  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
759  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
760  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
761 
762  /* ftmp12: dst13,dst12,dst11,dst10 */
763  "li %[tmp0], 0x000a0011 \n\t"
764  "mtc1 %[tmp0], %[ftmp3] \n\t"
765  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
766  "li %[tmp0], 0xffeaffef \n\t"
767  "mtc1 %[tmp0], %[ftmp4] \n\t"
768  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
769  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
770  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
771  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
772  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
773  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
774  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
775  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
776  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
777  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
778  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
779  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
780  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
781  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
782 
783  /* ftmp13: dst23,dst22,dst21,dst20 */
784  "li %[tmp0], 0xfff60011 \n\t"
785  "mtc1 %[tmp0], %[ftmp3] \n\t"
786  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
787  "li %[tmp0], 0x0016ffef \n\t"
788  "mtc1 %[tmp0], %[ftmp4] \n\t"
789  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
790  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
791  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
792  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
793  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
794  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
795  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
796  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
797  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
798  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
799  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
800  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
801  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
802  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
803 
804  /* ftmp14: dst33,dst32,dst31,dst30 */
805  "li %[tmp0], 0xffea0011 \n\t"
806  "mtc1 %[tmp0], %[ftmp3] \n\t"
807  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
808  "li %[tmp0], 0xfff60011 \n\t"
809  "mtc1 %[tmp0], %[ftmp4] \n\t"
810  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
811  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
812  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
813  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
814  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
815  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
816  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
817  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
818  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
819  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
820  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
821  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
822  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
823  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
824 
825  MMI_LWC1(%[ftmp1], %[dest], 0x04)
826  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
827  MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
828  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
829  MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
830  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
831  MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
832  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
833  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
834  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
835  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
836  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
837  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
838  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
839  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
840  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
841  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
842  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
843  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
844  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
845  MMI_SWC1(%[ftmp1], %[dest], 0x04)
846  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
847  MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
848  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
849  MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
850  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
851  MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
852 
853  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
854  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
855  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
856  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
857  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
858  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
859  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
860  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
861  [tmp0]"=&r"(tmp[0])
862  : [ff_pw_64]"f"(ff_pw_64_local),
863  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
864  :"memory"
865  );
866 }
867 #endif
868 
869 /* Do inverse transform on 4x8 parts of block */
870 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
871 {
872  int dc = block[0];
873  double ftmp[9];
874  DECLARE_VAR_LOW32;
875 
876  dc = (17 * dc + 4) >> 3;
877  dc = (12 * dc + 64) >> 7;
878 
879  __asm__ volatile(
880  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
881  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
882 
883  MMI_LWC1(%[ftmp1], %[dest0], 0x00)
884  MMI_LWC1(%[ftmp2], %[dest1], 0x00)
885  MMI_LWC1(%[ftmp3], %[dest2], 0x00)
886  MMI_LWC1(%[ftmp4], %[dest3], 0x00)
887  MMI_LWC1(%[ftmp5], %[dest4], 0x00)
888  MMI_LWC1(%[ftmp6], %[dest5], 0x00)
889  MMI_LWC1(%[ftmp7], %[dest6], 0x00)
890  MMI_LWC1(%[ftmp8], %[dest7], 0x00)
891 
892  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
893  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
894  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
895  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
896  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
897  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
898  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
899  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
900 
901  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
902  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
903  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
904  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
905  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
906  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
907  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
908  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
909 
910  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
911  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
912  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
913  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
914  "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
915  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
916  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
917  "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
918 
919  MMI_SWC1(%[ftmp1], %[dest0], 0x00)
920  MMI_SWC1(%[ftmp2], %[dest1], 0x00)
921  MMI_SWC1(%[ftmp3], %[dest2], 0x00)
922  MMI_SWC1(%[ftmp4], %[dest3], 0x00)
923  MMI_SWC1(%[ftmp5], %[dest4], 0x00)
924  MMI_SWC1(%[ftmp6], %[dest5], 0x00)
925  MMI_SWC1(%[ftmp7], %[dest6], 0x00)
926  MMI_SWC1(%[ftmp8], %[dest7], 0x00)
927  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
928  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
929  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
930  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
931  RESTRICT_ASM_LOW32
932  [ftmp8]"=&f"(ftmp[8])
933  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
934  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
935  [dest4]"r"(dest+4*linesize), [dest5]"r"(dest+5*linesize),
936  [dest6]"r"(dest+6*linesize), [dest7]"r"(dest+7*linesize),
937  [dc]"f"(dc)
938  : "memory"
939  );
940 }
941 
942 #if _MIPS_SIM != _ABIO32
943 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
944 {
945  int16_t *src = block;
946  int16_t *dst = block;
947  double ftmp[23];
948  uint32_t count = 8, tmp[1];
949  int16_t coeff[16] = {17, 22, 17, 10,
950  17, 10,-17,-22,
951  17,-10,-17, 22,
952  17,-22, 17,-10};
953  DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
954  DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
955  DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
956 
957  // 1st loop
958  __asm__ volatile (
959 
960  "li %[tmp0], 0x03 \n\t"
961  "mtc1 %[tmp0], %[ftmp0] \n\t"
962 
963  MMI_LDC1(%[ftmp2], %[coeff], 0x00)
964  MMI_LDC1(%[ftmp3], %[coeff], 0x08)
965  MMI_LDC1(%[ftmp4], %[coeff], 0x10)
966  MMI_LDC1(%[ftmp5], %[coeff], 0x18)
967  "1: \n\t"
968  /* ftmp8: dst3,dst2,dst1,dst0 */
969  MMI_LDC1(%[ftmp1], %[src], 0x00)
970  "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
971  "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
972  "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
973  "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
974  "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
975  "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
976  "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
977  "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
978  "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
979  "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
980  "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
981  "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
982  "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
983  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
984  "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
985  "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
986  "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
987  MMI_SDC1(%[ftmp8], %[dst], 0x00)
988 
989  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
990  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
991  "addiu %[count], %[count], -0x01 \n\t"
992  "bnez %[count], 1b \n\t"
993  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
994  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
995  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
996  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
997  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
998  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
999  [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
1000  [src]"+&r"(src), [dst]"+&r"(dst)
1001  : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
1002  : "memory"
1003  );
1004 
1005  src = block;
1006 
1007  // 2nd loop
1008  __asm__ volatile (
1009  "li %[tmp0], 0x07 \n\t"
1010  "mtc1 %[tmp0], %[ftmp0] \n\t"
1011 
1012  MMI_LDC1(%[ftmp1], %[src], 0x00)
1013  MMI_LDC1(%[ftmp2], %[src], 0x20)
1014  MMI_LDC1(%[ftmp3], %[src], 0x40)
1015  MMI_LDC1(%[ftmp4], %[src], 0x60)
1016  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1017  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1018  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1019  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1020 
1021  MMI_LDC1(%[ftmp1], %[src], 0x10)
1022  MMI_LDC1(%[ftmp2], %[src], 0x30)
1023  MMI_LDC1(%[ftmp3], %[src], 0x50)
1024  MMI_LDC1(%[ftmp4], %[src], 0x70)
1025  "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1026  "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1027  "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t"
1028  "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t"
1029 
1030  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1031  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1032  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1033 
1034  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1035  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1036  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1037 
1038  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1039  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1040  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1041 
1042  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1043  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1044  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1045 
1046  MMI_LWC1(%[ftmp1], %[dest], 0x00)
1047  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1048  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1049  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1050  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1051  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1052  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1053  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1054  MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1055  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1056  MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1057  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1058  MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1059  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1060  MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1061  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1062  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1063  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1064  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1065  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1066  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1067  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1068  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1069  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1070 
1071  "paddh %[ftmp1], %[ftmp1], %[ftmp15] \n\t"
1072  "paddh %[ftmp2], %[ftmp2], %[ftmp16] \n\t"
1073  "paddh %[ftmp3], %[ftmp3], %[ftmp17] \n\t"
1074  "paddh %[ftmp4], %[ftmp4], %[ftmp18] \n\t"
1075  "paddh %[ftmp5], %[ftmp5], %[ftmp19] \n\t"
1076  "paddh %[ftmp6], %[ftmp6], %[ftmp20] \n\t"
1077  "paddh %[ftmp7], %[ftmp7], %[ftmp21] \n\t"
1078  "paddh %[ftmp8], %[ftmp8], %[ftmp22] \n\t"
1079 
1080  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1081  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1082  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1083  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1084  "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1085  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1086  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1087  "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1088 
1089  MMI_SWC1(%[ftmp1], %[dest], 0x00)
1090  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1091  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1092  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1093  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1094  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1095  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1096  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1097  MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1098  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1099  MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1100  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1101  MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1102  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1103  MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1104 
1105  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1106  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1107  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1108  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1109  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1110  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1111  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1112  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1113  [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
1114  [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
1115  [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
1116  [ftmp22]"=&f"(ftmp[22]),
1117  [tmp0]"=&r"(tmp[0])
1118  : [ff_pw_1]"f"(ff_pw_1_local), [ff_pw_64]"f"(ff_pw_64_local),
1119  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1120  : "memory"
1121  );
1122 }
1123 #endif
1124 
1125 /* Do inverse transform on 4x4 part of block */
1126 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1127 {
1128  int dc = block[0];
1129  double ftmp[5];
1130  DECLARE_VAR_LOW32;
1131 
1132  dc = (17 * dc + 4) >> 3;
1133  dc = (17 * dc + 64) >> 7;
1134 
1135  __asm__ volatile(
1136  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1137  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
1138 
1139  MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1140  MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1141  MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1142  MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1143 
1144  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1145  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1146  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1147  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1148 
1149  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
1150  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
1151  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
1152  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
1153 
1154  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1155  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1156  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1157  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1158 
1159  MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1160  MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1161  MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1162  MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1163  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1164  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1165  RESTRICT_ASM_LOW32
1166  [ftmp4]"=&f"(ftmp[4])
1167  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
1168  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
1169  [dc]"f"(dc)
1170  : "memory"
1171  );
1172 }
1173 
1174 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1175 {
1176  int16_t *src = block;
1177  int16_t *dst = block;
1178  double ftmp[16];
1179  uint32_t count = 4, tmp[1];
1180  int16_t coeff[16] = {17, 22, 17, 10,
1181  17, 10,-17,-22,
1182  17,-10,-17, 22,
1183  17,-22, 17,-10};
1184  DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
1185  DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
1186  // 1st loop
1187  __asm__ volatile (
1188 
1189  "li %[tmp0], 0x03 \n\t"
1190  "mtc1 %[tmp0], %[ftmp0] \n\t"
1191  MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1192  MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1193  MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1194  MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1195  "1: \n\t"
1196  /* ftmp8: dst3,dst2,dst1,dst0 */
1197  MMI_LDC1(%[ftmp1], %[src], 0x00)
1198  "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
1199  "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
1200  "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
1201  "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
1202  "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
1203  "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
1204  "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1205  "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1206  "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
1207  "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
1208  "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
1209  "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
1210  "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1211  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1212  "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1213  "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1214  "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
1215  MMI_SDC1(%[ftmp8], %[dst], 0x00)
1216 
1217  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
1218  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
1219  "addiu %[count], %[count], -0x01 \n\t"
1220  "bnez %[count], 1b \n\t"
1221  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1222  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1223  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1224  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1225  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1226  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1227  [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
1228  [src]"+&r"(src), [dst]"+&r"(dst)
1229  : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
1230  : "memory"
1231  );
1232 
1233  src = block;
1234 
1235  // 2nd loop
1236  __asm__ volatile (
1237  "li %[tmp0], 0x07 \n\t"
1238  "mtc1 %[tmp0], %[ftmp0] \n\t"
1239  "li %[tmp0], 0x44 \n\t"
1240  "mtc1 %[tmp0], %[ftmp15] \n\t"
1241 
1242  MMI_LDC1(%[ftmp1], %[src], 0x00)
1243  MMI_LDC1(%[ftmp2], %[src], 0x10)
1244  MMI_LDC1(%[ftmp3], %[src], 0x20)
1245  MMI_LDC1(%[ftmp4], %[src], 0x30)
1246  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1247  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1248  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1249  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1250 
1251  /* ftmp11: dst03,dst02,dst01,dst00 */
1252  "li %[tmp0], 0x00160011 \n\t"
1253  "mtc1 %[tmp0], %[ftmp3] \n\t"
1254  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1255  "li %[tmp0], 0x000a0011 \n\t"
1256  "mtc1 %[tmp0], %[ftmp4] \n\t"
1257  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1258  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1259  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1260  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1261  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1262  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1263  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1264  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1265  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1266  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1267  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1268  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1269  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1270  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
1271 
1272  /* ftmp12: dst13,dst12,dst11,dst10 */
1273  "li %[tmp0], 0x000a0011 \n\t"
1274  "mtc1 %[tmp0], %[ftmp3] \n\t"
1275  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1276  "li %[tmp0], 0xffeaffef \n\t"
1277  "mtc1 %[tmp0], %[ftmp4] \n\t"
1278  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1279  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1280  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1281  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1282  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1283  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1284  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1285  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1286  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1287  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1288  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1289  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1290  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1291  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
1292 
1293  /* ftmp13: dst23,dst22,dst21,dst20 */
1294  "li %[tmp0], 0xfff60011 \n\t"
1295  "mtc1 %[tmp0], %[ftmp3] \n\t"
1296  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1297  "li %[tmp0], 0x0016ffef \n\t"
1298  "mtc1 %[tmp0], %[ftmp4] \n\t"
1299  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1300  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1301  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1302  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1303  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1304  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1305  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1306  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1307  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1308  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1309  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1310  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1311  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1312  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
1313 
1314  /* ftmp14: dst33,dst32,dst31,dst30 */
1315  "li %[tmp0], 0xffea0011 \n\t"
1316  "mtc1 %[tmp0], %[ftmp3] \n\t"
1317  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1318  "li %[tmp0], 0xfff60011 \n\t"
1319  "mtc1 %[tmp0], %[ftmp4] \n\t"
1320  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1321  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1322  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1323  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1324  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1325  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1326  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1327  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1328  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1329  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1330  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1331  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1332  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1333  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
1334 
1335  MMI_LWC1(%[ftmp1], %[dest], 0x00)
1336  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1337  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1338  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1339  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1340  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1341  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1342  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1343  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1344  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1345  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1346  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1347  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1348  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
1349  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
1350  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
1351  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1352  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1353  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1354  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1355 
1356  MMI_SWC1(%[ftmp1], %[dest], 0x00)
1357  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1358  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1359  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1360  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1361  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1362  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1363 
1364  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1365  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1366  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1367  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1368  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1369  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1370  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1371  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1372  [tmp0]"=&r"(tmp[0])
1373  : [ff_pw_64]"f"(ff_pw_64_local),
1374  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1375  :"memory"
1376  );
1377 }
1378 
1379 /* Apply overlap transform to horizontal edge */
1381 {
1382  int i;
1383  int a, b, c, d;
1384  int d1, d2;
1385  int rnd = 1;
1386  for (i = 0; i < 8; i++) {
1387  a = src[-2];
1388  b = src[-1];
1389  c = src[0];
1390  d = src[1];
1391  d1 = (a - d + 3 + rnd) >> 3;
1392  d2 = (a - d + b - c + 4 - rnd) >> 3;
1393 
1394  src[-2] = a - d1;
1395  src[-1] = av_clip_uint8(b - d2);
1396  src[0] = av_clip_uint8(c + d2);
1397  src[1] = d + d1;
1398  src += stride;
1399  rnd = !rnd;
1400  }
1401 }
1402 
1403 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
1404 {
1405  int i;
1406  int a, b, c, d;
1407  int d1, d2;
1408  int rnd1 = flags & 2 ? 3 : 4;
1409  int rnd2 = 7 - rnd1;
1410  for (i = 0; i < 8; i++) {
1411  a = left[6];
1412  b = left[7];
1413  c = right[0];
1414  d = right[1];
1415  d1 = a - d;
1416  d2 = a - d + b - c;
1417 
1418  left[6] = ((a << 3) - d1 + rnd1) >> 3;
1419  left[7] = ((b << 3) - d2 + rnd2) >> 3;
1420  right[0] = ((c << 3) + d2 + rnd1) >> 3;
1421  right[1] = ((d << 3) + d1 + rnd2) >> 3;
1422 
1423  right += right_stride;
1424  left += left_stride;
1425  if (flags & 1) {
1426  rnd2 = 7 - rnd2;
1427  rnd1 = 7 - rnd1;
1428  }
1429  }
1430 }
1431 
1432 /* Apply overlap transform to vertical edge */
1434 {
1435  int i;
1436  int a, b, c, d;
1437  int d1, d2;
1438  int rnd = 1;
1439  for (i = 0; i < 8; i++) {
1440  a = src[-2 * stride];
1441  b = src[-stride];
1442  c = src[0];
1443  d = src[stride];
1444  d1 = (a - d + 3 + rnd) >> 3;
1445  d2 = (a - d + b - c + 4 - rnd) >> 3;
1446 
1447  src[-2 * stride] = a - d1;
1448  src[-stride] = av_clip_uint8(b - d2);
1449  src[0] = av_clip_uint8(c + d2);
1450  src[stride] = d + d1;
1451  src++;
1452  rnd = !rnd;
1453  }
1454 }
1455 
1456 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1457 {
1458  int i;
1459  int a, b, c, d;
1460  int d1, d2;
1461  int rnd1 = 4, rnd2 = 3;
1462  for (i = 0; i < 8; i++) {
1463  a = top[48];
1464  b = top[56];
1465  c = bottom[0];
1466  d = bottom[8];
1467  d1 = a - d;
1468  d2 = a - d + b - c;
1469 
1470  top[48] = ((a << 3) - d1 + rnd1) >> 3;
1471  top[56] = ((b << 3) - d2 + rnd2) >> 3;
1472  bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1473  bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1474 
1475  bottom++;
1476  top++;
1477  rnd2 = 7 - rnd2;
1478  rnd1 = 7 - rnd1;
1479  }
1480 }
1481 
1482 /**
1483  * VC-1 in-loop deblocking filter for one line
1484  * @param src source block type
1485  * @param stride block stride
1486  * @param pq block quantizer
1487  * @return whether other 3 pairs should be filtered or not
1488  * @see 8.6
1489  */
1491 {
1492  int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1493  5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1494  int a0_sign = a0 >> 31; /* Store sign */
1495 
1496  a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1497  if (a0 < pq) {
1498  int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1499  5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1500  int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1501  5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1502  if (a1 < a0 || a2 < a0) {
1503  int clip = src[-1 * stride] - src[0 * stride];
1504  int clip_sign = clip >> 31;
1505 
1506  clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1507  if (clip) {
1508  int a3 = FFMIN(a1, a2);
1509  int d = 5 * (a3 - a0);
1510  int d_sign = (d >> 31);
1511 
1512  d = ((d ^ d_sign) - d_sign) >> 3;
1513  d_sign ^= a0_sign;
1514 
1515  if (d_sign ^ clip_sign)
1516  d = 0;
1517  else {
1518  d = FFMIN(d, clip);
1519  d = (d ^ d_sign) - d_sign; /* Restore sign */
1520  src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1521  src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1522  }
1523  return 1;
1524  }
1525  }
1526  }
1527  return 0;
1528 }
1529 
1530 /**
1531  * VC-1 in-loop deblocking filter
1532  * @param src source block type
1533  * @param step distance between horizontally adjacent elements
1534  * @param stride distance between vertically adjacent elements
1535  * @param len edge length to filter (4 or 8 pixels)
1536  * @param pq block quantizer
1537  * @see 8.6
1538  */
1539 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1540  int len, int pq)
1541 {
1542  int i;
1543  int filt3;
1544 
1545  for (i = 0; i < len; i += 4) {
1546  filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1547  if (filt3) {
1548  vc1_filter_line(src + 0 * step, stride, pq);
1549  vc1_filter_line(src + 1 * step, stride, pq);
1550  vc1_filter_line(src + 3 * step, stride, pq);
1551  }
1552  src += step * 4;
1553  }
1554 }
1555 
1557 {
1558  vc1_loop_filter(src, 1, stride, 4, pq);
1559 }
1560 
1562 {
1563  vc1_loop_filter(src, stride, 1, 4, pq);
1564 }
1565 
1567 {
1568  vc1_loop_filter(src, 1, stride, 8, pq);
1569 }
1570 
1572 {
1573  vc1_loop_filter(src, stride, 1, 8, pq);
1574 }
1575 
1577 {
1578  vc1_loop_filter(src, 1, stride, 16, pq);
1579 }
1580 
1582 {
1583  vc1_loop_filter(src, stride, 1, 16, pq);
1584 }
1585 
1587  ptrdiff_t stride, int rnd)
1588 {
1589  ff_put_pixels8_8_mmi(dst, src, stride, 8);
1590 }
1592  ptrdiff_t stride, int rnd)
1593 {
1594  ff_put_pixels16_8_mmi(dst, src, stride, 16);
1595 }
1597  ptrdiff_t stride, int rnd)
1598 {
1599  ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1600 }
1602  ptrdiff_t stride, int rnd)
1603 {
1604  ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1605 }
1606 
1607 #define OP_PUT(S, D)
1608 #define OP_AVG(S, D) \
1609  "ldc1 $f16, "#S" \n\t" \
1610  "pavgb "#D", "#D", $f16 \n\t"
1611 
1612 /** Add rounder from $f14 to $f6 and pack result at destination */
1613 #define NORMALIZE_MMI(SHIFT) \
1614  "paddh $f6, $f6, $f14 \n\t" /* +bias-r */ \
1615  "paddh $f8, $f8, $f14 \n\t" /* +bias-r */ \
1616  "psrah $f6, $f6, "SHIFT" \n\t" \
1617  "psrah $f8, $f8, "SHIFT" \n\t"
1618 
1619 #define TRANSFER_DO_PACK(OP) \
1620  "packushb $f6, $f6, $f8 \n\t" \
1621  OP((%[dst]), $f6) \
1622  "sdc1 $f6, 0x00(%[dst]) \n\t"
1623 
1624 #define TRANSFER_DONT_PACK(OP) \
1625  OP(0(%[dst]), $f6) \
1626  OP(8(%[dst]), $f8) \
1627  "sdc1 $f6, 0x00(%[dst]) \n\t" \
1628  "sdc1 $f8, 0x08(%[dst]) \n\t"
1629 
1630 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1631 #define DO_UNPACK(reg) \
1632  "punpcklbh "reg", "reg", $f0 \n\t"
1633 #define DONT_UNPACK(reg)
1634 
1635 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1636 #define LOAD_ROUNDER_MMI(ROUND) \
1637  "lwc1 $f14, "ROUND" \n\t" \
1638  "punpcklhw $f14, $f14, $f14 \n\t" \
1639  "punpcklwd $f14, $f14, $f14 \n\t"
1640 
1641 
1642 #define SHIFT2_LINE(OFF, R0, R1, R2, R3) \
1643  "paddh "#R1", "#R1", "#R2" \n\t" \
1644  PTR_ADDU "$9, %[src], %[stride1] \n\t" \
1645  MMI_ULWC1(R0, $9, 0x00) \
1646  "pmullh "#R1", "#R1", $f6 \n\t" \
1647  "punpcklbh "#R0", "#R0", $f0 \n\t" \
1648  PTR_ADDU "$9, %[src], %[stride] \n\t" \
1649  MMI_ULWC1(R3, $9, 0x00) \
1650  "psubh "#R1", "#R1", "#R0" \n\t" \
1651  "punpcklbh "#R3", "#R3", $f0 \n\t" \
1652  "paddh "#R1", "#R1", $f14 \n\t" \
1653  "psubh "#R1", "#R1", "#R3" \n\t" \
1654  "psrah "#R1", "#R1", %[shift] \n\t" \
1655  MMI_SDC1(R1, %[dst], OFF) \
1656  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1657 
1658 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1659 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1660  const uint8_t *src, mips_reg stride,
1661  int rnd, int64_t shift)
1662 {
1663  DECLARE_VAR_LOW32;
1664  DECLARE_VAR_ADDRT;
1665 
1666  __asm__ volatile(
1667  "xor $f0, $f0, $f0 \n\t"
1668  "li $8, 0x03 \n\t"
1669  LOAD_ROUNDER_MMI("%[rnd]")
1670  "ldc1 $f12, %[ff_pw_9] \n\t"
1671  "1: \n\t"
1672  MMI_ULWC1($f4, %[src], 0x00)
1673  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1674  MMI_ULWC1($f6, %[src], 0x00)
1675  "punpcklbh $f4, $f4, $f0 \n\t"
1676  "punpcklbh $f6, $f6, $f0 \n\t"
1677  SHIFT2_LINE( 0, $f2, $f4, $f6, $f8)
1678  SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1679  SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1680  SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1681  SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1682  SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1683  SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1684  SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1685  PTR_SUBU "%[src], %[src], %[stride2] \n\t"
1686  PTR_ADDIU "%[dst], %[dst], 0x08 \n\t"
1687  "addiu $8, $8, -0x01 \n\t"
1688  "bnez $8, 1b \n\t"
1689  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT
1690  [src]"+r"(src), [dst]"+r"(dst)
1691  : [stride]"r"(stride), [stride1]"r"(-2*stride),
1692  [shift]"f"(shift), [rnd]"m"(rnd),
1693  [stride2]"r"(9*stride-4), [ff_pw_9]"m"(ff_pw_9)
1694  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1695  "$f14", "$f16", "memory"
1696  );
1697 }
1698 
1699 /**
1700  * Data is already unpacked, so some operations can directly be made from
1701  * memory.
1702  */
1703 #define VC1_HOR_16B_SHIFT2(OP, OPNAME) \
1704 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1705  const int16_t *src, int rnd) \
1706 { \
1707  int h = 8; \
1708  DECLARE_VAR_ALL64; \
1709  DECLARE_VAR_ADDRT; \
1710  \
1711  src -= 1; \
1712  rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ \
1713  \
1714  __asm__ volatile( \
1715  LOAD_ROUNDER_MMI("%[rnd]") \
1716  "ldc1 $f12, %[ff_pw_128] \n\t" \
1717  "ldc1 $f10, %[ff_pw_9] \n\t" \
1718  "1: \n\t" \
1719  MMI_ULDC1($f2, %[src], 0x00) \
1720  MMI_ULDC1($f4, %[src], 0x08) \
1721  MMI_ULDC1($f6, %[src], 0x02) \
1722  MMI_ULDC1($f8, %[src], 0x0a) \
1723  MMI_ULDC1($f0, %[src], 0x06) \
1724  "paddh $f2, $f2, $f0 \n\t" \
1725  MMI_ULDC1($f0, %[src], 0x0e) \
1726  "paddh $f4, $f4, $f0 \n\t" \
1727  MMI_ULDC1($f0, %[src], 0x04) \
1728  "paddh $f6, $f6, $f0 \n\t" \
1729  MMI_ULDC1($f0, %[src], 0x0b) \
1730  "paddh $f8, $f8, $f0 \n\t" \
1731  "pmullh $f6, $f6, $f10 \n\t" \
1732  "pmullh $f8, $f8, $f10 \n\t" \
1733  "psubh $f6, $f6, $f2 \n\t" \
1734  "psubh $f8, $f8, $f4 \n\t" \
1735  "li $8, 0x07 \n\t" \
1736  "mtc1 $8, $f16 \n\t" \
1737  NORMALIZE_MMI("$f16") \
1738  /* Remove bias */ \
1739  "paddh $f6, $f6, $f12 \n\t" \
1740  "paddh $f8, $f8, $f12 \n\t" \
1741  TRANSFER_DO_PACK(OP) \
1742  "addiu %[h], %[h], -0x01 \n\t" \
1743  PTR_ADDIU "%[src], %[src], 0x18 \n\t" \
1744  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1745  "bnez %[h], 1b \n\t" \
1746  : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1747  [h]"+r"(h), \
1748  [src]"+r"(src), [dst]"+r"(dst) \
1749  : [stride]"r"(stride), [rnd]"m"(rnd), \
1750  [ff_pw_9]"m"(ff_pw_9), [ff_pw_128]"m"(ff_pw_128) \
1751  : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", \
1752  "$f16", "memory" \
1753  ); \
1754 }
1755 
1758 
1759 /**
1760  * Purely vertical or horizontal 1/2 shift interpolation.
1761  * Sacrify $f12 for *9 factor.
1762  */
1763 #define VC1_SHIFT2(OP, OPNAME)\
1764 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \
1765  mips_reg stride, int rnd, \
1766  mips_reg offset) \
1767 { \
1768  DECLARE_VAR_LOW32; \
1769  DECLARE_VAR_ADDRT; \
1770  \
1771  rnd = 8 - rnd; \
1772  \
1773  __asm__ volatile( \
1774  "xor $f0, $f0, $f0 \n\t" \
1775  "li $10, 0x08 \n\t" \
1776  LOAD_ROUNDER_MMI("%[rnd]") \
1777  "ldc1 $f12, %[ff_pw_9] \n\t" \
1778  "1: \n\t" \
1779  MMI_ULWC1($f6, %[src], 0x00) \
1780  MMI_ULWC1($f8, %[src], 0x04) \
1781  PTR_ADDU "$9, %[src], %[offset] \n\t" \
1782  MMI_ULWC1($f2, $9, 0x00) \
1783  MMI_ULWC1($f4, $9, 0x04) \
1784  PTR_ADDU "%[src], %[src], %[offset] \n\t" \
1785  "punpcklbh $f6, $f6, $f0 \n\t" \
1786  "punpcklbh $f8, $f8, $f0 \n\t" \
1787  "punpcklbh $f2, $f2, $f0 \n\t" \
1788  "punpcklbh $f4, $f4, $f0 \n\t" \
1789  "paddh $f6, $f6, $f2 \n\t" \
1790  "paddh $f8, $f8, $f4 \n\t" \
1791  PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \
1792  MMI_ULWC1($f2, $9, 0x00) \
1793  MMI_ULWC1($f4, $9, 0x04) \
1794  "pmullh $f6, $f6, $f12 \n\t" /* 0,9,9,0*/ \
1795  "pmullh $f8, $f8, $f12 \n\t" /* 0,9,9,0*/ \
1796  "punpcklbh $f2, $f2, $f0 \n\t" \
1797  "punpcklbh $f4, $f4, $f0 \n\t" \
1798  "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,0*/ \
1799  "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,0*/ \
1800  PTR_ADDU "$9, %[src], %[offset] \n\t" \
1801  MMI_ULWC1($f2, $9, 0x00) \
1802  MMI_ULWC1($f4, $9, 0x04) \
1803  "punpcklbh $f2, $f2, $f0 \n\t" \
1804  "punpcklbh $f4, $f4, $f0 \n\t" \
1805  "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,-1*/ \
1806  "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,-1*/ \
1807  "li $8, 0x04 \n\t" \
1808  "mtc1 $8, $f16 \n\t" \
1809  NORMALIZE_MMI("$f16") \
1810  "packushb $f6, $f6, $f8 \n\t" \
1811  OP((%[dst]), $f6) \
1812  "sdc1 $f6, 0x00(%[dst]) \n\t" \
1813  "addiu $10, $10, -0x01 \n\t" \
1814  PTR_ADDU "%[src], %[src], %[stride1] \n\t" \
1815  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1816  "bnez $10, 1b \n\t" \
1817  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1818  [src]"+r"(src), [dst]"+r"(dst) \
1819  : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \
1820  [stride]"r"(stride), [rnd]"m"(rnd), \
1821  [stride1]"r"(stride-offset), \
1822  [ff_pw_9]"m"(ff_pw_9) \
1823  : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \
1824  "$f12", "$f14", "$f16", "memory" \
1825  ); \
1826 }
1827 
1828 VC1_SHIFT2(OP_PUT, put_)
1829 VC1_SHIFT2(OP_AVG, avg_)
1830 
1831 /**
1832  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1833  *
1834  * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
1835  * @param LOAD "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1836  * @param M "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1837  * @param A1 Stride address of 1st tap (beware of unpacked/packed).
1838  * @param A2 Stride address of 2nd tap
1839  * @param A3 Stride address of 3rd tap
1840  * @param A4 Stride address of 4th tap
1841  */
1842 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \
1843  PTR_ADDU "$9, %[src], "#A1" \n\t" \
1844  LOAD($f2, $9, M*0) \
1845  LOAD($f4, $9, M*4) \
1846  UNPACK("$f2") \
1847  UNPACK("$f4") \
1848  "pmullh $f2, $f2, %[ff_pw_3] \n\t" \
1849  "pmullh $f4, $f4, %[ff_pw_3] \n\t" \
1850  PTR_ADDU "$9, %[src], "#A2" \n\t" \
1851  LOAD($f6, $9, M*0) \
1852  LOAD($f8, $9, M*4) \
1853  UNPACK("$f6") \
1854  UNPACK("$f8") \
1855  "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1856  "pmullh $f8, $f8, $f12 \n\t" /* *18 */ \
1857  "psubh $f6, $f6, $f2 \n\t" /* *18, -3 */ \
1858  "psubh $f8, $f8, $f4 \n\t" /* *18, -3 */ \
1859  PTR_ADDU "$9, %[src], "#A4" \n\t" \
1860  LOAD($f2, $9, M*0) \
1861  LOAD($f4, $9, M*4) \
1862  UNPACK("$f2") \
1863  UNPACK("$f4") \
1864  "li $8, 0x02 \n\t" \
1865  "mtc1 $8, $f16 \n\t" \
1866  "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1867  "psllh $f4, $f4, $f16 \n\t" /* 4* */ \
1868  "psubh $f6, $f6, $f2 \n\t" /* -4,18,-3 */ \
1869  "psubh $f8, $f8, $f4 \n\t" /* -4,18,-3 */ \
1870  PTR_ADDU "$9, %[src], "#A3" \n\t" \
1871  LOAD($f2, $9, M*0) \
1872  LOAD($f4, $9, M*4) \
1873  UNPACK("$f2") \
1874  UNPACK("$f4") \
1875  "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1876  "pmullh $f4, $f4, $f10 \n\t" /* *53 */ \
1877  "paddh $f6, $f6, $f2 \n\t" /* 4,53,18,-3 */ \
1878  "paddh $f8, $f8, $f4 \n\t" /* 4,53,18,-3 */
1879 
1880 /**
1881  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1882  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1883  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1884  *
1885  * @param NAME Either 1 or 3
1886  * @see MSPEL_FILTER13_CORE for information on A1->A4
1887  */
1888 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
1889 static void \
1890 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \
1891  mips_reg src_stride, \
1892  int rnd, int64_t shift) \
1893 { \
1894  int h = 8; \
1895  DECLARE_VAR_LOW32; \
1896  DECLARE_VAR_ADDRT; \
1897  \
1898  src -= src_stride; \
1899  \
1900  __asm__ volatile( \
1901  "xor $f0, $f0, $f0 \n\t" \
1902  LOAD_ROUNDER_MMI("%[rnd]") \
1903  "ldc1 $f10, %[ff_pw_53] \n\t" \
1904  "ldc1 $f12, %[ff_pw_18] \n\t" \
1905  ".p2align 3 \n\t" \
1906  "1: \n\t" \
1907  MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
1908  NORMALIZE_MMI("%[shift]") \
1909  TRANSFER_DONT_PACK(OP_PUT) \
1910  /* Last 3 (in fact 4) bytes on the line */ \
1911  PTR_ADDU "$9, %[src], "#A1" \n\t" \
1912  MMI_ULWC1($f2, $9, 0x08) \
1913  DO_UNPACK("$f2") \
1914  "mov.d $f6, $f2 \n\t" \
1915  "paddh $f2, $f2, $f2 \n\t" \
1916  "paddh $f2, $f2, $f6 \n\t" /* 3* */ \
1917  PTR_ADDU "$9, %[src], "#A2" \n\t" \
1918  MMI_ULWC1($f6, $9, 0x08) \
1919  DO_UNPACK("$f6") \
1920  "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1921  "psubh $f6, $f6, $f2 \n\t" /* *18,-3 */ \
1922  PTR_ADDU "$9, %[src], "#A3" \n\t" \
1923  MMI_ULWC1($f2, $9, 0x08) \
1924  DO_UNPACK("$f2") \
1925  "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1926  "paddh $f6, $f6, $f2 \n\t" /* *53,18,-3 */ \
1927  PTR_ADDU "$9, %[src], "#A4" \n\t" \
1928  MMI_ULWC1($f2, $9, 0x08) \
1929  DO_UNPACK("$f2") \
1930  "li $8, 0x02 \n\t" \
1931  "mtc1 $8, $f16 \n\t" \
1932  "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1933  "psubh $f6, $f6, $f2 \n\t" \
1934  "paddh $f6, $f6, $f14 \n\t" \
1935  "li $8, 0x06 \n\t" \
1936  "mtc1 $8, $f16 \n\t" \
1937  "psrah $f6, $f6, $f16 \n\t" \
1938  "sdc1 $f6, 0x10(%[dst]) \n\t" \
1939  "addiu %[h], %[h], -0x01 \n\t" \
1940  PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \
1941  PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \
1942  "bnez %[h], 1b \n\t" \
1943  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1944  [h]"+r"(h), \
1945  [src]"+r"(src), [dst]"+r"(dst) \
1946  : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \
1947  [stride_x3]"r"(3*src_stride), \
1948  [rnd]"m"(rnd), [shift]"f"(shift), \
1949  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1950  [ff_pw_3]"f"(ff_pw_3) \
1951  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1952  "$f14", "$f16", "memory" \
1953  ); \
1954 }
1955 
1956 /**
1957  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1958  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1959  *
1960  * @param NAME Either 1 or 3
1961  * @see MSPEL_FILTER13_CORE for information on A1->A4
1962  */
1963 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
1964 static void \
1965 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \
1966  const int16_t *src, int rnd) \
1967 { \
1968  int h = 8; \
1969  DECLARE_VAR_ALL64; \
1970  DECLARE_VAR_ADDRT; \
1971  \
1972  src -= 1; \
1973  rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
1974  \
1975  __asm__ volatile( \
1976  "xor $f0, $f0, $f0 \n\t" \
1977  LOAD_ROUNDER_MMI("%[rnd]") \
1978  "ldc1 $f10, %[ff_pw_53] \n\t" \
1979  "ldc1 $f12, %[ff_pw_18] \n\t" \
1980  ".p2align 3 \n\t" \
1981  "1: \n\t" \
1982  MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \
1983  "li $8, 0x07 \n\t" \
1984  "mtc1 $8, $f16 \n\t" \
1985  NORMALIZE_MMI("$f16") \
1986  /* Remove bias */ \
1987  "paddh $f6, $f6, %[ff_pw_128] \n\t" \
1988  "paddh $f8, $f8, %[ff_pw_128] \n\t" \
1989  TRANSFER_DO_PACK(OP) \
1990  "addiu %[h], %[h], -0x01 \n\t" \
1991  PTR_ADDU "%[src], %[src], 0x18 \n\t" \
1992  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1993  "bnez %[h], 1b \n\t" \
1994  : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1995  [h]"+r"(h), \
1996  [src]"+r"(src), [dst]"+r"(dst) \
1997  : [stride]"r"(stride), [rnd]"m"(rnd), \
1998  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1999  [ff_pw_3]"f"(ff_pw_3), [ff_pw_128]"f"(ff_pw_128) \
2000  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
2001  "$f14", "$f16", "memory" \
2002  ); \
2003 }
2004 
2005 /**
2006  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2007  * Here, offset=src_stride. Parameters passed A1 to A4 must use
2008  * %3 (offset), %4 (2*offset) and %5 (3*offset).
2009  *
2010  * @param NAME Either 1 or 3
2011  * @see MSPEL_FILTER13_CORE for information on A1->A4
2012  */
2013 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
2014 static void \
2015 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \
2016  mips_reg stride, int rnd, mips_reg offset) \
2017 { \
2018  int h = 8; \
2019  DECLARE_VAR_LOW32; \
2020  DECLARE_VAR_ADDRT; \
2021  \
2022  src -= offset; \
2023  rnd = 32-rnd; \
2024  \
2025  __asm__ volatile ( \
2026  "xor $f0, $f0, $f0 \n\t" \
2027  LOAD_ROUNDER_MMI("%[rnd]") \
2028  "ldc1 $f10, %[ff_pw_53] \n\t" \
2029  "ldc1 $f12, %[ff_pw_18] \n\t" \
2030  ".p2align 3 \n\t" \
2031  "1: \n\t" \
2032  MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
2033  "li $8, 0x06 \n\t" \
2034  "mtc1 $8, $f16 \n\t" \
2035  NORMALIZE_MMI("$f16") \
2036  TRANSFER_DO_PACK(OP) \
2037  "addiu %[h], %[h], -0x01 \n\t" \
2038  PTR_ADDU "%[src], %[src], %[stride] \n\t" \
2039  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
2040  "bnez %[h], 1b \n\t" \
2041  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
2042  [h]"+r"(h), \
2043  [src]"+r"(src), [dst]"+r"(dst) \
2044  : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \
2045  [offset_x3]"r"(3*offset), [stride]"r"(stride), \
2046  [rnd]"m"(rnd), \
2047  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
2048  [ff_pw_3]"f"(ff_pw_3) \
2049  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
2050  "$f14", "$f16", "memory" \
2051  ); \
2052 }
2053 
2054 
2055 /** 1/4 shift bicubic interpolation */
2056 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2057 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2058 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2059 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2060 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2061 
2062 /** 3/4 shift bicubic interpolation */
2063 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2064 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2065 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2066 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2067 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2068 
2070  (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2071  int64_t shift);
2073  (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2075  (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2076  mips_reg offset);
2077 
2078 /**
2079  * Interpolate fractional pel values by applying proper vertical then
2080  * horizontal filter.
2081  *
2082  * @param dst Destination buffer for interpolated pels.
2083  * @param src Source buffer.
2084  * @param stride Stride for both src and dst buffers.
2085  * @param hmode Horizontal filter (expressed in quarter pixels shift).
2086  * @param hmode Vertical filter.
2087  * @param rnd Rounding bias.
2088  */
2089 #define VC1_MSPEL_MC(OP) \
2090 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2091  int hmode, int vmode, int rnd) \
2092 { \
2093  static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2094  { NULL, vc1_put_ver_16b_shift1_mmi, \
2095  vc1_put_ver_16b_shift2_mmi, \
2096  vc1_put_ver_16b_shift3_mmi }; \
2097  static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2098  { NULL, OP ## vc1_hor_16b_shift1_mmi, \
2099  OP ## vc1_hor_16b_shift2_mmi, \
2100  OP ## vc1_hor_16b_shift3_mmi }; \
2101  static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \
2102  { NULL, OP ## vc1_shift1_mmi, \
2103  OP ## vc1_shift2_mmi, \
2104  OP ## vc1_shift3_mmi }; \
2105  \
2106  if (vmode) { /* Vertical filter to apply */ \
2107  if (hmode) { /* Horizontal filter to apply, output to tmp */ \
2108  static const int shift_value[] = { 0, 5, 1, 5 }; \
2109  int shift = (shift_value[hmode]+shift_value[vmode])>>1; \
2110  int r; \
2111  LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \
2112  \
2113  r = (1<<(shift-1)) + rnd-1; \
2114  vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \
2115  \
2116  vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \
2117  return; \
2118  } \
2119  else { /* No horizontal filter, output 8 lines to dst */ \
2120  vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \
2121  return; \
2122  } \
2123  } \
2124  \
2125  /* Horizontal mode with no vertical mode */ \
2126  vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \
2127 } \
2128 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
2129  int stride, int hmode, int vmode, int rnd)\
2130 { \
2131  OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2132  OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2133  dst += 8*stride; src += 8*stride; \
2134  OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2135  OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2136 }
2137 
2138 VC1_MSPEL_MC(put_)
2139 VC1_MSPEL_MC(avg_)
2140 
2141 /** Macro to ease bicubic filter interpolation functions declarations */
2142 #define DECLARE_FUNCTION(a, b) \
2143 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2144  const uint8_t *src, \
2145  ptrdiff_t stride, \
2146  int rnd) \
2147 { \
2148  put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2149 } \
2150 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2151  const uint8_t *src, \
2152  ptrdiff_t stride, \
2153  int rnd) \
2154 { \
2155  avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2156 } \
2157 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2158  const uint8_t *src, \
2159  ptrdiff_t stride, \
2160  int rnd) \
2161 { \
2162  put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2163 } \
2164 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2165  const uint8_t *src, \
2166  ptrdiff_t stride, \
2167  int rnd) \
2168 { \
2169  avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2170 }
2171 
2172 DECLARE_FUNCTION(0, 1)
2173 DECLARE_FUNCTION(0, 2)
2174 DECLARE_FUNCTION(0, 3)
2175 
2176 DECLARE_FUNCTION(1, 0)
2177 DECLARE_FUNCTION(1, 1)
2178 DECLARE_FUNCTION(1, 2)
2179 DECLARE_FUNCTION(1, 3)
2180 
2181 DECLARE_FUNCTION(2, 0)
2182 DECLARE_FUNCTION(2, 1)
2183 DECLARE_FUNCTION(2, 2)
2184 DECLARE_FUNCTION(2, 3)
2185 
2186 DECLARE_FUNCTION(3, 0)
2187 DECLARE_FUNCTION(3, 1)
2188 DECLARE_FUNCTION(3, 2)
2189 DECLARE_FUNCTION(3, 3)
2190 
2191 #define CHROMA_MC_8_MMI \
2192  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
2193  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2194  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
2195  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2196  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
2197  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2198  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
2199  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2200  \
2201  "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2202  "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \
2203  "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2204  "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \
2205  "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2206  "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \
2207  "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2208  "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \
2209  \
2210  "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2211  "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2212  "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2213  "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2214  \
2215  "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
2216  "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \
2217  "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
2218  "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \
2219  \
2220  "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
2221  "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
2222  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
2223 
2224 
2225 #define CHROMA_MC_4_MMI \
2226  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2227  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2228  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2229  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2230  \
2231  "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2232  "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2233  "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2234  "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2235  \
2236  "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2237  "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2238  "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2239  "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2240  \
2241  "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
2242  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
2243 
2244 
2246  uint8_t *src /* align 1 */,
2247  ptrdiff_t stride, int h, int x, int y)
2248 {
2249  const int A = (8 - x) * (8 - y);
2250  const int B = (x) * (8 - y);
2251  const int C = (8 - x) * (y);
2252  const int D = (x) * (y);
2253  double ftmp[10];
2254  uint32_t tmp[1];
2255  DECLARE_VAR_ALL64;
2256  DECLARE_VAR_ADDRT;
2257 
2258  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2259 
2260  __asm__ volatile(
2261  "li %[tmp0], 0x06 \n\t"
2262  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2263  "mtc1 %[tmp0], %[ftmp9] \n\t"
2264  "pshufh %[A], %[A], %[ftmp0] \n\t"
2265  "pshufh %[B], %[B], %[ftmp0] \n\t"
2266  "pshufh %[C], %[C], %[ftmp0] \n\t"
2267  "pshufh %[D], %[D], %[ftmp0] \n\t"
2268 
2269  "1: \n\t"
2270  MMI_ULDC1(%[ftmp1], %[src], 0x00)
2271  MMI_ULDC1(%[ftmp2], %[src], 0x01)
2272  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2273  MMI_ULDC1(%[ftmp3], %[src], 0x00)
2274  MMI_ULDC1(%[ftmp4], %[src], 0x01)
2275 
2277 
2278  MMI_SDC1(%[ftmp1], %[dst], 0x00)
2279  "addiu %[h], %[h], -0x01 \n\t"
2280  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2281  "bnez %[h], 1b \n\t"
2282  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2283  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2284  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2285  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2286  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2287  RESTRICT_ASM_ALL64
2288  RESTRICT_ASM_ADDRT
2289  [tmp0]"=&r"(tmp[0]),
2290  [src]"+&r"(src), [dst]"+&r"(dst),
2291  [h]"+&r"(h)
2292  : [stride]"r"((mips_reg)stride),
2293  [A]"f"(A), [B]"f"(B),
2294  [C]"f"(C), [D]"f"(D),
2295  [ff_pw_28]"f"(ff_pw_28)
2296  : "memory"
2297  );
2298 }
2299 
2301  uint8_t *src /* align 1 */,
2302  ptrdiff_t stride, int h, int x, int y)
2303 {
2304  const int A = (8 - x) * (8 - y);
2305  const int B = (x) * (8 - y);
2306  const int C = (8 - x) * (y);
2307  const int D = (x) * (y);
2308  double ftmp[6];
2309  uint32_t tmp[1];
2310  DECLARE_VAR_LOW32;
2311  DECLARE_VAR_ADDRT;
2312 
2313  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2314 
2315  __asm__ volatile(
2316  "li %[tmp0], 0x06 \n\t"
2317  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2318  "mtc1 %[tmp0], %[ftmp5] \n\t"
2319  "pshufh %[A], %[A], %[ftmp0] \n\t"
2320  "pshufh %[B], %[B], %[ftmp0] \n\t"
2321  "pshufh %[C], %[C], %[ftmp0] \n\t"
2322  "pshufh %[D], %[D], %[ftmp0] \n\t"
2323 
2324  "1: \n\t"
2325  MMI_ULWC1(%[ftmp1], %[src], 0x00)
2326  MMI_ULWC1(%[ftmp2], %[src], 0x01)
2327  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2328  MMI_ULWC1(%[ftmp3], %[src], 0x00)
2329  MMI_ULWC1(%[ftmp4], %[src], 0x01)
2330 
2332 
2333  MMI_SWC1(%[ftmp1], %[dst], 0x00)
2334  "addiu %[h], %[h], -0x01 \n\t"
2335  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2336  "bnez %[h], 1b \n\t"
2337  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2338  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2339  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2340  [tmp0]"=&r"(tmp[0]),
2341  RESTRICT_ASM_LOW32
2342  RESTRICT_ASM_ADDRT
2343  [src]"+&r"(src), [dst]"+&r"(dst),
2344  [h]"+&r"(h)
2345  : [stride]"r"((mips_reg)stride),
2346  [A]"f"(A), [B]"f"(B),
2347  [C]"f"(C), [D]"f"(D),
2348  [ff_pw_28]"f"(ff_pw_28)
2349  : "memory"
2350  );
2351 }
2352 
2354  uint8_t *src /* align 1 */,
2355  ptrdiff_t stride, int h, int x, int y)
2356 {
2357  const int A = (8 - x) * (8 - y);
2358  const int B = (x) * (8 - y);
2359  const int C = (8 - x) * (y);
2360  const int D = (x) * (y);
2361  double ftmp[10];
2362  uint32_t tmp[1];
2363  DECLARE_VAR_ALL64;
2364  DECLARE_VAR_ADDRT;
2365 
2366  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2367 
2368  __asm__ volatile(
2369  "li %[tmp0], 0x06 \n\t"
2370  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2371  "mtc1 %[tmp0], %[ftmp9] \n\t"
2372  "pshufh %[A], %[A], %[ftmp0] \n\t"
2373  "pshufh %[B], %[B], %[ftmp0] \n\t"
2374  "pshufh %[C], %[C], %[ftmp0] \n\t"
2375  "pshufh %[D], %[D], %[ftmp0] \n\t"
2376 
2377  "1: \n\t"
2378  MMI_ULDC1(%[ftmp1], %[src], 0x00)
2379  MMI_ULDC1(%[ftmp2], %[src], 0x01)
2380  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2381  MMI_ULDC1(%[ftmp3], %[src], 0x00)
2382  MMI_ULDC1(%[ftmp4], %[src], 0x01)
2383 
2385 
2386  MMI_LDC1(%[ftmp2], %[dst], 0x00)
2387  "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2388 
2389  MMI_SDC1(%[ftmp1], %[dst], 0x00)
2390  "addiu %[h], %[h], -0x01 \n\t"
2391  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2392  "bnez %[h], 1b \n\t"
2393  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2394  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2395  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2396  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2397  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2398  [tmp0]"=&r"(tmp[0]),
2399  RESTRICT_ASM_ALL64
2400  RESTRICT_ASM_ADDRT
2401  [src]"+&r"(src), [dst]"+&r"(dst),
2402  [h]"+&r"(h)
2403  : [stride]"r"((mips_reg)stride),
2404  [A]"f"(A), [B]"f"(B),
2405  [C]"f"(C), [D]"f"(D),
2406  [ff_pw_28]"f"(ff_pw_28)
2407  : "memory"
2408  );
2409 }
2410 
2412  uint8_t *src /* align 1 */,
2413  ptrdiff_t stride, int h, int x, int y)
2414 {
2415  const int A = (8 - x) * (8 - y);
2416  const int B = ( x) * (8 - y);
2417  const int C = (8 - x) * ( y);
2418  const int D = ( x) * ( y);
2419  double ftmp[6];
2420  uint32_t tmp[1];
2421  DECLARE_VAR_LOW32;
2422  DECLARE_VAR_ADDRT;
2423 
2424  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2425 
2426  __asm__ volatile(
2427  "li %[tmp0], 0x06 \n\t"
2428  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2429  "mtc1 %[tmp0], %[ftmp5] \n\t"
2430  "pshufh %[A], %[A], %[ftmp0] \n\t"
2431  "pshufh %[B], %[B], %[ftmp0] \n\t"
2432  "pshufh %[C], %[C], %[ftmp0] \n\t"
2433  "pshufh %[D], %[D], %[ftmp0] \n\t"
2434 
2435  "1: \n\t"
2436  MMI_ULWC1(%[ftmp1], %[src], 0x00)
2437  MMI_ULWC1(%[ftmp2], %[src], 0x01)
2438  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2439  MMI_ULWC1(%[ftmp3], %[src], 0x00)
2440  MMI_ULWC1(%[ftmp4], %[src], 0x01)
2441 
2443 
2444  MMI_LWC1(%[ftmp2], %[dst], 0x00)
2445  "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2446 
2447  MMI_SWC1(%[ftmp1], %[dst], 0x00)
2448  "addiu %[h], %[h], -0x01 \n\t"
2449  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2450  "bnez %[h], 1b \n\t"
2451  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2452  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2453  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2454  [tmp0]"=&r"(tmp[0]),
2455  RESTRICT_ASM_LOW32
2456  RESTRICT_ASM_ADDRT
2457  [src]"+&r"(src), [dst]"+&r"(dst),
2458  [h]"+&r"(h)
2459  : [stride]"r"((mips_reg)stride),
2460  [A]"f"(A), [B]"f"(B),
2461  [C]"f"(C), [D]"f"(D),
2462  [ff_pw_28]"f"(ff_pw_28)
2463  : "memory"
2464  );
2465 }
#define A(x)
Definition: vp56_arith.h:28
#define PTR_SUBU
Definition: asmdefs.h:50
#define PTR_ADDIU
Definition: asmdefs.h:48
#define mips_reg
Definition: asmdefs.h:44
#define PTR_ADDU
Definition: asmdefs.h:47
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
#define av_always_inline
Definition: attributes.h:45
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31)))) #define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac) { } void ff_audio_convert_free(AudioConvert **ac) { if(! *ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);} AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map) { AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method !=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2) { ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc) { av_free(ac);return NULL;} return ac;} in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar) { ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar ? ac->channels :1;} else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;} int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in) { int use_generic=1;int len=in->nb_samples;int p;if(ac->dc) { av_log(ac->avr, AV_LOG_TRACE, "%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> dc
uint8_t
simple assert() macros that are a bit more flexible than ISO C assert().
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
#define flags(name, subs,...)
Definition: cbs_av1.c:561
#define f(width, name)
Definition: cbs_vp9.c:255
#define rnd()
Definition: checkasm.h:117
#define FFMIN(a, b)
Definition: common.h:105
#define av_clip_uint8
Definition: common.h:128
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
Definition: mem.h:117
void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
#define B
Definition: huffyuvdsp.h:32
int i
Definition: input.c:407
#define C
static const int shift1[6]
Definition: dxa.c:50
int stride
Definition: mace.c:144
const uint64_t ff_pw_28
Definition: constants.c:42
const uint64_t ff_pw_64
Definition: constants.c:45
const uint64_t ff_pw_1
Definition: constants.c:26
const uint64_t ff_pw_4
Definition: constants.c:29
const uint64_t ff_pw_9
Definition: constants.c:33
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
Definition: mmiutils.h:269
#define a3
Definition: regdef.h:49
#define a2
Definition: regdef.h:48
#define a0
Definition: regdef.h:46
#define a1
Definition: regdef.h:47
D(D(float, sse)
Definition: rematrix_init.c:28
static int shift(int a, int b)
Definition: sonic.c:82
static uint8_t tmp[11]
Definition: aes_ctr.c:27
#define src
Definition: vp8dsp.c:255
static int16_t block[64]
Definition: dct.c:116
VC-1 and WMV3 decoder.
void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
void(* vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd, mips_reg offset)
Definition: vc1dsp_mmi.c:2075
void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
Definition: vc1dsp_mmi.c:1456
void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1571
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the horizontal 16bits version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:1963
#define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)
Definition: vc1dsp_mmi.c:77
#define SHIFT2_LINE(OFF, R0, R1, R2, R3)
Definition: vc1dsp_mmi.c:1642
#define VC1_MSPEL_MC(OP)
Interpolate fractional pel values by applying proper vertical then horizontal filter.
Definition: vc1dsp_mmi.c:2089
#define CHROMA_MC_4_MMI
Definition: vc1dsp_mmi.c:2225
void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:1174
#define OP_AVG(S, D)
Definition: vc1dsp_mmi.c:1608
void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2353
void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2411
void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1576
#define DECLARE_FUNCTION(a, b)
Macro to ease bicubic filter interpolation functions declarations.
Definition: vc1dsp_mmi.c:2142
static void vc1_loop_filter(uint8_t *src, int step, int stride, int len, int pq)
VC-1 in-loop deblocking filter.
Definition: vc1dsp_mmi.c:1539
void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2245
#define CHROMA_MC_8_MMI
Definition: vc1dsp_mmi.c:2191
#define VC1_HOR_16B_SHIFT2(OP, OPNAME)
Data is already unpacked, so some operations can directly be made from memory.
Definition: vc1dsp_mmi.c:1703
void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
Definition: vc1dsp_mmi.c:1433
#define VC1_SHIFT2(OP, OPNAME)
Purely vertical or horizontal 1/2 shift interpolation.
Definition: vc1dsp_mmi.c:1763
void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1566
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)
Macro to build the vertical 16bits version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:1888
void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:126
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the 8bits, any direction, version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:2013
void(* vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd)
Definition: vc1dsp_mmi.c:2073
void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1591
void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
Definition: vc1dsp_mmi.c:1403
void(* vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd, int64_t shift)
1/4 shift bicubic interpolation
Definition: vc1dsp_mmi.c:2070
void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
Definition: vc1dsp_mmi.c:1380
#define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)
Definition: vc1dsp_mmi.c:33
void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1596
void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:1126
#define LOAD_ROUNDER_MMI(ROUND)
Compute the rounder 32-r or 8-r and unpacks it to $f14.
Definition: vc1dsp_mmi.c:1636
void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2300
static void vc1_put_ver_16b_shift2_mmi(int16_t *dst, const uint8_t *src, mips_reg stride, int rnd, int64_t shift)
Sacrificing $f12 makes it possible to pipeline loads from src.
Definition: vc1dsp_mmi.c:1659
void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1601
void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:870
#define OP_PUT(S, D)
Definition: vc1dsp_mmi.c:1607
void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1586
static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
VC-1 in-loop deblocking filter for one line.
Definition: vc1dsp_mmi.c:1490
void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1561
void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1556
void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1581
void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:419
const char * b
Definition: vf_curves.c:118
const char * r
Definition: vf_curves.c:116
static double clip(void *opaque, double val)
Clip value val in the minval - maxval range.
Definition: vf_lut.c:162
else temp
Definition: vf_mcdeint.c:259
static const double coeff[2][5]
Definition: vf_owdenoise.c:73
static const uint8_t offset[127][2]
Definition: vf_spp.c:107
int len
static double c[64]