h264.c@ 10184

最後變更在這個檔案從10184是 5776,由 vboxsync 提交於 17 年前
ffmpeg: exported to OSE
檔案大小: 317.3 KB

行
1	/*
2	* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3	* Copyright (c) 2003 Michael Niedermayer <[email protected]>
4	*
5	* This library is free software; you can redistribute it and/or
6	* modify it under the terms of the GNU Lesser General Public
7	* License as published by the Free Software Foundation; either
8	* version 2 of the License, or (at your option) any later version.
9	*
10	* This library is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	* Lesser General Public License for more details.
14	*
15	* You should have received a copy of the GNU Lesser General Public
16	* License along with this library; if not, write to the Free Software
17	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18	*
19	*/
20
21	/**
22	* @file h264.c
23	* H.264 / AVC / MPEG4 part10 codec.
24	* @author Michael Niedermayer <[email protected]>
25	*/
26
27	#include "common.h"
28	#include "dsputil.h"
29	#include "avcodec.h"
30	#include "mpegvideo.h"
31	#include "h264data.h"
32	#include "golomb.h"
33
34	#include "cabac.h"
35
36	//#undef NDEBUG
37	#include <assert.h>
38
39	#define interlaced_dct interlaced_dct_is_a_bad_name
40	#define mb_intra mb_intra_isnt_initalized_see_mb_type
41
42	#define LUMA_DC_BLOCK_INDEX 25
43	#define CHROMA_DC_BLOCK_INDEX 26
44
45	#define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
46	#define COEFF_TOKEN_VLC_BITS 8
47	#define TOTAL_ZEROS_VLC_BITS 9
48	#define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
49	#define RUN_VLC_BITS 3
50	#define RUN7_VLC_BITS 6
51
52	#define MAX_SPS_COUNT 32
53	#define MAX_PPS_COUNT 256
54
55	#define MAX_MMCO_COUNT 66
56
57	/* Compiling in interlaced support reduces the speed
58	* of progressive decoding by about 2%. */
59	#define ALLOW_INTERLACE
60
61	#ifdef ALLOW_INTERLACE
62	#define MB_MBAFF h->mb_mbaff
63	#define MB_FIELD h->mb_field_decoding_flag
64	#define FRAME_MBAFF h->mb_aff_frame
65	#else
66	#define MB_MBAFF 0
67	#define MB_FIELD 0
68	#define FRAME_MBAFF 0
69	#undef IS_INTERLACED
70	#define IS_INTERLACED(mb_type) 0
71	#endif
72
73	/**
74	* Sequence parameter set
75	*/
76	typedef struct SPS{
77
78	int profile_idc;
79	int level_idc;
80	int transform_bypass; ///< qpprime_y_zero_transform_bypass_flag
81	int log2_max_frame_num; ///< log2_max_frame_num_minus4 + 4
82	int poc_type; ///< pic_order_cnt_type
83	int log2_max_poc_lsb; ///< log2_max_pic_order_cnt_lsb_minus4
84	int delta_pic_order_always_zero_flag;
85	int offset_for_non_ref_pic;
86	int offset_for_top_to_bottom_field;
87	int poc_cycle_length; ///< num_ref_frames_in_pic_order_cnt_cycle
88	int ref_frame_count; ///< num_ref_frames
89	int gaps_in_frame_num_allowed_flag;
90	int mb_width; ///< frame_width_in_mbs_minus1 + 1
91	int mb_height; ///< frame_height_in_mbs_minus1 + 1
92	int frame_mbs_only_flag;
93	int mb_aff; ///<mb_adaptive_frame_field_flag
94	int direct_8x8_inference_flag;
95	int crop; ///< frame_cropping_flag
96	int crop_left; ///< frame_cropping_rect_left_offset
97	int crop_right; ///< frame_cropping_rect_right_offset
98	int crop_top; ///< frame_cropping_rect_top_offset
99	int crop_bottom; ///< frame_cropping_rect_bottom_offset
100	int vui_parameters_present_flag;
101	AVRational sar;
102	int timing_info_present_flag;
103	uint32_t num_units_in_tick;
104	uint32_t time_scale;
105	int fixed_frame_rate_flag;
106	short offset_for_ref_frame[256]; //FIXME dyn aloc?
107	int bitstream_restriction_flag;
108	int num_reorder_frames;
109	int scaling_matrix_present;
110	uint8_t scaling_matrix4[6][16];
111	uint8_t scaling_matrix8[2][64];
112	}SPS;
113
114	/**
115	* Picture parameter set
116	*/
117	typedef struct PPS{
118	int sps_id;
119	int cabac; ///< entropy_coding_mode_flag
120	int pic_order_present; ///< pic_order_present_flag
121	int slice_group_count; ///< num_slice_groups_minus1 + 1
122	int mb_slice_group_map_type;
123	int ref_count[2]; ///< num_ref_idx_l0/1_active_minus1 + 1
124	int weighted_pred; ///< weighted_pred_flag
125	int weighted_bipred_idc;
126	int init_qp; ///< pic_init_qp_minus26 + 26
127	int init_qs; ///< pic_init_qs_minus26 + 26
128	int chroma_qp_index_offset;
129	int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
130	int constrained_intra_pred; ///< constrained_intra_pred_flag
131	int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
132	int transform_8x8_mode; ///< transform_8x8_mode_flag
133	uint8_t scaling_matrix4[6][16];
134	uint8_t scaling_matrix8[2][64];
135	}PPS;
136
137	/**
138	* Memory management control operation opcode.
139	*/
140	typedef enum MMCOOpcode{
141	MMCO_END=0,
142	MMCO_SHORT2UNUSED,
143	MMCO_LONG2UNUSED,
144	MMCO_SHORT2LONG,
145	MMCO_SET_MAX_LONG,
146	MMCO_RESET,
147	MMCO_LONG,
148	} MMCOOpcode;
149
150	/**
151	* Memory management control operation.
152	*/
153	typedef struct MMCO{
154	MMCOOpcode opcode;
155	int short_frame_num;
156	int long_index;
157	} MMCO;
158
159	/**
160	* H264Context
161	*/
162	typedef struct H264Context{
163	MpegEncContext s;
164	int nal_ref_idc;
165	int nal_unit_type;
166	#define NAL_SLICE 1
167	#define NAL_DPA 2
168	#define NAL_DPB 3
169	#define NAL_DPC 4
170	#define NAL_IDR_SLICE 5
171	#define NAL_SEI 6
172	#define NAL_SPS 7
173	#define NAL_PPS 8
174	#define NAL_AUD 9
175	#define NAL_END_SEQUENCE 10
176	#define NAL_END_STREAM 11
177	#define NAL_FILLER_DATA 12
178	#define NAL_SPS_EXT 13
179	#define NAL_AUXILIARY_SLICE 19
180	uint8_t *rbsp_buffer;
181	unsigned int rbsp_buffer_size;
182
183	/**
184	* Used to parse AVC variant of h264
185	*/
186	int is_avc; ///< this flag is != 0 if codec is avc1
187	int got_avcC; ///< flag used to parse avcC data only once
188	int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
189
190	int chroma_qp; //QPc
191
192	int prev_mb_skipped;
193	int next_mb_skipped;
194
195	//prediction stuff
196	int chroma_pred_mode;
197	int intra16x16_pred_mode;
198
199	int top_mb_xy;
200	int left_mb_xy[2];
201
202	int8_t intra4x4_pred_mode_cache[5*8];
203	int8_t (*intra4x4_pred_mode)[8];
204	void (pred4x4 [9+3])(uint8_t src, uint8_t *topright, int stride);//FIXME move to dsp?
205	void (pred8x8l [9+3])(uint8_t src, int topleft, int topright, int stride);
206	void (pred8x8 [4+3])(uint8_t src, int stride);
207	void (pred16x16[4+3])(uint8_t src, int stride);
208	unsigned int topleft_samples_available;
209	unsigned int top_samples_available;
210	unsigned int topright_samples_available;
211	unsigned int left_samples_available;
212	uint8_t (top_borders[2])[16+28];
213	uint8_t left_border[2(17+29)];
214
215	/**
216	* non zero coeff count cache.
217	* is 64 if not available.
218	*/
219	DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
220	uint8_t (*non_zero_count)[16];
221
222	/**
223	* Motion vector cache.
224	*/
225	DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
226	DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
227	#define LIST_NOT_USED -1 //FIXME rename?
228	#define PART_NOT_AVAILABLE -2
229
230	/**
231	* is 1 if the specific list MV&references are set to 0,0,-2.
232	*/
233	int mv_cache_clean[2];
234
235	/**
236	* number of neighbors (top and/or left) that used 8x8 dct
237	*/
238	int neighbor_transform_size;
239
240	/**
241	* block_offset[ 0..23] for frame macroblocks
242	* block_offset[24..47] for field macroblocks
243	*/
244	int block_offset[2*(16+8)];
245
246	uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
247	uint32_t *mb2b8_xy;
248	int b_stride; //FIXME use s->b4_stride
249	int b8_stride;
250
251	int mb_linesize; ///< may be equal to s->linesize or s->linesize*2, for mbaff
252	int mb_uvlinesize;
253
254	int emu_edge_width;
255	int emu_edge_height;
256
257	int halfpel_flag;
258	int thirdpel_flag;
259
260	int unknown_svq3_flag;
261	int next_slice_index;
262
263	SPS sps_buffer[MAX_SPS_COUNT];
264	SPS sps; ///< current sps
265
266	PPS pps_buffer[MAX_PPS_COUNT];
267	/**
268	* current pps
269	*/
270	PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
271
272	uint32_t dequant4_buffer[6][52][16];
273	uint32_t dequant8_buffer[2][52][64];
274	uint32_t (*dequant4_coeff[6])[16];
275	uint32_t (*dequant8_coeff[2])[64];
276	int dequant_coeff_pps; ///< reinit tables when pps changes
277
278	int slice_num;
279	uint8_t *slice_table_base;
280	uint8_t slice_table; ///< slice_table_base + 2mb_stride + 1
281	int slice_type;
282	int slice_type_fixed;
283
284	//interlacing specific flags
285	int mb_aff_frame;
286	int mb_field_decoding_flag;
287	int mb_mbaff; ///< mb_aff_frame && mb_field_decoding_flag
288
289	int sub_mb_type[4];
290
291	//POC stuff
292	int poc_lsb;
293	int poc_msb;
294	int delta_poc_bottom;
295	int delta_poc[2];
296	int frame_num;
297	int prev_poc_msb; ///< poc_msb of the last reference pic for POC type 0
298	int prev_poc_lsb; ///< poc_lsb of the last reference pic for POC type 0
299	int frame_num_offset; ///< for POC type 2
300	int prev_frame_num_offset; ///< for POC type 2
301	int prev_frame_num; ///< frame_num of the last pic for POC type 1/2
302
303	/**
304	* frame_num for frames or 2*frame_num for field pics.
305	*/
306	int curr_pic_num;
307
308	/**
309	* max_frame_num or 2*max_frame_num for field pics.
310	*/
311	int max_pic_num;
312
313	//Weighted pred stuff
314	int use_weight;
315	int use_weight_chroma;
316	int luma_log2_weight_denom;
317	int chroma_log2_weight_denom;
318	int luma_weight[2][48];
319	int luma_offset[2][48];
320	int chroma_weight[2][48][2];
321	int chroma_offset[2][48][2];
322	int implicit_weight[48][48];
323
324	//deblock
325	int deblocking_filter; ///< disable_deblocking_filter_idc with 1<->0
326	int slice_alpha_c0_offset;
327	int slice_beta_offset;
328
329	int redundant_pic_count;
330
331	int direct_spatial_mv_pred;
332	int dist_scale_factor[16];
333	int dist_scale_factor_field[32];
334	int map_col_to_list0[2][16];
335	int map_col_to_list0_field[2][32];
336
337	/**
338	* num_ref_idx_l0/1_active_minus1 + 1
339	*/
340	int ref_count[2]; ///< counts frames or fields, depending on current mb mode
341	Picture *short_ref[32];
342	Picture *long_ref[32];
343	Picture default_ref_list[2][32];
344	Picture ref_list[2][48]; ///< 0..15: frame refs, 16..47: mbaff field refs
345	Picture *delayed_pic[16]; //FIXME size?
346	Picture *delayed_output_pic;
347
348	/**
349	* memory management control operations buffer.
350	*/
351	MMCO mmco[MAX_MMCO_COUNT];
352	int mmco_index;
353
354	int long_ref_count; ///< number of actual long term references
355	int short_ref_count; ///< number of actual short term references
356
357	//data partitioning
358	GetBitContext intra_gb;
359	GetBitContext inter_gb;
360	GetBitContext *intra_gb_ptr;
361	GetBitContext *inter_gb_ptr;
362
363	DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
364
365	/**
366	* Cabac
367	*/
368	CABACContext cabac;
369	uint8_t cabac_state[460];
370	int cabac_init_idc;
371
372	/* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
373	uint16_t *cbp_table;
374	int top_cbp;
375	int left_cbp;
376	/* chroma_pred_mode for i4x4 or i16x16, else 0 */
377	uint8_t *chroma_pred_mode_table;
378	int last_qscale_diff;
379	int16_t (*mvd_table[2])[2];
380	DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
381	uint8_t *direct_table;
382	uint8_t direct_cache[5*8];
383
384	uint8_t zigzag_scan[16];
385	uint8_t zigzag_scan8x8[64];
386	uint8_t zigzag_scan8x8_cavlc[64];
387	uint8_t field_scan[16];
388	uint8_t field_scan8x8[64];
389	uint8_t field_scan8x8_cavlc[64];
390	const uint8_t *zigzag_scan_q0;
391	const uint8_t *zigzag_scan8x8_q0;
392	const uint8_t *zigzag_scan8x8_cavlc_q0;
393	const uint8_t *field_scan_q0;
394	const uint8_t *field_scan8x8_q0;
395	const uint8_t *field_scan8x8_cavlc_q0;
396
397	int x264_build;
398	}H264Context;
399
400	static VLC coeff_token_vlc[4];
401	static VLC chroma_dc_coeff_token_vlc;
402
403	static VLC total_zeros_vlc[15];
404	static VLC chroma_dc_total_zeros_vlc[3];
405
406	static VLC run_vlc[6];
407	static VLC run7_vlc;
408
409	static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
410	static void svq3_add_idct_c(uint8_t dst, DCTELEM block, int stride, int qp, int dc);
411	static void filter_mb( H264Context h, int mb_x, int mb_y, uint8_t img_y, uint8_t img_cb, uint8_t img_cr, unsigned int linesize, unsigned int uvlinesize);
412
413	static always_inline uint32_t pack16to32(int a, int b){
414	#ifdef WORDS_BIGENDIAN
415	return (b&0xFFFF) + (a<<16);
416	#else
417	return (a&0xFFFF) + (b<<16);
418	#endif
419	}
420
421	/**
422	* fill a rectangle.
423	* @param h height of the rectangle, should be a constant
424	* @param w width of the rectangle, should be a constant
425	* @param size the size of val (1 or 4), should be a constant
426	*/
427	static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
428	uint8_t p= (uint8_t)vp;
429	assert(size==1 \|\| size==4);
430	assert(w<=4);
431
432	w *= size;
433	stride *= size;
434
435	assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
436	assert((stride&(w-1))==0);
437	if(w==2){
438	const uint16_t v= size==4 ? val : val*0x0101;
439	(uint16_t)(p + 0*stride)= v;
440	if(h==1) return;
441	(uint16_t)(p + 1*stride)= v;
442	if(h==2) return;
443	(uint16_t)(p + 2*stride)=
444	(uint16_t)(p + 3*stride)= v;
445	}else if(w==4){
446	const uint32_t v= size==4 ? val : val*0x01010101;
447	(uint32_t)(p + 0*stride)= v;
448	if(h==1) return;
449	(uint32_t)(p + 1*stride)= v;
450	if(h==2) return;
451	(uint32_t)(p + 2*stride)=
452	(uint32_t)(p + 3*stride)= v;
453	}else if(w==8){
454	//gcc can't optimize 64bit math on x86_32
455	#if defined(ARCH_X86_64) \|\| (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
456	const uint64_t v= val*0x0100000001ULL;
457	(uint64_t)(p + 0*stride)= v;
458	if(h==1) return;
459	(uint64_t)(p + 1*stride)= v;
460	if(h==2) return;
461	(uint64_t)(p + 2*stride)=
462	(uint64_t)(p + 3*stride)= v;
463	}else if(w==16){
464	const uint64_t v= val*0x0100000001ULL;
465	(uint64_t)(p + 0+0*stride)=
466	(uint64_t)(p + 8+0*stride)=
467	(uint64_t)(p + 0+1*stride)=
468	(uint64_t)(p + 8+1*stride)= v;
469	if(h==2) return;
470	(uint64_t)(p + 0+2*stride)=
471	(uint64_t)(p + 8+2*stride)=
472	(uint64_t)(p + 0+3*stride)=
473	(uint64_t)(p + 8+3*stride)= v;
474	#else
475	(uint32_t)(p + 0+0*stride)=
476	(uint32_t)(p + 4+0*stride)= val;
477	if(h==1) return;
478	(uint32_t)(p + 0+1*stride)=
479	(uint32_t)(p + 4+1*stride)= val;
480	if(h==2) return;
481	(uint32_t)(p + 0+2*stride)=
482	(uint32_t)(p + 4+2*stride)=
483	(uint32_t)(p + 0+3*stride)=
484	(uint32_t)(p + 4+3*stride)= val;
485	}else if(w==16){
486	(uint32_t)(p + 0+0*stride)=
487	(uint32_t)(p + 4+0*stride)=
488	(uint32_t)(p + 8+0*stride)=
489	(uint32_t)(p +12+0*stride)=
490	(uint32_t)(p + 0+1*stride)=
491	(uint32_t)(p + 4+1*stride)=
492	(uint32_t)(p + 8+1*stride)=
493	(uint32_t)(p +12+1*stride)= val;
494	if(h==2) return;
495	(uint32_t)(p + 0+2*stride)=
496	(uint32_t)(p + 4+2*stride)=
497	(uint32_t)(p + 8+2*stride)=
498	(uint32_t)(p +12+2*stride)=
499	(uint32_t)(p + 0+3*stride)=
500	(uint32_t)(p + 4+3*stride)=
501	(uint32_t)(p + 8+3*stride)=
502	(uint32_t)(p +12+3*stride)= val;
503	#endif
504	}else
505	assert(0);
506	assert(h==4);
507	}
508
509	static void fill_caches(H264Context *h, int mb_type, int for_deblock){
510	MpegEncContext * const s = &h->s;
511	const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
512	int topleft_xy, top_xy, topright_xy, left_xy[2];
513	int topleft_type, top_type, topright_type, left_type[2];
514	int left_block[8];
515	int i;
516
517	//FIXME deblocking can skip fill_caches much of the time with multiple slices too.
518	// the actual condition is whether we're on the edge of a slice,
519	// and even then the intra and nnz parts are unnecessary.
520	if(for_deblock && h->slice_num == 1 && !FRAME_MBAFF)
521	return;
522
523	//wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
524
525	top_xy = mb_xy - s->mb_stride;
526	topleft_xy = top_xy - 1;
527	topright_xy= top_xy + 1;
528	left_xy[1] = left_xy[0] = mb_xy-1;
529	left_block[0]= 0;
530	left_block[1]= 1;
531	left_block[2]= 2;
532	left_block[3]= 3;
533	left_block[4]= 7;
534	left_block[5]= 10;
535	left_block[6]= 8;
536	left_block[7]= 11;
537	if(FRAME_MBAFF){
538	const int pair_xy = s->mb_x + (s->mb_y & ~1)*s->mb_stride;
539	const int top_pair_xy = pair_xy - s->mb_stride;
540	const int topleft_pair_xy = top_pair_xy - 1;
541	const int topright_pair_xy = top_pair_xy + 1;
542	const int topleft_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
543	const int top_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
544	const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
545	const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
546	const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
547	const int bottom = (s->mb_y & 1);
548	tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
549	if (bottom
550	? !curr_mb_frame_flag // bottom macroblock
551	: (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
552	) {
553	top_xy -= s->mb_stride;
554	}
555	if (bottom
556	? !curr_mb_frame_flag // bottom macroblock
557	: (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
558	) {
559	topleft_xy -= s->mb_stride;
560	}
561	if (bottom
562	? !curr_mb_frame_flag // bottom macroblock
563	: (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
564	) {
565	topright_xy -= s->mb_stride;
566	}
567	if (left_mb_frame_flag != curr_mb_frame_flag) {
568	left_xy[1] = left_xy[0] = pair_xy - 1;
569	if (curr_mb_frame_flag) {
570	if (bottom) {
571	left_block[0]= 2;
572	left_block[1]= 2;
573	left_block[2]= 3;
574	left_block[3]= 3;
575	left_block[4]= 8;
576	left_block[5]= 11;
577	left_block[6]= 8;
578	left_block[7]= 11;
579	} else {
580	left_block[0]= 0;
581	left_block[1]= 0;
582	left_block[2]= 1;
583	left_block[3]= 1;
584	left_block[4]= 7;
585	left_block[5]= 10;
586	left_block[6]= 7;
587	left_block[7]= 10;
588	}
589	} else {
590	left_xy[1] += s->mb_stride;
591	//left_block[0]= 0;
592	left_block[1]= 2;
593	left_block[2]= 0;
594	left_block[3]= 2;
595	//left_block[4]= 7;
596	left_block[5]= 10;
597	left_block[6]= 7;
598	left_block[7]= 10;
599	}
600	}
601	}
602
603	h->top_mb_xy = top_xy;
604	h->left_mb_xy[0] = left_xy[0];
605	h->left_mb_xy[1] = left_xy[1];
606	if(for_deblock){
607	topleft_type = h->slice_table[topleft_xy ] < 255 ? s->current_picture.mb_type[topleft_xy] : 0;
608	top_type = h->slice_table[top_xy ] < 255 ? s->current_picture.mb_type[top_xy] : 0;
609	topright_type= h->slice_table[topright_xy] < 255 ? s->current_picture.mb_type[topright_xy]: 0;
610	left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
611	left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
612
613	if(FRAME_MBAFF && !IS_INTRA(mb_type)){
614	int list;
615	int v = (uint16_t)&h->non_zero_count[mb_xy][14];
616	for(i=0; i<16; i++)
617	h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
618	for(list=0; list<1+(h->slice_type==B_TYPE); list++){
619	if(USES_LIST(mb_type,list)){
620	uint32_t src = (uint32_t)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
621	uint32_t dst = (uint32_t)h->mv_cache[list][scan8[0]];
622	uint8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
623	for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
624	dst[0] = src[0];
625	dst[1] = src[1];
626	dst[2] = src[2];
627	dst[3] = src[3];
628	}
629	(uint32_t)&h->ref_cache[list][scan8[ 0]] =
630	(uint32_t)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
631	ref += h->b8_stride;
632	(uint32_t)&h->ref_cache[list][scan8[ 8]] =
633	(uint32_t)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
634	}else{
635	fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
636	fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
637	}
638	}
639	}
640	}else{
641	topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
642	top_type = h->slice_table[top_xy ] == h->slice_num ? s->current_picture.mb_type[top_xy] : 0;
643	topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
644	left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
645	left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
646	}
647
648	if(IS_INTRA(mb_type)){
649	h->topleft_samples_available=
650	h->top_samples_available=
651	h->left_samples_available= 0xFFFF;
652	h->topright_samples_available= 0xEEEA;
653
654	if(!IS_INTRA(top_type) && (top_type==0 \|\| h->pps.constrained_intra_pred)){
655	h->topleft_samples_available= 0xB3FF;
656	h->top_samples_available= 0x33FF;
657	h->topright_samples_available= 0x26EA;
658	}
659	for(i=0; i<2; i++){
660	if(!IS_INTRA(left_type[i]) && (left_type[i]==0 \|\| h->pps.constrained_intra_pred)){
661	h->topleft_samples_available&= 0xDF5F;
662	h->left_samples_available&= 0x5F5F;
663	}
664	}
665
666	if(!IS_INTRA(topleft_type) && (topleft_type==0 \|\| h->pps.constrained_intra_pred))
667	h->topleft_samples_available&= 0x7FFF;
668
669	if(!IS_INTRA(topright_type) && (topright_type==0 \|\| h->pps.constrained_intra_pred))
670	h->topright_samples_available&= 0xFBFF;
671
672	if(IS_INTRA4x4(mb_type)){
673	if(IS_INTRA4x4(top_type)){
674	h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
675	h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
676	h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
677	h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
678	}else{
679	int pred;
680	if(!top_type \|\| (IS_INTER(top_type) && h->pps.constrained_intra_pred))
681	pred= -1;
682	else{
683	pred= 2;
684	}
685	h->intra4x4_pred_mode_cache[4+8*0]=
686	h->intra4x4_pred_mode_cache[5+8*0]=
687	h->intra4x4_pred_mode_cache[6+8*0]=
688	h->intra4x4_pred_mode_cache[7+8*0]= pred;
689	}
690	for(i=0; i<2; i++){
691	if(IS_INTRA4x4(left_type[i])){
692	h->intra4x4_pred_mode_cache[3+81 + 28i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2i]];
693	h->intra4x4_pred_mode_cache[3+82 + 28i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2i]];
694	}else{
695	int pred;
696	if(!left_type[i] \|\| (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
697	pred= -1;
698	else{
699	pred= 2;
700	}
701	h->intra4x4_pred_mode_cache[3+81 + 28*i]=
702	h->intra4x4_pred_mode_cache[3+82 + 28*i]= pred;
703	}
704	}
705	}
706	}
707
708
709	/*
710	0 . T T. T T T T
711	1 L . .L . . . .
712	2 L . .L . . . .
713	3 . T TL . . . .
714	4 L . .L . . . .
715	5 L . .. . . . .
716	*/
717	//FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
718	if(top_type){
719	h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
720	h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
721	h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
722	h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
723
724	h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
725	h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
726
727	h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
728	h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
729
730	}else{
731	h->non_zero_count_cache[4+8*0]=
732	h->non_zero_count_cache[5+8*0]=
733	h->non_zero_count_cache[6+8*0]=
734	h->non_zero_count_cache[7+8*0]=
735
736	h->non_zero_count_cache[1+8*0]=
737	h->non_zero_count_cache[2+8*0]=
738
739	h->non_zero_count_cache[1+8*3]=
740	h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
741
742	}
743
744	for (i=0; i<2; i++) {
745	if(left_type[i]){
746	h->non_zero_count_cache[3+81 + 28i]= h->non_zero_count[left_xy[i]][left_block[0+2i]];
747	h->non_zero_count_cache[3+82 + 28i]= h->non_zero_count[left_xy[i]][left_block[1+2i]];
748	h->non_zero_count_cache[0+81 + 8i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
749	h->non_zero_count_cache[0+84 + 8i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
750	}else{
751	h->non_zero_count_cache[3+81 + 28*i]=
752	h->non_zero_count_cache[3+82 + 28*i]=
753	h->non_zero_count_cache[0+81 + 8i]=
754	h->non_zero_count_cache[0+84 + 8i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
755	}
756	}
757
758	if( h->pps.cabac ) {
759	// top_cbp
760	if(top_type) {
761	h->top_cbp = h->cbp_table[top_xy];
762	} else if(IS_INTRA(mb_type)) {
763	h->top_cbp = 0x1C0;
764	} else {
765	h->top_cbp = 0;
766	}
767	// left_cbp
768	if (left_type[0]) {
769	h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
770	} else if(IS_INTRA(mb_type)) {
771	h->left_cbp = 0x1C0;
772	} else {
773	h->left_cbp = 0;
774	}
775	if (left_type[0]) {
776	h->left_cbp \|= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
777	}
778	if (left_type[1]) {
779	h->left_cbp \|= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
780	}
781	}
782
783	#if 1
784	//FIXME direct mb can skip much of this
785	if(IS_INTER(mb_type) \|\| IS_DIRECT(mb_type)){
786	int list;
787	for(list=0; list<1+(h->slice_type==B_TYPE); list++){
788	if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
789	/*if(!h->mv_cache_clean[list]){
790	memset(h->mv_cache [list], 0, 852*sizeof(int16_t)); //FIXME clean only input? clean at all?
791	memset(h->ref_cache[list], PART_NOT_AVAILABLE, 85sizeof(int8_t));
792	h->mv_cache_clean[list]= 1;
793	}*/
794	continue;
795	}
796	h->mv_cache_clean[list]= 0;
797
798	if(USES_LIST(top_type, list)){
799	const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
800	const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
801	(uint32_t)h->mv_cache[list][scan8[0] + 0 - 18]= (uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
802	(uint32_t)h->mv_cache[list][scan8[0] + 1 - 18]= (uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
803	(uint32_t)h->mv_cache[list][scan8[0] + 2 - 18]= (uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
804	(uint32_t)h->mv_cache[list][scan8[0] + 3 - 18]= (uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
805	h->ref_cache[list][scan8[0] + 0 - 1*8]=
806	h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
807	h->ref_cache[list][scan8[0] + 2 - 1*8]=
808	h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
809	}else{
810	(uint32_t)h->mv_cache [list][scan8[0] + 0 - 1*8]=
811	(uint32_t)h->mv_cache [list][scan8[0] + 1 - 1*8]=
812	(uint32_t)h->mv_cache [list][scan8[0] + 2 - 1*8]=
813	(uint32_t)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
814	(uint32_t)&h->ref_cache[list][scan8[0] + 0 - 18]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)0x01010101;
815	}
816
817	//FIXME unify cleanup or sth
818	if(USES_LIST(left_type[0], list)){
819	const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
820	const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
821	(uint32_t)h->mv_cache[list][scan8[0] - 1 + 08]= (uint32_t)s->current_picture.motion_val[list][b_xy + h->b_strideleft_block[0]];
822	(uint32_t)h->mv_cache[list][scan8[0] - 1 + 18]= (uint32_t)s->current_picture.motion_val[list][b_xy + h->b_strideleft_block[1]];
823	h->ref_cache[list][scan8[0] - 1 + 08]= s->current_picture.ref_index[list][b8_xy + h->b8_stride(left_block[0]>>1)];
824	h->ref_cache[list][scan8[0] - 1 + 18]= s->current_picture.ref_index[list][b8_xy + h->b8_stride(left_block[1]>>1)];
825	}else{
826	(uint32_t)h->mv_cache [list][scan8[0] - 1 + 0*8]=
827	(uint32_t)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
828	h->ref_cache[list][scan8[0] - 1 + 0*8]=
829	h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
830	}
831
832	if(USES_LIST(left_type[1], list)){
833	const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
834	const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
835	(uint32_t)h->mv_cache[list][scan8[0] - 1 + 28]= (uint32_t)s->current_picture.motion_val[list][b_xy + h->b_strideleft_block[2]];
836	(uint32_t)h->mv_cache[list][scan8[0] - 1 + 38]= (uint32_t)s->current_picture.motion_val[list][b_xy + h->b_strideleft_block[3]];
837	h->ref_cache[list][scan8[0] - 1 + 28]= s->current_picture.ref_index[list][b8_xy + h->b8_stride(left_block[2]>>1)];
838	h->ref_cache[list][scan8[0] - 1 + 38]= s->current_picture.ref_index[list][b8_xy + h->b8_stride(left_block[3]>>1)];
839	}else{
840	(uint32_t)h->mv_cache [list][scan8[0] - 1 + 2*8]=
841	(uint32_t)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
842	h->ref_cache[list][scan8[0] - 1 + 2*8]=
843	h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
844	assert((!left_type[0]) == (!left_type[1]));
845	}
846
847	if(for_deblock \|\| (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred))
848	continue;
849
850	if(USES_LIST(topleft_type, list)){
851	const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
852	const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
853	(uint32_t)h->mv_cache[list][scan8[0] - 1 - 18]= (uint32_t*)s->current_picture.motion_val[list][b_xy];
854	h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
855	}else{
856	(uint32_t)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
857	h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
858	}
859
860	if(USES_LIST(topright_type, list)){
861	const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
862	const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
863	(uint32_t)h->mv_cache[list][scan8[0] + 4 - 18]= (uint32_t*)s->current_picture.motion_val[list][b_xy];
864	h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
865	}else{
866	(uint32_t)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
867	h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
868	}
869
870
871	h->ref_cache[list][scan8[5 ]+1] =
872	h->ref_cache[list][scan8[7 ]+1] =
873	h->ref_cache[list][scan8[13]+1] = //FIXME remove past 3 (init somewhere else)
874	h->ref_cache[list][scan8[4 ]] =
875	h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
876	(uint32_t)h->mv_cache [list][scan8[5 ]+1]=
877	(uint32_t)h->mv_cache [list][scan8[7 ]+1]=
878	(uint32_t)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
879	(uint32_t)h->mv_cache [list][scan8[4 ]]=
880	(uint32_t)h->mv_cache [list][scan8[12]]= 0;
881
882	if( h->pps.cabac ) {
883	/* XXX beurk, Load mvd */
884	if(USES_LIST(top_type, list)){
885	const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
886	(uint32_t)h->mvd_cache[list][scan8[0] + 0 - 18]= (uint32_t*)h->mvd_table[list][b_xy + 0];
887	(uint32_t)h->mvd_cache[list][scan8[0] + 1 - 18]= (uint32_t*)h->mvd_table[list][b_xy + 1];
888	(uint32_t)h->mvd_cache[list][scan8[0] + 2 - 18]= (uint32_t*)h->mvd_table[list][b_xy + 2];
889	(uint32_t)h->mvd_cache[list][scan8[0] + 3 - 18]= (uint32_t*)h->mvd_table[list][b_xy + 3];
890	}else{
891	(uint32_t)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
892	(uint32_t)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
893	(uint32_t)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
894	(uint32_t)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
895	}
896	if(USES_LIST(left_type[0], list)){
897	const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
898	(uint32_t)h->mvd_cache[list][scan8[0] - 1 + 08]= (uint32_t)h->mvd_table[list][b_xy + h->b_strideleft_block[0]];
899	(uint32_t)h->mvd_cache[list][scan8[0] - 1 + 18]= (uint32_t)h->mvd_table[list][b_xy + h->b_strideleft_block[1]];
900	}else{
901	(uint32_t)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
902	(uint32_t)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
903	}
904	if(USES_LIST(left_type[1], list)){
905	const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
906	(uint32_t)h->mvd_cache[list][scan8[0] - 1 + 28]= (uint32_t)h->mvd_table[list][b_xy + h->b_strideleft_block[2]];
907	(uint32_t)h->mvd_cache[list][scan8[0] - 1 + 38]= (uint32_t)h->mvd_table[list][b_xy + h->b_strideleft_block[3]];
908	}else{
909	(uint32_t)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
910	(uint32_t)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
911	}
912	(uint32_t)h->mvd_cache [list][scan8[5 ]+1]=
913	(uint32_t)h->mvd_cache [list][scan8[7 ]+1]=
914	(uint32_t)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
915	(uint32_t)h->mvd_cache [list][scan8[4 ]]=
916	(uint32_t)h->mvd_cache [list][scan8[12]]= 0;
917
918	if(h->slice_type == B_TYPE){
919	fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
920
921	if(IS_DIRECT(top_type)){
922	(uint32_t)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
923	}else if(IS_8X8(top_type)){
924	int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
925	h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
926	h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
927	}else{
928	(uint32_t)&h->direct_cache[scan8[0] - 1*8]= 0;
929	}
930
931	if(IS_DIRECT(left_type[0]))
932	h->direct_cache[scan8[0] - 1 + 0*8]= 1;
933	else if(IS_8X8(left_type[0]))
934	h->direct_cache[scan8[0] - 1 + 08]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride(left_block[0]>>1)];
935	else
936	h->direct_cache[scan8[0] - 1 + 0*8]= 0;
937
938	if(IS_DIRECT(left_type[1]))
939	h->direct_cache[scan8[0] - 1 + 2*8]= 1;
940	else if(IS_8X8(left_type[1]))
941	h->direct_cache[scan8[0] - 1 + 28]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride(left_block[2]>>1)];
942	else
943	h->direct_cache[scan8[0] - 1 + 2*8]= 0;
944	}
945	}
946
947	if(FRAME_MBAFF){
948	#define MAP_MVS\
949	MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
950	MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
951	MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
952	MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
953	MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
954	MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
955	MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
956	MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
957	MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
958	MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
959	if(MB_FIELD){
960	#define MAP_F2F(idx, mb_type)\
961	if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
962	h->ref_cache[list][idx] <<= 1;\
963	h->mv_cache[list][idx][1] /= 2;\
964	h->mvd_cache[list][idx][1] /= 2;\
965	}
966	MAP_MVS
967	#undef MAP_F2F
968	}else{
969	#define MAP_F2F(idx, mb_type)\
970	if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
971	h->ref_cache[list][idx] >>= 1;\
972	h->mv_cache[list][idx][1] <<= 1;\
973	h->mvd_cache[list][idx][1] <<= 1;\
974	}
975	MAP_MVS
976	#undef MAP_F2F
977	}
978	}
979	}
980	}
981	#endif
982
983	h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
984	}
985
986	static inline void write_back_intra_pred_mode(H264Context *h){
987	MpegEncContext * const s = &h->s;
988	const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
989
990	h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
991	h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
992	h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
993	h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
994	h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
995	h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
996	h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
997	}
998
999	/**
1000	* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1001	*/
1002	static inline int check_intra4x4_pred_mode(H264Context *h){
1003	MpegEncContext * const s = &h->s;
1004	static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
1005	static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
1006	int i;
1007
1008	if(!(h->top_samples_available&0x8000)){
1009	for(i=0; i<4; i++){
1010	int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
1011	if(status<0){
1012	av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1013	return -1;
1014	} else if(status){
1015	h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
1016	}
1017	}
1018	}
1019
1020	if(!(h->left_samples_available&0x8000)){
1021	for(i=0; i<4; i++){
1022	int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
1023	if(status<0){
1024	av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1025	return -1;
1026	} else if(status){
1027	h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
1028	}
1029	}
1030	}
1031
1032	return 0;
1033	} //FIXME cleanup like next
1034
1035	/**
1036	* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1037	*/
1038	static inline int check_intra_pred_mode(H264Context *h, int mode){
1039	MpegEncContext * const s = &h->s;
1040	static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
1041	static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
1042
1043	if(mode < 0 \|\| mode > 6) {
1044	av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
1045	return -1;
1046	}
1047
1048	if(!(h->top_samples_available&0x8000)){
1049	mode= top[ mode ];
1050	if(mode<0){
1051	av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1052	return -1;
1053	}
1054	}
1055
1056	if(!(h->left_samples_available&0x8000)){
1057	mode= left[ mode ];
1058	if(mode<0){
1059	av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1060	return -1;
1061	}
1062	}
1063
1064	return mode;
1065	}
1066
1067	/**
1068	* gets the predicted intra4x4 prediction mode.
1069	*/
1070	static inline int pred_intra_mode(H264Context *h, int n){
1071	const int index8= scan8[n];
1072	const int left= h->intra4x4_pred_mode_cache[index8 - 1];
1073	const int top = h->intra4x4_pred_mode_cache[index8 - 8];
1074	const int min= FFMIN(left, top);
1075
1076	tprintf("mode:%d %d min:%d\n", left ,top, min);
1077
1078	if(min<0) return DC_PRED;
1079	else return min;
1080	}
1081
1082	static inline void write_back_non_zero_count(H264Context *h){
1083	MpegEncContext * const s = &h->s;
1084	const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1085
1086	h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
1087	h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
1088	h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
1089	h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
1090	h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
1091	h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
1092	h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
1093
1094	h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
1095	h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
1096	h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
1097
1098	h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
1099	h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
1100	h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
1101
1102	if(FRAME_MBAFF){
1103	// store all luma nnzs, for deblocking
1104	int v = 0, i;
1105	for(i=0; i<16; i++)
1106	v += (!!h->non_zero_count_cache[scan8[i]]) << i;
1107	(uint16_t)&h->non_zero_count[mb_xy][14] = v;
1108	}
1109	}
1110
1111	/**
1112	* gets the predicted number of non zero coefficients.
1113	* @param n block index
1114	*/
1115	static inline int pred_non_zero_count(H264Context *h, int n){
1116	const int index8= scan8[n];
1117	const int left= h->non_zero_count_cache[index8 - 1];
1118	const int top = h->non_zero_count_cache[index8 - 8];
1119	int i= left + top;
1120
1121	if(i<64) i= (i+1)>>1;
1122
1123	tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
1124
1125	return i&31;
1126	}
1127
1128	static inline int fetch_diagonal_mv(H264Context h, const int16_t *C, int i, int list, int part_width){
1129	const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
1130
1131	/* there is no consistent mapping of mvs to neighboring locations that will
1132	* make mbaff happy, so we can't move all this logic to fill_caches */
1133	if(FRAME_MBAFF){
1134	MpegEncContext *s = &h->s;
1135	const int *mb_types = s->current_picture_ptr->mb_type;
1136	const int16_t *mv;
1137	(uint32_t)h->mv_cache[list][scan8[0]-2] = 0;
1138	*C = h->mv_cache[list][scan8[0]-2];
1139
1140	if(!MB_FIELD
1141	&& (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
1142	int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
1143	if(IS_INTERLACED(mb_types[topright_xy])){
1144	#define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
1145	const int x4 = X4, y4 = Y4;\
1146	const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
1147	if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
1148	return LIST_NOT_USED;\
1149	mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
1150	h->mv_cache[list][scan8[0]-2][0] = mv[0];\
1151	h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
1152	return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
1153
1154	SET_DIAG_MV(2, >>1, s->mb_x4+(i&7)-4+part_width, s->mb_y*4-1);
1155	}
1156	}
1157	if(topright_ref == PART_NOT_AVAILABLE
1158	&& ((s->mb_y&1) \|\| i >= scan8[0]+8) && (i&7)==4
1159	&& h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
1160	if(!MB_FIELD
1161	&& IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
1162	SET_DIAG_MV(2, >>1, s->mb_x4-1, (s->mb_y\|1)4+(s->mb_y&1)2+(i>>4)-1);
1163	}
1164	if(MB_FIELD
1165	&& !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
1166	&& i >= scan8[0]+8){
1167	// leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
1168	SET_DIAG_MV(>>1, <<1, s->mb_x4-1, (s->mb_y&~1)4 - 1 + ((i-scan8[0])>>3)*2);
1169	}
1170	}
1171	#undef SET_DIAG_MV
1172	}
1173
1174	if(topright_ref != PART_NOT_AVAILABLE){
1175	*C= h->mv_cache[list][ i - 8 + part_width ];
1176	return topright_ref;
1177	}else{
1178	tprintf("topright MV not available\n");
1179
1180	*C= h->mv_cache[list][ i - 8 - 1 ];
1181	return h->ref_cache[list][ i - 8 - 1 ];
1182	}
1183	}
1184
1185	/**
1186	* gets the predicted MV.
1187	* @param n the block index
1188	* @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1189	* @param mx the x component of the predicted motion vector
1190	* @param my the y component of the predicted motion vector
1191	*/
1192	static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
1193	const int index8= scan8[n];
1194	const int top_ref= h->ref_cache[list][ index8 - 8 ];
1195	const int left_ref= h->ref_cache[list][ index8 - 1 ];
1196	const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1197	const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1198	const int16_t * C;
1199	int diagonal_ref, match_count;
1200
1201	assert(part_width==1 \|\| part_width==2 \|\| part_width==4);
1202
1203	/* mv_cache
1204	B . . A T T T T
1205	U . . L . . , .
1206	U . . L . . . .
1207	U . . L . . , .
1208	. . . L . . . .
1209	*/
1210
1211	diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1212	match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1213	tprintf("pred_motion match_count=%d\n", match_count);
1214	if(match_count > 1){ //most common
1215	*mx= mid_pred(A[0], B[0], C[0]);
1216	*my= mid_pred(A[1], B[1], C[1]);
1217	}else if(match_count==1){
1218	if(left_ref==ref){
1219	*mx= A[0];
1220	*my= A[1];
1221	}else if(top_ref==ref){
1222	*mx= B[0];
1223	*my= B[1];
1224	}else{
1225	*mx= C[0];
1226	*my= C[1];
1227	}
1228	}else{
1229	if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1230	*mx= A[0];
1231	*my= A[1];
1232	}else{
1233	*mx= mid_pred(A[0], B[0], C[0]);
1234	*my= mid_pred(A[1], B[1], C[1]);
1235	}
1236	}
1237
1238	tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, mx, my, h->s.mb_x, h->s.mb_y, n, list);
1239	}
1240
1241	/**
1242	* gets the directionally predicted 16x8 MV.
1243	* @param n the block index
1244	* @param mx the x component of the predicted motion vector
1245	* @param my the y component of the predicted motion vector
1246	*/
1247	static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1248	if(n==0){
1249	const int top_ref= h->ref_cache[list][ scan8[0] - 8 ];
1250	const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1251
1252	tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1253
1254	if(top_ref == ref){
1255	*mx= B[0];
1256	*my= B[1];
1257	return;
1258	}
1259	}else{
1260	const int left_ref= h->ref_cache[list][ scan8[8] - 1 ];
1261	const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1262
1263	tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1264
1265	if(left_ref == ref){
1266	*mx= A[0];
1267	*my= A[1];
1268	return;
1269	}
1270	}
1271
1272	//RARE
1273	pred_motion(h, n, 4, list, ref, mx, my);
1274	}
1275
1276	/**
1277	* gets the directionally predicted 8x16 MV.
1278	* @param n the block index
1279	* @param mx the x component of the predicted motion vector
1280	* @param my the y component of the predicted motion vector
1281	*/
1282	static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1283	if(n==0){
1284	const int left_ref= h->ref_cache[list][ scan8[0] - 1 ];
1285	const int16_t * const A= h->mv_cache[list][ scan8[0] - 1 ];
1286
1287	tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1288
1289	if(left_ref == ref){
1290	*mx= A[0];
1291	*my= A[1];
1292	return;
1293	}
1294	}else{
1295	const int16_t * C;
1296	int diagonal_ref;
1297
1298	diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1299
1300	tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1301
1302	if(diagonal_ref == ref){
1303	*mx= C[0];
1304	*my= C[1];
1305	return;
1306	}
1307	}
1308
1309	//RARE
1310	pred_motion(h, n, 2, list, ref, mx, my);
1311	}
1312
1313	static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1314	const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1315	const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1316
1317	tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1318
1319	if(top_ref == PART_NOT_AVAILABLE \|\| left_ref == PART_NOT_AVAILABLE
1320	\|\| (top_ref == 0 && (uint32_t)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1321	\|\| (left_ref == 0 && (uint32_t)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1322
1323	mx = my = 0;
1324	return;
1325	}
1326
1327	pred_motion(h, 0, 4, 0, 0, mx, my);
1328
1329	return;
1330	}
1331
1332	static inline void direct_dist_scale_factor(H264Context * const h){
1333	const int poc = h->s.current_picture_ptr->poc;
1334	const int poc1 = h->ref_list[1][0].poc;
1335	int i;
1336	for(i=0; i<h->ref_count[0]; i++){
1337	int poc0 = h->ref_list[0][i].poc;
1338	int td = clip(poc1 - poc0, -128, 127);
1339	if(td == 0 /* FIXME \|\| pic0 is a long-term ref */){
1340	h->dist_scale_factor[i] = 256;
1341	}else{
1342	int tb = clip(poc - poc0, -128, 127);
1343	int tx = (16384 + (ABS(td) >> 1)) / td;
1344	h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
1345	}
1346	}
1347	if(FRAME_MBAFF){
1348	for(i=0; i<h->ref_count[0]; i++){
1349	h->dist_scale_factor_field[2*i] =
1350	h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
1351	}
1352	}
1353	}
1354	static inline void direct_ref_list_init(H264Context * const h){
1355	MpegEncContext * const s = &h->s;
1356	Picture * const ref1 = &h->ref_list[1][0];
1357	Picture * const cur = s->current_picture_ptr;
1358	int list, i, j;
1359	if(cur->pict_type == I_TYPE)
1360	cur->ref_count[0] = 0;
1361	if(cur->pict_type != B_TYPE)
1362	cur->ref_count[1] = 0;
1363	for(list=0; list<2; list++){
1364	cur->ref_count[list] = h->ref_count[list];
1365	for(j=0; j<h->ref_count[list]; j++)
1366	cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1367	}
1368	if(cur->pict_type != B_TYPE \|\| h->direct_spatial_mv_pred)
1369	return;
1370	for(list=0; list<2; list++){
1371	for(i=0; i<ref1->ref_count[list]; i++){
1372	const int poc = ref1->ref_poc[list][i];
1373	h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1374	for(j=0; j<h->ref_count[list]; j++)
1375	if(h->ref_list[list][j].poc == poc){
1376	h->map_col_to_list0[list][i] = j;
1377	break;
1378	}
1379	}
1380	}
1381	if(FRAME_MBAFF){
1382	for(list=0; list<2; list++){
1383	for(i=0; i<ref1->ref_count[list]; i++){
1384	j = h->map_col_to_list0[list][i];
1385	h->map_col_to_list0_field[list][2i] = 2j;
1386	h->map_col_to_list0_field[list][2i+1] = 2j+1;
1387	}
1388	}
1389	}
1390	}
1391
1392	static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1393	MpegEncContext * const s = &h->s;
1394	const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
1395	const int b8_xy = 2s->mb_x + 2s->mb_y*h->b8_stride;
1396	const int b4_xy = 4s->mb_x + 4s->mb_y*h->b_stride;
1397	const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1398	const int16_t (l1mv0)[2] = (const int16_t ()[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1399	const int16_t (l1mv1)[2] = (const int16_t ()[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1400	const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1401	const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1402	const int is_b8x8 = IS_8X8(*mb_type);
1403	int sub_mb_type;
1404	int i8, i4;
1405
1406	#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16\|MB_TYPE_INTRA4x4\|MB_TYPE_INTRA16x16\|MB_TYPE_INTRA_PCM)
1407	if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1408	/* FIXME save sub mb types from previous frames (or derive from MVs)
1409	* so we know exactly what block size to use */
1410	sub_mb_type = MB_TYPE_8x8\|MB_TYPE_P0L0\|MB_TYPE_P0L1\|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1411	*mb_type = MB_TYPE_8x8\|MB_TYPE_L0L1;
1412	}else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1413	sub_mb_type = MB_TYPE_16x16\|MB_TYPE_P0L0\|MB_TYPE_P0L1\|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1414	mb_type = MB_TYPE_16x16\|MB_TYPE_P0L0\|MB_TYPE_P0L1\|MB_TYPE_DIRECT2; / B_16x16 */
1415	}else{
1416	sub_mb_type = MB_TYPE_16x16\|MB_TYPE_P0L0\|MB_TYPE_P0L1\|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1417	*mb_type = MB_TYPE_8x8\|MB_TYPE_L0L1;
1418	}
1419	if(!is_b8x8)
1420	*mb_type \|= MB_TYPE_DIRECT2;
1421	if(MB_FIELD)
1422	*mb_type \|= MB_TYPE_INTERLACED;
1423
1424	tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1425
1426	if(h->direct_spatial_mv_pred){
1427	int ref[2];
1428	int mv[2][2];
1429	int list;
1430
1431	/* FIXME interlacing + spatial direct uses wrong colocated block positions */
1432
1433	/* ref = min(neighbors) */
1434	for(list=0; list<2; list++){
1435	int refa = h->ref_cache[list][scan8[0] - 1];
1436	int refb = h->ref_cache[list][scan8[0] - 8];
1437	int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1438	if(refc == -2)
1439	refc = h->ref_cache[list][scan8[0] - 8 - 1];
1440	ref[list] = refa;
1441	if(ref[list] < 0 \|\| (refb < ref[list] && refb >= 0))
1442	ref[list] = refb;
1443	if(ref[list] < 0 \|\| (refc < ref[list] && refc >= 0))
1444	ref[list] = refc;
1445	if(ref[list] < 0)
1446	ref[list] = -1;
1447	}
1448
1449	if(ref[0] < 0 && ref[1] < 0){
1450	ref[0] = ref[1] = 0;
1451	mv[0][0] = mv[0][1] =
1452	mv[1][0] = mv[1][1] = 0;
1453	}else{
1454	for(list=0; list<2; list++){
1455	if(ref[list] >= 0)
1456	pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1457	else
1458	mv[list][0] = mv[list][1] = 0;
1459	}
1460	}
1461
1462	if(ref[1] < 0){
1463	*mb_type &= ~MB_TYPE_P0L1;
1464	sub_mb_type &= ~MB_TYPE_P0L1;
1465	}else if(ref[0] < 0){
1466	*mb_type &= ~MB_TYPE_P0L0;
1467	sub_mb_type &= ~MB_TYPE_P0L0;
1468	}
1469
1470	if(IS_16X16(*mb_type)){
1471	fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1472	fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1473	if(!IS_INTRA(mb_type_col)
1474	&& ( (l1ref0[0] == 0 && ABS(l1mv0[0][0]) <= 1 && ABS(l1mv0[0][1]) <= 1)
1475	\|\| (l1ref0[0] < 0 && l1ref1[0] == 0 && ABS(l1mv1[0][0]) <= 1 && ABS(l1mv1[0][1]) <= 1
1476	&& (h->x264_build>33 \|\| !h->x264_build)))){
1477	if(ref[0] > 0)
1478	fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1479	else
1480	fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1481	if(ref[1] > 0)
1482	fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1483	else
1484	fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1485	}else{
1486	fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1487	fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1488	}
1489	}else{
1490	for(i8=0; i8<4; i8++){
1491	const int x8 = i8&1;
1492	const int y8 = i8>>1;
1493
1494	if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1495	continue;
1496	h->sub_mb_type[i8] = sub_mb_type;
1497
1498	fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1499	fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1500	fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1501	fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1502
1503	/* col_zero_flag */
1504	if(!IS_INTRA(mb_type_col) && ( l1ref0[x8 + y8*h->b8_stride] == 0
1505	\|\| (l1ref0[x8 + y8h->b8_stride] < 0 && l1ref1[x8 + y8h->b8_stride] == 0
1506	&& (h->x264_build>33 \|\| !h->x264_build)))){
1507	const int16_t (l1mv)[2]= l1ref0[x8 + y8h->b8_stride] == 0 ? l1mv0 : l1mv1;
1508	if(IS_SUB_8X8(sub_mb_type)){
1509	const int16_t mv_col = l1mv[x83 + y83h->b_stride];
1510	if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
1511	if(ref[0] == 0)
1512	fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1513	if(ref[1] == 0)
1514	fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1515	}
1516	}else
1517	for(i4=0; i4<4; i4++){
1518	const int16_t mv_col = l1mv[x82 + (i4&1) + (y82 + (i4>>1))h->b_stride];
1519	if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
1520	if(ref[0] == 0)
1521	(uint32_t)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1522	if(ref[1] == 0)
1523	(uint32_t)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1524	}
1525	}
1526	}
1527	}
1528	}
1529	}else{ /* direct temporal mv pred */
1530	const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1531	const int *dist_scale_factor = h->dist_scale_factor;
1532
1533	if(FRAME_MBAFF){
1534	if(IS_INTERLACED(*mb_type)){
1535	map_col_to_list0[0] = h->map_col_to_list0_field[0];
1536	map_col_to_list0[1] = h->map_col_to_list0_field[1];
1537	dist_scale_factor = h->dist_scale_factor_field;
1538	}
1539	if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1540	/* FIXME assumes direct_8x8_inference == 1 */
1541	const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1542	int mb_types_col[2];
1543	int y_shift;
1544
1545	*mb_type = MB_TYPE_8x8\|MB_TYPE_L0L1
1546	\| (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1547	\| (*mb_type & MB_TYPE_INTERLACED);
1548	sub_mb_type = MB_TYPE_P0L0\|MB_TYPE_P0L1\|MB_TYPE_DIRECT2\|MB_TYPE_16x16;
1549
1550	if(IS_INTERLACED(*mb_type)){
1551	/* frame to field scaling */
1552	mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1553	mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1554	if(s->mb_y&1){
1555	l1ref0 -= 2*h->b8_stride;
1556	l1ref1 -= 2*h->b8_stride;
1557	l1mv0 -= 4*h->b_stride;
1558	l1mv1 -= 4*h->b_stride;
1559	}
1560	y_shift = 0;
1561
1562	if( (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1563	&& (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1564	&& !is_b8x8)
1565	*mb_type \|= MB_TYPE_16x8;
1566	else
1567	*mb_type \|= MB_TYPE_8x8;
1568	}else{
1569	/* field to frame scaling */
1570	/* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1571	* but in MBAFF, top and bottom POC are equal */
1572	int dy = (s->mb_y&1) ? 1 : 2;
1573	mb_types_col[0] =
1574	mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1575	l1ref0 += dy*h->b8_stride;
1576	l1ref1 += dy*h->b8_stride;
1577	l1mv0 += 2dyh->b_stride;
1578	l1mv1 += 2dyh->b_stride;
1579	y_shift = 2;
1580
1581	if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA\|MB_TYPE_16x8))
1582	&& !is_b8x8)
1583	*mb_type \|= MB_TYPE_16x16;
1584	else
1585	*mb_type \|= MB_TYPE_8x8;
1586	}
1587
1588	for(i8=0; i8<4; i8++){
1589	const int x8 = i8&1;
1590	const int y8 = i8>>1;
1591	int ref0, scale;
1592	const int16_t (*l1mv)[2]= l1mv0;
1593
1594	if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1595	continue;
1596	h->sub_mb_type[i8] = sub_mb_type;
1597
1598	fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1599	if(IS_INTRA(mb_types_col[y8])){
1600	fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1601	fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1602	fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1603	continue;
1604	}
1605
1606	ref0 = l1ref0[x8 + (y82>>y_shift)h->b8_stride];
1607	if(ref0 >= 0)
1608	ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1609	else{
1610	ref0 = map_col_to_list0[1][l1ref1[x8 + (y82>>y_shift)h->b8_stride]*2>>y_shift];
1611	l1mv= l1mv1;
1612	}
1613	scale = dist_scale_factor[ref0];
1614	fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1615
1616	{
1617	const int16_t mv_col = l1mv[x83 + (y86>>y_shift)h->b_stride];
1618	int my_col = (mv_col[1]<<y_shift)/2;
1619	int mx = (scale * mv_col[0] + 128) >> 8;
1620	int my = (scale * my_col + 128) >> 8;
1621	fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1622	fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1623	}
1624	}
1625	return;
1626	}
1627	}
1628
1629	/* one-to-one mv scaling */
1630
1631	if(IS_16X16(*mb_type)){
1632	fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1633	if(IS_INTRA(mb_type_col)){
1634	fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
1635	fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1636	fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1637	}else{
1638	const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1639	: map_col_to_list0[1][l1ref1[0]];
1640	const int scale = dist_scale_factor[ref0];
1641	const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1642	int mv_l0[2];
1643	mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1644	mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1645	fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
1646	fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
1647	fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
1648	}
1649	}else{
1650	for(i8=0; i8<4; i8++){
1651	const int x8 = i8&1;
1652	const int y8 = i8>>1;
1653	int ref0, scale;
1654	const int16_t (*l1mv)[2]= l1mv0;
1655
1656	if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1657	continue;
1658	h->sub_mb_type[i8] = sub_mb_type;
1659	fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1660	if(IS_INTRA(mb_type_col)){
1661	fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1662	fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1663	fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1664	continue;
1665	}
1666
1667	ref0 = l1ref0[x8 + y8*h->b8_stride];
1668	if(ref0 >= 0)
1669	ref0 = map_col_to_list0[0][ref0];
1670	else{
1671	ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1672	l1mv= l1mv1;
1673	}
1674	scale = dist_scale_factor[ref0];
1675
1676	fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1677	if(IS_SUB_8X8(sub_mb_type)){
1678	const int16_t mv_col = l1mv[x83 + y83h->b_stride];
1679	int mx = (scale * mv_col[0] + 128) >> 8;
1680	int my = (scale * mv_col[1] + 128) >> 8;
1681	fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1682	fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1683	}else
1684	for(i4=0; i4<4; i4++){
1685	const int16_t mv_col = l1mv[x82 + (i4&1) + (y82 + (i4>>1))h->b_stride];
1686	int16_t mv_l0 = h->mv_cache[0][scan8[i84+i4]];
1687	mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1688	mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1689	(uint32_t)h->mv_cache[1][scan8[i8*4+i4]] =
1690	pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1691	}
1692	}
1693	}
1694	}
1695	}
1696
1697	static inline void write_back_motion(H264Context *h, int mb_type){
1698	MpegEncContext * const s = &h->s;
1699	const int b_xy = 4s->mb_x + 4s->mb_y*h->b_stride;
1700	const int b8_xy= 2s->mb_x + 2s->mb_y*h->b8_stride;
1701	int list;
1702
1703	if(!USES_LIST(mb_type, 0))
1704	fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1705
1706	for(list=0; list<2; list++){
1707	int y;
1708	if(!USES_LIST(mb_type, list))
1709	continue;
1710
1711	for(y=0; y<4; y++){
1712	(uint64_t)s->current_picture.motion_val[list][b_xy + 0 + yh->b_stride]= (uint64_t)h->mv_cache[list][scan8[0]+0 + 8y];
1713	(uint64_t)s->current_picture.motion_val[list][b_xy + 2 + yh->b_stride]= (uint64_t)h->mv_cache[list][scan8[0]+2 + 8y];
1714	}
1715	if( h->pps.cabac ) {
1716	for(y=0; y<4; y++){
1717	(uint64_t)h->mvd_table[list][b_xy + 0 + yh->b_stride]= (uint64_t)h->mvd_cache[list][scan8[0]+0 + 8y];
1718	(uint64_t)h->mvd_table[list][b_xy + 2 + yh->b_stride]= (uint64_t)h->mvd_cache[list][scan8[0]+2 + 8y];
1719	}
1720	}
1721
1722	{
1723	uint8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1724	ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1725	ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1726	ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1727	ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1728	}
1729	}
1730
1731	if(h->slice_type == B_TYPE && h->pps.cabac){
1732	if(IS_8X8(mb_type)){
1733	uint8_t *direct_table = &h->direct_table[b8_xy];
1734	direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1735	direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1736	direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1737	}
1738	}
1739	}
1740
1741	/**
1742	* Decodes a network abstraction layer unit.
1743	* @param consumed is the number of bytes used as input
1744	* @param length is the length of the array
1745	* @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1746	* @returns decoded bytes, might be src+1 if no escapes
1747	*/
1748	static uint8_t decode_nal(H264Context h, uint8_t src, int dst_length, int *consumed, int length){
1749	int i, si, di;
1750	uint8_t *dst;
1751
1752	// src[0]&0x80; //forbidden bit
1753	h->nal_ref_idc= src[0]>>5;
1754	h->nal_unit_type= src[0]&0x1F;
1755
1756	src++; length--;
1757	#if 0
1758	for(i=0; i<length; i++)
1759	printf("%2X ", src[i]);
1760	#endif
1761	for(i=0; i+1<length; i+=2){
1762	if(src[i]) continue;
1763	if(i>0 && src[i-1]==0) i--;
1764	if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1765	if(src[i+2]!=3){
1766	/* startcode, so we must be past the end */
1767	length=i;
1768	}
1769	break;
1770	}
1771	}
1772
1773	if(i>=length-1){ //no escaped 0
1774	*dst_length= length;
1775	*consumed= length+1; //+1 for the header
1776	return src;
1777	}
1778
1779	h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1780	dst= h->rbsp_buffer;
1781
1782	//printf("decoding esc\n");
1783	si=di=0;
1784	while(si<length){
1785	//remove escapes (very rare 1:2^22)
1786	if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1787	if(src[si+2]==3){ //escape
1788	dst[di++]= 0;
1789	dst[di++]= 0;
1790	si+=3;
1791	continue;
1792	}else //next start code
1793	break;
1794	}
1795
1796	dst[di++]= src[si++];
1797	}
1798
1799	*dst_length= di;
1800	*consumed= si + 1;//+1 for the header
1801	//FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1802	return dst;
1803	}
1804
1805	#if 0
1806	/**
1807	* @param src the data which should be escaped
1808	* @param dst the target buffer, dst+1 == src is allowed as a special case
1809	* @param length the length of the src data
1810	* @param dst_length the length of the dst array
1811	* @returns length of escaped data in bytes or -1 if an error occured
1812	*/
1813	static int encode_nal(H264Context h, uint8_t dst, uint8_t *src, int length, int dst_length){
1814	int i, escape_count, si, di;
1815	uint8_t *temp;
1816
1817	assert(length>=0);
1818	assert(dst_length>0);
1819
1820	dst[0]= (h->nal_ref_idc<<5) + h->nal_unit_type;
1821
1822	if(length==0) return 1;
1823
1824	escape_count= 0;
1825	for(i=0; i<length; i+=2){
1826	if(src[i]) continue;
1827	if(i>0 && src[i-1]==0)
1828	i--;
1829	if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1830	escape_count++;
1831	i+=2;
1832	}
1833	}
1834
1835	if(escape_count==0){
1836	if(dst+1 != src)
1837	memcpy(dst+1, src, length);
1838	return length + 1;
1839	}
1840
1841	if(length + escape_count + 1> dst_length)
1842	return -1;
1843
1844	//this should be damn rare (hopefully)
1845
1846	h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length + escape_count);
1847	temp= h->rbsp_buffer;
1848	//printf("encoding esc\n");
1849
1850	si= 0;
1851	di= 0;
1852	while(si < length){
1853	if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1854	temp[di++]= 0; si++;
1855	temp[di++]= 0; si++;
1856	temp[di++]= 3;
1857	temp[di++]= src[si++];
1858	}
1859	else
1860	temp[di++]= src[si++];
1861	}
1862	memcpy(dst+1, temp, length+escape_count);
1863
1864	assert(di == length+escape_count);
1865
1866	return di + 1;
1867	}
1868
1869	/**
1870	* write 1,10,100,1000,... for alignment, yes its exactly inverse to mpeg4
1871	*/
1872	static void encode_rbsp_trailing(PutBitContext *pb){
1873	int length;
1874	put_bits(pb, 1, 1);
1875	length= (-put_bits_count(pb))&7;
1876	if(length) put_bits(pb, length, 0);
1877	}
1878	#endif
1879
1880	/**
1881	* identifies the exact end of the bitstream
1882	* @return the length of the trailing, or 0 if damaged
1883	*/
1884	static int decode_rbsp_trailing(uint8_t *src){
1885	int v= *src;
1886	int r;
1887
1888	tprintf("rbsp trailing %X\n", v);
1889
1890	for(r=1; r<9; r++){
1891	if(v&1) return r;
1892	v>>=1;
1893	}
1894	return 0;
1895	}
1896
1897	/**
1898	* idct tranforms the 16 dc values and dequantize them.
1899	* @param qp quantization parameter
1900	*/
1901	static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1902	#define stride 16
1903	int i;
1904	int temp[16]; //FIXME check if this is a good idea
1905	static const int x_offset[4]={0, 1stride, 4 stride, 5*stride};
1906	static const int y_offset[4]={0, 2stride, 8 stride, 10*stride};
1907
1908	//memset(block, 64, 2*256);
1909	//return;
1910	for(i=0; i<4; i++){
1911	const int offset= y_offset[i];
1912	const int z0= block[offset+stride0] + block[offset+stride4];
1913	const int z1= block[offset+stride0] - block[offset+stride4];
1914	const int z2= block[offset+stride1] - block[offset+stride5];
1915	const int z3= block[offset+stride1] + block[offset+stride5];
1916
1917	temp[4*i+0]= z0+z3;
1918	temp[4*i+1]= z1+z2;
1919	temp[4*i+2]= z1-z2;
1920	temp[4*i+3]= z0-z3;
1921	}
1922
1923	for(i=0; i<4; i++){
1924	const int offset= x_offset[i];
1925	const int z0= temp[40+i] + temp[42+i];
1926	const int z1= temp[40+i] - temp[42+i];
1927	const int z2= temp[41+i] - temp[43+i];
1928	const int z3= temp[41+i] + temp[43+i];
1929
1930	block[stride0 +offset]= ((((z0 + z3)qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1931	block[stride2 +offset]= ((((z1 + z2)qmul + 128 ) >> 8));
1932	block[stride8 +offset]= ((((z1 - z2)qmul + 128 ) >> 8));
1933	block[stride10+offset]= ((((z0 - z3)qmul + 128 ) >> 8));
1934	}
1935	}
1936
1937	#if 0
1938	/**
1939	* dct tranforms the 16 dc values.
1940	* @param qp quantization parameter ??? FIXME
1941	*/
1942	static void h264_luma_dc_dct_c(DCTELEM block/, int qp*/){
1943	// const int qmul= dequant_coeff[qp][0];
1944	int i;
1945	int temp[16]; //FIXME check if this is a good idea
1946	static const int x_offset[4]={0, 1stride, 4 stride, 5*stride};
1947	static const int y_offset[4]={0, 2stride, 8 stride, 10*stride};
1948
1949	for(i=0; i<4; i++){
1950	const int offset= y_offset[i];
1951	const int z0= block[offset+stride0] + block[offset+stride4];
1952	const int z1= block[offset+stride0] - block[offset+stride4];
1953	const int z2= block[offset+stride1] - block[offset+stride5];
1954	const int z3= block[offset+stride1] + block[offset+stride5];
1955
1956	temp[4*i+0]= z0+z3;
1957	temp[4*i+1]= z1+z2;
1958	temp[4*i+2]= z1-z2;
1959	temp[4*i+3]= z0-z3;
1960	}
1961
1962	for(i=0; i<4; i++){
1963	const int offset= x_offset[i];
1964	const int z0= temp[40+i] + temp[42+i];
1965	const int z1= temp[40+i] - temp[42+i];
1966	const int z2= temp[41+i] - temp[43+i];
1967	const int z3= temp[41+i] + temp[43+i];
1968
1969	block[stride*0 +offset]= (z0 + z3)>>1;
1970	block[stride*2 +offset]= (z1 + z2)>>1;
1971	block[stride*8 +offset]= (z1 - z2)>>1;
1972	block[stride*10+offset]= (z0 - z3)>>1;
1973	}
1974	}
1975	#endif
1976
1977	#undef xStride
1978	#undef stride
1979
1980	static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1981	const int stride= 16*2;
1982	const int xStride= 16;
1983	int a,b,c,d,e;
1984
1985	a= block[stride0 + xStride0];
1986	b= block[stride0 + xStride1];
1987	c= block[stride1 + xStride0];
1988	d= block[stride1 + xStride1];
1989
1990	e= a-b;
1991	a= a+b;
1992	b= c-d;
1993	c= c+d;
1994
1995	block[stride0 + xStride0]= ((a+c)*qmul) >> 7;
1996	block[stride0 + xStride1]= ((e+b)*qmul) >> 7;
1997	block[stride1 + xStride0]= ((a-c)*qmul) >> 7;
1998	block[stride1 + xStride1]= ((e-b)*qmul) >> 7;
1999	}
2000
2001	#if 0
2002	static void chroma_dc_dct_c(DCTELEM *block){
2003	const int stride= 16*2;
2004	const int xStride= 16;
2005	int a,b,c,d,e;
2006
2007	a= block[stride0 + xStride0];
2008	b= block[stride0 + xStride1];
2009	c= block[stride1 + xStride0];
2010	d= block[stride1 + xStride1];
2011
2012	e= a-b;
2013	a= a+b;
2014	b= c-d;
2015	c= c+d;
2016
2017	block[stride0 + xStride0]= (a+c);
2018	block[stride0 + xStride1]= (e+b);
2019	block[stride1 + xStride0]= (a-c);
2020	block[stride1 + xStride1]= (e-b);
2021	}
2022	#endif
2023
2024	/**
2025	* gets the chroma qp.
2026	*/
2027	static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
2028
2029	return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
2030	}
2031
2032
2033	#if 0
2034	static void h264_diff_dct_c(DCTELEM block, uint8_t src1, uint8_t *src2, int stride){
2035	int i;
2036	//FIXME try int temp instead of block
2037
2038	for(i=0; i<4; i++){
2039	const int d0= src1[0 + istride] - src2[0 + istride];
2040	const int d1= src1[1 + istride] - src2[1 + istride];
2041	const int d2= src1[2 + istride] - src2[2 + istride];
2042	const int d3= src1[3 + istride] - src2[3 + istride];
2043	const int z0= d0 + d3;
2044	const int z3= d0 - d3;
2045	const int z1= d1 + d2;
2046	const int z2= d1 - d2;
2047
2048	block[0 + 4*i]= z0 + z1;
2049	block[1 + 4i]= 2z3 + z2;
2050	block[2 + 4*i]= z0 - z1;
2051	block[3 + 4i]= z3 - 2z2;
2052	}
2053
2054	for(i=0; i<4; i++){
2055	const int z0= block[04 + i] + block[34 + i];
2056	const int z3= block[04 + i] - block[34 + i];
2057	const int z1= block[14 + i] + block[24 + i];
2058	const int z2= block[14 + i] - block[24 + i];
2059
2060	block[0*4 + i]= z0 + z1;
2061	block[14 + i]= 2z3 + z2;
2062	block[2*4 + i]= z0 - z1;
2063	block[34 + i]= z3 - 2z2;
2064	}
2065	}
2066	#endif
2067
2068	//FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
2069	//FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
2070	static inline int quantize_c(DCTELEM block, uint8_t scantable, int qscale, int intra, int seperate_dc){
2071	int i;
2072	const int * const quant_table= quant_coeff[qscale];
2073	const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
2074	const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
2075	const unsigned int threshold2= (threshold1<<1);
2076	int last_non_zero;
2077
2078	if(seperate_dc){
2079	if(qscale<=18){
2080	//avoid overflows
2081	const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
2082	const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
2083	const unsigned int dc_threshold2= (dc_threshold1<<1);
2084
2085	int level= block[0]*quant_coeff[qscale+18][0];
2086	if(((unsigned)(level+dc_threshold1))>dc_threshold2){
2087	if(level>0){
2088	level= (dc_bias + level)>>(QUANT_SHIFT-2);
2089	block[0]= level;
2090	}else{
2091	level= (dc_bias - level)>>(QUANT_SHIFT-2);
2092	block[0]= -level;
2093	}
2094	// last_non_zero = i;
2095	}else{
2096	block[0]=0;
2097	}
2098	}else{
2099	const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
2100	const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
2101	const unsigned int dc_threshold2= (dc_threshold1<<1);
2102
2103	int level= block[0]*quant_table[0];
2104	if(((unsigned)(level+dc_threshold1))>dc_threshold2){
2105	if(level>0){
2106	level= (dc_bias + level)>>(QUANT_SHIFT+1);
2107	block[0]= level;
2108	}else{
2109	level= (dc_bias - level)>>(QUANT_SHIFT+1);
2110	block[0]= -level;
2111	}
2112	// last_non_zero = i;
2113	}else{
2114	block[0]=0;
2115	}
2116	}
2117	last_non_zero= 0;
2118	i=1;
2119	}else{
2120	last_non_zero= -1;
2121	i=0;
2122	}
2123
2124	for(; i<16; i++){
2125	const int j= scantable[i];
2126	int level= block[j]*quant_table[j];
2127
2128	// if( bias+level >= (1<<(QMAT_SHIFT - 3))
2129	// \|\| bias-level >= (1<<(QMAT_SHIFT - 3))){
2130	if(((unsigned)(level+threshold1))>threshold2){
2131	if(level>0){
2132	level= (bias + level)>>QUANT_SHIFT;
2133	block[j]= level;
2134	}else{
2135	level= (bias - level)>>QUANT_SHIFT;
2136	block[j]= -level;
2137	}
2138	last_non_zero = i;
2139	}else{
2140	block[j]=0;
2141	}
2142	}
2143
2144	return last_non_zero;
2145	}
2146
2147	static void pred4x4_vertical_c(uint8_t src, uint8_t topright, int stride){
2148	const uint32_t a= ((uint32_t*)(src-stride))[0];
2149	((uint32_t)(src+0stride))[0]= a;
2150	((uint32_t)(src+1stride))[0]= a;
2151	((uint32_t)(src+2stride))[0]= a;
2152	((uint32_t)(src+3stride))[0]= a;
2153	}
2154
2155	static void pred4x4_horizontal_c(uint8_t src, uint8_t topright, int stride){
2156	((uint32_t)(src+0stride))[0]= src[-1+0stride]0x01010101;
2157	((uint32_t)(src+1stride))[0]= src[-1+1stride]0x01010101;
2158	((uint32_t)(src+2stride))[0]= src[-1+2stride]0x01010101;
2159	((uint32_t)(src+3stride))[0]= src[-1+3stride]0x01010101;
2160	}
2161
2162	static void pred4x4_dc_c(uint8_t src, uint8_t topright, int stride){
2163	const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
2164	+ src[-1+0stride] + src[-1+1stride] + src[-1+2stride] + src[-1+3stride] + 4) >>3;
2165
2166	((uint32_t)(src+0stride))[0]=
2167	((uint32_t)(src+1stride))[0]=
2168	((uint32_t)(src+2stride))[0]=
2169	((uint32_t)(src+3stride))[0]= dc* 0x01010101;
2170	}
2171
2172	static void pred4x4_left_dc_c(uint8_t src, uint8_t topright, int stride){
2173	const int dc= ( src[-1+0stride] + src[-1+1stride] + src[-1+2stride] + src[-1+3stride] + 2) >>2;
2174
2175	((uint32_t)(src+0stride))[0]=
2176	((uint32_t)(src+1stride))[0]=
2177	((uint32_t)(src+2stride))[0]=
2178	((uint32_t)(src+3stride))[0]= dc* 0x01010101;
2179	}
2180
2181	static void pred4x4_top_dc_c(uint8_t src, uint8_t topright, int stride){
2182	const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
2183
2184	((uint32_t)(src+0stride))[0]=
2185	((uint32_t)(src+1stride))[0]=
2186	((uint32_t)(src+2stride))[0]=
2187	((uint32_t)(src+3stride))[0]= dc* 0x01010101;
2188	}
2189
2190	static void pred4x4_128_dc_c(uint8_t src, uint8_t topright, int stride){
2191	((uint32_t)(src+0stride))[0]=
2192	((uint32_t)(src+1stride))[0]=
2193	((uint32_t)(src+2stride))[0]=
2194	((uint32_t)(src+3stride))[0]= 128U*0x01010101U;
2195	}
2196
2197
2198	#define LOAD_TOP_RIGHT_EDGE\
2199	const int t4= topright[0];\
2200	const int t5= topright[1];\
2201	const int t6= topright[2];\
2202	const int t7= topright[3];\
2203
2204	#define LOAD_LEFT_EDGE\
2205	const int l0= src[-1+0*stride];\
2206	const int l1= src[-1+1*stride];\
2207	const int l2= src[-1+2*stride];\
2208	const int l3= src[-1+3*stride];\
2209
2210	#define LOAD_TOP_EDGE\
2211	const int t0= src[ 0-1*stride];\
2212	const int t1= src[ 1-1*stride];\
2213	const int t2= src[ 2-1*stride];\
2214	const int t3= src[ 3-1*stride];\
2215
2216	static void pred4x4_down_right_c(uint8_t src, uint8_t topright, int stride){
2217	const int lt= src[-1-1*stride];
2218	LOAD_TOP_EDGE
2219	LOAD_LEFT_EDGE
2220
2221	src[0+3stride]=(l3 + 2l2 + l1 + 2)>>2;
2222	src[0+2*stride]=
2223	src[1+3stride]=(l2 + 2l1 + l0 + 2)>>2;
2224	src[0+1*stride]=
2225	src[1+2*stride]=
2226	src[2+3stride]=(l1 + 2l0 + lt + 2)>>2;
2227	src[0+0*stride]=
2228	src[1+1*stride]=
2229	src[2+2*stride]=
2230	src[3+3stride]=(l0 + 2lt + t0 + 2)>>2;
2231	src[1+0*stride]=
2232	src[2+1*stride]=
2233	src[3+2stride]=(lt + 2t0 + t1 + 2)>>2;
2234	src[2+0*stride]=
2235	src[3+1stride]=(t0 + 2t1 + t2 + 2)>>2;
2236	src[3+0stride]=(t1 + 2t2 + t3 + 2)>>2;
2237	}
2238
2239	static void pred4x4_down_left_c(uint8_t src, uint8_t topright, int stride){
2240	LOAD_TOP_EDGE
2241	LOAD_TOP_RIGHT_EDGE
2242	// LOAD_LEFT_EDGE
2243
2244	src[0+0stride]=(t0 + t2 + 2t1 + 2)>>2;
2245	src[1+0*stride]=
2246	src[0+1stride]=(t1 + t3 + 2t2 + 2)>>2;
2247	src[2+0*stride]=
2248	src[1+1*stride]=
2249	src[0+2stride]=(t2 + t4 + 2t3 + 2)>>2;
2250	src[3+0*stride]=
2251	src[2+1*stride]=
2252	src[1+2*stride]=
2253	src[0+3stride]=(t3 + t5 + 2t4 + 2)>>2;
2254	src[3+1*stride]=
2255	src[2+2*stride]=
2256	src[1+3stride]=(t4 + t6 + 2t5 + 2)>>2;
2257	src[3+2*stride]=
2258	src[2+3stride]=(t5 + t7 + 2t6 + 2)>>2;
2259	src[3+3stride]=(t6 + 3t7 + 2)>>2;
2260	}
2261
2262	static void pred4x4_vertical_right_c(uint8_t src, uint8_t topright, int stride){
2263	const int lt= src[-1-1*stride];
2264	LOAD_TOP_EDGE
2265	LOAD_LEFT_EDGE
2266	const __attribute__((unused)) int unu= l3;
2267
2268	src[0+0*stride]=
2269	src[1+2*stride]=(lt + t0 + 1)>>1;
2270	src[1+0*stride]=
2271	src[2+2*stride]=(t0 + t1 + 1)>>1;
2272	src[2+0*stride]=
2273	src[3+2*stride]=(t1 + t2 + 1)>>1;
2274	src[3+0*stride]=(t2 + t3 + 1)>>1;
2275	src[0+1*stride]=
2276	src[1+3stride]=(l0 + 2lt + t0 + 2)>>2;
2277	src[1+1*stride]=
2278	src[2+3stride]=(lt + 2t0 + t1 + 2)>>2;
2279	src[2+1*stride]=
2280	src[3+3stride]=(t0 + 2t1 + t2 + 2)>>2;
2281	src[3+1stride]=(t1 + 2t2 + t3 + 2)>>2;
2282	src[0+2stride]=(lt + 2l0 + l1 + 2)>>2;
2283	src[0+3stride]=(l0 + 2l1 + l2 + 2)>>2;
2284	}
2285
2286	static void pred4x4_vertical_left_c(uint8_t src, uint8_t topright, int stride){
2287	LOAD_TOP_EDGE
2288	LOAD_TOP_RIGHT_EDGE
2289	const __attribute__((unused)) int unu= t7;
2290
2291	src[0+0*stride]=(t0 + t1 + 1)>>1;
2292	src[1+0*stride]=
2293	src[0+2*stride]=(t1 + t2 + 1)>>1;
2294	src[2+0*stride]=
2295	src[1+2*stride]=(t2 + t3 + 1)>>1;
2296	src[3+0*stride]=
2297	src[2+2*stride]=(t3 + t4+ 1)>>1;
2298	src[3+2*stride]=(t4 + t5+ 1)>>1;
2299	src[0+1stride]=(t0 + 2t1 + t2 + 2)>>2;
2300	src[1+1*stride]=
2301	src[0+3stride]=(t1 + 2t2 + t3 + 2)>>2;
2302	src[2+1*stride]=
2303	src[1+3stride]=(t2 + 2t3 + t4 + 2)>>2;
2304	src[3+1*stride]=
2305	src[2+3stride]=(t3 + 2t4 + t5 + 2)>>2;
2306	src[3+3stride]=(t4 + 2t5 + t6 + 2)>>2;
2307	}
2308
2309	static void pred4x4_horizontal_up_c(uint8_t src, uint8_t topright, int stride){
2310	LOAD_LEFT_EDGE
2311
2312	src[0+0*stride]=(l0 + l1 + 1)>>1;
2313	src[1+0stride]=(l0 + 2l1 + l2 + 2)>>2;
2314	src[2+0*stride]=
2315	src[0+1*stride]=(l1 + l2 + 1)>>1;
2316	src[3+0*stride]=
2317	src[1+1stride]=(l1 + 2l2 + l3 + 2)>>2;
2318	src[2+1*stride]=
2319	src[0+2*stride]=(l2 + l3 + 1)>>1;
2320	src[3+1*stride]=
2321	src[1+2stride]=(l2 + 2l3 + l3 + 2)>>2;
2322	src[3+2*stride]=
2323	src[1+3*stride]=
2324	src[0+3*stride]=
2325	src[2+2*stride]=
2326	src[2+3*stride]=
2327	src[3+3*stride]=l3;
2328	}
2329
2330	static void pred4x4_horizontal_down_c(uint8_t src, uint8_t topright, int stride){
2331	const int lt= src[-1-1*stride];
2332	LOAD_TOP_EDGE
2333	LOAD_LEFT_EDGE
2334	const __attribute__((unused)) int unu= t3;
2335
2336	src[0+0*stride]=
2337	src[2+1*stride]=(lt + l0 + 1)>>1;
2338	src[1+0*stride]=
2339	src[3+1stride]=(l0 + 2lt + t0 + 2)>>2;
2340	src[2+0stride]=(lt + 2t0 + t1 + 2)>>2;
2341	src[3+0stride]=(t0 + 2t1 + t2 + 2)>>2;
2342	src[0+1*stride]=
2343	src[2+2*stride]=(l0 + l1 + 1)>>1;
2344	src[1+1*stride]=
2345	src[3+2stride]=(lt + 2l0 + l1 + 2)>>2;
2346	src[0+2*stride]=
2347	src[2+3*stride]=(l1 + l2+ 1)>>1;
2348	src[1+2*stride]=
2349	src[3+3stride]=(l0 + 2l1 + l2 + 2)>>2;
2350	src[0+3*stride]=(l2 + l3 + 1)>>1;
2351	src[1+3stride]=(l1 + 2l2 + l3 + 2)>>2;
2352	}
2353
2354	static void pred16x16_vertical_c(uint8_t *src, int stride){
2355	int i;
2356	const uint32_t a= ((uint32_t*)(src-stride))[0];
2357	const uint32_t b= ((uint32_t*)(src-stride))[1];
2358	const uint32_t c= ((uint32_t*)(src-stride))[2];
2359	const uint32_t d= ((uint32_t*)(src-stride))[3];
2360
2361	for(i=0; i<16; i++){
2362	((uint32_t)(src+istride))[0]= a;
2363	((uint32_t)(src+istride))[1]= b;
2364	((uint32_t)(src+istride))[2]= c;
2365	((uint32_t)(src+istride))[3]= d;
2366	}
2367	}
2368
2369	static void pred16x16_horizontal_c(uint8_t *src, int stride){
2370	int i;
2371
2372	for(i=0; i<16; i++){
2373	((uint32_t)(src+istride))[0]=
2374	((uint32_t)(src+istride))[1]=
2375	((uint32_t)(src+istride))[2]=
2376	((uint32_t)(src+istride))[3]= src[-1+istride]0x01010101;
2377	}
2378	}
2379
2380	static void pred16x16_dc_c(uint8_t *src, int stride){
2381	int i, dc=0;
2382
2383	for(i=0;i<16; i++){
2384	dc+= src[-1+i*stride];
2385	}
2386
2387	for(i=0;i<16; i++){
2388	dc+= src[i-stride];
2389	}
2390
2391	dc= 0x01010101*((dc + 16)>>5);
2392
2393	for(i=0; i<16; i++){
2394	((uint32_t)(src+istride))[0]=
2395	((uint32_t)(src+istride))[1]=
2396	((uint32_t)(src+istride))[2]=
2397	((uint32_t)(src+istride))[3]= dc;
2398	}
2399	}
2400
2401	static void pred16x16_left_dc_c(uint8_t *src, int stride){
2402	int i, dc=0;
2403
2404	for(i=0;i<16; i++){
2405	dc+= src[-1+i*stride];
2406	}
2407
2408	dc= 0x01010101*((dc + 8)>>4);
2409
2410	for(i=0; i<16; i++){
2411	((uint32_t)(src+istride))[0]=
2412	((uint32_t)(src+istride))[1]=
2413	((uint32_t)(src+istride))[2]=
2414	((uint32_t)(src+istride))[3]= dc;
2415	}
2416	}
2417
2418	static void pred16x16_top_dc_c(uint8_t *src, int stride){
2419	int i, dc=0;
2420
2421	for(i=0;i<16; i++){
2422	dc+= src[i-stride];
2423	}
2424	dc= 0x01010101*((dc + 8)>>4);
2425
2426	for(i=0; i<16; i++){
2427	((uint32_t)(src+istride))[0]=
2428	((uint32_t)(src+istride))[1]=
2429	((uint32_t)(src+istride))[2]=
2430	((uint32_t)(src+istride))[3]= dc;
2431	}
2432	}
2433
2434	static void pred16x16_128_dc_c(uint8_t *src, int stride){
2435	int i;
2436
2437	for(i=0; i<16; i++){
2438	((uint32_t)(src+istride))[0]=
2439	((uint32_t)(src+istride))[1]=
2440	((uint32_t)(src+istride))[2]=
2441	((uint32_t)(src+istride))[3]= 0x01010101U*128U;
2442	}
2443	}
2444
2445	static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2446	int i, j, k;
2447	int a;
2448	uint8_t *cm = cropTbl + MAX_NEG_CROP;
2449	const uint8_t * const src0 = src+7-stride;
2450	const uint8_t src1 = src+8stride-1;
2451	const uint8_t src2 = src1-2stride; // == src+6*stride-1;
2452	int H = src0[1] - src0[-1];
2453	int V = src1[0] - src2[ 0];
2454	for(k=2; k<=8; ++k) {
2455	src1 += stride; src2 -= stride;
2456	H += k*(src0[k] - src0[-k]);
2457	V += k*(src1[0] - src2[ 0]);
2458	}
2459	if(svq3){
2460	H = ( 5*(H/4) ) / 16;
2461	V = ( 5*(V/4) ) / 16;
2462
2463	/* required for 100% accuracy */
2464	i = H; H = V; V = i;
2465	}else{
2466	H = ( 5*H+32 ) >> 6;
2467	V = ( 5*V+32 ) >> 6;
2468	}
2469
2470	a = 16(src1[0] + src2[16] + 1) - 7(V+H);
2471	for(j=16; j>0; --j) {
2472	int b = a;
2473	a += V;
2474	for(i=-16; i<0; i+=4) {
2475	src[16+i] = cm[ (b ) >> 5 ];
2476	src[17+i] = cm[ (b+ H) >> 5 ];
2477	src[18+i] = cm[ (b+2*H) >> 5 ];
2478	src[19+i] = cm[ (b+3*H) >> 5 ];
2479	b += 4*H;
2480	}
2481	src += stride;
2482	}
2483	}
2484
2485	static void pred16x16_plane_c(uint8_t *src, int stride){
2486	pred16x16_plane_compat_c(src, stride, 0);
2487	}
2488
2489	static void pred8x8_vertical_c(uint8_t *src, int stride){
2490	int i;
2491	const uint32_t a= ((uint32_t*)(src-stride))[0];
2492	const uint32_t b= ((uint32_t*)(src-stride))[1];
2493
2494	for(i=0; i<8; i++){
2495	((uint32_t)(src+istride))[0]= a;
2496	((uint32_t)(src+istride))[1]= b;
2497	}
2498	}
2499
2500	static void pred8x8_horizontal_c(uint8_t *src, int stride){
2501	int i;
2502
2503	for(i=0; i<8; i++){
2504	((uint32_t)(src+istride))[0]=
2505	((uint32_t)(src+istride))[1]= src[-1+istride]0x01010101;
2506	}
2507	}
2508
2509	static void pred8x8_128_dc_c(uint8_t *src, int stride){
2510	int i;
2511
2512	for(i=0; i<8; i++){
2513	((uint32_t)(src+istride))[0]=
2514	((uint32_t)(src+istride))[1]= 0x01010101U*128U;
2515	}
2516	}
2517
2518	static void pred8x8_left_dc_c(uint8_t *src, int stride){
2519	int i;
2520	int dc0, dc2;
2521
2522	dc0=dc2=0;
2523	for(i=0;i<4; i++){
2524	dc0+= src[-1+i*stride];
2525	dc2+= src[-1+(i+4)*stride];
2526	}
2527	dc0= 0x01010101*((dc0 + 2)>>2);
2528	dc2= 0x01010101*((dc2 + 2)>>2);
2529
2530	for(i=0; i<4; i++){
2531	((uint32_t)(src+istride))[0]=
2532	((uint32_t)(src+istride))[1]= dc0;
2533	}
2534	for(i=4; i<8; i++){
2535	((uint32_t)(src+istride))[0]=
2536	((uint32_t)(src+istride))[1]= dc2;
2537	}
2538	}
2539
2540	static void pred8x8_top_dc_c(uint8_t *src, int stride){
2541	int i;
2542	int dc0, dc1;
2543
2544	dc0=dc1=0;
2545	for(i=0;i<4; i++){
2546	dc0+= src[i-stride];
2547	dc1+= src[4+i-stride];
2548	}
2549	dc0= 0x01010101*((dc0 + 2)>>2);
2550	dc1= 0x01010101*((dc1 + 2)>>2);
2551
2552	for(i=0; i<4; i++){
2553	((uint32_t)(src+istride))[0]= dc0;
2554	((uint32_t)(src+istride))[1]= dc1;
2555	}
2556	for(i=4; i<8; i++){
2557	((uint32_t)(src+istride))[0]= dc0;
2558	((uint32_t)(src+istride))[1]= dc1;
2559	}
2560	}
2561
2562
2563	static void pred8x8_dc_c(uint8_t *src, int stride){
2564	int i;
2565	int dc0, dc1, dc2, dc3;
2566
2567	dc0=dc1=dc2=0;
2568	for(i=0;i<4; i++){
2569	dc0+= src[-1+i*stride] + src[i-stride];
2570	dc1+= src[4+i-stride];
2571	dc2+= src[-1+(i+4)*stride];
2572	}
2573	dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2574	dc0= 0x01010101*((dc0 + 4)>>3);
2575	dc1= 0x01010101*((dc1 + 2)>>2);
2576	dc2= 0x01010101*((dc2 + 2)>>2);
2577
2578	for(i=0; i<4; i++){
2579	((uint32_t)(src+istride))[0]= dc0;
2580	((uint32_t)(src+istride))[1]= dc1;
2581	}
2582	for(i=4; i<8; i++){
2583	((uint32_t)(src+istride))[0]= dc2;
2584	((uint32_t)(src+istride))[1]= dc3;
2585	}
2586	}
2587
2588	static void pred8x8_plane_c(uint8_t *src, int stride){
2589	int j, k;
2590	int a;
2591	uint8_t *cm = cropTbl + MAX_NEG_CROP;
2592	const uint8_t * const src0 = src+3-stride;
2593	const uint8_t src1 = src+4stride-1;
2594	const uint8_t src2 = src1-2stride; // == src+2*stride-1;
2595	int H = src0[1] - src0[-1];
2596	int V = src1[0] - src2[ 0];
2597	for(k=2; k<=4; ++k) {
2598	src1 += stride; src2 -= stride;
2599	H += k*(src0[k] - src0[-k]);
2600	V += k*(src1[0] - src2[ 0]);
2601	}
2602	H = ( 17*H+16 ) >> 5;
2603	V = ( 17*V+16 ) >> 5;
2604
2605	a = 16(src1[0] + src2[8]+1) - 3(V+H);
2606	for(j=8; j>0; --j) {
2607	int b = a;
2608	a += V;
2609	src[0] = cm[ (b ) >> 5 ];
2610	src[1] = cm[ (b+ H) >> 5 ];
2611	src[2] = cm[ (b+2*H) >> 5 ];
2612	src[3] = cm[ (b+3*H) >> 5 ];
2613	src[4] = cm[ (b+4*H) >> 5 ];
2614	src[5] = cm[ (b+5*H) >> 5 ];
2615	src[6] = cm[ (b+6*H) >> 5 ];
2616	src[7] = cm[ (b+7*H) >> 5 ];
2617	src += stride;
2618	}
2619	}
2620
2621	#define SRC(x,y) src[(x)+(y)*stride]
2622	#define PL(y) \
2623	const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2624	#define PREDICT_8x8_LOAD_LEFT \
2625	const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2626	+ 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2627	PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2628	const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2629
2630	#define PT(x) \
2631	const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2632	#define PREDICT_8x8_LOAD_TOP \
2633	const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2634	+ 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2635	PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2636	const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2637	+ 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2638
2639	#define PTR(x) \
2640	t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2641	#define PREDICT_8x8_LOAD_TOPRIGHT \
2642	int t8, t9, t10, t11, t12, t13, t14, t15; \
2643	if(has_topright) { \
2644	PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2645	t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2646	} else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2647
2648	#define PREDICT_8x8_LOAD_TOPLEFT \
2649	const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2650
2651	#define PREDICT_8x8_DC(v) \
2652	int y; \
2653	for( y = 0; y < 8; y++ ) { \
2654	((uint32_t*)src)[0] = \
2655	((uint32_t*)src)[1] = v; \
2656	src += stride; \
2657	}
2658
2659	static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2660	{
2661	PREDICT_8x8_DC(0x80808080);
2662	}
2663	static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2664	{
2665	PREDICT_8x8_LOAD_LEFT;
2666	const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2667	PREDICT_8x8_DC(dc);
2668	}
2669	static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2670	{
2671	PREDICT_8x8_LOAD_TOP;
2672	const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2673	PREDICT_8x8_DC(dc);
2674	}
2675	static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2676	{
2677	PREDICT_8x8_LOAD_LEFT;
2678	PREDICT_8x8_LOAD_TOP;
2679	const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2680	+t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2681	PREDICT_8x8_DC(dc);
2682	}
2683	static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2684	{
2685	PREDICT_8x8_LOAD_LEFT;
2686	#define ROW(y) ((uint32_t)(src+ystride))[0] =\
2687	((uint32_t)(src+ystride))[1] = 0x01010101 * l##y
2688	ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2689	#undef ROW
2690	}
2691	static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2692	{
2693	int y;
2694	PREDICT_8x8_LOAD_TOP;
2695	src[0] = t0;
2696	src[1] = t1;
2697	src[2] = t2;
2698	src[3] = t3;
2699	src[4] = t4;
2700	src[5] = t5;
2701	src[6] = t6;
2702	src[7] = t7;
2703	for( y = 1; y < 8; y++ )
2704	(uint64_t)(src+ystride) = (uint64_t*)src;
2705	}
2706	static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2707	{
2708	PREDICT_8x8_LOAD_TOP;
2709	PREDICT_8x8_LOAD_TOPRIGHT;
2710	SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2711	SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2712	SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2713	SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2714	SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2715	SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2716	SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2717	SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2718	SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2719	SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2720	SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2721	SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2722	SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2723	SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2724	SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2725	}
2726	static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2727	{
2728	PREDICT_8x8_LOAD_TOP;
2729	PREDICT_8x8_LOAD_LEFT;
2730	PREDICT_8x8_LOAD_TOPLEFT;
2731	SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2732	SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2733	SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2734	SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2735	SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2736	SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2737	SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2738	SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2739	SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2740	SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2741	SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2742	SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2743	SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2744	SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2745	SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2746
2747	}
2748	static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2749	{
2750	PREDICT_8x8_LOAD_TOP;
2751	PREDICT_8x8_LOAD_LEFT;
2752	PREDICT_8x8_LOAD_TOPLEFT;
2753	SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2754	SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2755	SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2756	SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2757	SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2758	SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2759	SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2760	SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2761	SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2762	SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2763	SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2764	SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2765	SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2766	SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2767	SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2768	SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2769	SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2770	SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2771	SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2772	SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2773	SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2774	SRC(7,0)= (t6 + t7 + 1) >> 1;
2775	}
2776	static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2777	{
2778	PREDICT_8x8_LOAD_TOP;
2779	PREDICT_8x8_LOAD_LEFT;
2780	PREDICT_8x8_LOAD_TOPLEFT;
2781	SRC(0,7)= (l6 + l7 + 1) >> 1;
2782	SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2783	SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2784	SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2785	SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2786	SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2787	SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2788	SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2789	SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2790	SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2791	SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2792	SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2793	SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2794	SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2795	SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2796	SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2797	SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2798	SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2799	SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2800	SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2801	SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2802	SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2803	}
2804	static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2805	{
2806	PREDICT_8x8_LOAD_TOP;
2807	PREDICT_8x8_LOAD_TOPRIGHT;
2808	SRC(0,0)= (t0 + t1 + 1) >> 1;
2809	SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2810	SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2811	SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2812	SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2813	SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2814	SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2815	SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2816	SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2817	SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2818	SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2819	SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2820	SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2821	SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2822	SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2823	SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2824	SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2825	SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2826	SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2827	SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2828	SRC(7,6)= (t10 + t11 + 1) >> 1;
2829	SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2830	}
2831	static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2832	{
2833	PREDICT_8x8_LOAD_LEFT;
2834	SRC(0,0)= (l0 + l1 + 1) >> 1;
2835	SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2836	SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2837	SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2838	SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2839	SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2840	SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2841	SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2842	SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2843	SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2844	SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2845	SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2846	SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2847	SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2848	SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2849	SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2850	SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2851	SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2852	}
2853	#undef PREDICT_8x8_LOAD_LEFT
2854	#undef PREDICT_8x8_LOAD_TOP
2855	#undef PREDICT_8x8_LOAD_TOPLEFT
2856	#undef PREDICT_8x8_LOAD_TOPRIGHT
2857	#undef PREDICT_8x8_DC
2858	#undef PTR
2859	#undef PT
2860	#undef PL
2861	#undef SRC
2862
2863	static inline void mc_dir_part(H264Context h, Picture pic, int n, int square, int chroma_height, int delta, int list,
2864	uint8_t dest_y, uint8_t dest_cb, uint8_t *dest_cr,
2865	int src_x_offset, int src_y_offset,
2866	qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2867	MpegEncContext * const s = &h->s;
2868	const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2869	int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2870	const int luma_xy= (mx&3) + ((my&3)<<2);
2871	uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2872	uint8_t * src_cb, * src_cr;
2873	int extra_width= h->emu_edge_width;
2874	int extra_height= h->emu_edge_height;
2875	int emu=0;
2876	const int full_mx= mx>>2;
2877	const int full_my= my>>2;
2878	const int pic_width = 16*s->mb_width;
2879	const int pic_height = 16*s->mb_height >> MB_MBAFF;
2880
2881	if(!pic->data[0])
2882	return;
2883
2884	if(mx&7) extra_width -= 3;
2885	if(my&7) extra_height -= 3;
2886
2887	if( full_mx < 0-extra_width
2888	\|\| full_my < 0-extra_height
2889	\|\| full_mx + 16/FIXME/ > pic_width + extra_width
2890	\|\| full_my + 16/FIXME/ > pic_height + extra_height){
2891	ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2h->mb_linesize, h->mb_linesize, 16+5, 16+5/FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2892	src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2893	emu=1;
2894	}
2895
2896	qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2897	if(!square){
2898	qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2899	}
2900
2901	if(s->flags&CODEC_FLAG_GRAY) return;
2902
2903	if(MB_MBAFF){
2904	// chroma offset when predicting from a field of opposite parity
2905	my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
2906	emu \|= (my>>3) < 0 \|\| (my>>3) + 8 >= (pic_height>>1);
2907	}
2908	src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2909	src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2910
2911	if(emu){
2912	ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/FIXME/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2913	src_cb= s->edge_emu_buffer;
2914	}
2915	chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2916
2917	if(emu){
2918	ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/FIXME/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2919	src_cr= s->edge_emu_buffer;
2920	}
2921	chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2922	}
2923
2924	static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2925	uint8_t dest_y, uint8_t dest_cb, uint8_t *dest_cr,
2926	int x_offset, int y_offset,
2927	qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2928	qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2929	int list0, int list1){
2930	MpegEncContext * const s = &h->s;
2931	qpel_mc_func *qpix_op= qpix_put;
2932	h264_chroma_mc_func chroma_op= chroma_put;
2933
2934	dest_y += 2x_offset + 2y_offset*h-> mb_linesize;
2935	dest_cb += x_offset + y_offset*h->mb_uvlinesize;
2936	dest_cr += x_offset + y_offset*h->mb_uvlinesize;
2937	x_offset += 8*s->mb_x;
2938	y_offset += 8*(s->mb_y >> MB_MBAFF);
2939
2940	if(list0){
2941	Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2942	mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2943	dest_y, dest_cb, dest_cr, x_offset, y_offset,
2944	qpix_op, chroma_op);
2945
2946	qpix_op= qpix_avg;
2947	chroma_op= chroma_avg;
2948	}
2949
2950	if(list1){
2951	Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2952	mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2953	dest_y, dest_cb, dest_cr, x_offset, y_offset,
2954	qpix_op, chroma_op);
2955	}
2956	}
2957
2958	static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2959	uint8_t dest_y, uint8_t dest_cb, uint8_t *dest_cr,
2960	int x_offset, int y_offset,
2961	qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2962	h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2963	h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2964	int list0, int list1){
2965	MpegEncContext * const s = &h->s;
2966
2967	dest_y += 2x_offset + 2y_offset*h-> mb_linesize;
2968	dest_cb += x_offset + y_offset*h->mb_uvlinesize;
2969	dest_cr += x_offset + y_offset*h->mb_uvlinesize;
2970	x_offset += 8*s->mb_x;
2971	y_offset += 8*(s->mb_y >> MB_MBAFF);
2972
2973	if(list0 && list1){
2974	/* don't optimize for luma-only case, since B-frames usually
2975	* use implicit weights => chroma too. */
2976	uint8_t *tmp_cb = s->obmc_scratchpad;
2977	uint8_t *tmp_cr = s->obmc_scratchpad + 8;
2978	uint8_t tmp_y = s->obmc_scratchpad + 8h->mb_uvlinesize;
2979	int refn0 = h->ref_cache[0][ scan8[n] ];
2980	int refn1 = h->ref_cache[1][ scan8[n] ];
2981
2982	mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2983	dest_y, dest_cb, dest_cr,
2984	x_offset, y_offset, qpix_put, chroma_put);
2985	mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2986	tmp_y, tmp_cb, tmp_cr,
2987	x_offset, y_offset, qpix_put, chroma_put);
2988
2989	if(h->use_weight == 2){
2990	int weight0 = h->implicit_weight[refn0][refn1];
2991	int weight1 = 64 - weight0;
2992	luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0);
2993	chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
2994	chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
2995	}else{
2996	luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
2997	h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2998	h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
2999	chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3000	h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
3001	h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
3002	chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3003	h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
3004	h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
3005	}
3006	}else{
3007	int list = list1 ? 1 : 0;
3008	int refn = h->ref_cache[list][ scan8[n] ];
3009	Picture *ref= &h->ref_list[list][refn];
3010	mc_dir_part(h, ref, n, square, chroma_height, delta, list,
3011	dest_y, dest_cb, dest_cr, x_offset, y_offset,
3012	qpix_put, chroma_put);
3013
3014	luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
3015	h->luma_weight[list][refn], h->luma_offset[list][refn]);
3016	if(h->use_weight_chroma){
3017	chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3018	h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
3019	chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3020	h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
3021	}
3022	}
3023	}
3024
3025	static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
3026	uint8_t dest_y, uint8_t dest_cb, uint8_t *dest_cr,
3027	int x_offset, int y_offset,
3028	qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
3029	qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
3030	h264_weight_func weight_op, h264_biweight_func weight_avg,
3031	int list0, int list1){
3032	if((h->use_weight==2 && list0 && list1
3033	&& (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
3034	\|\| h->use_weight==1)
3035	mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
3036	x_offset, y_offset, qpix_put, chroma_put,
3037	weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
3038	else
3039	mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
3040	x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
3041	}
3042
3043	static inline void prefetch_motion(H264Context *h, int list){
3044	/* fetch pixels for estimated mv 4 macroblocks ahead
3045	* optimized for 64byte cache lines */
3046	MpegEncContext * const s = &h->s;
3047	const int refn = h->ref_cache[list][scan8[0]];
3048	if(refn >= 0){
3049	const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
3050	const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
3051	uint8_t **src= h->ref_list[list][refn].data;
3052	int off= mx + (my + (s->mb_x&3)4)h->mb_linesize + 64;
3053	s->dsp.prefetch(src[0]+off, s->linesize, 4);
3054	off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
3055	s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
3056	}
3057	}
3058
3059	static void hl_motion(H264Context h, uint8_t dest_y, uint8_t dest_cb, uint8_t dest_cr,
3060	qpel_mc_func (qpix_put)[16], h264_chroma_mc_func (chroma_put),
3061	qpel_mc_func (qpix_avg)[16], h264_chroma_mc_func (chroma_avg),
3062	h264_weight_func weight_op, h264_biweight_func weight_avg){
3063	MpegEncContext * const s = &h->s;
3064	const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
3065	const int mb_type= s->current_picture.mb_type[mb_xy];
3066
3067	assert(IS_INTER(mb_type));
3068
3069	prefetch_motion(h, 0);
3070
3071	if(IS_16X16(mb_type)){
3072	mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
3073	qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
3074	&weight_op[0], &weight_avg[0],
3075	IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
3076	}else if(IS_16X8(mb_type)){
3077	mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
3078	qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
3079	&weight_op[1], &weight_avg[1],
3080	IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
3081	mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
3082	qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
3083	&weight_op[1], &weight_avg[1],
3084	IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
3085	}else if(IS_8X16(mb_type)){
3086	mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
3087	qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
3088	&weight_op[2], &weight_avg[2],
3089	IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
3090	mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
3091	qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
3092	&weight_op[2], &weight_avg[2],
3093	IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
3094	}else{
3095	int i;
3096
3097	assert(IS_8X8(mb_type));
3098
3099	for(i=0; i<4; i++){
3100	const int sub_mb_type= h->sub_mb_type[i];
3101	const int n= 4*i;
3102	int x_offset= (i&1)<<2;
3103	int y_offset= (i&2)<<1;
3104
3105	if(IS_SUB_8X8(sub_mb_type)){
3106	mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3107	qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
3108	&weight_op[3], &weight_avg[3],
3109	IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3110	}else if(IS_SUB_8X4(sub_mb_type)){
3111	mc_part(h, n , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3112	qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3113	&weight_op[4], &weight_avg[4],
3114	IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3115	mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
3116	qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3117	&weight_op[4], &weight_avg[4],
3118	IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3119	}else if(IS_SUB_4X8(sub_mb_type)){
3120	mc_part(h, n , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3121	qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3122	&weight_op[5], &weight_avg[5],
3123	IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3124	mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
3125	qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3126	&weight_op[5], &weight_avg[5],
3127	IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3128	}else{
3129	int j;
3130	assert(IS_SUB_4X4(sub_mb_type));
3131	for(j=0; j<4; j++){
3132	int sub_x_offset= x_offset + 2*(j&1);
3133	int sub_y_offset= y_offset + (j&2);
3134	mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
3135	qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3136	&weight_op[6], &weight_avg[6],
3137	IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3138	}
3139	}
3140	}
3141	}
3142
3143	prefetch_motion(h, 1);
3144	}
3145
3146	static void decode_init_vlc(H264Context *h){
3147	static int done = 0;
3148
3149	if (!done) {
3150	int i;
3151	done = 1;
3152
3153	init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
3154	&chroma_dc_coeff_token_len [0], 1, 1,
3155	&chroma_dc_coeff_token_bits[0], 1, 1, 1);
3156
3157	for(i=0; i<4; i++){
3158	init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
3159	&coeff_token_len [i][0], 1, 1,
3160	&coeff_token_bits[i][0], 1, 1, 1);
3161	}
3162
3163	for(i=0; i<3; i++){
3164	init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
3165	&chroma_dc_total_zeros_len [i][0], 1, 1,
3166	&chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
3167	}
3168	for(i=0; i<15; i++){
3169	init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
3170	&total_zeros_len [i][0], 1, 1,
3171	&total_zeros_bits[i][0], 1, 1, 1);
3172	}
3173
3174	for(i=0; i<6; i++){
3175	init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
3176	&run_len [i][0], 1, 1,
3177	&run_bits[i][0], 1, 1, 1);
3178	}
3179	init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
3180	&run_len [6][0], 1, 1,
3181	&run_bits[6][0], 1, 1, 1);
3182	}
3183	}
3184
3185	/**
3186	* Sets the intra prediction function pointers.
3187	*/
3188	static void init_pred_ptrs(H264Context *h){
3189	// MpegEncContext * const s = &h->s;
3190
3191	h->pred4x4[VERT_PRED ]= pred4x4_vertical_c;
3192	h->pred4x4[HOR_PRED ]= pred4x4_horizontal_c;
3193	h->pred4x4[DC_PRED ]= pred4x4_dc_c;
3194	h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
3195	h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
3196	h->pred4x4[VERT_RIGHT_PRED ]= pred4x4_vertical_right_c;
3197	h->pred4x4[HOR_DOWN_PRED ]= pred4x4_horizontal_down_c;
3198	h->pred4x4[VERT_LEFT_PRED ]= pred4x4_vertical_left_c;
3199	h->pred4x4[HOR_UP_PRED ]= pred4x4_horizontal_up_c;
3200	h->pred4x4[LEFT_DC_PRED ]= pred4x4_left_dc_c;
3201	h->pred4x4[TOP_DC_PRED ]= pred4x4_top_dc_c;
3202	h->pred4x4[DC_128_PRED ]= pred4x4_128_dc_c;
3203
3204	h->pred8x8l[VERT_PRED ]= pred8x8l_vertical_c;
3205	h->pred8x8l[HOR_PRED ]= pred8x8l_horizontal_c;
3206	h->pred8x8l[DC_PRED ]= pred8x8l_dc_c;
3207	h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
3208	h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
3209	h->pred8x8l[VERT_RIGHT_PRED ]= pred8x8l_vertical_right_c;
3210	h->pred8x8l[HOR_DOWN_PRED ]= pred8x8l_horizontal_down_c;
3211	h->pred8x8l[VERT_LEFT_PRED ]= pred8x8l_vertical_left_c;
3212	h->pred8x8l[HOR_UP_PRED ]= pred8x8l_horizontal_up_c;
3213	h->pred8x8l[LEFT_DC_PRED ]= pred8x8l_left_dc_c;
3214	h->pred8x8l[TOP_DC_PRED ]= pred8x8l_top_dc_c;
3215	h->pred8x8l[DC_128_PRED ]= pred8x8l_128_dc_c;
3216
3217	h->pred8x8[DC_PRED8x8 ]= pred8x8_dc_c;
3218	h->pred8x8[VERT_PRED8x8 ]= pred8x8_vertical_c;
3219	h->pred8x8[HOR_PRED8x8 ]= pred8x8_horizontal_c;
3220	h->pred8x8[PLANE_PRED8x8 ]= pred8x8_plane_c;
3221	h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
3222	h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
3223	h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
3224
3225	h->pred16x16[DC_PRED8x8 ]= pred16x16_dc_c;
3226	h->pred16x16[VERT_PRED8x8 ]= pred16x16_vertical_c;
3227	h->pred16x16[HOR_PRED8x8 ]= pred16x16_horizontal_c;
3228	h->pred16x16[PLANE_PRED8x8 ]= pred16x16_plane_c;
3229	h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
3230	h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
3231	h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
3232	}
3233
3234	static void free_tables(H264Context *h){
3235	av_freep(&h->intra4x4_pred_mode);
3236	av_freep(&h->chroma_pred_mode_table);
3237	av_freep(&h->cbp_table);
3238	av_freep(&h->mvd_table[0]);
3239	av_freep(&h->mvd_table[1]);
3240	av_freep(&h->direct_table);
3241	av_freep(&h->non_zero_count);
3242	av_freep(&h->slice_table_base);
3243	av_freep(&h->top_borders[1]);
3244	av_freep(&h->top_borders[0]);
3245	h->slice_table= NULL;
3246
3247	av_freep(&h->mb2b_xy);
3248	av_freep(&h->mb2b8_xy);
3249
3250	av_freep(&h->s.obmc_scratchpad);
3251	}
3252
3253	static void init_dequant8_coeff_table(H264Context *h){
3254	int i,q,x;
3255	const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
3256	h->dequant8_coeff[0] = h->dequant8_buffer[0];
3257	h->dequant8_coeff[1] = h->dequant8_buffer[1];
3258
3259	for(i=0; i<2; i++ ){
3260	if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
3261	h->dequant8_coeff[1] = h->dequant8_buffer[0];
3262	break;
3263	}
3264
3265	for(q=0; q<52; q++){
3266	int shift = div6[q];
3267	int idx = rem6[q];
3268	for(x=0; x<64; x++)
3269	h->dequant8_coeff[i][q][transpose ? (x>>3)\|((x&7)<<3) : x] =
3270	((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) \| (x&3)] ] *
3271	h->pps.scaling_matrix8[i][x]) << shift;
3272	}
3273	}
3274	}
3275
3276	static void init_dequant4_coeff_table(H264Context *h){
3277	int i,j,q,x;
3278	const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
3279	for(i=0; i<6; i++ ){
3280	h->dequant4_coeff[i] = h->dequant4_buffer[i];
3281	for(j=0; j<i; j++){
3282	if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
3283	h->dequant4_coeff[i] = h->dequant4_buffer[j];
3284	break;
3285	}
3286	}
3287	if(j<i)
3288	continue;
3289
3290	for(q=0; q<52; q++){
3291	int shift = div6[q] + 2;
3292	int idx = rem6[q];
3293	for(x=0; x<16; x++)
3294	h->dequant4_coeff[i][q][transpose ? (x>>2)\|((x<<2)&0xF) : x] =
3295	((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
3296	h->pps.scaling_matrix4[i][x]) << shift;
3297	}
3298	}
3299	}
3300
3301	static void init_dequant_tables(H264Context *h){
3302	int i,x;
3303	init_dequant4_coeff_table(h);
3304	if(h->pps.transform_8x8_mode)
3305	init_dequant8_coeff_table(h);
3306	if(h->sps.transform_bypass){
3307	for(i=0; i<6; i++)
3308	for(x=0; x<16; x++)
3309	h->dequant4_coeff[i][0][x] = 1<<6;
3310	if(h->pps.transform_8x8_mode)
3311	for(i=0; i<2; i++)
3312	for(x=0; x<64; x++)
3313	h->dequant8_coeff[i][0][x] = 1<<6;
3314	}
3315	}
3316
3317
3318	/**
3319	* allocates tables.
3320	* needs width/height
3321	*/
3322	static int alloc_tables(H264Context *h){
3323	MpegEncContext * const s = &h->s;
3324	const int big_mb_num= s->mb_stride * (s->mb_height+1);
3325	int x,y;
3326
3327	CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8 * sizeof(uint8_t))
3328
3329	CHECKED_ALLOCZ(h->non_zero_count , big_mb_num * 16 * sizeof(uint8_t))
3330	CHECKED_ALLOCZ(h->slice_table_base , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
3331	CHECKED_ALLOCZ(h->top_borders[0] , s->mb_width * (16+8+8) * sizeof(uint8_t))
3332	CHECKED_ALLOCZ(h->top_borders[1] , s->mb_width * (16+8+8) * sizeof(uint8_t))
3333	CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
3334
3335	if( h->pps.cabac ) {
3336	CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
3337	CHECKED_ALLOCZ(h->mvd_table[0], 32big_mb_num sizeof(uint16_t));
3338	CHECKED_ALLOCZ(h->mvd_table[1], 32big_mb_num sizeof(uint16_t));
3339	CHECKED_ALLOCZ(h->direct_table, 32big_mb_num sizeof(uint8_t));
3340	}
3341
3342	memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride) * sizeof(uint8_t));
3343	h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
3344
3345	CHECKED_ALLOCZ(h->mb2b_xy , big_mb_num * sizeof(uint32_t));
3346	CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
3347	for(y=0; y<s->mb_height; y++){
3348	for(x=0; x<s->mb_width; x++){
3349	const int mb_xy= x + y*s->mb_stride;
3350	const int b_xy = 4x + 4y*h->b_stride;
3351	const int b8_xy= 2x + 2y*h->b8_stride;
3352
3353	h->mb2b_xy [mb_xy]= b_xy;
3354	h->mb2b8_xy[mb_xy]= b8_xy;
3355	}
3356	}
3357
3358	s->obmc_scratchpad = NULL;
3359
3360	if(!h->dequant4_coeff[0])
3361	init_dequant_tables(h);
3362
3363	return 0;
3364	fail:
3365	free_tables(h);
3366	return -1;
3367	}
3368
3369	static void common_init(H264Context *h){
3370	MpegEncContext * const s = &h->s;
3371
3372	s->width = s->avctx->width;
3373	s->height = s->avctx->height;
3374	s->codec_id= s->avctx->codec->id;
3375
3376	init_pred_ptrs(h);
3377
3378	h->dequant_coeff_pps= -1;
3379	s->unrestricted_mv=1;
3380	s->decode=1; //FIXME
3381
3382	memset(h->pps.scaling_matrix4, 16, 616sizeof(uint8_t));
3383	memset(h->pps.scaling_matrix8, 16, 264sizeof(uint8_t));
3384	}
3385
3386	static int decode_init(AVCodecContext *avctx){
3387	H264Context *h= avctx->priv_data;
3388	MpegEncContext * const s = &h->s;
3389
3390	MPV_decode_defaults(s);
3391
3392	s->avctx = avctx;
3393	common_init(h);
3394
3395	s->out_format = FMT_H264;
3396	s->workaround_bugs= avctx->workaround_bugs;
3397
3398	// set defaults
3399	// s->decode_mb= ff_h263_decode_mb;
3400	s->low_delay= 1;
3401	avctx->pix_fmt= PIX_FMT_YUV420P;
3402
3403	decode_init_vlc(h);
3404
3405	if(avctx->extradata_size > 0 && avctx->extradata &&
3406	(char )avctx->extradata == 1){
3407	h->is_avc = 1;
3408	h->got_avcC = 0;
3409	} else {
3410	h->is_avc = 0;
3411	}
3412
3413	return 0;
3414	}
3415
3416	static int frame_start(H264Context *h){
3417	MpegEncContext * const s = &h->s;
3418	int i;
3419
3420	if(MPV_frame_start(s, s->avctx) < 0)
3421	return -1;
3422	ff_er_frame_start(s);
3423
3424	assert(s->linesize && s->uvlinesize);
3425
3426	for(i=0; i<16; i++){
3427	h->block_offset[i]= 4((scan8[i] - scan8[0])&7) + 4s->linesize*((scan8[i] - scan8[0])>>3);
3428	h->block_offset[24+i]= 4((scan8[i] - scan8[0])&7) + 8s->linesize*((scan8[i] - scan8[0])>>3);
3429	}
3430	for(i=0; i<4; i++){
3431	h->block_offset[16+i]=
3432	h->block_offset[20+i]= 4((scan8[i] - scan8[0])&7) + 4s->uvlinesize*((scan8[i] - scan8[0])>>3);
3433	h->block_offset[24+16+i]=
3434	h->block_offset[24+20+i]= 4((scan8[i] - scan8[0])&7) + 8s->uvlinesize*((scan8[i] - scan8[0])>>3);
3435	}
3436
3437	/* can't be in alloc_tables because linesize isn't known there.
3438	* FIXME: redo bipred weight to not require extra buffer? */
3439	if(!s->obmc_scratchpad)
3440	s->obmc_scratchpad = av_malloc(162s->linesize + 82s->uvlinesize);
3441
3442	/* some macroblocks will be accessed before they're available */
3443	if(FRAME_MBAFF)
3444	memset(h->slice_table, -1, (s->mb_heights->mb_stride-1) sizeof(uint8_t));
3445
3446	// s->decode= (s->flags&CODEC_FLAG_PSNR) \|\| !s->encoding \|\| s->current_picture.reference /\|\| h->contains_intra/ \|\| 1;
3447	return 0;
3448	}
3449
3450	static inline void backup_mb_border(H264Context h, uint8_t src_y, uint8_t src_cb, uint8_t src_cr, int linesize, int uvlinesize){
3451	MpegEncContext * const s = &h->s;
3452	int i;
3453
3454	src_y -= linesize;
3455	src_cb -= uvlinesize;
3456	src_cr -= uvlinesize;
3457
3458	// There are two lines saved, the line above the the top macroblock of a pair,
3459	// and the line above the bottom macroblock
3460	h->left_border[0]= h->top_borders[0][s->mb_x][15];
3461	for(i=1; i<17; i++){
3462	h->left_border[i]= src_y[15+i* linesize];
3463	}
3464
3465	(uint64_t)(h->top_borders[0][s->mb_x]+0)= (uint64_t)(src_y + 16*linesize);
3466	(uint64_t)(h->top_borders[0][s->mb_x]+8)= (uint64_t)(src_y +8+16*linesize);
3467
3468	if(!(s->flags&CODEC_FLAG_GRAY)){
3469	h->left_border[17 ]= h->top_borders[0][s->mb_x][16+7];
3470	h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3471	for(i=1; i<9; i++){
3472	h->left_border[i+17 ]= src_cb[7+i*uvlinesize];
3473	h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3474	}
3475	(uint64_t)(h->top_borders[0][s->mb_x]+16)= (uint64_t)(src_cb+8*uvlinesize);
3476	(uint64_t)(h->top_borders[0][s->mb_x]+24)= (uint64_t)(src_cr+8*uvlinesize);
3477	}
3478	}
3479
3480	static inline void xchg_mb_border(H264Context h, uint8_t src_y, uint8_t src_cb, uint8_t src_cr, int linesize, int uvlinesize, int xchg){
3481	MpegEncContext * const s = &h->s;
3482	int temp8, i;
3483	uint64_t temp64;
3484	int deblock_left = (s->mb_x > 0);
3485	int deblock_top = (s->mb_y > 0);
3486
3487	src_y -= linesize + 1;
3488	src_cb -= uvlinesize + 1;
3489	src_cr -= uvlinesize + 1;
3490
3491	#define XCHG(a,b,t,xchg)\
3492	t= a;\
3493	if(xchg)\
3494	a= b;\
3495	b= t;
3496
3497	if(deblock_left){
3498	for(i = !deblock_top; i<17; i++){
3499	XCHG(h->left_border[i ], src_y [i* linesize], temp8, xchg);
3500	}
3501	}
3502
3503	if(deblock_top){
3504	XCHG((uint64_t)(h->top_borders[0][s->mb_x]+0), (uint64_t)(src_y +1), temp64, xchg);
3505	XCHG((uint64_t)(h->top_borders[0][s->mb_x]+8), (uint64_t)(src_y +9), temp64, 1);
3506	if(s->mb_x+1 < s->mb_width){
3507	XCHG((uint64_t)(h->top_borders[0][s->mb_x+1]), (uint64_t)(src_y +17), temp64, 1);
3508	}
3509	}
3510
3511	if(!(s->flags&CODEC_FLAG_GRAY)){
3512	if(deblock_left){
3513	for(i = !deblock_top; i<9; i++){
3514	XCHG(h->left_border[i+17 ], src_cb[i*uvlinesize], temp8, xchg);
3515	XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3516	}
3517	}
3518	if(deblock_top){
3519	XCHG((uint64_t)(h->top_borders[0][s->mb_x]+16), (uint64_t)(src_cb+1), temp64, 1);
3520	XCHG((uint64_t)(h->top_borders[0][s->mb_x]+24), (uint64_t)(src_cr+1), temp64, 1);
3521	}
3522	}
3523	}
3524
3525	static inline void backup_pair_border(H264Context h, uint8_t src_y, uint8_t src_cb, uint8_t src_cr, int linesize, int uvlinesize){
3526	MpegEncContext * const s = &h->s;
3527	int i;
3528
3529	src_y -= 2 * linesize;
3530	src_cb -= 2 * uvlinesize;
3531	src_cr -= 2 * uvlinesize;
3532
3533	// There are two lines saved, the line above the the top macroblock of a pair,
3534	// and the line above the bottom macroblock
3535	h->left_border[0]= h->top_borders[0][s->mb_x][15];
3536	h->left_border[1]= h->top_borders[1][s->mb_x][15];
3537	for(i=2; i<34; i++){
3538	h->left_border[i]= src_y[15+i* linesize];
3539	}
3540
3541	(uint64_t)(h->top_borders[0][s->mb_x]+0)= (uint64_t)(src_y + 32*linesize);
3542	(uint64_t)(h->top_borders[0][s->mb_x]+8)= (uint64_t)(src_y +8+32*linesize);
3543	(uint64_t)(h->top_borders[1][s->mb_x]+0)= (uint64_t)(src_y + 33*linesize);
3544	(uint64_t)(h->top_borders[1][s->mb_x]+8)= (uint64_t)(src_y +8+33*linesize);
3545
3546	if(!(s->flags&CODEC_FLAG_GRAY)){
3547	h->left_border[34 ]= h->top_borders[0][s->mb_x][16+7];
3548	h->left_border[34+ 1]= h->top_borders[1][s->mb_x][16+7];
3549	h->left_border[34+18 ]= h->top_borders[0][s->mb_x][24+7];
3550	h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3551	for(i=2; i<18; i++){
3552	h->left_border[i+34 ]= src_cb[7+i*uvlinesize];
3553	h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3554	}
3555	(uint64_t)(h->top_borders[0][s->mb_x]+16)= (uint64_t)(src_cb+16*uvlinesize);
3556	(uint64_t)(h->top_borders[0][s->mb_x]+24)= (uint64_t)(src_cr+16*uvlinesize);
3557	(uint64_t)(h->top_borders[1][s->mb_x]+16)= (uint64_t)(src_cb+17*uvlinesize);
3558	(uint64_t)(h->top_borders[1][s->mb_x]+24)= (uint64_t)(src_cr+17*uvlinesize);
3559	}
3560	}
3561
3562	static inline void xchg_pair_border(H264Context h, uint8_t src_y, uint8_t src_cb, uint8_t src_cr, int linesize, int uvlinesize, int xchg){
3563	MpegEncContext * const s = &h->s;
3564	int temp8, i;
3565	uint64_t temp64;
3566	int deblock_left = (s->mb_x > 0);
3567	int deblock_top = (s->mb_y > 1);
3568
3569	tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3570
3571	src_y -= 2 * linesize + 1;
3572	src_cb -= 2 * uvlinesize + 1;
3573	src_cr -= 2 * uvlinesize + 1;
3574
3575	#define XCHG(a,b,t,xchg)\
3576	t= a;\
3577	if(xchg)\
3578	a= b;\
3579	b= t;
3580
3581	if(deblock_left){
3582	for(i = (!deblock_top)<<1; i<34; i++){
3583	XCHG(h->left_border[i ], src_y [i* linesize], temp8, xchg);
3584	}
3585	}
3586
3587	if(deblock_top){
3588	XCHG((uint64_t)(h->top_borders[0][s->mb_x]+0), (uint64_t)(src_y +1), temp64, xchg);
3589	XCHG((uint64_t)(h->top_borders[0][s->mb_x]+8), (uint64_t)(src_y +9), temp64, 1);
3590	XCHG((uint64_t)(h->top_borders[1][s->mb_x]+0), (uint64_t)(src_y +1 +linesize), temp64, xchg);
3591	XCHG((uint64_t)(h->top_borders[1][s->mb_x]+8), (uint64_t)(src_y +9 +linesize), temp64, 1);
3592	if(s->mb_x+1 < s->mb_width){
3593	XCHG((uint64_t)(h->top_borders[0][s->mb_x+1]), (uint64_t)(src_y +17), temp64, 1);
3594	XCHG((uint64_t)(h->top_borders[1][s->mb_x+1]), (uint64_t)(src_y +17 +linesize), temp64, 1);
3595	}
3596	}
3597
3598	if(!(s->flags&CODEC_FLAG_GRAY)){
3599	if(deblock_left){
3600	for(i = (!deblock_top) << 1; i<18; i++){
3601	XCHG(h->left_border[i+34 ], src_cb[i*uvlinesize], temp8, xchg);
3602	XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3603	}
3604	}
3605	if(deblock_top){
3606	XCHG((uint64_t)(h->top_borders[0][s->mb_x]+16), (uint64_t)(src_cb+1), temp64, 1);
3607	XCHG((uint64_t)(h->top_borders[0][s->mb_x]+24), (uint64_t)(src_cr+1), temp64, 1);
3608	XCHG((uint64_t)(h->top_borders[1][s->mb_x]+16), (uint64_t)(src_cb+1 +uvlinesize), temp64, 1);
3609	XCHG((uint64_t)(h->top_borders[1][s->mb_x]+24), (uint64_t)(src_cr+1 +uvlinesize), temp64, 1);
3610	}
3611	}
3612	}
3613
3614	static void hl_decode_mb(H264Context *h){
3615	MpegEncContext * const s = &h->s;
3616	const int mb_x= s->mb_x;
3617	const int mb_y= s->mb_y;
3618	const int mb_xy= mb_x + mb_y*s->mb_stride;
3619	const int mb_type= s->current_picture.mb_type[mb_xy];
3620	uint8_t dest_y, dest_cb, *dest_cr;
3621	int linesize, uvlinesize /dct_offset/;
3622	int i;
3623	int *block_offset = &h->block_offset[0];
3624	const unsigned int bottom = mb_y & 1;
3625	const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
3626	void (idct_add)(uint8_t dst, DCTELEM *block, int stride);
3627	void (idct_dc_add)(uint8_t dst, DCTELEM *block, int stride);
3628
3629	if(!s->decode)
3630	return;
3631
3632	dest_y = s->current_picture.data[0] + (mb_y * 16* s->linesize ) + mb_x * 16;
3633	dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3634	dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3635
3636	if (MB_FIELD) {
3637	linesize = h->mb_linesize = s->linesize * 2;
3638	uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
3639	block_offset = &h->block_offset[24];
3640	if(mb_y&1){ //FIXME move out of this func?
3641	dest_y -= s->linesize*15;
3642	dest_cb-= s->uvlinesize*7;
3643	dest_cr-= s->uvlinesize*7;
3644	}
3645	if(FRAME_MBAFF) {
3646	int list;
3647	for(list=0; list<2; list++){
3648	if(!USES_LIST(mb_type, list))
3649	continue;
3650	if(IS_16X16(mb_type)){
3651	int8_t *ref = &h->ref_cache[list][scan8[0]];
3652	fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
3653	}else{
3654	for(i=0; i<16; i+=4){
3655	//FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
3656	int ref = h->ref_cache[list][scan8[i]];
3657	if(ref >= 0)
3658	fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
3659	}
3660	}
3661	}
3662	}
3663	} else {
3664	linesize = h->mb_linesize = s->linesize;
3665	uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3666	// dct_offset = s->linesize * 16;
3667	}
3668
3669	if(transform_bypass){
3670	idct_dc_add =
3671	idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3672	}else if(IS_8x8DCT(mb_type)){
3673	idct_dc_add = s->dsp.h264_idct8_dc_add;
3674	idct_add = s->dsp.h264_idct8_add;
3675	}else{
3676	idct_dc_add = s->dsp.h264_idct_dc_add;
3677	idct_add = s->dsp.h264_idct_add;
3678	}
3679
3680	if(FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
3681	&& (!bottom \|\| !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
3682	int mbt_y = mb_y&~1;
3683	uint8_t top_y = s->current_picture.data[0] + (mbt_y 16* s->linesize ) + mb_x * 16;
3684	uint8_t top_cb = s->current_picture.data[1] + (mbt_y 8 * s->uvlinesize) + mb_x * 8;
3685	uint8_t top_cr = s->current_picture.data[2] + (mbt_y 8 * s->uvlinesize) + mb_x * 8;
3686	xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
3687	}
3688
3689	if (IS_INTRA_PCM(mb_type)) {
3690	unsigned int x, y;
3691
3692	// The pixels are stored in h->mb array in the same order as levels,
3693	// copy them in output in the correct order.
3694	for(i=0; i<16; i++) {
3695	for (y=0; y<4; y++) {
3696	for (x=0; x<4; x++) {
3697	(dest_y + block_offset[i] + ylinesize + x) = h->mb[i16+y4+x];
3698	}
3699	}
3700	}
3701	for(i=16; i<16+4; i++) {
3702	for (y=0; y<4; y++) {
3703	for (x=0; x<4; x++) {
3704	(dest_cb + block_offset[i] + yuvlinesize + x) = h->mb[i16+y4+x];
3705	}
3706	}
3707	}
3708	for(i=20; i<20+4; i++) {
3709	for (y=0; y<4; y++) {
3710	for (x=0; x<4; x++) {
3711	(dest_cr + block_offset[i] + yuvlinesize + x) = h->mb[i16+y4+x];
3712	}
3713	}
3714	}
3715	} else {
3716	if(IS_INTRA(mb_type)){
3717	if(h->deblocking_filter && !FRAME_MBAFF)
3718	xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
3719
3720	if(!(s->flags&CODEC_FLAG_GRAY)){
3721	h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3722	h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3723	}
3724
3725	if(IS_INTRA4x4(mb_type)){
3726	if(!s->encoding){
3727	if(IS_8x8DCT(mb_type)){
3728	for(i=0; i<16; i+=4){
3729	uint8_t * const ptr= dest_y + block_offset[i];
3730	const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3731	const int nnz = h->non_zero_count_cache[ scan8[i] ];
3732	h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3733	(h->topright_samples_available<<(i+1))&0x8000, linesize);
3734	if(nnz){
3735	if(nnz == 1 && h->mb[i*16])
3736	idct_dc_add(ptr, h->mb + i*16, linesize);
3737	else
3738	idct_add(ptr, h->mb + i*16, linesize);
3739	}
3740	}
3741	}else
3742	for(i=0; i<16; i++){
3743	uint8_t * const ptr= dest_y + block_offset[i];
3744	uint8_t *topright;
3745	const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3746	int nnz, tr;
3747
3748	if(dir == DIAG_DOWN_LEFT_PRED \|\| dir == VERT_LEFT_PRED){
3749	const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3750	assert(mb_y \|\| linesize <= block_offset[i]);
3751	if(!topright_avail){
3752	tr= ptr[3 - linesize]*0x01010101;
3753	topright= (uint8_t*) &tr;
3754	}else
3755	topright= ptr + 4 - linesize;
3756	}else
3757	topright= NULL;
3758
3759	h->pred4x4[ dir ](ptr, topright, linesize);
3760	nnz = h->non_zero_count_cache[ scan8[i] ];
3761	if(nnz){
3762	if(s->codec_id == CODEC_ID_H264){
3763	if(nnz == 1 && h->mb[i*16])
3764	idct_dc_add(ptr, h->mb + i*16, linesize);
3765	else
3766	idct_add(ptr, h->mb + i*16, linesize);
3767	}else
3768	svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3769	}
3770	}
3771	}
3772	}else{
3773	h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3774	if(s->codec_id == CODEC_ID_H264){
3775	if(!transform_bypass)
3776	h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3777	}else
3778	svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3779	}
3780	if(h->deblocking_filter && !FRAME_MBAFF)
3781	xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3782	}else if(s->codec_id == CODEC_ID_H264){
3783	hl_motion(h, dest_y, dest_cb, dest_cr,
3784	s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab,
3785	s->dsp.avg_h264_qpel_pixels_tab, s->dsp.avg_h264_chroma_pixels_tab,
3786	s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3787	}
3788
3789
3790	if(!IS_INTRA4x4(mb_type)){
3791	if(s->codec_id == CODEC_ID_H264){
3792	if(IS_INTRA16x16(mb_type)){
3793	for(i=0; i<16; i++){
3794	if(h->non_zero_count_cache[ scan8[i] ])
3795	idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3796	else if(h->mb[i*16])
3797	idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3798	}
3799	}else{
3800	const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3801	for(i=0; i<16; i+=di){
3802	int nnz = h->non_zero_count_cache[ scan8[i] ];
3803	if(nnz){
3804	if(nnz==1 && h->mb[i*16])
3805	idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3806	else
3807	idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3808	}
3809	}
3810	}
3811	}else{
3812	for(i=0; i<16; i++){
3813	if(h->non_zero_count_cache[ scan8[i] ] \|\| h->mb[i*16]){ //FIXME benchmark weird rule, & below
3814	uint8_t * const ptr= dest_y + block_offset[i];
3815	svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3816	}
3817	}
3818	}
3819	}
3820
3821	if(!(s->flags&CODEC_FLAG_GRAY)){
3822	uint8_t *dest[2] = {dest_cb, dest_cr};
3823	if(transform_bypass){
3824	idct_add = idct_dc_add = s->dsp.add_pixels4;
3825	}else{
3826	idct_add = s->dsp.h264_idct_add;
3827	idct_dc_add = s->dsp.h264_idct_dc_add;
3828	chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3829	chroma_dc_dequant_idct_c(h->mb + 1616+416, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3830	}
3831	if(s->codec_id == CODEC_ID_H264){
3832	for(i=16; i<16+8; i++){
3833	if(h->non_zero_count_cache[ scan8[i] ])
3834	idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3835	else if(h->mb[i*16])
3836	idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3837	}
3838	}else{
3839	for(i=16; i<16+8; i++){
3840	if(h->non_zero_count_cache[ scan8[i] ] \|\| h->mb[i*16]){
3841	uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3842	svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3843	}
3844	}
3845	}
3846	}
3847	}
3848	if(h->deblocking_filter) {
3849	if (FRAME_MBAFF) {
3850	//FIXME try deblocking one mb at a time?
3851	// the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
3852	const int mb_y = s->mb_y - 1;
3853	uint8_t pair_dest_y, pair_dest_cb, *pair_dest_cr;
3854	const int mb_xy= mb_x + mb_y*s->mb_stride;
3855	const int mb_type_top = s->current_picture.mb_type[mb_xy];
3856	const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3857	if (!bottom) return;
3858	pair_dest_y = s->current_picture.data[0] + (mb_y * 16* s->linesize ) + mb_x * 16;
3859	pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3860	pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3861
3862	if(IS_INTRA(mb_type_top \| mb_type_bottom))
3863	xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3864
3865	backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3866	// deblock a pair
3867	// top
3868	s->mb_y--;
3869	tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3870	fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3871	h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy]);
3872	filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3873	// bottom
3874	s->mb_y++;
3875	tprintf("call mbaff filter_mb\n");
3876	fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3877	h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3878	filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3879	} else {
3880	tprintf("call filter_mb\n");
3881	backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3882	fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3883	filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3884	}
3885	}
3886	}
3887
3888	/**
3889	* fills the default_ref_list.
3890	*/
3891	static int fill_default_ref_list(H264Context *h){
3892	MpegEncContext * const s = &h->s;
3893	int i;
3894	int smallest_poc_greater_than_current = -1;
3895	Picture sorted_short_ref[32];
3896
3897	if(h->slice_type==B_TYPE){
3898	int out_i;
3899	int limit= INT_MIN;
3900
3901	/* sort frame according to poc in B slice */
3902	for(out_i=0; out_i<h->short_ref_count; out_i++){
3903	int best_i=INT_MIN;
3904	int best_poc=INT_MAX;
3905
3906	for(i=0; i<h->short_ref_count; i++){
3907	const int poc= h->short_ref[i]->poc;
3908	if(poc > limit && poc < best_poc){
3909	best_poc= poc;
3910	best_i= i;
3911	}
3912	}
3913
3914	assert(best_i != INT_MIN);
3915
3916	limit= best_poc;
3917	sorted_short_ref[out_i]= *h->short_ref[best_i];
3918	tprintf("sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3919	if (-1 == smallest_poc_greater_than_current) {
3920	if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3921	smallest_poc_greater_than_current = out_i;
3922	}
3923	}
3924	}
3925	}
3926
3927	if(s->picture_structure == PICT_FRAME){
3928	if(h->slice_type==B_TYPE){
3929	int list;
3930	tprintf("current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3931
3932	// find the largest poc
3933	for(list=0; list<2; list++){
3934	int index = 0;
3935	int j= -99;
3936	int step= list ? -1 : 1;
3937
3938	for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3939	while(j<0 \|\| j>= h->short_ref_count){
3940	if(j != -99 && step == (list ? -1 : 1))
3941	return -1;
3942	step = -step;
3943	j= smallest_poc_greater_than_current + (step>>1);
3944	}
3945	if(sorted_short_ref[j].reference != 3) continue;
3946	h->default_ref_list[list][index ]= sorted_short_ref[j];
3947	h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3948	}
3949
3950	for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3951	if(h->long_ref[i] == NULL) continue;
3952	if(h->long_ref[i]->reference != 3) continue;
3953
3954	h->default_ref_list[ list ][index ]= *h->long_ref[i];
3955	h->default_ref_list[ list ][index++].pic_id= i;;
3956	}
3957
3958	if(list && (smallest_poc_greater_than_current<=0 \|\| smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3959	// swap the two first elements of L1 when
3960	// L0 and L1 are identical
3961	Picture temp= h->default_ref_list[1][0];
3962	h->default_ref_list[1][0] = h->default_ref_list[1][1];
3963	h->default_ref_list[1][1] = temp;
3964	}
3965
3966	if(index < h->ref_count[ list ])
3967	memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3968	}
3969	}else{
3970	int index=0;
3971	for(i=0; i<h->short_ref_count; i++){
3972	if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3973	h->default_ref_list[0][index ]= *h->short_ref[i];
3974	h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3975	}
3976	for(i = 0; i < 16; i++){
3977	if(h->long_ref[i] == NULL) continue;
3978	if(h->long_ref[i]->reference != 3) continue;
3979	h->default_ref_list[0][index ]= *h->long_ref[i];
3980	h->default_ref_list[0][index++].pic_id= i;;
3981	}
3982	if(index < h->ref_count[0])
3983	memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3984	}
3985	}else{ //FIELD
3986	if(h->slice_type==B_TYPE){
3987	}else{
3988	//FIXME second field balh
3989	}
3990	}
3991	#ifdef TRACE
3992	for (i=0; i<h->ref_count[0]; i++) {
3993	tprintf("List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3994	}
3995	if(h->slice_type==B_TYPE){
3996	for (i=0; i<h->ref_count[1]; i++) {
3997	tprintf("List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3998	}
3999	}
4000	#endif
4001	return 0;
4002	}
4003
4004	static void print_short_term(H264Context *h);
4005	static void print_long_term(H264Context *h);
4006
4007	static int decode_ref_pic_list_reordering(H264Context *h){
4008	MpegEncContext * const s = &h->s;
4009	int list, index;
4010
4011	print_short_term(h);
4012	print_long_term(h);
4013	if(h->slice_type==I_TYPE \|\| h->slice_type==SI_TYPE) return 0; //FIXME move before func
4014
4015	for(list=0; list<2; list++){
4016	memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
4017
4018	if(get_bits1(&s->gb)){
4019	int pred= h->curr_pic_num;
4020
4021	for(index=0; ; index++){
4022	int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
4023	int pic_id;
4024	int i;
4025	Picture *ref = NULL;
4026
4027	if(reordering_of_pic_nums_idc==3)
4028	break;
4029
4030	if(index >= h->ref_count[list]){
4031	av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
4032	return -1;
4033	}
4034
4035	if(reordering_of_pic_nums_idc<3){
4036	if(reordering_of_pic_nums_idc<2){
4037	const int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
4038
4039	if(abs_diff_pic_num >= h->max_pic_num){
4040	av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
4041	return -1;
4042	}
4043
4044	if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
4045	else pred+= abs_diff_pic_num;
4046	pred &= h->max_pic_num - 1;
4047
4048	for(i= h->short_ref_count-1; i>=0; i--){
4049	ref = h->short_ref[i];
4050	assert(ref->reference == 3);
4051	assert(!ref->long_ref);
4052	if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
4053	break;
4054	}
4055	if(i>=0)
4056	ref->pic_id= ref->frame_num;
4057	}else{
4058	pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
4059	ref = h->long_ref[pic_id];
4060	ref->pic_id= pic_id;
4061	assert(ref->reference == 3);
4062	assert(ref->long_ref);
4063	i=0;
4064	}
4065
4066	if (i < 0) {
4067	av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
4068	memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
4069	} else {
4070	for(i=index; i+1<h->ref_count[list]; i++){
4071	if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
4072	break;
4073	}
4074	for(; i > index; i--){
4075	h->ref_list[list][i]= h->ref_list[list][i-1];
4076	}
4077	h->ref_list[list][index]= *ref;
4078	}
4079	}else{
4080	av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
4081	return -1;
4082	}
4083	}
4084	}
4085
4086	if(h->slice_type!=B_TYPE) break;
4087	}
4088	for(list=0; list<2; list++){
4089	for(index= 0; index < h->ref_count[list]; index++){
4090	if(!h->ref_list[list][index].data[0])
4091	h->ref_list[list][index]= s->current_picture;
4092	}
4093	if(h->slice_type!=B_TYPE) break;
4094	}
4095
4096	if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
4097	direct_dist_scale_factor(h);
4098	direct_ref_list_init(h);
4099	return 0;
4100	}
4101
4102	static void fill_mbaff_ref_list(H264Context *h){
4103	int list, i, j;
4104	for(list=0; list<2; list++){
4105	for(i=0; i<h->ref_count[list]; i++){
4106	Picture *frame = &h->ref_list[list][i];
4107	Picture field = &h->ref_list[list][16+2i];
4108	field[0] = *frame;
4109	for(j=0; j<3; j++)
4110	field[0].linesize[j] <<= 1;
4111	field[1] = field[0];
4112	for(j=0; j<3; j++)
4113	field[1].data[j] += frame->linesize[j];
4114
4115	h->luma_weight[list][16+2i] = h->luma_weight[list][16+2i+1] = h->luma_weight[list][i];
4116	h->luma_offset[list][16+2i] = h->luma_offset[list][16+2i+1] = h->luma_offset[list][i];
4117	for(j=0; j<2; j++){
4118	h->chroma_weight[list][16+2i][j] = h->chroma_weight[list][16+2i+1][j] = h->chroma_weight[list][i][j];
4119	h->chroma_offset[list][16+2i][j] = h->chroma_offset[list][16+2i+1][j] = h->chroma_offset[list][i][j];
4120	}
4121	}
4122	}
4123	for(j=0; j<h->ref_count[1]; j++){
4124	for(i=0; i<h->ref_count[0]; i++)
4125	h->implicit_weight[j][16+2i] = h->implicit_weight[j][16+2i+1] = h->implicit_weight[j][i];
4126	memcpy(h->implicit_weight[16+2j], h->implicit_weight[j], sizeof(h->implicit_weight));
4127	memcpy(h->implicit_weight[16+2j+1], h->implicit_weight[j], sizeof(h->implicit_weight));
4128	}
4129	}
4130
4131	static int pred_weight_table(H264Context *h){
4132	MpegEncContext * const s = &h->s;
4133	int list, i;
4134	int luma_def, chroma_def;
4135
4136	h->use_weight= 0;
4137	h->use_weight_chroma= 0;
4138	h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
4139	h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
4140	luma_def = 1<<h->luma_log2_weight_denom;
4141	chroma_def = 1<<h->chroma_log2_weight_denom;
4142
4143	for(list=0; list<2; list++){
4144	for(i=0; i<h->ref_count[list]; i++){
4145	int luma_weight_flag, chroma_weight_flag;
4146
4147	luma_weight_flag= get_bits1(&s->gb);
4148	if(luma_weight_flag){
4149	h->luma_weight[list][i]= get_se_golomb(&s->gb);
4150	h->luma_offset[list][i]= get_se_golomb(&s->gb);
4151	if( h->luma_weight[list][i] != luma_def
4152	\|\| h->luma_offset[list][i] != 0)
4153	h->use_weight= 1;
4154	}else{
4155	h->luma_weight[list][i]= luma_def;
4156	h->luma_offset[list][i]= 0;
4157	}
4158
4159	chroma_weight_flag= get_bits1(&s->gb);
4160	if(chroma_weight_flag){
4161	int j;
4162	for(j=0; j<2; j++){
4163	h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
4164	h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
4165	if( h->chroma_weight[list][i][j] != chroma_def
4166	\|\| h->chroma_offset[list][i][j] != 0)
4167	h->use_weight_chroma= 1;
4168	}
4169	}else{
4170	int j;
4171	for(j=0; j<2; j++){
4172	h->chroma_weight[list][i][j]= chroma_def;
4173	h->chroma_offset[list][i][j]= 0;
4174	}
4175	}
4176	}
4177	if(h->slice_type != B_TYPE) break;
4178	}
4179	h->use_weight= h->use_weight \|\| h->use_weight_chroma;
4180	return 0;
4181	}
4182
4183	static void implicit_weight_table(H264Context *h){
4184	MpegEncContext * const s = &h->s;
4185	int ref0, ref1;
4186	int cur_poc = s->current_picture_ptr->poc;
4187
4188	if( h->ref_count[0] == 1 && h->ref_count[1] == 1
4189	&& h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
4190	h->use_weight= 0;
4191	h->use_weight_chroma= 0;
4192	return;
4193	}
4194
4195	h->use_weight= 2;
4196	h->use_weight_chroma= 2;
4197	h->luma_log2_weight_denom= 5;
4198	h->chroma_log2_weight_denom= 5;
4199
4200	for(ref0=0; ref0 < h->ref_count[0]; ref0++){
4201	int poc0 = h->ref_list[0][ref0].poc;
4202	for(ref1=0; ref1 < h->ref_count[1]; ref1++){
4203	int poc1 = h->ref_list[1][ref1].poc;
4204	int td = clip(poc1 - poc0, -128, 127);
4205	if(td){
4206	int tb = clip(cur_poc - poc0, -128, 127);
4207	int tx = (16384 + (ABS(td) >> 1)) / td;
4208	int dist_scale_factor = clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
4209	if(dist_scale_factor < -64 \|\| dist_scale_factor > 128)
4210	h->implicit_weight[ref0][ref1] = 32;
4211	else
4212	h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
4213	}else
4214	h->implicit_weight[ref0][ref1] = 32;
4215	}
4216	}
4217	}
4218
4219	static inline void unreference_pic(H264Context h, Picture pic){
4220	int i;
4221	pic->reference=0;
4222	if(pic == h->delayed_output_pic)
4223	pic->reference=1;
4224	else{
4225	for(i = 0; h->delayed_pic[i]; i++)
4226	if(pic == h->delayed_pic[i]){
4227	pic->reference=1;
4228	break;
4229	}
4230	}
4231	}
4232
4233	/**
4234	* instantaneous decoder refresh.
4235	*/
4236	static void idr(H264Context *h){
4237	int i;
4238
4239	for(i=0; i<16; i++){
4240	if (h->long_ref[i] != NULL) {
4241	unreference_pic(h, h->long_ref[i]);
4242	h->long_ref[i]= NULL;
4243	}
4244	}
4245	h->long_ref_count=0;
4246
4247	for(i=0; i<h->short_ref_count; i++){
4248	unreference_pic(h, h->short_ref[i]);
4249	h->short_ref[i]= NULL;
4250	}
4251	h->short_ref_count=0;
4252	}
4253
4254	/* forget old pics after a seek */
4255	static void flush_dpb(AVCodecContext *avctx){
4256	H264Context *h= avctx->priv_data;
4257	int i;
4258	for(i=0; i<16; i++) {
4259	if(h->delayed_pic[i])
4260	h->delayed_pic[i]->reference= 0;
4261	h->delayed_pic[i]= NULL;
4262	}
4263	if(h->delayed_output_pic)
4264	h->delayed_output_pic->reference= 0;
4265	h->delayed_output_pic= NULL;
4266	idr(h);
4267	if(h->s.current_picture_ptr)
4268	h->s.current_picture_ptr->reference= 0;
4269	}
4270
4271	/**
4272	*
4273	* @return the removed picture or NULL if an error occurs
4274	*/
4275	static Picture * remove_short(H264Context *h, int frame_num){
4276	MpegEncContext * const s = &h->s;
4277	int i;
4278
4279	if(s->avctx->debug&FF_DEBUG_MMCO)
4280	av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
4281
4282	for(i=0; i<h->short_ref_count; i++){
4283	Picture *pic= h->short_ref[i];
4284	if(s->avctx->debug&FF_DEBUG_MMCO)
4285	av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
4286	if(pic->frame_num == frame_num){
4287	h->short_ref[i]= NULL;
4288	memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)sizeof(Picture));
4289	h->short_ref_count--;
4290	return pic;
4291	}
4292	}
4293	return NULL;
4294	}
4295
4296	/**
4297	*
4298	* @return the removed picture or NULL if an error occurs
4299	*/
4300	static Picture * remove_long(H264Context *h, int i){
4301	Picture *pic;
4302
4303	pic= h->long_ref[i];
4304	h->long_ref[i]= NULL;
4305	if(pic) h->long_ref_count--;
4306
4307	return pic;
4308	}
4309
4310	/**
4311	* print short term list
4312	*/
4313	static void print_short_term(H264Context *h) {
4314	uint32_t i;
4315	if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4316	av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
4317	for(i=0; i<h->short_ref_count; i++){
4318	Picture *pic= h->short_ref[i];
4319	av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4320	}
4321	}
4322	}
4323
4324	/**
4325	* print long term list
4326	*/
4327	static void print_long_term(H264Context *h) {
4328	uint32_t i;
4329	if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4330	av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
4331	for(i = 0; i < 16; i++){
4332	Picture *pic= h->long_ref[i];
4333	if (pic) {
4334	av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4335	}
4336	}
4337	}
4338	}
4339
4340	/**
4341	* Executes the reference picture marking (memory management control operations).
4342	*/
4343	static int execute_ref_pic_marking(H264Context h, MMCO mmco, int mmco_count){
4344	MpegEncContext * const s = &h->s;
4345	int i, j;
4346	int current_is_long=0;
4347	Picture *pic;
4348
4349	if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
4350	av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
4351
4352	for(i=0; i<mmco_count; i++){
4353	if(s->avctx->debug&FF_DEBUG_MMCO)
4354	av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
4355
4356	switch(mmco[i].opcode){
4357	case MMCO_SHORT2UNUSED:
4358	pic= remove_short(h, mmco[i].short_frame_num);
4359	if(pic)
4360	unreference_pic(h, pic);
4361	else if(s->avctx->debug&FF_DEBUG_MMCO)
4362	av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
4363	break;
4364	case MMCO_SHORT2LONG:
4365	pic= remove_long(h, mmco[i].long_index);
4366	if(pic) unreference_pic(h, pic);
4367
4368	h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
4369	h->long_ref[ mmco[i].long_index ]->long_ref=1;
4370	h->long_ref_count++;
4371	break;
4372	case MMCO_LONG2UNUSED:
4373	pic= remove_long(h, mmco[i].long_index);
4374	if(pic)
4375	unreference_pic(h, pic);
4376	else if(s->avctx->debug&FF_DEBUG_MMCO)
4377	av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
4378	break;
4379	case MMCO_LONG:
4380	pic= remove_long(h, mmco[i].long_index);
4381	if(pic) unreference_pic(h, pic);
4382
4383	h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
4384	h->long_ref[ mmco[i].long_index ]->long_ref=1;
4385	h->long_ref_count++;
4386
4387	current_is_long=1;
4388	break;
4389	case MMCO_SET_MAX_LONG:
4390	assert(mmco[i].long_index <= 16);
4391	// just remove the long term which index is greater than new max
4392	for(j = mmco[i].long_index; j<16; j++){
4393	pic = remove_long(h, j);
4394	if (pic) unreference_pic(h, pic);
4395	}
4396	break;
4397	case MMCO_RESET:
4398	while(h->short_ref_count){
4399	pic= remove_short(h, h->short_ref[0]->frame_num);
4400	unreference_pic(h, pic);
4401	}
4402	for(j = 0; j < 16; j++) {
4403	pic= remove_long(h, j);
4404	if(pic) unreference_pic(h, pic);
4405	}
4406	break;
4407	default: assert(0);
4408	}
4409	}
4410
4411	if(!current_is_long){
4412	pic= remove_short(h, s->current_picture_ptr->frame_num);
4413	if(pic){
4414	unreference_pic(h, pic);
4415	av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
4416	}
4417
4418	if(h->short_ref_count)
4419	memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_countsizeof(Picture));
4420
4421	h->short_ref[0]= s->current_picture_ptr;
4422	h->short_ref[0]->long_ref=0;
4423	h->short_ref_count++;
4424	}
4425
4426	print_short_term(h);
4427	print_long_term(h);
4428	return 0;
4429	}
4430
4431	static int decode_ref_pic_marking(H264Context *h){
4432	MpegEncContext * const s = &h->s;
4433	int i;
4434
4435	if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
4436	s->broken_link= get_bits1(&s->gb) -1;
4437	h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
4438	if(h->mmco[0].long_index == -1)
4439	h->mmco_index= 0;
4440	else{
4441	h->mmco[0].opcode= MMCO_LONG;
4442	h->mmco_index= 1;
4443	}
4444	}else{
4445	if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
4446	for(i= 0; i<MAX_MMCO_COUNT; i++) {
4447	MMCOOpcode opcode= get_ue_golomb(&s->gb);;
4448
4449	h->mmco[i].opcode= opcode;
4450	if(opcode==MMCO_SHORT2UNUSED \|\| opcode==MMCO_SHORT2LONG){
4451	h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
4452	/* if(h->mmco[i].short_frame_num >= h->short_ref_count \|\| h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
4453	av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
4454	return -1;
4455	}*/
4456	}
4457	if(opcode==MMCO_SHORT2LONG \|\| opcode==MMCO_LONG2UNUSED \|\| opcode==MMCO_LONG \|\| opcode==MMCO_SET_MAX_LONG){
4458	h->mmco[i].long_index= get_ue_golomb(&s->gb);
4459	if(/h->mmco[i].long_index >= h->long_ref_count \|\| h->long_ref[ h->mmco[i].long_index ] == NULL/ h->mmco[i].long_index >= 16){
4460	av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4461	return -1;
4462	}
4463	}
4464
4465	if(opcode > MMCO_LONG){
4466	av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4467	return -1;
4468	}
4469	if(opcode == MMCO_END)
4470	break;
4471	}
4472	h->mmco_index= i;
4473	}else{
4474	assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4475
4476	if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4477	h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4478	h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4479	h->mmco_index= 1;
4480	}else
4481	h->mmco_index= 0;
4482	}
4483	}
4484
4485	return 0;
4486	}
4487
4488	static int init_poc(H264Context *h){
4489	MpegEncContext * const s = &h->s;
4490	const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4491	int field_poc[2];
4492
4493	if(h->nal_unit_type == NAL_IDR_SLICE){
4494	h->frame_num_offset= 0;
4495	}else{
4496	if(h->frame_num < h->prev_frame_num)
4497	h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4498	else
4499	h->frame_num_offset= h->prev_frame_num_offset;
4500	}
4501
4502	if(h->sps.poc_type==0){
4503	const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4504
4505	if(h->nal_unit_type == NAL_IDR_SLICE){
4506	h->prev_poc_msb=
4507	h->prev_poc_lsb= 0;
4508	}
4509
4510	if (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4511	h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4512	else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4513	h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4514	else
4515	h->poc_msb = h->prev_poc_msb;
4516	//printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4517	field_poc[0] =
4518	field_poc[1] = h->poc_msb + h->poc_lsb;
4519	if(s->picture_structure == PICT_FRAME)
4520	field_poc[1] += h->delta_poc_bottom;
4521	}else if(h->sps.poc_type==1){
4522	int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4523	int i;
4524
4525	if(h->sps.poc_cycle_length != 0)
4526	abs_frame_num = h->frame_num_offset + h->frame_num;
4527	else
4528	abs_frame_num = 0;
4529
4530	if(h->nal_ref_idc==0 && abs_frame_num > 0)
4531	abs_frame_num--;
4532
4533	expected_delta_per_poc_cycle = 0;
4534	for(i=0; i < h->sps.poc_cycle_length; i++)
4535	expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4536
4537	if(abs_frame_num > 0){
4538	int poc_cycle_cnt = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4539	int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4540
4541	expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4542	for(i = 0; i <= frame_num_in_poc_cycle; i++)
4543	expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4544	} else
4545	expectedpoc = 0;
4546
4547	if(h->nal_ref_idc == 0)
4548	expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4549
4550	field_poc[0] = expectedpoc + h->delta_poc[0];
4551	field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4552
4553	if(s->picture_structure == PICT_FRAME)
4554	field_poc[1] += h->delta_poc[1];
4555	}else{
4556	int poc;
4557	if(h->nal_unit_type == NAL_IDR_SLICE){
4558	poc= 0;
4559	}else{
4560	if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4561	else poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4562	}
4563	field_poc[0]= poc;
4564	field_poc[1]= poc;
4565	}
4566
4567	if(s->picture_structure != PICT_BOTTOM_FIELD)
4568	s->current_picture_ptr->field_poc[0]= field_poc[0];
4569	if(s->picture_structure != PICT_TOP_FIELD)
4570	s->current_picture_ptr->field_poc[1]= field_poc[1];
4571	if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4572	s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4573
4574	return 0;
4575	}
4576
4577	/**
4578	* decodes a slice header.
4579	* this will allso call MPV_common_init() and frame_start() as needed
4580	*/
4581	static int decode_slice_header(H264Context *h){
4582	MpegEncContext * const s = &h->s;
4583	int first_mb_in_slice, pps_id;
4584	int num_ref_idx_active_override_flag;
4585	static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4586	int slice_type;
4587	int default_ref_list_done = 0;
4588
4589	s->current_picture.reference= h->nal_ref_idc != 0;
4590	s->dropable= h->nal_ref_idc == 0;
4591
4592	first_mb_in_slice= get_ue_golomb(&s->gb);
4593
4594	slice_type= get_ue_golomb(&s->gb);
4595	if(slice_type > 9){
4596	av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4597	return -1;
4598	}
4599	if(slice_type > 4){
4600	slice_type -= 5;
4601	h->slice_type_fixed=1;
4602	}else
4603	h->slice_type_fixed=0;
4604
4605	slice_type= slice_type_map[ slice_type ];
4606	if (slice_type == I_TYPE
4607	\|\| (h->slice_num != 0 && slice_type == h->slice_type) ) {
4608	default_ref_list_done = 1;
4609	}
4610	h->slice_type= slice_type;
4611
4612	s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4613
4614	pps_id= get_ue_golomb(&s->gb);
4615	if(pps_id>255){
4616	av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4617	return -1;
4618	}
4619	h->pps= h->pps_buffer[pps_id];
4620	if(h->pps.slice_group_count == 0){
4621	av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4622	return -1;
4623	}
4624
4625	h->sps= h->sps_buffer[ h->pps.sps_id ];
4626	if(h->sps.log2_max_frame_num == 0){
4627	av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4628	return -1;
4629	}
4630
4631	if(h->dequant_coeff_pps != pps_id){
4632	h->dequant_coeff_pps = pps_id;
4633	init_dequant_tables(h);
4634	}
4635
4636	s->mb_width= h->sps.mb_width;
4637	s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4638
4639	h->b_stride= s->mb_width*4;
4640	h->b8_stride= s->mb_width*2;
4641
4642	s->width = 16s->mb_width - 2(h->sps.crop_left + h->sps.crop_right );
4643	if(h->sps.frame_mbs_only_flag)
4644	s->height= 16s->mb_height - 2(h->sps.crop_top + h->sps.crop_bottom);
4645	else
4646	s->height= 16s->mb_height - 4(h->sps.crop_top + h->sps.crop_bottom); //FIXME recheck
4647
4648	if (s->context_initialized
4649	&& ( s->width != s->avctx->width \|\| s->height != s->avctx->height)) {
4650	free_tables(h);
4651	MPV_common_end(s);
4652	}
4653	if (!s->context_initialized) {
4654	if (MPV_common_init(s) < 0)
4655	return -1;
4656
4657	if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4658	memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4659	memcpy(h-> field_scan, field_scan, 16*sizeof(uint8_t));
4660	}else{
4661	int i;
4662	for(i=0; i<16; i++){
4663	#define T(x) (x>>2) \| ((x<<2) & 0xF)
4664	h->zigzag_scan[i] = T(zigzag_scan[i]);
4665	h-> field_scan[i] = T( field_scan[i]);
4666	#undef T
4667	}
4668	}
4669	if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
4670	memcpy(h->zigzag_scan8x8, zigzag_scan8x8, 64*sizeof(uint8_t));
4671	memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
4672	memcpy(h->field_scan8x8, field_scan8x8, 64*sizeof(uint8_t));
4673	memcpy(h->field_scan8x8_cavlc, field_scan8x8_cavlc, 64*sizeof(uint8_t));
4674	}else{
4675	int i;
4676	for(i=0; i<64; i++){
4677	#define T(x) (x>>3) \| ((x&7)<<3)
4678	h->zigzag_scan8x8[i] = T(zigzag_scan8x8[i]);
4679	h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
4680	h->field_scan8x8[i] = T(field_scan8x8[i]);
4681	h->field_scan8x8_cavlc[i] = T(field_scan8x8_cavlc[i]);
4682	#undef T
4683	}
4684	}
4685	if(h->sps.transform_bypass){ //FIXME same ugly
4686	h->zigzag_scan_q0 = zigzag_scan;
4687	h->zigzag_scan8x8_q0 = zigzag_scan8x8;
4688	h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
4689	h->field_scan_q0 = field_scan;
4690	h->field_scan8x8_q0 = field_scan8x8;
4691	h->field_scan8x8_cavlc_q0 = field_scan8x8_cavlc;
4692	}else{
4693	h->zigzag_scan_q0 = h->zigzag_scan;
4694	h->zigzag_scan8x8_q0 = h->zigzag_scan8x8;
4695	h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
4696	h->field_scan_q0 = h->field_scan;
4697	h->field_scan8x8_q0 = h->field_scan8x8;
4698	h->field_scan8x8_cavlc_q0 = h->field_scan8x8_cavlc;
4699	}
4700
4701	alloc_tables(h);
4702
4703	s->avctx->width = s->width;
4704	s->avctx->height = s->height;
4705	s->avctx->sample_aspect_ratio= h->sps.sar;
4706	if(!s->avctx->sample_aspect_ratio.den)
4707	s->avctx->sample_aspect_ratio.den = 1;
4708
4709	if(h->sps.timing_info_present_flag){
4710	s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4711	if(h->x264_build > 0 && h->x264_build < 44)
4712	s->avctx->time_base.den *= 2;
4713	av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4714	s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4715	}
4716	}
4717
4718	if(h->slice_num == 0){
4719	if(frame_start(h) < 0)
4720	return -1;
4721	}
4722
4723	s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4724	h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4725
4726	h->mb_mbaff = 0;
4727	h->mb_aff_frame = 0;
4728	if(h->sps.frame_mbs_only_flag){
4729	s->picture_structure= PICT_FRAME;
4730	}else{
4731	if(get_bits1(&s->gb)) { //field_pic_flag
4732	s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4733	av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
4734	} else {
4735	s->picture_structure= PICT_FRAME;
4736	h->mb_aff_frame = h->sps.mb_aff;
4737	}
4738	}
4739
4740	s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4741	s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
4742	if(s->mb_y >= s->mb_height){
4743	return -1;
4744	}
4745
4746	if(s->picture_structure==PICT_FRAME){
4747	h->curr_pic_num= h->frame_num;
4748	h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4749	}else{
4750	h->curr_pic_num= 2*h->frame_num;
4751	h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4752	}
4753
4754	if(h->nal_unit_type == NAL_IDR_SLICE){
4755	get_ue_golomb(&s->gb); /* idr_pic_id */
4756	}
4757
4758	if(h->sps.poc_type==0){
4759	h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4760
4761	if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4762	h->delta_poc_bottom= get_se_golomb(&s->gb);
4763	}
4764	}
4765
4766	if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4767	h->delta_poc[0]= get_se_golomb(&s->gb);
4768
4769	if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4770	h->delta_poc[1]= get_se_golomb(&s->gb);
4771	}
4772
4773	init_poc(h);
4774
4775	if(h->pps.redundant_pic_cnt_present){
4776	h->redundant_pic_count= get_ue_golomb(&s->gb);
4777	}
4778
4779	//set defaults, might be overriden a few line later
4780	h->ref_count[0]= h->pps.ref_count[0];
4781	h->ref_count[1]= h->pps.ref_count[1];
4782
4783	if(h->slice_type == P_TYPE \|\| h->slice_type == SP_TYPE \|\| h->slice_type == B_TYPE){
4784	if(h->slice_type == B_TYPE){
4785	h->direct_spatial_mv_pred= get_bits1(&s->gb);
4786	if(h->sps.mb_aff && h->direct_spatial_mv_pred)
4787	av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
4788	}
4789	num_ref_idx_active_override_flag= get_bits1(&s->gb);
4790
4791	if(num_ref_idx_active_override_flag){
4792	h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4793	if(h->slice_type==B_TYPE)
4794	h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4795
4796	if(h->ref_count[0] > 32 \|\| h->ref_count[1] > 32){
4797	av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4798	return -1;
4799	}
4800	}
4801	}
4802
4803	if(!default_ref_list_done){
4804	fill_default_ref_list(h);
4805	}
4806
4807	if(decode_ref_pic_list_reordering(h) < 0)
4808	return -1;
4809
4810	if( (h->pps.weighted_pred && (h->slice_type == P_TYPE \|\| h->slice_type == SP_TYPE ))
4811	\|\| (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4812	pred_weight_table(h);
4813	else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4814	implicit_weight_table(h);
4815	else
4816	h->use_weight = 0;
4817
4818	if(s->current_picture.reference)
4819	decode_ref_pic_marking(h);
4820
4821	if(FRAME_MBAFF)
4822	fill_mbaff_ref_list(h);
4823
4824	if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac )
4825	h->cabac_init_idc = get_ue_golomb(&s->gb);
4826
4827	h->last_qscale_diff = 0;
4828	s->qscale = h->pps.init_qp + get_se_golomb(&s->gb);
4829	if(s->qscale<0 \|\| s->qscale>51){
4830	av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale);
4831	return -1;
4832	}
4833	h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4834	//FIXME qscale / qp ... stuff
4835	if(h->slice_type == SP_TYPE){
4836	get_bits1(&s->gb); /* sp_for_switch_flag */
4837	}
4838	if(h->slice_type==SP_TYPE \|\| h->slice_type == SI_TYPE){
4839	get_se_golomb(&s->gb); /* slice_qs_delta */
4840	}
4841
4842	h->deblocking_filter = 1;
4843	h->slice_alpha_c0_offset = 0;
4844	h->slice_beta_offset = 0;
4845	if( h->pps.deblocking_filter_parameters_present ) {
4846	h->deblocking_filter= get_ue_golomb(&s->gb);
4847	if(h->deblocking_filter < 2)
4848	h->deblocking_filter^= 1; // 1<->0
4849
4850	if( h->deblocking_filter ) {
4851	h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4852	h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4853	}
4854	}
4855	if( s->avctx->skip_loop_filter >= AVDISCARD_ALL
4856	\|\|(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4857	\|\|(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR && h->slice_type == B_TYPE)
4858	\|\|(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4859	h->deblocking_filter= 0;
4860
4861	#if 0 //FMO
4862	if( h->pps.num_slice_groups > 1 && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4863	slice_group_change_cycle= get_bits(&s->gb, ?);
4864	#endif
4865
4866	h->slice_num++;
4867
4868	h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4869	h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
4870
4871	if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4872	av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4873	h->slice_num,
4874	(s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4875	first_mb_in_slice,
4876	av_get_pict_type_char(h->slice_type),
4877	pps_id, h->frame_num,
4878	s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4879	h->ref_count[0], h->ref_count[1],
4880	s->qscale,
4881	h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4882	h->use_weight,
4883	h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4884	);
4885	}
4886
4887	return 0;
4888	}
4889
4890	/**
4891	*
4892	*/
4893	static inline int get_level_prefix(GetBitContext *gb){
4894	unsigned int buf;
4895	int log;
4896
4897	OPEN_READER(re, gb);
4898	UPDATE_CACHE(re, gb);
4899	buf=GET_CACHE(re, gb);
4900
4901	log= 32 - av_log2(buf);
4902	#ifdef TRACE
4903	print_bin(buf>>(32-log), log);
4904	av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4905	#endif
4906
4907	LAST_SKIP_BITS(re, gb, log);
4908	CLOSE_READER(re, gb);
4909
4910	return log-1;
4911	}
4912
4913	static inline int get_dct8x8_allowed(H264Context *h){
4914	int i;
4915	for(i=0; i<4; i++){
4916	if(!IS_SUB_8X8(h->sub_mb_type[i])
4917	\|\| (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4918	return 0;
4919	}
4920	return 1;
4921	}
4922
4923	/**
4924	* decodes a residual block.
4925	* @param n block index
4926	* @param scantable scantable
4927	* @param max_coeff number of coefficients in the block
4928	* @return <0 if an error occured
4929	*/
4930	static int decode_residual(H264Context h, GetBitContext gb, DCTELEM block, int n, const uint8_t scantable, const uint32_t *qmul, int max_coeff){
4931	MpegEncContext * const s = &h->s;
4932	static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4933	int level[16];
4934	int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4935
4936	//FIXME put trailing_onex into the context
4937
4938	if(n == CHROMA_DC_BLOCK_INDEX){
4939	coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4940	total_coeff= coeff_token>>2;
4941	}else{
4942	if(n == LUMA_DC_BLOCK_INDEX){
4943	total_coeff= pred_non_zero_count(h, 0);
4944	coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4945	total_coeff= coeff_token>>2;
4946	}else{
4947	total_coeff= pred_non_zero_count(h, n);
4948	coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4949	total_coeff= coeff_token>>2;
4950	h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4951	}
4952	}
4953
4954	//FIXME set last_non_zero?
4955
4956	if(total_coeff==0)
4957	return 0;
4958
4959	trailing_ones= coeff_token&3;
4960	tprintf("trailing:%d, total:%d\n", trailing_ones, total_coeff);
4961	assert(total_coeff<=16);
4962
4963	for(i=0; i<trailing_ones; i++){
4964	level[i]= 1 - 2*get_bits1(gb);
4965	}
4966
4967	if(i<total_coeff) {
4968	int level_code, mask;
4969	int suffix_length = total_coeff > 10 && trailing_ones < 3;
4970	int prefix= get_level_prefix(gb);
4971
4972	//first coefficient has suffix_length equal to 0 or 1
4973	if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4974	if(suffix_length)
4975	level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4976	else
4977	level_code= (prefix<<suffix_length); //part
4978	}else if(prefix==14){
4979	if(suffix_length)
4980	level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4981	else
4982	level_code= prefix + get_bits(gb, 4); //part
4983	}else if(prefix==15){
4984	level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4985	if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4986	}else{
4987	av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4988	return -1;
4989	}
4990
4991	if(trailing_ones < 3) level_code += 2;
4992
4993	suffix_length = 1;
4994	if(level_code > 5)
4995	suffix_length++;
4996	mask= -(level_code&1);
4997	level[i]= (((2+level_code)>>1) ^ mask) - mask;
4998	i++;
4999
5000	//remaining coefficients have suffix_length > 0
5001	for(;i<total_coeff;i++) {
5002	static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
5003	prefix = get_level_prefix(gb);
5004	if(prefix<15){
5005	level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
5006	}else if(prefix==15){
5007	level_code = (prefix<<suffix_length) + get_bits(gb, 12);
5008	}else{
5009	av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
5010	return -1;
5011	}
5012	mask= -(level_code&1);
5013	level[i]= (((2+level_code)>>1) ^ mask) - mask;
5014	if(level_code > suffix_limit[suffix_length])
5015	suffix_length++;
5016	}
5017	}
5018
5019	if(total_coeff == max_coeff)
5020	zeros_left=0;
5021	else{
5022	if(n == CHROMA_DC_BLOCK_INDEX)
5023	zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
5024	else
5025	zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
5026	}
5027
5028	coeff_num = zeros_left + total_coeff - 1;
5029	j = scantable[coeff_num];
5030	if(n > 24){
5031	block[j] = level[0];
5032	for(i=1;i<total_coeff;i++) {
5033	if(zeros_left <= 0)
5034	run_before = 0;
5035	else if(zeros_left < 7){
5036	run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
5037	}else{
5038	run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
5039	}
5040	zeros_left -= run_before;
5041	coeff_num -= 1 + run_before;
5042	j= scantable[ coeff_num ];
5043
5044	block[j]= level[i];
5045	}
5046	}else{
5047	block[j] = (level[0] * qmul[j] + 32)>>6;
5048	for(i=1;i<total_coeff;i++) {
5049	if(zeros_left <= 0)
5050	run_before = 0;
5051	else if(zeros_left < 7){
5052	run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
5053	}else{
5054	run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
5055	}
5056	zeros_left -= run_before;
5057	coeff_num -= 1 + run_before;
5058	j= scantable[ coeff_num ];
5059
5060	block[j]= (level[i] * qmul[j] + 32)>>6;
5061	}
5062	}
5063
5064	if(zeros_left<0){
5065	av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
5066	return -1;
5067	}
5068
5069	return 0;
5070	}
5071
5072	static void predict_field_decoding_flag(H264Context *h){
5073	MpegEncContext * const s = &h->s;
5074	const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5075	int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
5076	? s->current_picture.mb_type[mb_xy-1]
5077	: (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
5078	? s->current_picture.mb_type[mb_xy-s->mb_stride]
5079	: 0;
5080	h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
5081	}
5082
5083	/**
5084	* decodes a P_SKIP or B_SKIP macroblock
5085	*/
5086	static void decode_mb_skip(H264Context *h){
5087	MpegEncContext * const s = &h->s;
5088	const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5089	int mb_type=0;
5090
5091	memset(h->non_zero_count[mb_xy], 0, 16);
5092	memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
5093
5094	if(MB_FIELD)
5095	mb_type\|= MB_TYPE_INTERLACED;
5096
5097	if( h->slice_type == B_TYPE )
5098	{
5099	// just for fill_caches. pred_direct_motion will set the real mb_type
5100	mb_type\|= MB_TYPE_16x16\|MB_TYPE_P0L0\|MB_TYPE_P0L1\|MB_TYPE_DIRECT2\|MB_TYPE_SKIP;
5101
5102	fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
5103	pred_direct_motion(h, &mb_type);
5104	if(h->pps.cabac){
5105	fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5106	fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5107	}
5108	}
5109	else
5110	{
5111	int mx, my;
5112	mb_type\|= MB_TYPE_16x16\|MB_TYPE_P0L0\|MB_TYPE_P1L0\|MB_TYPE_SKIP;
5113
5114	fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
5115	pred_pskip_motion(h, &mx, &my);
5116	fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
5117	fill_rectangle( h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
5118	if(h->pps.cabac)
5119	fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5120	}
5121
5122	write_back_motion(h, mb_type);
5123	s->current_picture.mb_type[mb_xy]= mb_type\|MB_TYPE_SKIP;
5124	s->current_picture.qscale_table[mb_xy]= s->qscale;
5125	h->slice_table[ mb_xy ]= h->slice_num;
5126	h->prev_mb_skipped= 1;
5127	}
5128
5129	/**
5130	* decodes a macroblock
5131	* @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5132	*/
5133	static int decode_mb_cavlc(H264Context *h){
5134	MpegEncContext * const s = &h->s;
5135	const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5136	int mb_type, partition_count, cbp;
5137	int dct8x8_allowed= h->pps.transform_8x8_mode;
5138
5139	s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
5140
5141	tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5142	cbp = 0; /* avoid warning. FIXME: find a solution without slowing
5143	down the code */
5144	if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
5145	if(s->mb_skip_run==-1)
5146	s->mb_skip_run= get_ue_golomb(&s->gb);
5147
5148	if (s->mb_skip_run--) {
5149	if(FRAME_MBAFF && (s->mb_y&1) == 0){
5150	if(s->mb_skip_run==0)
5151	h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5152	else
5153	predict_field_decoding_flag(h);
5154	}
5155	decode_mb_skip(h);
5156	return 0;
5157	}
5158	}
5159	if(FRAME_MBAFF){
5160	if( (s->mb_y&1) == 0 )
5161	h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5162	}else
5163	h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5164
5165	h->prev_mb_skipped= 0;
5166
5167	mb_type= get_ue_golomb(&s->gb);
5168	if(h->slice_type == B_TYPE){
5169	if(mb_type < 23){
5170	partition_count= b_mb_type_info[mb_type].partition_count;
5171	mb_type= b_mb_type_info[mb_type].type;
5172	}else{
5173	mb_type -= 23;
5174	goto decode_intra_mb;
5175	}
5176	}else if(h->slice_type == P_TYPE /\|\| h->slice_type == SP_TYPE /){
5177	if(mb_type < 5){
5178	partition_count= p_mb_type_info[mb_type].partition_count;
5179	mb_type= p_mb_type_info[mb_type].type;
5180	}else{
5181	mb_type -= 5;
5182	goto decode_intra_mb;
5183	}
5184	}else{
5185	assert(h->slice_type == I_TYPE);
5186	decode_intra_mb:
5187	if(mb_type > 25){
5188	av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice to large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
5189	return -1;
5190	}
5191	partition_count=0;
5192	cbp= i_mb_type_info[mb_type].cbp;
5193	h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5194	mb_type= i_mb_type_info[mb_type].type;
5195	}
5196
5197	if(MB_FIELD)
5198	mb_type \|= MB_TYPE_INTERLACED;
5199
5200	h->slice_table[ mb_xy ]= h->slice_num;
5201
5202	if(IS_INTRA_PCM(mb_type)){
5203	unsigned int x, y;
5204
5205	// we assume these blocks are very rare so we dont optimize it
5206	align_get_bits(&s->gb);
5207
5208	// The pixels are stored in the same order as levels in h->mb array.
5209	for(y=0; y<16; y++){
5210	const int index= 4(y&3) + 32((y>>2)&1) + 128*(y>>3);
5211	for(x=0; x<16; x++){
5212	tprintf("LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5213	h->mb[index + (x&3) + 16((x>>2)&1) + 64(x>>3)]= get_bits(&s->gb, 8);
5214	}
5215	}
5216	for(y=0; y<8; y++){
5217	const int index= 256 + 4(y&3) + 32(y>>2);
5218	for(x=0; x<8; x++){
5219	tprintf("CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5220	h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5221	}
5222	}
5223	for(y=0; y<8; y++){
5224	const int index= 256 + 64 + 4(y&3) + 32(y>>2);
5225	for(x=0; x<8; x++){
5226	tprintf("CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5227	h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5228	}
5229	}
5230
5231	// In deblocking, the quantizer is 0
5232	s->current_picture.qscale_table[mb_xy]= 0;
5233	h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
5234	// All coeffs are present
5235	memset(h->non_zero_count[mb_xy], 16, 16);
5236
5237	s->current_picture.mb_type[mb_xy]= mb_type;
5238	return 0;
5239	}
5240
5241	if(MB_MBAFF){
5242	h->ref_count[0] <<= 1;
5243	h->ref_count[1] <<= 1;
5244	}
5245
5246	fill_caches(h, mb_type, 0);
5247
5248	//mb_pred
5249	if(IS_INTRA(mb_type)){
5250	// init_top_left_availability(h);
5251	if(IS_INTRA4x4(mb_type)){
5252	int i;
5253	int di = 1;
5254	if(dct8x8_allowed && get_bits1(&s->gb)){
5255	mb_type \|= MB_TYPE_8x8DCT;
5256	di = 4;
5257	}
5258
5259	// fill_intra4x4_pred_table(h);
5260	for(i=0; i<16; i+=di){
5261	int mode= pred_intra_mode(h, i);
5262
5263	if(!get_bits1(&s->gb)){
5264	const int rem_mode= get_bits(&s->gb, 3);
5265	mode = rem_mode + (rem_mode >= mode);
5266	}
5267
5268	if(di==4)
5269	fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5270	else
5271	h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
5272	}
5273	write_back_intra_pred_mode(h);
5274	if( check_intra4x4_pred_mode(h) < 0)
5275	return -1;
5276	}else{
5277	h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
5278	if(h->intra16x16_pred_mode < 0)
5279	return -1;
5280	}
5281	h->chroma_pred_mode= get_ue_golomb(&s->gb);
5282
5283	h->chroma_pred_mode= check_intra_pred_mode(h, h->chroma_pred_mode);
5284	if(h->chroma_pred_mode < 0)
5285	return -1;
5286	}else if(partition_count==4){
5287	int i, j, sub_partition_count[4], list, ref[2][4];
5288
5289	if(h->slice_type == B_TYPE){
5290	for(i=0; i<4; i++){
5291	h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5292	if(h->sub_mb_type[i] >=13){
5293	av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5294	return -1;
5295	}
5296	sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5297	h->sub_mb_type[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5298	}
5299	if( IS_DIRECT(h->sub_mb_type[0]) \|\| IS_DIRECT(h->sub_mb_type[1])
5300	\|\| IS_DIRECT(h->sub_mb_type[2]) \|\| IS_DIRECT(h->sub_mb_type[3])) {
5301	pred_direct_motion(h, &mb_type);
5302	h->ref_cache[0][scan8[4]] =
5303	h->ref_cache[1][scan8[4]] =
5304	h->ref_cache[0][scan8[12]] =
5305	h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5306	}
5307	}else{
5308	assert(h->slice_type == P_TYPE \|\| h->slice_type == SP_TYPE); //FIXME SP correct ?
5309	for(i=0; i<4; i++){
5310	h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5311	if(h->sub_mb_type[i] >=4){
5312	av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5313	return -1;
5314	}
5315	sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5316	h->sub_mb_type[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5317	}
5318	}
5319
5320	for(list=0; list<2; list++){
5321	int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
5322	if(ref_count == 0) continue;
5323	for(i=0; i<4; i++){
5324	if(IS_DIRECT(h->sub_mb_type[i])) continue;
5325	if(IS_DIR(h->sub_mb_type[i], 0, list)){
5326	ref[list][i] = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
5327	}else{
5328	//FIXME
5329	ref[list][i] = -1;
5330	}
5331	}
5332	}
5333
5334	if(dct8x8_allowed)
5335	dct8x8_allowed = get_dct8x8_allowed(h);
5336
5337	for(list=0; list<2; list++){
5338	const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
5339	if(ref_count == 0) continue;
5340
5341	for(i=0; i<4; i++){
5342	if(IS_DIRECT(h->sub_mb_type[i])) {
5343	h->ref_cache[list][ scan8[4i] ] = h->ref_cache[list][ scan8[4i]+1 ];
5344	continue;
5345	}
5346	h->ref_cache[list][ scan8[4i] ]=h->ref_cache[list][ scan8[4i]+1 ]=
5347	h->ref_cache[list][ scan8[4i]+8 ]=h->ref_cache[list][ scan8[4i]+9 ]= ref[list][i];
5348
5349	if(IS_DIR(h->sub_mb_type[i], 0, list)){
5350	const int sub_mb_type= h->sub_mb_type[i];
5351	const int block_width= (sub_mb_type & (MB_TYPE_16x16\|MB_TYPE_16x8)) ? 2 : 1;
5352	for(j=0; j<sub_partition_count[i]; j++){
5353	int mx, my;
5354	const int index= 4i + block_widthj;
5355	int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5356	pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
5357	mx += get_se_golomb(&s->gb);
5358	my += get_se_golomb(&s->gb);
5359	tprintf("final mv:%d %d\n", mx, my);
5360
5361	if(IS_SUB_8X8(sub_mb_type)){
5362	mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
5363	mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5364	mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
5365	mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5366	}else if(IS_SUB_8X4(sub_mb_type)){
5367	mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
5368	mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
5369	}else if(IS_SUB_4X8(sub_mb_type)){
5370	mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
5371	mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
5372	}else{
5373	assert(IS_SUB_4X4(sub_mb_type));
5374	mv_cache[ 0 ][0]= mx;
5375	mv_cache[ 0 ][1]= my;
5376	}
5377	}
5378	}else{
5379	uint32_t p= (uint32_t )&h->mv_cache[list][ scan8[4*i] ][0];
5380	p[0] = p[1]=
5381	p[8] = p[9]= 0;
5382	}
5383	}
5384	}
5385	}else if(IS_DIRECT(mb_type)){
5386	pred_direct_motion(h, &mb_type);
5387	dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5388	}else{
5389	int list, mx, my, i;
5390	//FIXME we should set ref_idx_l? to 0 if we use that later ...
5391	if(IS_16X16(mb_type)){
5392	for(list=0; list<2; list++){
5393	if(h->ref_count[list]>0){
5394	if(IS_DIR(mb_type, 0, list)){
5395	const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5396	fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
5397	}else
5398	fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (LIST_NOT_USED&0xFF), 1);
5399	}
5400	}
5401	for(list=0; list<2; list++){
5402	if(IS_DIR(mb_type, 0, list)){
5403	pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
5404	mx += get_se_golomb(&s->gb);
5405	my += get_se_golomb(&s->gb);
5406	tprintf("final mv:%d %d\n", mx, my);
5407
5408	fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5409	}else
5410	fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5411	}
5412	}
5413	else if(IS_16X8(mb_type)){
5414	for(list=0; list<2; list++){
5415	if(h->ref_count[list]>0){
5416	for(i=0; i<2; i++){
5417	if(IS_DIR(mb_type, i, list)){
5418	const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5419	fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
5420	}else
5421	fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5422	}
5423	}
5424	}
5425	for(list=0; list<2; list++){
5426	for(i=0; i<2; i++){
5427	if(IS_DIR(mb_type, i, list)){
5428	pred_16x8_motion(h, 8i, list, h->ref_cache[list][scan8[0] + 16i], &mx, &my);
5429	mx += get_se_golomb(&s->gb);
5430	my += get_se_golomb(&s->gb);
5431	tprintf("final mv:%d %d\n", mx, my);
5432
5433	fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5434	}else
5435	fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5436	}
5437	}
5438	}else{
5439	assert(IS_8X16(mb_type));
5440	for(list=0; list<2; list++){
5441	if(h->ref_count[list]>0){
5442	for(i=0; i<2; i++){
5443	if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5444	const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5445	fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
5446	}else
5447	fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5448	}
5449	}
5450	}
5451	for(list=0; list<2; list++){
5452	for(i=0; i<2; i++){
5453	if(IS_DIR(mb_type, i, list)){
5454	pred_8x16_motion(h, i4, list, h->ref_cache[list][ scan8[0] + 2i ], &mx, &my);
5455	mx += get_se_golomb(&s->gb);
5456	my += get_se_golomb(&s->gb);
5457	tprintf("final mv:%d %d\n", mx, my);
5458
5459	fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5460	}else
5461	fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5462	}
5463	}
5464	}
5465	}
5466
5467	if(IS_INTER(mb_type))
5468	write_back_motion(h, mb_type);
5469
5470	if(!IS_INTRA16x16(mb_type)){
5471	cbp= get_ue_golomb(&s->gb);
5472	if(cbp > 47){
5473	av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%d) at %d %d\n", cbp, s->mb_x, s->mb_y);
5474	return -1;
5475	}
5476
5477	if(IS_INTRA4x4(mb_type))
5478	cbp= golomb_to_intra4x4_cbp[cbp];
5479	else
5480	cbp= golomb_to_inter_cbp[cbp];
5481	}
5482
5483	if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
5484	if(get_bits1(&s->gb))
5485	mb_type \|= MB_TYPE_8x8DCT;
5486	}
5487	s->current_picture.mb_type[mb_xy]= mb_type;
5488
5489	if(cbp \|\| IS_INTRA16x16(mb_type)){
5490	int i8x8, i4x4, chroma_idx;
5491	int chroma_qp, dquant;
5492	GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
5493	const uint8_t scan, scan8x8, *dc_scan;
5494
5495	// fill_non_zero_count_cache(h);
5496
5497	if(IS_INTERLACED(mb_type)){
5498	scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
5499	scan= s->qscale ? h->field_scan : h->field_scan_q0;
5500	dc_scan= luma_dc_field_scan;
5501	}else{
5502	scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
5503	scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5504	dc_scan= luma_dc_zigzag_scan;
5505	}
5506
5507	dquant= get_se_golomb(&s->gb);
5508
5509	if( dquant > 25 \|\| dquant < -26 ){
5510	av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
5511	return -1;
5512	}
5513
5514	s->qscale += dquant;
5515	if(((unsigned)s->qscale) > 51){
5516	if(s->qscale<0) s->qscale+= 52;
5517	else s->qscale-= 52;
5518	}
5519
5520	h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
5521	if(IS_INTRA16x16(mb_type)){
5522	if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
5523	return -1; //FIXME continue if partitioned and other return -1 too
5524	}
5525
5526	assert((cbp&15) == 0 \|\| (cbp&15) == 15);
5527
5528	if(cbp&15){
5529	for(i8x8=0; i8x8<4; i8x8++){
5530	for(i4x4=0; i4x4<4; i4x4++){
5531	const int index= i4x4 + 4*i8x8;
5532	if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
5533	return -1;
5534	}
5535	}
5536	}
5537	}else{
5538	fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5539	}
5540	}else{
5541	for(i8x8=0; i8x8<4; i8x8++){
5542	if(cbp & (1<<i8x8)){
5543	if(IS_8x8DCT(mb_type)){
5544	DCTELEM buf = &h->mb[64i8x8];
5545	uint8_t *nnz;
5546	for(i4x4=0; i4x4<4; i4x4++){
5547	if( decode_residual(h, gb, buf, i4x4+4i8x8, scan8x8+16i4x4,
5548	h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5549	return -1;
5550	}
5551	nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5552	nnz[0] += nnz[1] + nnz[8] + nnz[9];
5553	}else{
5554	for(i4x4=0; i4x4<4; i4x4++){
5555	const int index= i4x4 + 4*i8x8;
5556
5557	if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5558	return -1;
5559	}
5560	}
5561	}
5562	}else{
5563	uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5564	nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5565	}
5566	}
5567	}
5568
5569	if(cbp&0x30){
5570	for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5571	if( decode_residual(h, gb, h->mb + 256 + 164chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
5572	return -1;
5573	}
5574	}
5575
5576	if(cbp&0x20){
5577	for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5578	for(i4x4=0; i4x4<4; i4x4++){
5579	const int index= 16 + 4*chroma_idx + i4x4;
5580	if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][chroma_qp], 15) < 0){
5581	return -1;
5582	}
5583	}
5584	}
5585	}else{
5586	uint8_t * const nnz= &h->non_zero_count_cache[0];
5587	nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5588	nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5589	}
5590	}else{
5591	uint8_t * const nnz= &h->non_zero_count_cache[0];
5592	fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5593	nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5594	nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5595	}
5596	s->current_picture.qscale_table[mb_xy]= s->qscale;
5597	write_back_non_zero_count(h);
5598
5599	if(MB_MBAFF){
5600	h->ref_count[0] >>= 1;
5601	h->ref_count[1] >>= 1;
5602	}
5603
5604	return 0;
5605	}
5606
5607	static int decode_cabac_field_decoding_flag(H264Context *h) {
5608	MpegEncContext * const s = &h->s;
5609	const int mb_x = s->mb_x;
5610	const int mb_y = s->mb_y & ~1;
5611	const int mba_xy = mb_x - 1 + mb_y *s->mb_stride;
5612	const int mbb_xy = mb_x + (mb_y-2)*s->mb_stride;
5613
5614	unsigned int ctx = 0;
5615
5616	if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5617	ctx += 1;
5618	}
5619	if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5620	ctx += 1;
5621	}
5622
5623	return get_cabac( &h->cabac, &h->cabac_state[70 + ctx] );
5624	}
5625
5626	static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5627	uint8_t *state= &h->cabac_state[ctx_base];
5628	int mb_type;
5629
5630	if(intra_slice){
5631	MpegEncContext * const s = &h->s;
5632	const int mba_xy = h->left_mb_xy[0];
5633	const int mbb_xy = h->top_mb_xy;
5634	int ctx=0;
5635	if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5636	ctx++;
5637	if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5638	ctx++;
5639	if( get_cabac( &h->cabac, &state[ctx] ) == 0 )
5640	return 0; /* I4x4 */
5641	state += 2;
5642	}else{
5643	if( get_cabac( &h->cabac, &state[0] ) == 0 )
5644	return 0; /* I4x4 */
5645	}
5646
5647	if( get_cabac_terminate( &h->cabac ) )
5648	return 25; /* PCM */
5649
5650	mb_type = 1; /* I16x16 */
5651	mb_type += 12 * get_cabac( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5652	if( get_cabac( &h->cabac, &state[2] ) ) /* cbp_chroma */
5653	mb_type += 4 + 4 * get_cabac( &h->cabac, &state[2+intra_slice] );
5654	mb_type += 2 * get_cabac( &h->cabac, &state[3+intra_slice] );
5655	mb_type += 1 * get_cabac( &h->cabac, &state[3+2*intra_slice] );
5656	return mb_type;
5657	}
5658
5659	static int decode_cabac_mb_type( H264Context *h ) {
5660	MpegEncContext * const s = &h->s;
5661
5662	if( h->slice_type == I_TYPE ) {
5663	return decode_cabac_intra_mb_type(h, 3, 1);
5664	} else if( h->slice_type == P_TYPE ) {
5665	if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5666	/* P-type */
5667	if( get_cabac( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5668	/* P_L0_D16x16, P_8x8 */
5669	return 3 * get_cabac( &h->cabac, &h->cabac_state[16] );
5670	} else {
5671	/* P_L0_D8x16, P_L0_D16x8 */
5672	return 2 - get_cabac( &h->cabac, &h->cabac_state[17] );
5673	}
5674	} else {
5675	return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5676	}
5677	} else if( h->slice_type == B_TYPE ) {
5678	const int mba_xy = h->left_mb_xy[0];
5679	const int mbb_xy = h->top_mb_xy;
5680	int ctx = 0;
5681	int bits;
5682
5683	if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5684	ctx++;
5685	if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5686	ctx++;
5687
5688	if( !get_cabac( &h->cabac, &h->cabac_state[27+ctx] ) )
5689	return 0; /* B_Direct_16x16 */
5690
5691	if( !get_cabac( &h->cabac, &h->cabac_state[27+3] ) ) {
5692	return 1 + get_cabac( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5693	}
5694
5695	bits = get_cabac( &h->cabac, &h->cabac_state[27+4] ) << 3;
5696	bits\|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 2;
5697	bits\|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 1;
5698	bits\|= get_cabac( &h->cabac, &h->cabac_state[27+5] );
5699	if( bits < 8 )
5700	return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5701	else if( bits == 13 ) {
5702	return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5703	} else if( bits == 14 )
5704	return 11; /* B_L1_L0_8x16 */
5705	else if( bits == 15 )
5706	return 22; /* B_8x8 */
5707
5708	bits= ( bits<<1 ) \| get_cabac( &h->cabac, &h->cabac_state[27+5] );
5709	return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5710	} else {
5711	/* TODO SI/SP frames? */
5712	return -1;
5713	}
5714	}
5715
5716	static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5717	MpegEncContext * const s = &h->s;
5718	int mba_xy, mbb_xy;
5719	int ctx = 0;
5720
5721	if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5722	int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5723	mba_xy = mb_xy - 1;
5724	if( (mb_y&1)
5725	&& h->slice_table[mba_xy] == h->slice_num
5726	&& MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5727	mba_xy += s->mb_stride;
5728	if( MB_FIELD ){
5729	mbb_xy = mb_xy - s->mb_stride;
5730	if( !(mb_y&1)
5731	&& h->slice_table[mbb_xy] == h->slice_num
5732	&& IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5733	mbb_xy -= s->mb_stride;
5734	}else
5735	mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5736	}else{
5737	int mb_xy = mb_x + mb_y*s->mb_stride;
5738	mba_xy = mb_xy - 1;
5739	mbb_xy = mb_xy - s->mb_stride;
5740	}
5741
5742	if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5743	ctx++;
5744	if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5745	ctx++;
5746
5747	if( h->slice_type == B_TYPE )
5748	ctx += 13;
5749	return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
5750	}
5751
5752	static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5753	int mode = 0;
5754
5755	if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5756	return pred_mode;
5757
5758	mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5759	mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5760	mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5761
5762	if( mode >= pred_mode )
5763	return mode + 1;
5764	else
5765	return mode;
5766	}
5767
5768	static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5769	const int mba_xy = h->left_mb_xy[0];
5770	const int mbb_xy = h->top_mb_xy;
5771
5772	int ctx = 0;
5773
5774	/* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5775	if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5776	ctx++;
5777
5778	if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5779	ctx++;
5780
5781	if( get_cabac( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5782	return 0;
5783
5784	if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5785	return 1;
5786	if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5787	return 2;
5788	else
5789	return 3;
5790	}
5791
5792	static const uint8_t block_idx_x[16] = {
5793	0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5794	};
5795	static const uint8_t block_idx_y[16] = {
5796	0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5797	};
5798	static const uint8_t block_idx_xy[4][4] = {
5799	{ 0, 2, 8, 10},
5800	{ 1, 3, 9, 11},
5801	{ 4, 6, 12, 14},
5802	{ 5, 7, 13, 15}
5803	};
5804
5805	static int decode_cabac_mb_cbp_luma( H264Context *h) {
5806	int cbp = 0;
5807	int cbp_b = -1;
5808	int i8x8;
5809
5810	if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
5811	cbp_b = h->top_cbp;
5812	tprintf("cbp_b = top_cbp = %x\n", cbp_b);
5813	}
5814
5815	for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5816	int cbp_a = -1;
5817	int x, y;
5818	int ctx = 0;
5819
5820	x = block_idx_x[4*i8x8];
5821	y = block_idx_y[4*i8x8];
5822
5823	if( x > 0 )
5824	cbp_a = cbp;
5825	else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
5826	cbp_a = h->left_cbp;
5827	tprintf("cbp_a = left_cbp = %x\n", cbp_a);
5828	}
5829
5830	if( y > 0 )
5831	cbp_b = cbp;
5832
5833	/* No need to test for skip as we put 0 for skip block */
5834	/* No need to test for IPCM as we put 1 for IPCM block */
5835	if( cbp_a >= 0 ) {
5836	int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5837	if( ((cbp_a >> i8x8a)&0x01) == 0 )
5838	ctx++;
5839	}
5840
5841	if( cbp_b >= 0 ) {
5842	int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5843	if( ((cbp_b >> i8x8b)&0x01) == 0 )
5844	ctx += 2;
5845	}
5846
5847	if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5848	cbp \|= 1 << i8x8;
5849	}
5850	}
5851	return cbp;
5852	}
5853	static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5854	int ctx;
5855	int cbp_a, cbp_b;
5856
5857	cbp_a = (h->left_cbp>>4)&0x03;
5858	cbp_b = (h-> top_cbp>>4)&0x03;
5859
5860	ctx = 0;
5861	if( cbp_a > 0 ) ctx++;
5862	if( cbp_b > 0 ) ctx += 2;
5863	if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5864	return 0;
5865
5866	ctx = 4;
5867	if( cbp_a == 2 ) ctx++;
5868	if( cbp_b == 2 ) ctx += 2;
5869	return 1 + get_cabac( &h->cabac, &h->cabac_state[77 + ctx] );
5870	}
5871	static int decode_cabac_mb_dqp( H264Context *h) {
5872	MpegEncContext * const s = &h->s;
5873	int mbn_xy;
5874	int ctx = 0;
5875	int val = 0;
5876
5877	if( s->mb_x > 0 )
5878	mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5879	else
5880	mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5881
5882	if( h->last_qscale_diff != 0 )
5883	ctx++;
5884
5885	while( get_cabac( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5886	if( ctx < 2 )
5887	ctx = 2;
5888	else
5889	ctx = 3;
5890	val++;
5891	if(val > 102) //prevent infinite loop
5892	return INT_MIN;
5893	}
5894
5895	if( val&0x01 )
5896	return (val + 1)/2;
5897	else
5898	return -(val + 1)/2;
5899	}
5900	static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5901	if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5902	return 0; /* 8x8 */
5903	if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5904	return 1; /* 8x4 */
5905	if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5906	return 2; /* 4x8 */
5907	return 3; /* 4x4 */
5908	}
5909	static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5910	int type;
5911	if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5912	return 0; /* B_Direct_8x8 */
5913	if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5914	return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5915	type = 3;
5916	if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5917	if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5918	return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5919	type += 4;
5920	}
5921	type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5922	type += get_cabac( &h->cabac, &h->cabac_state[39] );
5923	return type;
5924	}
5925
5926	static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5927	return get_cabac( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5928	}
5929
5930	static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5931	int refa = h->ref_cache[list][scan8[n] - 1];
5932	int refb = h->ref_cache[list][scan8[n] - 8];
5933	int ref = 0;
5934	int ctx = 0;
5935
5936	if( h->slice_type == B_TYPE) {
5937	if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5938	ctx++;
5939	if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5940	ctx += 2;
5941	} else {
5942	if( refa > 0 )
5943	ctx++;
5944	if( refb > 0 )
5945	ctx += 2;
5946	}
5947
5948	while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5949	ref++;
5950	if( ctx < 4 )
5951	ctx = 4;
5952	else
5953	ctx = 5;
5954	}
5955	return ref;
5956	}
5957
5958	static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5959	int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5960	abs( h->mvd_cache[list][scan8[n] - 8][l] );
5961	int ctxbase = (l == 0) ? 40 : 47;
5962	int ctx, mvd;
5963
5964	if( amvd < 3 )
5965	ctx = 0;
5966	else if( amvd > 32 )
5967	ctx = 2;
5968	else
5969	ctx = 1;
5970
5971	if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5972	return 0;
5973
5974	mvd= 1;
5975	ctx= 3;
5976	while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5977	mvd++;
5978	if( ctx < 6 )
5979	ctx++;
5980	}
5981
5982	if( mvd >= 9 ) {
5983	int k = 3;
5984	while( get_cabac_bypass( &h->cabac ) ) {
5985	mvd += 1 << k;
5986	k++;
5987	}
5988	while( k-- ) {
5989	if( get_cabac_bypass( &h->cabac ) )
5990	mvd += 1 << k;
5991	}
5992	}
5993	if( get_cabac_bypass( &h->cabac ) ) return -mvd;
5994	else return mvd;
5995	}
5996
5997	static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5998	int nza, nzb;
5999	int ctx = 0;
6000
6001	if( cat == 0 ) {
6002	nza = h->left_cbp&0x100;
6003	nzb = h-> top_cbp&0x100;
6004	} else if( cat == 1 \|\| cat == 2 ) {
6005	nza = h->non_zero_count_cache[scan8[idx] - 1];
6006	nzb = h->non_zero_count_cache[scan8[idx] - 8];
6007	} else if( cat == 3 ) {
6008	nza = (h->left_cbp>>(6+idx))&0x01;
6009	nzb = (h-> top_cbp>>(6+idx))&0x01;
6010	} else {
6011	assert(cat == 4);
6012	nza = h->non_zero_count_cache[scan8[16+idx] - 1];
6013	nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
6014	}
6015
6016	if( nza > 0 )
6017	ctx++;
6018
6019	if( nzb > 0 )
6020	ctx += 2;
6021
6022	return ctx + 4 * cat;
6023	}
6024
6025	static int decode_cabac_residual( H264Context h, DCTELEM block, int cat, int n, const uint8_t scantable, const uint32_t qmul, int max_coeff) {
6026	const int mb_xy = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
6027	static const int significant_coeff_flag_offset[2][6] = {
6028	{ 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
6029	{ 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
6030	};
6031	static const int last_coeff_flag_offset[2][6] = {
6032	{ 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
6033	{ 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
6034	};
6035	static const int coeff_abs_level_m1_offset[6] = {
6036	227+0, 227+10, 227+20, 227+30, 227+39, 426
6037	};
6038	static const int significant_coeff_flag_offset_8x8[2][63] = {
6039	{ 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
6040	4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
6041	7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
6042	12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
6043	{ 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
6044	6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
6045	9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
6046	9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
6047	};
6048	static const int last_coeff_flag_offset_8x8[63] = {
6049	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6050	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
6051	3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
6052	5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
6053	};
6054
6055	int index[64];
6056
6057	int i, last;
6058	int coeff_count = 0;
6059
6060	int abslevel1 = 1;
6061	int abslevelgt1 = 0;
6062
6063	uint8_t *significant_coeff_ctx_base;
6064	uint8_t *last_coeff_ctx_base;
6065	uint8_t *abs_level_m1_ctx_base;
6066
6067	/* cat: 0-> DC 16x16 n = 0
6068	* 1-> AC 16x16 n = luma4x4idx
6069	* 2-> Luma4x4 n = luma4x4idx
6070	* 3-> DC Chroma n = iCbCr
6071	* 4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
6072	* 5-> Luma8x8 n = 4 * luma8x8idx
6073	*/
6074
6075	/* read coded block flag */
6076	if( cat != 5 ) {
6077	if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
6078	if( cat == 1 \|\| cat == 2 )
6079	h->non_zero_count_cache[scan8[n]] = 0;
6080	else if( cat == 4 )
6081	h->non_zero_count_cache[scan8[16+n]] = 0;
6082
6083	return 0;
6084	}
6085	}
6086
6087	significant_coeff_ctx_base = h->cabac_state
6088	+ significant_coeff_flag_offset[MB_FIELD][cat];
6089	last_coeff_ctx_base = h->cabac_state
6090	+ last_coeff_flag_offset[MB_FIELD][cat];
6091	abs_level_m1_ctx_base = h->cabac_state
6092	+ coeff_abs_level_m1_offset[cat];
6093
6094	if( cat == 5 ) {
6095	#define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
6096	for(last= 0; last < coefs; last++) { \
6097	uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
6098	if( get_cabac( &h->cabac, sig_ctx )) { \
6099	uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
6100	index[coeff_count++] = last; \
6101	if( get_cabac( &h->cabac, last_ctx ) ) { \
6102	last= max_coeff; \
6103	break; \
6104	} \
6105	} \
6106	}
6107	const int *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
6108	DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
6109	} else {
6110	DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
6111	}
6112	if( last == max_coeff -1 ) {
6113	index[coeff_count++] = last;
6114	}
6115	assert(coeff_count > 0);
6116
6117	if( cat == 0 )
6118	h->cbp_table[mb_xy] \|= 0x100;
6119	else if( cat == 1 \|\| cat == 2 )
6120	h->non_zero_count_cache[scan8[n]] = coeff_count;
6121	else if( cat == 3 )
6122	h->cbp_table[mb_xy] \|= 0x40 << n;
6123	else if( cat == 4 )
6124	h->non_zero_count_cache[scan8[16+n]] = coeff_count;
6125	else {
6126	assert( cat == 5 );
6127	fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
6128	}
6129
6130	for( i = coeff_count - 1; i >= 0; i-- ) {
6131	uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
6132	int j= scantable[index[i]];
6133
6134	if( get_cabac( &h->cabac, ctx ) == 0 ) {
6135	if( !qmul ) {
6136	if( get_cabac_bypass( &h->cabac ) ) block[j] = -1;
6137	else block[j] = 1;
6138	}else{
6139	if( get_cabac_bypass( &h->cabac ) ) block[j] = (-qmul[j] + 32) >> 6;
6140	else block[j] = ( qmul[j] + 32) >> 6;
6141	}
6142
6143	abslevel1++;
6144	} else {
6145	int coeff_abs = 2;
6146	ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
6147	while( coeff_abs < 15 && get_cabac( &h->cabac, ctx ) ) {
6148	coeff_abs++;
6149	}
6150
6151	if( coeff_abs >= 15 ) {
6152	int j = 0;
6153	while( get_cabac_bypass( &h->cabac ) ) {
6154	coeff_abs += 1 << j;
6155	j++;
6156	}
6157
6158	while( j-- ) {
6159	if( get_cabac_bypass( &h->cabac ) )
6160	coeff_abs += 1 << j ;
6161	}
6162	}
6163
6164	if( !qmul ) {
6165	if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs;
6166	else block[j] = coeff_abs;
6167	}else{
6168	if( get_cabac_bypass( &h->cabac ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
6169	else block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
6170	}
6171
6172	abslevelgt1++;
6173	}
6174	}
6175	return 0;
6176	}
6177
6178	static void inline compute_mb_neighbors(H264Context *h)
6179	{
6180	MpegEncContext * const s = &h->s;
6181	const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
6182	h->top_mb_xy = mb_xy - s->mb_stride;
6183	h->left_mb_xy[0] = mb_xy - 1;
6184	if(FRAME_MBAFF){
6185	const int pair_xy = s->mb_x + (s->mb_y & ~1)*s->mb_stride;
6186	const int top_pair_xy = pair_xy - s->mb_stride;
6187	const int top_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
6188	const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
6189	const int curr_mb_frame_flag = !MB_FIELD;
6190	const int bottom = (s->mb_y & 1);
6191	if (bottom
6192	? !curr_mb_frame_flag // bottom macroblock
6193	: (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
6194	) {
6195	h->top_mb_xy -= s->mb_stride;
6196	}
6197	if (left_mb_frame_flag != curr_mb_frame_flag) {
6198	h->left_mb_xy[0] = pair_xy - 1;
6199	}
6200	}
6201	return;
6202	}
6203
6204	/**
6205	* decodes a macroblock
6206	* @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
6207	*/
6208	static int decode_mb_cabac(H264Context *h) {
6209	MpegEncContext * const s = &h->s;
6210	const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
6211	int mb_type, partition_count, cbp = 0;
6212	int dct8x8_allowed= h->pps.transform_8x8_mode;
6213
6214	s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
6215
6216	tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
6217	if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
6218	int skip;
6219	/* a skipped mb needs the aff flag from the following mb */
6220	if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
6221	predict_field_decoding_flag(h);
6222	if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
6223	skip = h->next_mb_skipped;
6224	else
6225	skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
6226	/* read skip flags */
6227	if( skip ) {
6228	if( FRAME_MBAFF && (s->mb_y&1)==0 ){
6229	s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
6230	h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
6231	if(h->next_mb_skipped)
6232	predict_field_decoding_flag(h);
6233	else
6234	h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6235	}
6236
6237	decode_mb_skip(h);
6238
6239	h->cbp_table[mb_xy] = 0;
6240	h->chroma_pred_mode_table[mb_xy] = 0;
6241	h->last_qscale_diff = 0;
6242
6243	return 0;
6244
6245	}
6246	}
6247	if(FRAME_MBAFF){
6248	if( (s->mb_y&1) == 0 )
6249	h->mb_mbaff =
6250	h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6251	}else
6252	h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
6253
6254	h->prev_mb_skipped = 0;
6255
6256	compute_mb_neighbors(h);
6257	if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
6258	av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
6259	return -1;
6260	}
6261
6262	if( h->slice_type == B_TYPE ) {
6263	if( mb_type < 23 ){
6264	partition_count= b_mb_type_info[mb_type].partition_count;
6265	mb_type= b_mb_type_info[mb_type].type;
6266	}else{
6267	mb_type -= 23;
6268	goto decode_intra_mb;
6269	}
6270	} else if( h->slice_type == P_TYPE ) {
6271	if( mb_type < 5) {
6272	partition_count= p_mb_type_info[mb_type].partition_count;
6273	mb_type= p_mb_type_info[mb_type].type;
6274	} else {
6275	mb_type -= 5;
6276	goto decode_intra_mb;
6277	}
6278	} else {
6279	assert(h->slice_type == I_TYPE);
6280	decode_intra_mb:
6281	partition_count = 0;
6282	cbp= i_mb_type_info[mb_type].cbp;
6283	h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
6284	mb_type= i_mb_type_info[mb_type].type;
6285	}
6286	if(MB_FIELD)
6287	mb_type \|= MB_TYPE_INTERLACED;
6288
6289	h->slice_table[ mb_xy ]= h->slice_num;
6290
6291	if(IS_INTRA_PCM(mb_type)) {
6292	const uint8_t *ptr;
6293	unsigned int x, y;
6294
6295	// We assume these blocks are very rare so we dont optimize it.
6296	// FIXME The two following lines get the bitstream position in the cabac
6297	// decode, I think it should be done by a function in cabac.h (or cabac.c).
6298	ptr= h->cabac.bytestream;
6299	if (h->cabac.low&0x1) ptr-=CABAC_BITS/8;
6300
6301	// The pixels are stored in the same order as levels in h->mb array.
6302	for(y=0; y<16; y++){
6303	const int index= 4(y&3) + 32((y>>2)&1) + 128*(y>>3);
6304	for(x=0; x<16; x++){
6305	tprintf("LUMA ICPM LEVEL (%3d)\n", *ptr);
6306	h->mb[index + (x&3) + 16((x>>2)&1) + 64(x>>3)]= *ptr++;
6307	}
6308	}
6309	for(y=0; y<8; y++){
6310	const int index= 256 + 4(y&3) + 32(y>>2);
6311	for(x=0; x<8; x++){
6312	tprintf("CHROMA U ICPM LEVEL (%3d)\n", *ptr);
6313	h->mb[index + (x&3) + 16(x>>2)]= ptr++;
6314	}
6315	}
6316	for(y=0; y<8; y++){
6317	const int index= 256 + 64 + 4(y&3) + 32(y>>2);
6318	for(x=0; x<8; x++){
6319	tprintf("CHROMA V ICPM LEVEL (%3d)\n", *ptr);
6320	h->mb[index + (x&3) + 16(x>>2)]= ptr++;
6321	}
6322	}
6323
6324	ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
6325
6326	// All blocks are present
6327	h->cbp_table[mb_xy] = 0x1ef;
6328	h->chroma_pred_mode_table[mb_xy] = 0;
6329	// In deblocking, the quantizer is 0
6330	s->current_picture.qscale_table[mb_xy]= 0;
6331	h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
6332	// All coeffs are present
6333	memset(h->non_zero_count[mb_xy], 16, 16);
6334	s->current_picture.mb_type[mb_xy]= mb_type;
6335	return 0;
6336	}
6337
6338	if(MB_MBAFF){
6339	h->ref_count[0] <<= 1;
6340	h->ref_count[1] <<= 1;
6341	}
6342
6343	fill_caches(h, mb_type, 0);
6344
6345	if( IS_INTRA( mb_type ) ) {
6346	int i;
6347	if( IS_INTRA4x4( mb_type ) ) {
6348	if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
6349	mb_type \|= MB_TYPE_8x8DCT;
6350	for( i = 0; i < 16; i+=4 ) {
6351	int pred = pred_intra_mode( h, i );
6352	int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6353	fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
6354	}
6355	} else {
6356	for( i = 0; i < 16; i++ ) {
6357	int pred = pred_intra_mode( h, i );
6358	h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6359
6360	//av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
6361	}
6362	}
6363	write_back_intra_pred_mode(h);
6364	if( check_intra4x4_pred_mode(h) < 0 ) return -1;
6365	} else {
6366	h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
6367	if( h->intra16x16_pred_mode < 0 ) return -1;
6368	}
6369	h->chroma_pred_mode_table[mb_xy] =
6370	h->chroma_pred_mode = decode_cabac_mb_chroma_pre_mode( h );
6371
6372	h->chroma_pred_mode= check_intra_pred_mode( h, h->chroma_pred_mode );
6373	if( h->chroma_pred_mode < 0 ) return -1;
6374	} else if( partition_count == 4 ) {
6375	int i, j, sub_partition_count[4], list, ref[2][4];
6376
6377	if( h->slice_type == B_TYPE ) {
6378	for( i = 0; i < 4; i++ ) {
6379	h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
6380	sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6381	h->sub_mb_type[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6382	}
6383	if( IS_DIRECT(h->sub_mb_type[0]) \|\| IS_DIRECT(h->sub_mb_type[1])
6384	\|\| IS_DIRECT(h->sub_mb_type[2]) \|\| IS_DIRECT(h->sub_mb_type[3])) {
6385	pred_direct_motion(h, &mb_type);
6386	if( h->ref_count[0] > 1 \|\| h->ref_count[1] > 1 ) {
6387	for( i = 0; i < 4; i++ )
6388	if( IS_DIRECT(h->sub_mb_type[i]) )
6389	fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
6390	}
6391	}
6392	} else {
6393	for( i = 0; i < 4; i++ ) {
6394	h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
6395	sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6396	h->sub_mb_type[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6397	}
6398	}
6399
6400	for( list = 0; list < 2; list++ ) {
6401	if( h->ref_count[list] > 0 ) {
6402	for( i = 0; i < 4; i++ ) {
6403	if(IS_DIRECT(h->sub_mb_type[i])) continue;
6404	if(IS_DIR(h->sub_mb_type[i], 0, list)){
6405	if( h->ref_count[list] > 1 )
6406	ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
6407	else
6408	ref[list][i] = 0;
6409	} else {
6410	ref[list][i] = -1;
6411	}
6412	h->ref_cache[list][ scan8[4*i]+1 ]=
6413	h->ref_cache[list][ scan8[4i]+8 ]=h->ref_cache[list][ scan8[4i]+9 ]= ref[list][i];
6414	}
6415	}
6416	}
6417
6418	if(dct8x8_allowed)
6419	dct8x8_allowed = get_dct8x8_allowed(h);
6420
6421	for(list=0; list<2; list++){
6422	for(i=0; i<4; i++){
6423	if(IS_DIRECT(h->sub_mb_type[i])){
6424	fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
6425	continue;
6426	}
6427	h->ref_cache[list][ scan8[4i] ]=h->ref_cache[list][ scan8[4i]+1 ];
6428
6429	if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
6430	const int sub_mb_type= h->sub_mb_type[i];
6431	const int block_width= (sub_mb_type & (MB_TYPE_16x16\|MB_TYPE_16x8)) ? 2 : 1;
6432	for(j=0; j<sub_partition_count[i]; j++){
6433	int mpx, mpy;
6434	int mx, my;
6435	const int index= 4i + block_widthj;
6436	int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
6437	int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
6438	pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
6439
6440	mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
6441	my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
6442	tprintf("final mv:%d %d\n", mx, my);
6443
6444	if(IS_SUB_8X8(sub_mb_type)){
6445	mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
6446	mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
6447	mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
6448	mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
6449
6450	mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]=
6451	mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
6452	mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]=
6453	mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
6454	}else if(IS_SUB_8X4(sub_mb_type)){
6455	mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
6456	mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
6457
6458	mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= mx- mpx;
6459	mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= my - mpy;
6460	}else if(IS_SUB_4X8(sub_mb_type)){
6461	mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
6462	mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
6463
6464	mvd_cache[ 0 ][0]= mvd_cache[ 8 ][0]= mx - mpx;
6465	mvd_cache[ 0 ][1]= mvd_cache[ 8 ][1]= my - mpy;
6466	}else{
6467	assert(IS_SUB_4X4(sub_mb_type));
6468	mv_cache[ 0 ][0]= mx;
6469	mv_cache[ 0 ][1]= my;
6470
6471	mvd_cache[ 0 ][0]= mx - mpx;
6472	mvd_cache[ 0 ][1]= my - mpy;
6473	}
6474	}
6475	}else{
6476	uint32_t p= (uint32_t )&h->mv_cache[list][ scan8[4*i] ][0];
6477	uint32_t pd= (uint32_t )&h->mvd_cache[list][ scan8[4*i] ][0];
6478	p[0] = p[1] = p[8] = p[9] = 0;
6479	pd[0]= pd[1]= pd[8]= pd[9]= 0;
6480	}
6481	}
6482	}
6483	} else if( IS_DIRECT(mb_type) ) {
6484	pred_direct_motion(h, &mb_type);
6485	fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
6486	fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
6487	dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
6488	} else {
6489	int list, mx, my, i, mpx, mpy;
6490	if(IS_16X16(mb_type)){
6491	for(list=0; list<2; list++){
6492	if(IS_DIR(mb_type, 0, list)){
6493	if(h->ref_count[list] > 0 ){
6494	const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
6495	fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
6496	}
6497	}else
6498	fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
6499	}
6500	for(list=0; list<2; list++){
6501	if(IS_DIR(mb_type, 0, list)){
6502	pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
6503
6504	mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
6505	my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
6506	tprintf("final mv:%d %d\n", mx, my);
6507
6508	fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6509	fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
6510	}else
6511	fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
6512	}
6513	}
6514	else if(IS_16X8(mb_type)){
6515	for(list=0; list<2; list++){
6516	if(h->ref_count[list]>0){
6517	for(i=0; i<2; i++){
6518	if(IS_DIR(mb_type, i, list)){
6519	const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
6520	fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
6521	}else
6522	fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
6523	}
6524	}
6525	}
6526	for(list=0; list<2; list++){
6527	for(i=0; i<2; i++){
6528	if(IS_DIR(mb_type, i, list)){
6529	pred_16x8_motion(h, 8i, list, h->ref_cache[list][scan8[0] + 16i], &mpx, &mpy);
6530	mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
6531	my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
6532	tprintf("final mv:%d %d\n", mx, my);
6533
6534	fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
6535	fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
6536	}else{
6537	fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6538	fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6539	}
6540	}
6541	}
6542	}else{
6543	assert(IS_8X16(mb_type));
6544	for(list=0; list<2; list++){
6545	if(h->ref_count[list]>0){
6546	for(i=0; i<2; i++){
6547	if(IS_DIR(mb_type, i, list)){ //FIXME optimize
6548	const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
6549	fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
6550	}else
6551	fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
6552	}
6553	}
6554	}
6555	for(list=0; list<2; list++){
6556	for(i=0; i<2; i++){
6557	if(IS_DIR(mb_type, i, list)){
6558	pred_8x16_motion(h, i4, list, h->ref_cache[list][ scan8[0] + 2i ], &mpx, &mpy);
6559	mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6560	my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6561
6562	tprintf("final mv:%d %d\n", mx, my);
6563	fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6564	fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6565	}else{
6566	fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6567	fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6568	}
6569	}
6570	}
6571	}
6572	}
6573
6574	if( IS_INTER( mb_type ) ) {
6575	h->chroma_pred_mode_table[mb_xy] = 0;
6576	write_back_motion( h, mb_type );
6577	}
6578
6579	if( !IS_INTRA16x16( mb_type ) ) {
6580	cbp = decode_cabac_mb_cbp_luma( h );
6581	cbp \|= decode_cabac_mb_cbp_chroma( h ) << 4;
6582	}
6583
6584	h->cbp_table[mb_xy] = cbp;
6585
6586	if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6587	if( decode_cabac_mb_transform_size( h ) )
6588	mb_type \|= MB_TYPE_8x8DCT;
6589	}
6590	s->current_picture.mb_type[mb_xy]= mb_type;
6591
6592	if( cbp \|\| IS_INTRA16x16( mb_type ) ) {
6593	const uint8_t scan, scan8x8, *dc_scan;
6594	int dqp;
6595
6596	if(IS_INTERLACED(mb_type)){
6597	scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6598	scan= s->qscale ? h->field_scan : h->field_scan_q0;
6599	dc_scan= luma_dc_field_scan;
6600	}else{
6601	scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6602	scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6603	dc_scan= luma_dc_zigzag_scan;
6604	}
6605
6606	h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6607	if( dqp == INT_MIN ){
6608	av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6609	return -1;
6610	}
6611	s->qscale += dqp;
6612	if(((unsigned)s->qscale) > 51){
6613	if(s->qscale<0) s->qscale+= 52;
6614	else s->qscale-= 52;
6615	}
6616	h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
6617
6618	if( IS_INTRA16x16( mb_type ) ) {
6619	int i;
6620	//av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6621	if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
6622	return -1;
6623	if( cbp&15 ) {
6624	for( i = 0; i < 16; i++ ) {
6625	//av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6626	if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
6627	return -1;
6628	}
6629	} else {
6630	fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6631	}
6632	} else {
6633	int i8x8, i4x4;
6634	for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6635	if( cbp & (1<<i8x8) ) {
6636	if( IS_8x8DCT(mb_type) ) {
6637	if( decode_cabac_residual(h, h->mb + 64i8x8, 5, 4i8x8,
6638	scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
6639	return -1;
6640	} else
6641	for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6642	const int index = 4*i8x8 + i4x4;
6643	//av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6644	if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
6645	return -1;
6646	}
6647	} else {
6648	uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6649	nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6650	}
6651	}
6652	}
6653
6654	if( cbp&0x30 ){
6655	int c;
6656	for( c = 0; c < 2; c++ ) {
6657	//av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6658	if( decode_cabac_residual(h, h->mb + 256 + 164c, 3, c, chroma_dc_scan, NULL, 4) < 0)
6659	return -1;
6660	}
6661	}
6662
6663	if( cbp&0x20 ) {
6664	int c, i;
6665	for( c = 0; c < 2; c++ ) {
6666	for( i = 0; i < 4; i++ ) {
6667	const int index = 16 + 4 * c + i;
6668	//av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6669	if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp], 15) < 0)
6670	return -1;
6671	}
6672	}
6673	} else {
6674	uint8_t * const nnz= &h->non_zero_count_cache[0];
6675	nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6676	nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6677	}
6678	} else {
6679	uint8_t * const nnz= &h->non_zero_count_cache[0];
6680	fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6681	nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6682	nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6683	h->last_qscale_diff = 0;
6684	}
6685
6686	s->current_picture.qscale_table[mb_xy]= s->qscale;
6687	write_back_non_zero_count(h);
6688
6689	if(MB_MBAFF){
6690	h->ref_count[0] >>= 1;
6691	h->ref_count[1] >>= 1;
6692	}
6693
6694	return 0;
6695	}
6696
6697
6698	static void filter_mb_edgev( H264Context h, uint8_t pix, int stride, int bS[4], int qp ) {
6699	int i, d;
6700	const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6701	const int alpha = alpha_table[index_a];
6702	const int beta = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6703
6704	if( bS[0] < 4 ) {
6705	int8_t tc[4];
6706	for(i=0; i<4; i++)
6707	tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6708	h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6709	} else {
6710	/* 16px edge length, because bS=4 is triggered by being at
6711	* the edge of an intra MB, so all 4 bS are the same */
6712	for( d = 0; d < 16; d++ ) {
6713	const int p0 = pix[-1];
6714	const int p1 = pix[-2];
6715	const int p2 = pix[-3];
6716
6717	const int q0 = pix[0];
6718	const int q1 = pix[1];
6719	const int q2 = pix[2];
6720
6721	if( ABS( p0 - q0 ) < alpha &&
6722	ABS( p1 - p0 ) < beta &&
6723	ABS( q1 - q0 ) < beta ) {
6724
6725	if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6726	if( ABS( p2 - p0 ) < beta)
6727	{
6728	const int p3 = pix[-4];
6729	/* p0', p1', p2' */
6730	pix[-1] = ( p2 + 2p1 + 2p0 + 2*q0 + q1 + 4 ) >> 3;
6731	pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6732	pix[-3] = ( 2p3 + 3p2 + p1 + p0 + q0 + 4 ) >> 3;
6733	} else {
6734	/* p0' */
6735	pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6736	}
6737	if( ABS( q2 - q0 ) < beta)
6738	{
6739	const int q3 = pix[3];
6740	/* q0', q1', q2' */
6741	pix[0] = ( p1 + 2p0 + 2q0 + 2*q1 + q2 + 4 ) >> 3;
6742	pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6743	pix[2] = ( 2q3 + 3q2 + q1 + q0 + p0 + 4 ) >> 3;
6744	} else {
6745	/* q0' */
6746	pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6747	}
6748	}else{
6749	/* p0', q0' */
6750	pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6751	pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6752	}
6753	tprintf("filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6754	}
6755	pix += stride;
6756	}
6757	}
6758	}
6759	static void filter_mb_edgecv( H264Context h, uint8_t pix, int stride, int bS[4], int qp ) {
6760	int i;
6761	const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6762	const int alpha = alpha_table[index_a];
6763	const int beta = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6764
6765	if( bS[0] < 4 ) {
6766	int8_t tc[4];
6767	for(i=0; i<4; i++)
6768	tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
6769	h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6770	} else {
6771	h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6772	}
6773	}
6774
6775	static void filter_mb_mbaff_edgev( H264Context h, uint8_t pix, int stride, int bS[8], int qp[2] ) {
6776	int i;
6777	for( i = 0; i < 16; i++, pix += stride) {
6778	int index_a;
6779	int alpha;
6780	int beta;
6781
6782	int qp_index;
6783	int bS_index = (i >> 1);
6784	if (!MB_FIELD) {
6785	bS_index &= ~1;
6786	bS_index \|= (i & 1);
6787	}
6788
6789	if( bS[bS_index] == 0 ) {
6790	continue;
6791	}
6792
6793	qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6794	index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6795	alpha = alpha_table[index_a];
6796	beta = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6797
6798	if( bS[bS_index] < 4 ) {
6799	const int tc0 = tc0_table[index_a][bS[bS_index] - 1];
6800	const int p0 = pix[-1];
6801	const int p1 = pix[-2];
6802	const int p2 = pix[-3];
6803	const int q0 = pix[0];
6804	const int q1 = pix[1];
6805	const int q2 = pix[2];
6806
6807	if( ABS( p0 - q0 ) < alpha &&
6808	ABS( p1 - p0 ) < beta &&
6809	ABS( q1 - q0 ) < beta ) {
6810	int tc = tc0;
6811	int i_delta;
6812
6813	if( ABS( p2 - p0 ) < beta ) {
6814	pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6815	tc++;
6816	}
6817	if( ABS( q2 - q0 ) < beta ) {
6818	pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6819	tc++;
6820	}
6821
6822	i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6823	pix[-1] = clip_uint8( p0 + i_delta ); /* p0' */
6824	pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
6825	tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6826	}
6827	}else{
6828	const int p0 = pix[-1];
6829	const int p1 = pix[-2];
6830	const int p2 = pix[-3];
6831
6832	const int q0 = pix[0];
6833	const int q1 = pix[1];
6834	const int q2 = pix[2];
6835
6836	if( ABS( p0 - q0 ) < alpha &&
6837	ABS( p1 - p0 ) < beta &&
6838	ABS( q1 - q0 ) < beta ) {
6839
6840	if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6841	if( ABS( p2 - p0 ) < beta)
6842	{
6843	const int p3 = pix[-4];
6844	/* p0', p1', p2' */
6845	pix[-1] = ( p2 + 2p1 + 2p0 + 2*q0 + q1 + 4 ) >> 3;
6846	pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6847	pix[-3] = ( 2p3 + 3p2 + p1 + p0 + q0 + 4 ) >> 3;
6848	} else {
6849	/* p0' */
6850	pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6851	}
6852	if( ABS( q2 - q0 ) < beta)
6853	{
6854	const int q3 = pix[3];
6855	/* q0', q1', q2' */
6856	pix[0] = ( p1 + 2p0 + 2q0 + 2*q1 + q2 + 4 ) >> 3;
6857	pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6858	pix[2] = ( 2q3 + 3q2 + q1 + q0 + p0 + 4 ) >> 3;
6859	} else {
6860	/* q0' */
6861	pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6862	}
6863	}else{
6864	/* p0', q0' */
6865	pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6866	pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6867	}
6868	tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6869	}
6870	}
6871	}
6872	}
6873	static void filter_mb_mbaff_edgecv( H264Context h, uint8_t pix, int stride, int bS[8], int qp[2] ) {
6874	int i;
6875	for( i = 0; i < 8; i++, pix += stride) {
6876	int index_a;
6877	int alpha;
6878	int beta;
6879
6880	int qp_index;
6881	int bS_index = i;
6882
6883	if( bS[bS_index] == 0 ) {
6884	continue;
6885	}
6886
6887	qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6888	index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6889	alpha = alpha_table[index_a];
6890	beta = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6891
6892	if( bS[bS_index] < 4 ) {
6893	const int tc = tc0_table[index_a][bS[bS_index] - 1] + 1;
6894	const int p0 = pix[-1];
6895	const int p1 = pix[-2];
6896	const int q0 = pix[0];
6897	const int q1 = pix[1];
6898
6899	if( ABS( p0 - q0 ) < alpha &&
6900	ABS( p1 - p0 ) < beta &&
6901	ABS( q1 - q0 ) < beta ) {
6902	const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6903
6904	pix[-1] = clip_uint8( p0 + i_delta ); /* p0' */
6905	pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
6906	tprintf("filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6907	}
6908	}else{
6909	const int p0 = pix[-1];
6910	const int p1 = pix[-2];
6911	const int q0 = pix[0];
6912	const int q1 = pix[1];
6913
6914	if( ABS( p0 - q0 ) < alpha &&
6915	ABS( p1 - p0 ) < beta &&
6916	ABS( q1 - q0 ) < beta ) {
6917
6918	pix[-1] = ( 2p1 + p0 + q1 + 2 ) >> 2; / p0' */
6919	pix[0] = ( 2q1 + q0 + p1 + 2 ) >> 2; / q0' */
6920	tprintf("filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6921	}
6922	}
6923	}
6924	}
6925
6926	static void filter_mb_edgeh( H264Context h, uint8_t pix, int stride, int bS[4], int qp ) {
6927	int i, d;
6928	const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6929	const int alpha = alpha_table[index_a];
6930	const int beta = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6931	const int pix_next = stride;
6932
6933	if( bS[0] < 4 ) {
6934	int8_t tc[4];
6935	for(i=0; i<4; i++)
6936	tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6937	h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6938	} else {
6939	/* 16px edge length, see filter_mb_edgev */
6940	for( d = 0; d < 16; d++ ) {
6941	const int p0 = pix[-1*pix_next];
6942	const int p1 = pix[-2*pix_next];
6943	const int p2 = pix[-3*pix_next];
6944	const int q0 = pix[0];
6945	const int q1 = pix[1*pix_next];
6946	const int q2 = pix[2*pix_next];
6947
6948	if( ABS( p0 - q0 ) < alpha &&
6949	ABS( p1 - p0 ) < beta &&
6950	ABS( q1 - q0 ) < beta ) {
6951
6952	const int p3 = pix[-4*pix_next];
6953	const int q3 = pix[ 3*pix_next];
6954
6955	if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6956	if( ABS( p2 - p0 ) < beta) {
6957	/* p0', p1', p2' */
6958	pix[-1pix_next] = ( p2 + 2p1 + 2p0 + 2q0 + q1 + 4 ) >> 3;
6959	pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6960	pix[-3pix_next] = ( 2p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6961	} else {
6962	/* p0' */
6963	pix[-1pix_next] = ( 2p1 + p0 + q1 + 2 ) >> 2;
6964	}
6965	if( ABS( q2 - q0 ) < beta) {
6966	/* q0', q1', q2' */
6967	pix[0pix_next] = ( p1 + 2p0 + 2q0 + 2q1 + q2 + 4 ) >> 3;
6968	pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6969	pix[2pix_next] = ( 2q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6970	} else {
6971	/* q0' */
6972	pix[0pix_next] = ( 2q1 + q0 + p1 + 2 ) >> 2;
6973	}
6974	}else{
6975	/* p0', q0' */
6976	pix[-1pix_next] = ( 2p1 + p0 + q1 + 2 ) >> 2;
6977	pix[ 0pix_next] = ( 2q1 + q0 + p1 + 2 ) >> 2;
6978	}
6979	tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6980	}
6981	pix++;
6982	}
6983	}
6984	}
6985
6986	static void filter_mb_edgech( H264Context h, uint8_t pix, int stride, int bS[4], int qp ) {
6987	int i;
6988	const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6989	const int alpha = alpha_table[index_a];
6990	const int beta = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6991
6992	if( bS[0] < 4 ) {
6993	int8_t tc[4];
6994	for(i=0; i<4; i++)
6995	tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
6996	h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6997	} else {
6998	h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6999	}
7000	}
7001
7002	static void filter_mb( H264Context h, int mb_x, int mb_y, uint8_t img_y, uint8_t img_cb, uint8_t img_cr, unsigned int linesize, unsigned int uvlinesize) {
7003	MpegEncContext * const s = &h->s;
7004	const int mb_xy= mb_x + mb_y*s->mb_stride;
7005	const int mb_type = s->current_picture.mb_type[mb_xy];
7006	const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
7007	int first_vertical_edge_done = 0;
7008	int dir;
7009	/* FIXME: A given frame may occupy more than one position in
7010	* the reference list. So ref2frm should be populated with
7011	* frame numbers, not indices. */
7012	static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
7013	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
7014
7015	//for sufficiently low qp, filtering wouldn't do anything
7016	//this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
7017	if(!FRAME_MBAFF){
7018	int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
7019	int qp = s->current_picture.qscale_table[mb_xy];
7020	if(qp <= qp_thresh
7021	&& (mb_x == 0 \|\| ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
7022	&& (mb_y == 0 \|\| ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
7023	return;
7024	}
7025	}
7026
7027	if (FRAME_MBAFF
7028	// left mb is in picture
7029	&& h->slice_table[mb_xy-1] != 255
7030	// and current and left pair do not have the same interlaced type
7031	&& (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
7032	// and left mb is in the same slice if deblocking_filter == 2
7033	&& (h->deblocking_filter!=2 \|\| h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
7034	/* First vertical edge is different in MBAFF frames
7035	* There are 8 different bS to compute and 2 different Qp
7036	*/
7037	const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
7038	const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
7039	int bS[8];
7040	int qp[2];
7041	int chroma_qp[2];
7042	int mb_qp, mbn0_qp, mbn1_qp;
7043	int i;
7044	first_vertical_edge_done = 1;
7045
7046	if( IS_INTRA(mb_type) )
7047	bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
7048	else {
7049	for( i = 0; i < 8; i++ ) {
7050	int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
7051
7052	if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
7053	bS[i] = 4;
7054	else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 \|\|
7055	/* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
7056	h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
7057	bS[i] = 2;
7058	else
7059	bS[i] = 1;
7060	}
7061	}
7062
7063	mb_qp = s->current_picture.qscale_table[mb_xy];
7064	mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
7065	mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
7066	qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
7067	chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7068	get_chroma_qp( h->pps.chroma_qp_index_offset, mbn0_qp ) + 1 ) >> 1;
7069	qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
7070	chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7071	get_chroma_qp( h->pps.chroma_qp_index_offset, mbn1_qp ) + 1 ) >> 1;
7072
7073	/* Filter edge */
7074	tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
7075	{ int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7076	filter_mb_mbaff_edgev ( h, &img_y [0], linesize, bS, qp );
7077	filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
7078	filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
7079	}
7080	/* dir : 0 -> vertical edge, 1 -> horizontal edge */
7081	for( dir = 0; dir < 2; dir++ )
7082	{
7083	int edge;
7084	const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
7085	const int mbm_type = s->current_picture.mb_type[mbm_xy];
7086	int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
7087
7088	const int edges = (mb_type & (MB_TYPE_16x16\|MB_TYPE_SKIP))
7089	== (MB_TYPE_16x16\|MB_TYPE_SKIP) ? 1 : 4;
7090	// how often to recheck mv-based bS when iterating between edges
7091	const int mask_edge = (mb_type & (MB_TYPE_16x16 \| (MB_TYPE_16x8 << dir))) ? 3 :
7092	(mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
7093	// how often to recheck mv-based bS when iterating along each edge
7094	const int mask_par0 = mb_type & (MB_TYPE_16x16 \| (MB_TYPE_8x16 >> dir));
7095
7096	if (first_vertical_edge_done) {
7097	start = 1;
7098	first_vertical_edge_done = 0;
7099	}
7100
7101	if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
7102	start = 1;
7103
7104	if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
7105	&& !IS_INTERLACED(mb_type)
7106	&& IS_INTERLACED(mbm_type)
7107	) {
7108	// This is a special case in the norm where the filtering must
7109	// be done twice (one each of the field) even if we are in a
7110	// frame macroblock.
7111	//
7112	static const int nnz_idx[4] = {4,5,6,3};
7113	unsigned int tmp_linesize = 2 * linesize;
7114	unsigned int tmp_uvlinesize = 2 * uvlinesize;
7115	int mbn_xy = mb_xy - 2 * s->mb_stride;
7116	int qp, chroma_qp;
7117	int i, j;
7118	int bS[4];
7119
7120	for(j=0; j<2; j++, mbn_xy += s->mb_stride){
7121	if( IS_INTRA(mb_type) \|\|
7122	IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
7123	bS[0] = bS[1] = bS[2] = bS[3] = 3;
7124	} else {
7125	const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
7126	for( i = 0; i < 4; i++ ) {
7127	if( h->non_zero_count_cache[scan8[0]+i] != 0 \|\|
7128	mbn_nnz[nnz_idx[i]] != 0 )
7129	bS[i] = 2;
7130	else
7131	bS[i] = 1;
7132	}
7133	}
7134	// Do not use s->qscale as luma quantizer because it has not the same
7135	// value in IPCM macroblocks.
7136	qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7137	tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
7138	{ int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7139	filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
7140	chroma_qp = ( h->chroma_qp +
7141	get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7142	filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7143	filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7144	}
7145
7146	start = 1;
7147	}
7148
7149	/* Calculate bS */
7150	for( edge = start; edge < edges; edge++ ) {
7151	/* mbn_xy: neighbor macroblock */
7152	const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
7153	const int mbn_type = s->current_picture.mb_type[mbn_xy];
7154	int bS[4];
7155	int qp;
7156
7157	if( (edge&1) && IS_8x8DCT(mb_type) )
7158	continue;
7159
7160	if( IS_INTRA(mb_type) \|\|
7161	IS_INTRA(mbn_type) ) {
7162	int value;
7163	if (edge == 0) {
7164	if ( (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
7165	\|\| ((FRAME_MBAFF \|\| (s->picture_structure != PICT_FRAME)) && (dir == 0))
7166	) {
7167	value = 4;
7168	} else {
7169	value = 3;
7170	}
7171	} else {
7172	value = 3;
7173	}
7174	bS[0] = bS[1] = bS[2] = bS[3] = value;
7175	} else {
7176	int i, l;
7177	int mv_done;
7178
7179	if( edge & mask_edge ) {
7180	bS[0] = bS[1] = bS[2] = bS[3] = 0;
7181	mv_done = 1;
7182	}
7183	else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
7184	bS[0] = bS[1] = bS[2] = bS[3] = 1;
7185	mv_done = 1;
7186	}
7187	else if( mask_par0 && (edge \|\| (mbn_type & (MB_TYPE_16x16 \| (MB_TYPE_8x16 >> dir)))) ) {
7188	int b_idx= 8 + 4 + edge * (dir ? 8:1);
7189	int bn_idx= b_idx - (dir ? 8:1);
7190	int v = 0;
7191	for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
7192	v \|= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] \|\|
7193	ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 \|\|
7194	ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
7195	}
7196	bS[0] = bS[1] = bS[2] = bS[3] = v;
7197	mv_done = 1;
7198	}
7199	else
7200	mv_done = 0;
7201
7202	for( i = 0; i < 4; i++ ) {
7203	int x = dir == 0 ? edge : i;
7204	int y = dir == 0 ? i : edge;
7205	int b_idx= 8 + 4 + x + 8*y;
7206	int bn_idx= b_idx - (dir ? 8:1);
7207
7208	if( h->non_zero_count_cache[b_idx] != 0 \|\|
7209	h->non_zero_count_cache[bn_idx] != 0 ) {
7210	bS[i] = 2;
7211	}
7212	else if(!mv_done)
7213	{
7214	bS[i] = 0;
7215	for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
7216	if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] \|\|
7217	ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 \|\|
7218	ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
7219	bS[i] = 1;
7220	break;
7221	}
7222	}
7223	}
7224	}
7225
7226	if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
7227	continue;
7228	}
7229
7230	/* Filter edge */
7231	// Do not use s->qscale as luma quantizer because it has not the same
7232	// value in IPCM macroblocks.
7233	qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7234	//tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
7235	tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
7236	{ int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7237	if( dir == 0 ) {
7238	filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
7239	if( (edge&1) == 0 ) {
7240	int chroma_qp = ( h->chroma_qp +
7241	get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7242	filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
7243	filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
7244	}
7245	} else {
7246	filter_mb_edgeh( h, &img_y[4edgelinesize], linesize, bS, qp );
7247	if( (edge&1) == 0 ) {
7248	int chroma_qp = ( h->chroma_qp +
7249	get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7250	filter_mb_edgech( h, &img_cb[2edgeuvlinesize], uvlinesize, bS, chroma_qp );
7251	filter_mb_edgech( h, &img_cr[2edgeuvlinesize], uvlinesize, bS, chroma_qp );
7252	}
7253	}
7254	}
7255	}
7256	}
7257
7258	static int decode_slice(H264Context *h){
7259	MpegEncContext * const s = &h->s;
7260	const int part_mask= s->partitioned_frame ? (AC_END\|AC_ERROR) : 0x7F;
7261
7262	s->mb_skip_run= -1;
7263
7264	if( h->pps.cabac ) {
7265	int i;
7266
7267	/* realign */
7268	align_get_bits( &s->gb );
7269
7270	/* init cabac */
7271	ff_init_cabac_states( &h->cabac, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64 );
7272	ff_init_cabac_decoder( &h->cabac,
7273	s->gb.buffer + get_bits_count(&s->gb)/8,
7274	( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
7275	/* calculate pre-state */
7276	for( i= 0; i < 460; i++ ) {
7277	int pre;
7278	if( h->slice_type == I_TYPE )
7279	pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
7280	else
7281	pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
7282
7283	if( pre <= 63 )
7284	h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
7285	else
7286	h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
7287	}
7288
7289	for(;;){
7290	int ret = decode_mb_cabac(h);
7291	int eos;
7292
7293	if(ret>=0) hl_decode_mb(h);
7294
7295	if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
7296	s->mb_y++;
7297
7298	if(ret>=0) ret = decode_mb_cabac(h);
7299
7300	if(ret>=0) hl_decode_mb(h);
7301	s->mb_y--;
7302	}
7303	eos = get_cabac_terminate( &h->cabac );
7304
7305	if( ret < 0 \|\| h->cabac.bytestream > h->cabac.bytestream_end + 1) {
7306	av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%d)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
7307	ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR\|DC_ERROR\|MV_ERROR)&part_mask);
7308	return -1;
7309	}
7310
7311	if( ++s->mb_x >= s->mb_width ) {
7312	s->mb_x = 0;
7313	ff_draw_horiz_band(s, 16*s->mb_y, 16);
7314	++s->mb_y;
7315	if(FRAME_MBAFF) {
7316	++s->mb_y;
7317	}
7318	}
7319
7320	if( eos \|\| s->mb_y >= s->mb_height ) {
7321	tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7322	ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END\|DC_END\|MV_END)&part_mask);
7323	return 0;
7324	}
7325	}
7326
7327	} else {
7328	for(;;){
7329	int ret = decode_mb_cavlc(h);
7330
7331	if(ret>=0) hl_decode_mb(h);
7332
7333	if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
7334	s->mb_y++;
7335	ret = decode_mb_cavlc(h);
7336
7337	if(ret>=0) hl_decode_mb(h);
7338	s->mb_y--;
7339	}
7340
7341	if(ret<0){
7342	av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7343	ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR\|DC_ERROR\|MV_ERROR)&part_mask);
7344
7345	return -1;
7346	}
7347
7348	if(++s->mb_x >= s->mb_width){
7349	s->mb_x=0;
7350	ff_draw_horiz_band(s, 16*s->mb_y, 16);
7351	++s->mb_y;
7352	if(FRAME_MBAFF) {
7353	++s->mb_y;
7354	}
7355	if(s->mb_y >= s->mb_height){
7356	tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7357
7358	if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
7359	ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END\|DC_END\|MV_END)&part_mask);
7360
7361	return 0;
7362	}else{
7363	ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END\|DC_END\|MV_END)&part_mask);
7364
7365	return -1;
7366	}
7367	}
7368	}
7369
7370	if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
7371	tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7372	if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
7373	ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END\|DC_END\|MV_END)&part_mask);
7374
7375	return 0;
7376	}else{
7377	ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR\|DC_ERROR\|MV_ERROR)&part_mask);
7378
7379	return -1;
7380	}
7381	}
7382	}
7383	}
7384
7385	#if 0
7386	for(;s->mb_y < s->mb_height; s->mb_y++){
7387	for(;s->mb_x < s->mb_width; s->mb_x++){
7388	int ret= decode_mb(h);
7389
7390	hl_decode_mb(h);
7391
7392	if(ret<0){
7393	av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7394	ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR\|DC_ERROR\|MV_ERROR)&part_mask);
7395
7396	return -1;
7397	}
7398
7399	if(++s->mb_x >= s->mb_width){
7400	s->mb_x=0;
7401	if(++s->mb_y >= s->mb_height){
7402	if(get_bits_count(s->gb) == s->gb.size_in_bits){
7403	ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END\|DC_END\|MV_END)&part_mask);
7404
7405	return 0;
7406	}else{
7407	ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END\|DC_END\|MV_END)&part_mask);
7408
7409	return -1;
7410	}
7411	}
7412	}
7413
7414	if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7415	if(get_bits_count(s->gb) == s->gb.size_in_bits){
7416	ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END\|DC_END\|MV_END)&part_mask);
7417
7418	return 0;
7419	}else{
7420	ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR\|DC_ERROR\|MV_ERROR)&part_mask);
7421
7422	return -1;
7423	}
7424	}
7425	}
7426	s->mb_x=0;
7427	ff_draw_horiz_band(s, 16*s->mb_y, 16);
7428	}
7429	#endif
7430	return -1; //not reached
7431	}
7432
7433	static int decode_unregistered_user_data(H264Context *h, int size){
7434	MpegEncContext * const s = &h->s;
7435	uint8_t user_data[16+256];
7436	int e, build, i;
7437
7438	if(size<16)
7439	return -1;
7440
7441	for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7442	user_data[i]= get_bits(&s->gb, 8);
7443	}
7444
7445	user_data[i]= 0;
7446	e= sscanf(user_data+16, "x264 - core %d"/%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html/, &build);
7447	if(e==1 && build>=0)
7448	h->x264_build= build;
7449
7450	if(s->avctx->debug & FF_DEBUG_BUGS)
7451	av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7452
7453	for(; i<size; i++)
7454	skip_bits(&s->gb, 8);
7455
7456	return 0;
7457	}
7458
7459	static int decode_sei(H264Context *h){
7460	MpegEncContext * const s = &h->s;
7461
7462	while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7463	int size, type;
7464
7465	type=0;
7466	do{
7467	type+= show_bits(&s->gb, 8);
7468	}while(get_bits(&s->gb, 8) == 255);
7469
7470	size=0;
7471	do{
7472	size+= show_bits(&s->gb, 8);
7473	}while(get_bits(&s->gb, 8) == 255);
7474
7475	switch(type){
7476	case 5:
7477	if(decode_unregistered_user_data(h, size) < 0)
7478	return -1;
7479	break;
7480	default:
7481	skip_bits(&s->gb, 8*size);
7482	}
7483
7484	//FIXME check bits here
7485	align_get_bits(&s->gb);
7486	}
7487
7488	return 0;
7489	}
7490
7491	static inline void decode_hrd_parameters(H264Context h, SPS sps){
7492	MpegEncContext * const s = &h->s;
7493	int cpb_count, i;
7494	cpb_count = get_ue_golomb(&s->gb) + 1;
7495	get_bits(&s->gb, 4); /* bit_rate_scale */
7496	get_bits(&s->gb, 4); /* cpb_size_scale */
7497	for(i=0; i<cpb_count; i++){
7498	get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7499	get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7500	get_bits1(&s->gb); /* cbr_flag */
7501	}
7502	get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7503	get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7504	get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7505	get_bits(&s->gb, 5); /* time_offset_length */
7506	}
7507
7508	static inline int decode_vui_parameters(H264Context h, SPS sps){
7509	MpegEncContext * const s = &h->s;
7510	int aspect_ratio_info_present_flag, aspect_ratio_idc;
7511	int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7512
7513	aspect_ratio_info_present_flag= get_bits1(&s->gb);
7514
7515	if( aspect_ratio_info_present_flag ) {
7516	aspect_ratio_idc= get_bits(&s->gb, 8);
7517	if( aspect_ratio_idc == EXTENDED_SAR ) {
7518	sps->sar.num= get_bits(&s->gb, 16);
7519	sps->sar.den= get_bits(&s->gb, 16);
7520	}else if(aspect_ratio_idc < 14){
7521	sps->sar= pixel_aspect[aspect_ratio_idc];
7522	}else{
7523	av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7524	return -1;
7525	}
7526	}else{
7527	sps->sar.num=
7528	sps->sar.den= 0;
7529	}
7530	// s->avctx->aspect_ratio= sar_widths->width / (float)(s->heightsar_height);
7531
7532	if(get_bits1(&s->gb)){ /* overscan_info_present_flag */
7533	get_bits1(&s->gb); /* overscan_appropriate_flag */
7534	}
7535
7536	if(get_bits1(&s->gb)){ /* video_signal_type_present_flag */
7537	get_bits(&s->gb, 3); /* video_format */
7538	get_bits1(&s->gb); /* video_full_range_flag */
7539	if(get_bits1(&s->gb)){ /* colour_description_present_flag */
7540	get_bits(&s->gb, 8); /* colour_primaries */
7541	get_bits(&s->gb, 8); /* transfer_characteristics */
7542	get_bits(&s->gb, 8); /* matrix_coefficients */
7543	}
7544	}
7545
7546	if(get_bits1(&s->gb)){ /* chroma_location_info_present_flag */
7547	get_ue_golomb(&s->gb); /* chroma_sample_location_type_top_field */
7548	get_ue_golomb(&s->gb); /* chroma_sample_location_type_bottom_field */
7549	}
7550
7551	sps->timing_info_present_flag = get_bits1(&s->gb);
7552	if(sps->timing_info_present_flag){
7553	sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7554	sps->time_scale = get_bits_long(&s->gb, 32);
7555	sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7556	}
7557
7558	nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7559	if(nal_hrd_parameters_present_flag)
7560	decode_hrd_parameters(h, sps);
7561	vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7562	if(vcl_hrd_parameters_present_flag)
7563	decode_hrd_parameters(h, sps);
7564	if(nal_hrd_parameters_present_flag \|\| vcl_hrd_parameters_present_flag)
7565	get_bits1(&s->gb); /* low_delay_hrd_flag */
7566	get_bits1(&s->gb); /* pic_struct_present_flag */
7567
7568	sps->bitstream_restriction_flag = get_bits1(&s->gb);
7569	if(sps->bitstream_restriction_flag){
7570	get_bits1(&s->gb); /* motion_vectors_over_pic_boundaries_flag */
7571	get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7572	get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7573	get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7574	get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7575	sps->num_reorder_frames = get_ue_golomb(&s->gb);
7576	get_ue_golomb(&s->gb); /* max_dec_frame_buffering */
7577	}
7578
7579	return 0;
7580	}
7581
7582	static void decode_scaling_list(H264Context h, uint8_t factors, int size,
7583	const uint8_t jvt_list, const uint8_t fallback_list){
7584	MpegEncContext * const s = &h->s;
7585	int i, last = 8, next = 8;
7586	const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7587	if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7588	memcpy(factors, fallback_list, size*sizeof(uint8_t));
7589	else
7590	for(i=0;i<size;i++){
7591	if(next)
7592	next = (last + get_se_golomb(&s->gb)) & 0xff;
7593	if(!i && !next){ /* matrix not written, we use the preset one */
7594	memcpy(factors, jvt_list, size*sizeof(uint8_t));
7595	break;
7596	}
7597	last = factors[scan[i]] = next ? next : last;
7598	}
7599	}
7600
7601	static void decode_scaling_matrices(H264Context h, SPS sps, PPS *pps, int is_sps,
7602	uint8_t (scaling_matrix4)[16], uint8_t (scaling_matrix8)[64]){
7603	MpegEncContext * const s = &h->s;
7604	int fallback_sps = !is_sps && sps->scaling_matrix_present;
7605	const uint8_t *fallback[4] = {
7606	fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7607	fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7608	fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7609	fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7610	};
7611	if(get_bits1(&s->gb)){
7612	sps->scaling_matrix_present \|= is_sps;
7613	decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7614	decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7615	decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7616	decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7617	decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7618	decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7619	if(is_sps \|\| pps->transform_8x8_mode){
7620	decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]); // Intra, Y
7621	decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]); // Inter, Y
7622	}
7623	} else if(fallback_sps) {
7624	memcpy(scaling_matrix4, sps->scaling_matrix4, 616sizeof(uint8_t));
7625	memcpy(scaling_matrix8, sps->scaling_matrix8, 264sizeof(uint8_t));
7626	}
7627	}
7628
7629	static inline int decode_seq_parameter_set(H264Context *h){
7630	MpegEncContext * const s = &h->s;
7631	int profile_idc, level_idc;
7632	int sps_id, i;
7633	SPS *sps;
7634
7635	profile_idc= get_bits(&s->gb, 8);
7636	get_bits1(&s->gb); //constraint_set0_flag
7637	get_bits1(&s->gb); //constraint_set1_flag
7638	get_bits1(&s->gb); //constraint_set2_flag
7639	get_bits1(&s->gb); //constraint_set3_flag
7640	get_bits(&s->gb, 4); // reserved
7641	level_idc= get_bits(&s->gb, 8);
7642	sps_id= get_ue_golomb(&s->gb);
7643
7644	sps= &h->sps_buffer[ sps_id ];
7645	sps->profile_idc= profile_idc;
7646	sps->level_idc= level_idc;
7647
7648	if(sps->profile_idc >= 100){ //high profile
7649	if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7650	get_bits1(&s->gb); //residual_color_transform_flag
7651	get_ue_golomb(&s->gb); //bit_depth_luma_minus8
7652	get_ue_golomb(&s->gb); //bit_depth_chroma_minus8
7653	sps->transform_bypass = get_bits1(&s->gb);
7654	decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7655	}else
7656	sps->scaling_matrix_present = 0;
7657
7658	sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7659	sps->poc_type= get_ue_golomb(&s->gb);
7660
7661	if(sps->poc_type == 0){ //FIXME #define
7662	sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7663	} else if(sps->poc_type == 1){//FIXME #define
7664	sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7665	sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7666	sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7667	sps->poc_cycle_length= get_ue_golomb(&s->gb);
7668
7669	for(i=0; i<sps->poc_cycle_length; i++)
7670	sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7671	}
7672	if(sps->poc_type > 2){
7673	av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7674	return -1;
7675	}
7676
7677	sps->ref_frame_count= get_ue_golomb(&s->gb);
7678	if(sps->ref_frame_count > MAX_PICTURE_COUNT-2){
7679	av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7680	}
7681	sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7682	sps->mb_width= get_ue_golomb(&s->gb) + 1;
7683	sps->mb_height= get_ue_golomb(&s->gb) + 1;
7684	if((unsigned)sps->mb_width >= INT_MAX/16 \|\| (unsigned)sps->mb_height >= INT_MAX/16 \|\|
7685	avcodec_check_dimensions(NULL, 16sps->mb_width, 16sps->mb_height))
7686	return -1;
7687
7688	sps->frame_mbs_only_flag= get_bits1(&s->gb);
7689	if(!sps->frame_mbs_only_flag)
7690	sps->mb_aff= get_bits1(&s->gb);
7691	else
7692	sps->mb_aff= 0;
7693
7694	sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7695
7696	#ifndef ALLOW_INTERLACE
7697	if(sps->mb_aff)
7698	av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it compilation time\n");
7699	#endif
7700	if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7701	av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7702
7703	sps->crop= get_bits1(&s->gb);
7704	if(sps->crop){
7705	sps->crop_left = get_ue_golomb(&s->gb);
7706	sps->crop_right = get_ue_golomb(&s->gb);
7707	sps->crop_top = get_ue_golomb(&s->gb);
7708	sps->crop_bottom= get_ue_golomb(&s->gb);
7709	if(sps->crop_left \|\| sps->crop_top){
7710	av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7711	}
7712	}else{
7713	sps->crop_left =
7714	sps->crop_right =
7715	sps->crop_top =
7716	sps->crop_bottom= 0;
7717	}
7718
7719	sps->vui_parameters_present_flag= get_bits1(&s->gb);
7720	if( sps->vui_parameters_present_flag )
7721	decode_vui_parameters(h, sps);
7722
7723	if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7724	av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%d profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7725	sps_id, sps->profile_idc, sps->level_idc,
7726	sps->poc_type,
7727	sps->ref_frame_count,
7728	sps->mb_width, sps->mb_height,
7729	sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7730	sps->direct_8x8_inference_flag ? "8B8" : "",
7731	sps->crop_left, sps->crop_right,
7732	sps->crop_top, sps->crop_bottom,
7733	sps->vui_parameters_present_flag ? "VUI" : ""
7734	);
7735	}
7736	return 0;
7737	}
7738
7739	static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7740	MpegEncContext * const s = &h->s;
7741	int pps_id= get_ue_golomb(&s->gb);
7742	PPS *pps= &h->pps_buffer[pps_id];
7743
7744	pps->sps_id= get_ue_golomb(&s->gb);
7745	pps->cabac= get_bits1(&s->gb);
7746	pps->pic_order_present= get_bits1(&s->gb);
7747	pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7748	if(pps->slice_group_count > 1 ){
7749	pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7750	av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7751	switch(pps->mb_slice_group_map_type){
7752	case 0:
7753	#if 0
7754	\| for( i = 0; i <= num_slice_groups_minus1; i++ ) \| \| \|
7755	\| run_length[ i ] \|1 \|ue(v) \|
7756	#endif
7757	break;
7758	case 2:
7759	#if 0
7760	\| for( i = 0; i < num_slice_groups_minus1; i++ ) \| \| \|
7761	\|{ \| \| \|
7762	\| top_left_mb[ i ] \|1 \|ue(v) \|
7763	\| bottom_right_mb[ i ] \|1 \|ue(v) \|
7764	\| } \| \| \|
7765	#endif
7766	break;
7767	case 3:
7768	case 4:
7769	case 5:
7770	#if 0
7771	\| slice_group_change_direction_flag \|1 \|u(1) \|
7772	\| slice_group_change_rate_minus1 \|1 \|ue(v) \|
7773	#endif
7774	break;
7775	case 6:
7776	#if 0
7777	\| slice_group_id_cnt_minus1 \|1 \|ue(v) \|
7778	\| for( i = 0; i <= slice_group_id_cnt_minus1; i++ \| \| \|
7779	\|) \| \| \|
7780	\| slice_group_id[ i ] \|1 \|u(v) \|
7781	#endif
7782	break;
7783	}
7784	}
7785	pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7786	pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7787	if(pps->ref_count[0] > 32 \|\| pps->ref_count[1] > 32){
7788	av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7789	return -1;
7790	}
7791
7792	pps->weighted_pred= get_bits1(&s->gb);
7793	pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7794	pps->init_qp= get_se_golomb(&s->gb) + 26;
7795	pps->init_qs= get_se_golomb(&s->gb) + 26;
7796	pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
7797	pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7798	pps->constrained_intra_pred= get_bits1(&s->gb);
7799	pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7800
7801	pps->transform_8x8_mode= 0;
7802	h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7803	memset(pps->scaling_matrix4, 16, 616sizeof(uint8_t));
7804	memset(pps->scaling_matrix8, 16, 264sizeof(uint8_t));
7805
7806	if(get_bits_count(&s->gb) < bit_length){
7807	pps->transform_8x8_mode= get_bits1(&s->gb);
7808	decode_scaling_matrices(h, &h->sps_buffer[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7809	get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7810	}
7811
7812	if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7813	av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%d sps:%d %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
7814	pps_id, pps->sps_id,
7815	pps->cabac ? "CABAC" : "CAVLC",
7816	pps->slice_group_count,
7817	pps->ref_count[0], pps->ref_count[1],
7818	pps->weighted_pred ? "weighted" : "",
7819	pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
7820	pps->deblocking_filter_parameters_present ? "LPAR" : "",
7821	pps->constrained_intra_pred ? "CONSTR" : "",
7822	pps->redundant_pic_cnt_present ? "REDU" : "",
7823	pps->transform_8x8_mode ? "8x8DCT" : ""
7824	);
7825	}
7826
7827	return 0;
7828	}
7829
7830	/**
7831	* finds the end of the current frame in the bitstream.
7832	* @return the position of the first byte of the next frame, or -1
7833	*/
7834	static int find_frame_end(H264Context h, const uint8_t buf, int buf_size){
7835	int i;
7836	uint32_t state;
7837	ParseContext *pc = &(h->s.parse_context);
7838	//printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
7839	// mb_addr= pc->mb_addr - 1;
7840	state= pc->state;
7841	for(i=0; i<=buf_size; i++){
7842	if((state&0xFFFFFF1F) == 0x101 \|\| (state&0xFFFFFF1F) == 0x102 \|\| (state&0xFFFFFF1F) == 0x105){
7843	tprintf("find_frame_end new startcode = %08x, frame_start_found = %d, pos = %d\n", state, pc->frame_start_found, i);
7844	if(pc->frame_start_found){
7845	// If there isn't one more byte in the buffer
7846	// the test on first_mb_in_slice cannot be done yet
7847	// do it at next call.
7848	if (i >= buf_size) break;
7849	if (buf[i] & 0x80) {
7850	// first_mb_in_slice is 0, probably the first nal of a new
7851	// slice
7852	tprintf("find_frame_end frame_end_found, state = %08x, pos = %d\n", state, i);
7853	pc->state=-1;
7854	pc->frame_start_found= 0;
7855	return i-4;
7856	}
7857	}
7858	pc->frame_start_found = 1;
7859	}
7860	if((state&0xFFFFFF1F) == 0x107 \|\| (state&0xFFFFFF1F) == 0x108 \|\| (state&0xFFFFFF1F) == 0x109){
7861	if(pc->frame_start_found){
7862	pc->state=-1;
7863	pc->frame_start_found= 0;
7864	return i-4;
7865	}
7866	}
7867	if (i<buf_size)
7868	state= (state<<8) \| buf[i];
7869	}
7870
7871	pc->state= state;
7872	return END_NOT_FOUND;
7873	}
7874
7875	#ifdef CONFIG_H264_PARSER
7876	static int h264_parse(AVCodecParserContext *s,
7877	AVCodecContext *avctx,
7878	uint8_t *poutbuf, int poutbuf_size,
7879	const uint8_t *buf, int buf_size)
7880	{
7881	H264Context *h = s->priv_data;
7882	ParseContext *pc = &h->s.parse_context;
7883	int next;
7884
7885	next= find_frame_end(h, buf, buf_size);
7886
7887	if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
7888	*poutbuf = NULL;
7889	*poutbuf_size = 0;
7890	return buf_size;
7891	}
7892
7893	poutbuf = (uint8_t )buf;
7894	*poutbuf_size = buf_size;
7895	return next;
7896	}
7897
7898	static int h264_split(AVCodecContext *avctx,
7899	const uint8_t *buf, int buf_size)
7900	{
7901	int i;
7902	uint32_t state = -1;
7903	int has_sps= 0;
7904
7905	for(i=0; i<=buf_size; i++){
7906	if((state&0xFFFFFF1F) == 0x107)
7907	has_sps=1;
7908	/* if((state&0xFFFFFF1F) == 0x101 \|\| (state&0xFFFFFF1F) == 0x102 \|\| (state&0xFFFFFF1F) == 0x105){
7909	}*/
7910	if((state&0xFFFFFF00) == 0x100 && (state&0xFFFFFF1F) != 0x107 && (state&0xFFFFFF1F) != 0x108 && (state&0xFFFFFF1F) != 0x109){
7911	if(has_sps){
7912	while(i>4 && buf[i-5]==0) i--;
7913	return i-4;
7914	}
7915	}
7916	if (i<buf_size)
7917	state= (state<<8) \| buf[i];
7918	}
7919	return 0;
7920	}
7921	#endif /* CONFIG_H264_PARSER */
7922
7923	static int decode_nal_units(H264Context h, uint8_t buf, int buf_size){
7924	MpegEncContext * const s = &h->s;
7925	AVCodecContext * const avctx= s->avctx;
7926	int buf_index=0;
7927	#if 0
7928	int i;
7929	for(i=0; i<50; i++){
7930	av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7931	}
7932	#endif
7933	h->slice_num = 0;
7934	s->current_picture_ptr= NULL;
7935	for(;;){
7936	int consumed;
7937	int dst_length;
7938	int bit_length;
7939	uint8_t *ptr;
7940	int i, nalsize = 0;
7941
7942	if(h->is_avc) {
7943	if(buf_index >= buf_size) break;
7944	nalsize = 0;
7945	for(i = 0; i < h->nal_length_size; i++)
7946	nalsize = (nalsize << 8) \| buf[buf_index++];
7947	if(nalsize <= 1){
7948	if(nalsize == 1){
7949	buf_index++;
7950	continue;
7951	}else{
7952	av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7953	break;
7954	}
7955	}
7956	} else {
7957	// start code prefix search
7958	for(; buf_index + 3 < buf_size; buf_index++){
7959	// this should allways succeed in the first iteration
7960	if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7961	break;
7962	}
7963
7964	if(buf_index+3 >= buf_size) break;
7965
7966	buf_index+=3;
7967	}
7968
7969	ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7970	while(ptr[dst_length - 1] == 0 && dst_length > 1)
7971	dst_length--;
7972	bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
7973
7974	if(s->avctx->debug&FF_DEBUG_STARTCODE){
7975	av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
7976	}
7977
7978	if (h->is_avc && (nalsize != consumed))
7979	av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7980
7981	buf_index += consumed;
7982
7983	if( (s->hurry_up == 1 && h->nal_ref_idc == 0) //FIXME dont discard SEI id
7984	\|\|(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
7985	continue;
7986
7987	switch(h->nal_unit_type){
7988	case NAL_IDR_SLICE:
7989	idr(h); //FIXME ensure we don't loose some frames if there is reordering
7990	case NAL_SLICE:
7991	init_get_bits(&s->gb, ptr, bit_length);
7992	h->intra_gb_ptr=
7993	h->inter_gb_ptr= &s->gb;
7994	s->data_partitioning = 0;
7995
7996	if(decode_slice_header(h) < 0){
7997	av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7998	break;
7999	}
8000	s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
8001	if(h->redundant_pic_count==0 && s->hurry_up < 5
8002	&& (avctx->skip_frame < AVDISCARD_NONREF \|\| h->nal_ref_idc)
8003	&& (avctx->skip_frame < AVDISCARD_BIDIR \|\| h->slice_type!=B_TYPE)
8004	&& (avctx->skip_frame < AVDISCARD_NONKEY \|\| h->slice_type==I_TYPE)
8005	&& avctx->skip_frame < AVDISCARD_ALL)
8006	decode_slice(h);
8007	break;
8008	case NAL_DPA:
8009	init_get_bits(&s->gb, ptr, bit_length);
8010	h->intra_gb_ptr=
8011	h->inter_gb_ptr= NULL;
8012	s->data_partitioning = 1;
8013
8014	if(decode_slice_header(h) < 0){
8015	av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
8016	}
8017	break;
8018	case NAL_DPB:
8019	init_get_bits(&h->intra_gb, ptr, bit_length);
8020	h->intra_gb_ptr= &h->intra_gb;
8021	break;
8022	case NAL_DPC:
8023	init_get_bits(&h->inter_gb, ptr, bit_length);
8024	h->inter_gb_ptr= &h->inter_gb;
8025
8026	if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
8027	&& s->hurry_up < 5
8028	&& (avctx->skip_frame < AVDISCARD_NONREF \|\| h->nal_ref_idc)
8029	&& (avctx->skip_frame < AVDISCARD_BIDIR \|\| h->slice_type!=B_TYPE)
8030	&& (avctx->skip_frame < AVDISCARD_NONKEY \|\| h->slice_type==I_TYPE)
8031	&& avctx->skip_frame < AVDISCARD_ALL)
8032	decode_slice(h);
8033	break;
8034	case NAL_SEI:
8035	init_get_bits(&s->gb, ptr, bit_length);
8036	decode_sei(h);
8037	break;
8038	case NAL_SPS:
8039	init_get_bits(&s->gb, ptr, bit_length);
8040	decode_seq_parameter_set(h);
8041
8042	if(s->flags& CODEC_FLAG_LOW_DELAY)
8043	s->low_delay=1;
8044
8045	if(avctx->has_b_frames < 2)
8046	avctx->has_b_frames= !s->low_delay;
8047	break;
8048	case NAL_PPS:
8049	init_get_bits(&s->gb, ptr, bit_length);
8050
8051	decode_picture_parameter_set(h, bit_length);
8052
8053	break;
8054	case NAL_AUD:
8055	case NAL_END_SEQUENCE:
8056	case NAL_END_STREAM:
8057	case NAL_FILLER_DATA:
8058	case NAL_SPS_EXT:
8059	case NAL_AUXILIARY_SLICE:
8060	break;
8061	default:
8062	av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
8063	}
8064	}
8065
8066	if(!s->current_picture_ptr) return buf_index; //no frame
8067
8068	s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
8069	s->current_picture_ptr->pict_type= s->pict_type;
8070
8071	h->prev_frame_num_offset= h->frame_num_offset;
8072	h->prev_frame_num= h->frame_num;
8073	if(s->current_picture_ptr->reference){
8074	h->prev_poc_msb= h->poc_msb;
8075	h->prev_poc_lsb= h->poc_lsb;
8076	}
8077	if(s->current_picture_ptr->reference)
8078	execute_ref_pic_marking(h, h->mmco, h->mmco_index);
8079
8080	ff_er_frame_end(s);
8081
8082	MPV_frame_end(s);
8083
8084	return buf_index;
8085	}
8086
8087	/**
8088	* returns the number of bytes consumed for building the current frame
8089	*/
8090	static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
8091	if(s->flags&CODEC_FLAG_TRUNCATED){
8092	pos -= s->parse_context.last_index;
8093	if(pos<0) pos=0; // FIXME remove (unneeded?)
8094
8095	return pos;
8096	}else{
8097	if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
8098	if(pos+10>buf_size) pos=buf_size; // oops ;)
8099
8100	return pos;
8101	}
8102	}
8103
8104	static int decode_frame(AVCodecContext *avctx,
8105	void data, int data_size,
8106	uint8_t *buf, int buf_size)
8107	{
8108	H264Context *h = avctx->priv_data;
8109	MpegEncContext *s = &h->s;
8110	AVFrame *pict = data;
8111	int buf_index;
8112
8113	s->flags= avctx->flags;
8114	s->flags2= avctx->flags2;
8115
8116	/* no supplementary picture */
8117	if (buf_size == 0) {
8118	return 0;
8119	}
8120
8121	if(s->flags&CODEC_FLAG_TRUNCATED){
8122	int next= find_frame_end(h, buf, buf_size);
8123
8124	if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
8125	return buf_size;
8126	//printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
8127	}
8128
8129	if(h->is_avc && !h->got_avcC) {
8130	int i, cnt, nalsize;
8131	unsigned char *p = avctx->extradata;
8132	if(avctx->extradata_size < 7) {
8133	av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
8134	return -1;
8135	}
8136	if(*p != 1) {
8137	av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
8138	return -1;
8139	}
8140	/* sps and pps in the avcC always have length coded with 2 bytes,
8141	so put a fake nal_length_size = 2 while parsing them */
8142	h->nal_length_size = 2;
8143	// Decode sps from avcC
8144	cnt = *(p+5) & 0x1f; // Number of sps
8145	p += 6;
8146	for (i = 0; i < cnt; i++) {
8147	nalsize = BE_16(p) + 2;
8148	if(decode_nal_units(h, p, nalsize) < 0) {
8149	av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
8150	return -1;
8151	}
8152	p += nalsize;
8153	}
8154	// Decode pps from avcC
8155	cnt = *(p++); // Number of pps
8156	for (i = 0; i < cnt; i++) {
8157	nalsize = BE_16(p) + 2;
8158	if(decode_nal_units(h, p, nalsize) != nalsize) {
8159	av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
8160	return -1;
8161	}
8162	p += nalsize;
8163	}
8164	// Now store right nal length size, that will be use to parse all other nals
8165	h->nal_length_size = (((((char)(avctx->extradata))+4))&0x03)+1;
8166	// Do not reparse avcC
8167	h->got_avcC = 1;
8168	}
8169
8170	if(!h->is_avc && s->avctx->extradata_size && s->picture_number==0){
8171	if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
8172	return -1;
8173	}
8174
8175	buf_index=decode_nal_units(h, buf, buf_size);
8176	if(buf_index < 0)
8177	return -1;
8178
8179	//FIXME do something with unavailable reference frames
8180
8181	// if(ret==FRAME_SKIPPED) return get_consumed_bytes(s, buf_index, buf_size);
8182	if(!s->current_picture_ptr){
8183	av_log(h->s.avctx, AV_LOG_DEBUG, "error, NO frame\n");
8184	return -1;
8185	}
8186
8187	{
8188	Picture *out = s->current_picture_ptr;
8189	#if 0 //decode order
8190	*data_size = sizeof(AVFrame);
8191	#else
8192	/* Sort B-frames into display order */
8193	Picture *cur = s->current_picture_ptr;
8194	Picture *prev = h->delayed_output_pic;
8195	int i, pics, cross_idr, out_of_order, out_idx;
8196
8197	if(h->sps.bitstream_restriction_flag
8198	&& s->avctx->has_b_frames < h->sps.num_reorder_frames){
8199	s->avctx->has_b_frames = h->sps.num_reorder_frames;
8200	s->low_delay = 0;
8201	}
8202
8203	pics = 0;
8204	while(h->delayed_pic[pics]) pics++;
8205	h->delayed_pic[pics++] = cur;
8206	if(cur->reference == 0)
8207	cur->reference = 1;
8208
8209	cross_idr = 0;
8210	for(i=0; h->delayed_pic[i]; i++)
8211	if(h->delayed_pic[i]->key_frame \|\| h->delayed_pic[i]->poc==0)
8212	cross_idr = 1;
8213
8214	out = h->delayed_pic[0];
8215	out_idx = 0;
8216	for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
8217	if(h->delayed_pic[i]->poc < out->poc){
8218	out = h->delayed_pic[i];
8219	out_idx = i;
8220	}
8221
8222	out_of_order = !cross_idr && prev && out->poc < prev->poc;
8223	if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
8224	{ }
8225	else if(prev && pics <= s->avctx->has_b_frames)
8226	out = prev;
8227	else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
8228	\|\| (s->low_delay &&
8229	((!cross_idr && prev && out->poc > prev->poc + 2)
8230	\|\| cur->pict_type == B_TYPE)))
8231	{
8232	s->low_delay = 0;
8233	s->avctx->has_b_frames++;
8234	out = prev;
8235	}
8236	else if(out_of_order)
8237	out = prev;
8238
8239	if(out_of_order \|\| pics > s->avctx->has_b_frames){
8240	for(i=out_idx; h->delayed_pic[i]; i++)
8241	h->delayed_pic[i] = h->delayed_pic[i+1];
8242	}
8243
8244	if(prev == out)
8245	*data_size = 0;
8246	else
8247	*data_size = sizeof(AVFrame);
8248	if(prev && prev != out && prev->reference == 1)
8249	prev->reference = 0;
8250	h->delayed_output_pic = out;
8251	#endif
8252
8253	if(out)
8254	pict= (AVFrame*)out;
8255	else
8256	av_log(avctx, AV_LOG_DEBUG, "no picture\n");
8257	}
8258
8259	assert(pict->data[0] \|\| !*data_size);
8260	ff_print_debug_info(s, pict);
8261	//printf("out %d\n", (int)pict->data[0]);
8262	#if 0 //?
8263
8264	/* Return the Picture timestamp as the frame number */
8265	/* we substract 1 because it is added on utils.c */
8266	avctx->frame_number = s->picture_number - 1;
8267	#endif
8268	return get_consumed_bytes(s, buf_index, buf_size);
8269	}
8270	#if 0
8271	static inline void fill_mb_avail(H264Context *h){
8272	MpegEncContext * const s = &h->s;
8273	const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
8274
8275	if(s->mb_y){
8276	h->mb_avail[0]= s->mb_x && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
8277	h->mb_avail[1]= h->slice_table[mb_xy - s->mb_stride ] == h->slice_num;
8278	h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
8279	}else{
8280	h->mb_avail[0]=
8281	h->mb_avail[1]=
8282	h->mb_avail[2]= 0;
8283	}
8284	h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
8285	h->mb_avail[4]= 1; //FIXME move out
8286	h->mb_avail[5]= 0; //FIXME move out
8287	}
8288	#endif
8289
8290	#if 0 //selftest
8291	#define COUNT 8000
8292	#define SIZE (COUNT*40)
8293	int main(){
8294	int i;
8295	uint8_t temp[SIZE];
8296	PutBitContext pb;
8297	GetBitContext gb;
8298	// int int_temp[10000];
8299	DSPContext dsp;
8300	AVCodecContext avctx;
8301
8302	dsputil_init(&dsp, &avctx);
8303
8304	init_put_bits(&pb, temp, SIZE);
8305	printf("testing unsigned exp golomb\n");
8306	for(i=0; i<COUNT; i++){
8307	START_TIMER
8308	set_ue_golomb(&pb, i);
8309	STOP_TIMER("set_ue_golomb");
8310	}
8311	flush_put_bits(&pb);
8312
8313	init_get_bits(&gb, temp, 8*SIZE);
8314	for(i=0; i<COUNT; i++){
8315	int j, s;
8316
8317	s= show_bits(&gb, 24);
8318
8319	START_TIMER
8320	j= get_ue_golomb(&gb);
8321	if(j != i){
8322	printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8323	// return -1;
8324	}
8325	STOP_TIMER("get_ue_golomb");
8326	}
8327
8328
8329	init_put_bits(&pb, temp, SIZE);
8330	printf("testing signed exp golomb\n");
8331	for(i=0; i<COUNT; i++){
8332	START_TIMER
8333	set_se_golomb(&pb, i - COUNT/2);
8334	STOP_TIMER("set_se_golomb");
8335	}
8336	flush_put_bits(&pb);
8337
8338	init_get_bits(&gb, temp, 8*SIZE);
8339	for(i=0; i<COUNT; i++){
8340	int j, s;
8341
8342	s= show_bits(&gb, 24);
8343
8344	START_TIMER
8345	j= get_se_golomb(&gb);
8346	if(j != i - COUNT/2){
8347	printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8348	// return -1;
8349	}
8350	STOP_TIMER("get_se_golomb");
8351	}
8352
8353	printf("testing 4x4 (I)DCT\n");
8354
8355	DCTELEM block[16];
8356	uint8_t src[16], ref[16];
8357	uint64_t error= 0, max_error=0;
8358
8359	for(i=0; i<COUNT; i++){
8360	int j;
8361	// printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8362	for(j=0; j<16; j++){
8363	ref[j]= random()%255;
8364	src[j]= random()%255;
8365	}
8366
8367	h264_diff_dct_c(block, src, ref, 4);
8368
8369	//normalize
8370	for(j=0; j<16; j++){
8371	// printf("%d ", block[j]);
8372	block[j]= block[j]*4;
8373	if(j&1) block[j]= (block[j]*4 + 2)/5;
8374	if(j&4) block[j]= (block[j]*4 + 2)/5;
8375	}
8376	// printf("\n");
8377
8378	s->dsp.h264_idct_add(ref, block, 4);
8379	/* for(j=0; j<16; j++){
8380	printf("%d ", ref[j]);
8381	}
8382	printf("\n");*/
8383
8384	for(j=0; j<16; j++){
8385	int diff= ABS(src[j] - ref[j]);
8386
8387	error+= diff*diff;
8388	max_error= FFMAX(max_error, diff);
8389	}
8390	}
8391	printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8392	#if 0
8393	printf("testing quantizer\n");
8394	for(qp=0; qp<52; qp++){
8395	for(i=0; i<16; i++)
8396	src1_block[i]= src2_block[i]= random()%255;
8397
8398	}
8399	#endif
8400	printf("Testing NAL layer\n");
8401
8402	uint8_t bitstream[COUNT];
8403	uint8_t nal[COUNT*2];
8404	H264Context h;
8405	memset(&h, 0, sizeof(H264Context));
8406
8407	for(i=0; i<COUNT; i++){
8408	int zeros= i;
8409	int nal_length;
8410	int consumed;
8411	int out_length;
8412	uint8_t *out;
8413	int j;
8414
8415	for(j=0; j<COUNT; j++){
8416	bitstream[j]= (random() % 255) + 1;
8417	}
8418
8419	for(j=0; j<zeros; j++){
8420	int pos= random() % COUNT;
8421	while(bitstream[pos] == 0){
8422	pos++;
8423	pos %= COUNT;
8424	}
8425	bitstream[pos]=0;
8426	}
8427
8428	START_TIMER
8429
8430	nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8431	if(nal_length<0){
8432	printf("encoding failed\n");
8433	return -1;
8434	}
8435
8436	out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8437
8438	STOP_TIMER("NAL")
8439
8440	if(out_length != COUNT){
8441	printf("incorrect length %d %d\n", out_length, COUNT);
8442	return -1;
8443	}
8444
8445	if(consumed != nal_length){
8446	printf("incorrect consumed length %d %d\n", nal_length, consumed);
8447	return -1;
8448	}
8449
8450	if(memcmp(bitstream, out, COUNT)){
8451	printf("missmatch\n");
8452	return -1;
8453	}
8454	}
8455
8456	printf("Testing RBSP\n");
8457
8458
8459	return 0;
8460	}
8461	#endif
8462
8463
8464	static int decode_end(AVCodecContext *avctx)
8465	{
8466	H264Context *h = avctx->priv_data;
8467	MpegEncContext *s = &h->s;
8468
8469	av_freep(&h->rbsp_buffer);
8470	free_tables(h); //FIXME cleanup init stuff perhaps
8471	MPV_common_end(s);
8472
8473	// memset(h, 0, sizeof(H264Context));
8474
8475	return 0;
8476	}
8477
8478
8479	AVCodec h264_decoder = {
8480	"h264",
8481	CODEC_TYPE_VIDEO,
8482	CODEC_ID_H264,
8483	sizeof(H264Context),
8484	decode_init,
8485	NULL,
8486	decode_end,
8487	decode_frame,
8488	/CODEC_CAP_DRAW_HORIZ_BAND \|/ CODEC_CAP_DR1 \| CODEC_CAP_TRUNCATED \| CODEC_CAP_DELAY,
8489	.flush= flush_dpb,
8490	};
8491
8492	#ifdef CONFIG_H264_PARSER
8493	AVCodecParser h264_parser = {
8494	{ CODEC_ID_H264 },
8495	sizeof(H264Context),
8496	NULL,
8497	h264_parse,
8498	ff_parse_close,
8499	h264_split,
8500	};
8501	#endif
8502
8503	#include "svq3.c"

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/libs/ffmpeg-20060710/libavcodec/h264.c@ 10184

以其他格式下載: