libaom 源码分析：AV1帧内预测 CfL 模式

CfL预测模式原理

从亮度到色度CfL 是一种色度帧内预测模式，通过建立共位重建亮度采样的线性函数来模拟色度采样；
对于不同的色度采样格式（例如4:2:0和4:2:2），可能需要对重建的亮度像素进行子采样，以匹配色度样本的分辨率；
在CfL模式中，从亮度样本中移除直流分量以形成交流（AC）贡献。预测块是色度直流（DC）分量和缩放的亮度交流（AC）分量之和；
在CfL模式中，指定两个颜色分量之间线性函数的模型参数（如缩放因子α）由编码器优化，并在编码过程中信号化到比特流中；
DC分量是预测块内所有样本的平均值，代表块的直流或平均亮度水平；AC分量是亮度样本与DC分量的差值，代表块内的交流或细节信息。

libaom相关源码分析

色度的预测模式一共有 14 种，而 CfL 模式是序号 13，在 enums.h 种定义；
对外编码参数控制开关： unsigned int disable_cfl，在 aom_encoder.h 文件中 cfg_options 结构体中声明变量；
控制逻辑函数关系：
is_cfl_allowed 函数

该函数判断当前块是否可以使用 CfL 模式，在无损编码模式下，CfL只允许用于4x4的色度块。在有损编码模式下，CfL允许用于亮度分区大小不超过32x32的块。

// Can we use CfL for the current block?
static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
  const MB_MODE_INFO *mbmi = xd->mi[0];
  const BLOCK_SIZE bsize = mbmi->bsize;
  assert(bsize < BLOCK_SIZES_ALL);
  if (xd->lossless[mbmi->segment_id]) {
    // In lossless, CfL is available when the partition size is equal to the
    // transform size.
    const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
    const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
    const int plane_bsize = get_plane_block_size(bsize, ssx, ssy);
    return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4);
  }
  // Spec: CfL is available to luma partitions lesser than or equal to 32x32
  return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 &&
                            block_size_high[bsize] <= 32);
}

av1_get_tx_size 函数

该函数根据平面编号和宏块描述符来确定变换大小。在无损编码模式下，它返回 TX_4X4。对于亮度平面，它返回宏块模式信息中的变换大小。对于色度平面，它基于块大小和子采样因子来确定最大变换大小。

static INLINE TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
  switch (tx_size) {
    case TX_64X64:
    case TX_64X32:
    case TX_32X64: return TX_32X32;
    case TX_64X16: return TX_32X16;
    case TX_16X64: return TX_16X32;
    default: return tx_size;
  }
}

static INLINE TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x,
                                            int subsampling_y) {
  const BLOCK_SIZE plane_bsize =
      get_plane_block_size(bsize, subsampling_x, subsampling_y);
  assert(plane_bsize < BLOCK_SIZES_ALL);
  const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize];
  return av1_get_adjusted_tx_size(uv_tx);
}

static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
  const MB_MODE_INFO *mbmi = xd->mi[0];
  if (xd->lossless[mbmi->segment_id]) return TX_4X4;
  if (plane == 0) return mbmi->tx_size;
  const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
  return av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
                               pd->subsampling_y);
}

cfl_rd_pick_alpha 函数

这个函数是AV1编码过程中CfL参数选择的核心，它通过估计和比较不同CfL参数的RD成本来找到最佳参数。
参数验证和初始化：
验证 cfl_search_range 是否在有效范围内。
初始化RD统计为无效。

启用DC预测数据缓存：
为了提高效率，启用CfL的DC预测数据缓存。

估计最佳CfL参数：
使用 cfl_pick_plane_parameter 函数为U和V平面选择最佳CfL参数。

处理CfL搜索范围为1的特殊情况：
如果 cfl_search_range 为1，不进行进一步的alpha细化，如果两个色度平面的CfL索引都为0，则设置无效的CfL参数并返回。

计算CfL模式的RD成本：
如果RD成本超过参考最佳RD值，则设置无效的CfL参数并返回。

计算每个色度平面的RD成本：
使用 cfl_pick_plane_rd 函数计算每个色度平面的RD成本。

清除DC预测数据缓存标志：
清除CfL的DC预测数据缓存标志，以避免意外使用缓存的DC预测数据。

遍历所有可能的CfL参数组合：
遍历所有可能的CfL参数组合，计算联合RD成本，并更新最佳RD统计和最佳CfL参数。

返回结果：
如果最佳RD统计的RD成本不小于参考最佳RD值，则设置无效的CfL参数并返回0；否则返回1。

/*!\brief Pick the optimal parameters for Chroma to Luma (CFL) component
 *
 * \ingroup intra_mode_search
 * \callergraph
 *
 * This function will use DCT_DCT followed by computing SATD (sum of absolute
 * transformed differences) to estimate the RD score and find the best possible
 * CFL parameter.
 *
 * Then the function will apply a full RD search near the best possible CFL
 * parameter to find the best actual CFL parameter.
 *
 * Side effect:
 * We use ths buffers in x->plane[] and xd->plane[] as throw-away buffers for RD
 * search.
 *
 * \param[in] x                Encoder prediction block structure.
 * \param[in] cpi              Top-level encoder instance structure.
 * \param[in] tx_size          Transform size.
 * \param[in] ref_best_rd      Reference best RD.
 * \param[in] cfl_search_range The search range of full RD search near the
 *                             estimated best CFL parameter.
 *
 * \param[out]   best_rd_stats          RD stats of the best CFL parameter
 * \param[out]   best_cfl_alpha_idx     Best CFL alpha index
 * \param[out]   best_cfl_alpha_signs   Best CFL joint signs
 *
 */
static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
                             TX_SIZE tx_size, int64_t ref_best_rd,
                             int cfl_search_range, RD_STATS *best_rd_stats,
                             uint8_t *best_cfl_alpha_idx,
                             int8_t *best_cfl_alpha_signs) {
  assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
  const ModeCosts *mode_costs = &x->mode_costs;
  RD_STATS cfl_rd_arr_u[CFL_MAGS_SIZE];
  RD_STATS cfl_rd_arr_v[CFL_MAGS_SIZE];
  MACROBLOCKD *const xd = &x->e_mbd;
  int est_best_cfl_idx_u, est_best_cfl_idx_v;

  av1_invalid_rd_stats(best_rd_stats);

  // As the dc pred data is same for different values of alpha, enable the
  // caching of dc pred data. Call clear_cfl_dc_pred_cache_flags() before
  // returning to avoid the unintentional usage of cached dc pred data.
  xd->cfl.use_dc_pred_cache = true;
  // Evaluate alpha parameter of each chroma plane.
  est_best_cfl_idx_u =
      cfl_pick_plane_parameter(cpi, x, 1, tx_size, cfl_search_range);
  est_best_cfl_idx_v =
      cfl_pick_plane_parameter(cpi, x, 2, tx_size, cfl_search_range);

  if (cfl_search_range == 1) {
    // For cfl_search_range=1, further refinement of alpha is not enabled. Hence
    // CfL index=0 for both the chroma planes implies invalid CfL mode.
    if (est_best_cfl_idx_u == CFL_INDEX_ZERO &&
        est_best_cfl_idx_v == CFL_INDEX_ZERO) {
      set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs);
      clear_cfl_dc_pred_cache_flags(&xd->cfl);
      return 0;
    }

    int cfl_alpha_u, cfl_alpha_v;
    CFL_SIGN_TYPE cfl_sign_u, cfl_sign_v;
    const MB_MODE_INFO *mbmi = xd->mi[0];
    cfl_idx_to_sign_and_alpha(est_best_cfl_idx_u, &cfl_sign_u, &cfl_alpha_u);
    cfl_idx_to_sign_and_alpha(est_best_cfl_idx_v, &cfl_sign_v, &cfl_alpha_v);
    const int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1;
    // Compute alpha and mode signaling rate.
    const int rate_overhead =
        mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u] +
        mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v] +
        mode_costs
            ->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_CFL_PRED];
    // Skip the CfL mode evaluation if the RD cost derived using the rate needed
    // to signal the CfL mode and alpha parameter exceeds the ref_best_rd.
    if (RDCOST(x->rdmult, rate_overhead, 0) > ref_best_rd) {
      set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs);
      clear_cfl_dc_pred_cache_flags(&xd->cfl);
      return 0;
    }
  }

  // Compute the rd cost of each chroma plane using the alpha parameters which
  // were already evaluated.
  cfl_pick_plane_rd(cpi, x, 1, tx_size, cfl_search_range, cfl_rd_arr_u,
                    est_best_cfl_idx_u);
  cfl_pick_plane_rd(cpi, x, 2, tx_size, cfl_search_range, cfl_rd_arr_v,
                    est_best_cfl_idx_v);

  clear_cfl_dc_pred_cache_flags(&xd->cfl);

  for (int ui = 0; ui < CFL_MAGS_SIZE; ++ui) {
    if (cfl_rd_arr_u[ui].rate == INT_MAX) continue;
    int cfl_alpha_u;
    CFL_SIGN_TYPE cfl_sign_u;
    cfl_idx_to_sign_and_alpha(ui, &cfl_sign_u, &cfl_alpha_u);
    for (int vi = 0; vi < CFL_MAGS_SIZE; ++vi) {
      if (cfl_rd_arr_v[vi].rate == INT_MAX) continue;
      int cfl_alpha_v;
      CFL_SIGN_TYPE cfl_sign_v;
      cfl_idx_to_sign_and_alpha(vi, &cfl_sign_v, &cfl_alpha_v);
      // cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO is not a
      // valid parameter for CFL
      if (cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO) continue;
      int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1;
      RD_STATS rd_stats = cfl_rd_arr_u[ui];
      av1_merge_rd_stats(&rd_stats, &cfl_rd_arr_v[vi]);
      if (rd_stats.rate != INT_MAX) {
        rd_stats.rate +=
            mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u];
        rd_stats.rate +=
            mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v];
      }
      av1_rd_cost_update(x->rdmult, &rd_stats);
      if (rd_stats.rdcost < best_rd_stats->rdcost) {
        *best_rd_stats = rd_stats;
        *best_cfl_alpha_idx =
            (cfl_alpha_u << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha_v;
        *best_cfl_alpha_signs = joint_sign;
      }
    }
  }
  if (best_rd_stats->rdcost >= ref_best_rd) {
    av1_invalid_rd_stats(best_rd_stats);
    // Set invalid CFL parameters here since the rdcost is not better than
    // ref_best_rd.
    set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs);
    return 0;
  }
  return 1;
}

cfl_pick_plane_rd 函数

该函数通过在给定的搜索范围内遍历不同的CfL参数，并计算每个参数对应的RD统计，来找到最佳的CfL参数。这个过程涉及到生成预测块（基于亮度信息和CfL参数），计算失真（通常使用SATD），以及计算编码这些预测残差所需的速率。最终，函数会选择使RD成本最小的CfL参数，以优化编码效率和图像质量。

static void cfl_pick_plane_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
                              int plane, TX_SIZE tx_size, int cfl_search_range,
                              RD_STATS cfl_rd_arr[CFL_MAGS_SIZE],
                              int est_best_cfl_idx) {
  assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
  const MACROBLOCKD *const xd = &x->e_mbd;
  const MB_MODE_INFO *const mbmi = xd->mi[0];
  assert(mbmi->uv_mode == UV_CFL_PRED);
  const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
  const BLOCK_SIZE plane_bsize =
      get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);

  for (int cfl_idx = 0; cfl_idx < CFL_MAGS_SIZE; ++cfl_idx) {
    av1_invalid_rd_stats(&cfl_rd_arr[cfl_idx]);
  }

  int fast_mode = 0;
  int start_cfl_idx = est_best_cfl_idx;
  cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, start_cfl_idx, fast_mode,
                 &cfl_rd_arr[start_cfl_idx]);

  if (cfl_search_range == 1) return;

  for (int si = 0; si < 2; ++si) {
    const int dir = cfl_dir_ls[si];
    for (int i = 1; i < cfl_search_range; ++i) {
      int cfl_idx = start_cfl_idx + dir * i;
      if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break;
      cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, cfl_idx, fast_mode,
                     &cfl_rd_arr[cfl_idx]);
    }
  }
}

cfl_compute_rd 函数

cfl_idx_to_sign_and_alpha函数用于从CfL索引中提取CfL符号和alpha值。
mbmi->cfl_alpha_signs和mbmi->cfl_alpha_idx被设置为新的CfL符号和alpha值，以便生成预测块。
在快速模式下，使用intra_model_rd函数计算CfL成本，该函数可能使用简化的失真度量。
在非快速模式下，使用av1_txfm_rd_in_plane函数执行完整的变换、量化和失真度量，然后使用av1_rd_cost_update函数更新RD统计。
最后，函数返回计算得到的CfL成本。

static int64_t cfl_compute_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
                              int plane, TX_SIZE tx_size,
                              BLOCK_SIZE plane_bsize, int cfl_idx,
                              int fast_mode, RD_STATS *rd_stats) {
  assert(IMPLIES(fast_mode, rd_stats == NULL));
  const AV1_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &x->e_mbd;
  MB_MODE_INFO *const mbmi = xd->mi[0];
  int cfl_plane = get_cfl_pred_type(plane);
  CFL_SIGN_TYPE cfl_sign;
  int cfl_alpha;
  cfl_idx_to_sign_and_alpha(cfl_idx, &cfl_sign, &cfl_alpha);
  // We conly build CFL for a given plane, the other plane's sign is dummy
  int dummy_sign = CFL_SIGN_NEG;
  const int8_t orig_cfl_alpha_signs = mbmi->cfl_alpha_signs;
  const uint8_t orig_cfl_alpha_idx = mbmi->cfl_alpha_idx;
  mbmi->cfl_alpha_signs =
      PLANE_SIGN_TO_JOINT_SIGN(cfl_plane, cfl_sign, dummy_sign);
  mbmi->cfl_alpha_idx = (cfl_alpha << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha;
  int64_t cfl_cost;
  if (fast_mode) {
    cfl_cost =
        intra_model_rd(cm, x, plane, plane_bsize, tx_size, /*use_hadamard=*/0);
  } else {
    av1_init_rd_stats(rd_stats);
    av1_txfm_rd_in_plane(x, cpi, rd_stats, INT64_MAX, 0, plane, plane_bsize,
                         tx_size, FTXS_NONE, 0);
    av1_rd_cost_update(x->rdmult, rd_stats);
    cfl_cost = rd_stats->rdcost;
  }
  mbmi->cfl_alpha_signs = orig_cfl_alpha_signs;
  mbmi->cfl_alpha_idx = orig_cfl_alpha_idx;
  return cfl_cost;
}

intra_model_rd 函数

获取宏块描述符和比特深度信息：
从宏块结构中获取宏块描述符xd。
获取比特深度信息bd_info。

参数验证和初始化：
确保当前块不是交织块。
初始化行和列的步长，这些步长基于变换大小tx_size。
计算最大块宽和高。

帧内预测：
对于每个块，执行帧内预测，生成预测块。
使用av1_predict_intra_block_facade 函数进行帧内预测。

计算预测残差：
使用av1_subtract_block函数计算预测残差，即原始块和预测块之间的差异。

快速变换：
使用av1_quick_txfm函数执行快速变换，这个函数简化了变换过程，不需要完整的量化和逆变换步骤。

计算失真：
使用aom_satd函数计算变换后的残差的绝对变换差之和（Sum of Absolute Transformed Differences，SATD），作为失真的度量。

返回RD成本：
将所有块的失真相加，得到总的失真成本，并返回。

/*!\cond */
// Makes a quick intra prediction and estimate the rdcost with a model without
// going through the whole txfm/quantize/itxfm process.
static int64_t intra_model_rd(const AV1_COMMON *cm, MACROBLOCK *const x,
                              int plane, BLOCK_SIZE plane_bsize,
                              TX_SIZE tx_size, int use_hadamard) {
  MACROBLOCKD *const xd = &x->e_mbd;
  const BitDepthInfo bd_info = get_bit_depth_info(xd);
  int row, col;
  assert(!is_inter_block(xd->mi[0]));
  const int stepr = tx_size_high_unit[tx_size];
  const int stepc = tx_size_wide_unit[tx_size];
  const int txbw = tx_size_wide[tx_size];
  const int txbh = tx_size_high[tx_size];
  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
  int64_t satd_cost = 0;
  struct macroblock_plane *p = &x->plane[plane];
  struct macroblockd_plane *pd = &xd->plane[plane];
  // Prediction.
  for (row = 0; row < max_blocks_high; row += stepr) {
    for (col = 0; col < max_blocks_wide; col += stepc) {
      av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
      // Here we use p->src_diff and p->coeff as temporary buffers for
      // prediction residue and transform coefficients. The buffers are only
      // used in this for loop, therefore we don't need to properly add offset
      // to the buffers.
      av1_subtract_block(
          bd_info, txbh, txbw, p->src_diff, block_size_wide[plane_bsize],
          p->src.buf + (((row * p->src.stride) + col) << 2), p->src.stride,
          pd->dst.buf + (((row * pd->dst.stride) + col) << 2), pd->dst.stride);
      av1_quick_txfm(use_hadamard, tx_size, bd_info, p->src_diff,
                     block_size_wide[plane_bsize], p->coeff);
      satd_cost += aom_satd(p->coeff, tx_size_2d[tx_size]);
    }
  }
  return satd_cost;
}