/* Copyright (C) 2001-2021 Artifex Software, Inc.
   All Rights Reserved.

   This software is provided AS-IS with no warranty, either express or
   implied.

   This software is distributed under license and may not be copied,
   modified or distributed except as expressly authorized under the terms
   of the license contained in the file LICENSE in this distribution.

   Refer to licensing information at http://www.artifex.com or contact
   Artifex Software, Inc.,  1305 Grant Avenue - Suite 200, Novato,
   CA 94945, U.S.A., +1(415)492-9861, for further information.
*/


/* Testbed implementation of Even Better Screening. */

/*
 * Code in this module is covered by US Patents 5,055,942 and
 * 5,917,614, and corresponding international patents.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "evenbetter-rll.h"

/* Set this define if compiling with AltiVec optimizations. */
#define noUSE_AVEC

/* Set this define if compiling with SSE optimizations. */
#define noUSE_SSE2

#define EVENBETTER_VERSION 133

#define EVEN_SHIFT 16
#define IMO_SHIFT 14
#define EVEN_RB_CAP (1 << (EVEN_SHIFT - 2))

#define FANCY_COUPLING

#if defined(USE_AVEC) || defined(USE_SSE2)
#define USE_VECTOR
#endif

#ifdef USE_AVEC
#include "eb_avec.h"

#endif

#ifdef USE_SSE2
typedef struct _eb_ctx_sse2 eb_ctx_sse2;
typedef struct _eb_srcbuf eb_srcbuf;

int eb_test_sse2(void);
int eb_sse2_core(eb_ctx_sse2 *ctx, unsigned char **out, eb_srcbuf *in,
                 int offset);
int eb_sse2_rev_rs(eb_ctx_sse2 *ctx, int offset);
int eb_sse2_set_daz(void);
void eb_sse2_restore_daz(int save_mxcsr);

struct _eb_ctx_sse2 {
  int xs;
  int *iir_line;
  int *r_line;
  int *a_line;
  int *b_line;
  char *skip_line;
  int dummy[2];
  float *luts[4];
  float e[4];
  float e_i_1[4];
  int r[4];
  int a[4];
  int b[4];
  int ones[4];
  int twos[4];
  int aspect2[4];
  float ehi[4];
  float elo[4];
  float ohi[4];
  float r_mul[4];
  float kernel[4];
  unsigned int seed1[4];
  unsigned int seed2[4];
};

struct _eb_srcbuf {
  float im[64];
  float rb[64];
  float rs[64];
  int dummy[3];
};

#endif

typedef struct _EBPlaneCtx EBPlaneCtx;
typedef unsigned int uint32;

struct _EvenBetterCtx {
  int source_width;
  int dest_width;
  int n_planes;
  int levels; /* Number of levels on output, <= 256 */
  EBPlaneCtx **plane_ctx;
  int aspect;
  int *strengths;
  int even_elo;
  int even_ehi;
  int *c_line;

  int even_c1;
  int do_shadows;

  uint32  seed1;
  uint32  seed2;

  FILE *dump_file;
  EbDumpLevel dump_level;

#ifdef USE_SSE2
  eb_ctx_sse2 **sse2_ctx;
  int using_vectors;
#endif
#ifdef USE_AVEC
  eb_ctx_avec **avec_ctx;
  int using_vectors;
#endif
};

struct _EBPlaneCtx {
  int source_width;
  int dest_width;
  int *rb_line;
  int *iir_line;
  int *r_line;
  int *a_line;
  int *b_line;
  int *r_line_sh;
  int *a_line_sh;
  int *b_line_sh;
  int *lut;
  int *rb_lut;
  char *rs_lut;
  int *white_count_line;
};

void *
eb_malloc_aligned(int size, int align)
{
  void *result;
  void *alloced = malloc(size + align);
  int pad;

  if (alloced == 0)
    return 0;
  pad = (((int)(size_t)alloced + 12) & 15) + 4;
  result = (void *)(pad + (char *)alloced);
  ((int *)result)[-1] = pad;
  return result;
}

void
eb_free_aligned(void *p)
{
  int pad = ((int *)p)[-1];
  free((char*)p - pad);
}

static double
eb_compute_rbscale(const EvenBetterParams *params)
{
  double rbscale = params->rbscale;

  if (rbscale == 0.0)
    {
      rbscale = params->aspect == 1 ? 0.95 :
        params->aspect == 2 ? 1.8 :
        params->aspect == 4 ? 3.6 : 1;
    }
  return rbscale;
}

static int
eb_compute_randshift(int nl, int rs_base, int do_shadows, int levels)
{
  int rs = rs_base;
  if ((nl > (90 << (EVEN_SHIFT - 10)) &&
       nl < (129 << (EVEN_SHIFT - 10))) ||
      (nl > (162 << (EVEN_SHIFT - 10)) &&
       nl < (180 << (EVEN_SHIFT - 10))))
    rs--;
  else if (nl > (321 << (EVEN_SHIFT - 10)) &&
           nl < (361 << (EVEN_SHIFT - 10)))
    {
      rs--;
      if (nl > (331 << (EVEN_SHIFT - 10)) &&
          nl < (351 << (EVEN_SHIFT - 10)))
        rs--;
    }
  else if ((do_shadows ||
            nl == (levels - 1) << EVEN_SHIFT) &&
           nl > ((levels - 1) << EVEN_SHIFT) -
           (1 << (EVEN_SHIFT - 2)))
    {
      /* don't add randomness in extreme shadows */
    }
  else if ((nl > (3 << (EVEN_SHIFT - 2))))
    {
      nl -= (nl + (1 << (EVEN_SHIFT - 2))) & -(1 << (EVEN_SHIFT - 1));
      if (nl < 0) nl = -nl;
      if (nl < (1 << (EVEN_SHIFT - 4))) rs--;
      if (nl < (1 << (EVEN_SHIFT - 5))) rs--;
      if (nl < (1 << (EVEN_SHIFT - 6))) rs--;
    }
  else
    {
      if (nl < (3 << (EVEN_SHIFT - 3))) nl += 1 << (EVEN_SHIFT - 2);
      nl = nl - (1 << (EVEN_SHIFT - 1));
      if (nl < 0) nl = -nl;
      if (nl < (1 << (EVEN_SHIFT - 4))) rs--;
      if (nl < (1 << (EVEN_SHIFT - 5))) rs--;
      if (nl < (1 << (EVEN_SHIFT - 6))) rs--;
    }
  return rs;
}

#ifdef USE_SSE2
static eb_ctx_sse2 *
eb_ctx_sse2_new(const EvenBetterParams *params, int start_plane, int end_plane)
{
  int xs = params->source_width;
  int aspect2 = params->aspect * params->aspect;
  eb_ctx_sse2 *ctx;
  int i;
  double im_scale;
  float r_mul = 1.0 / (params->aspect * (1 << (6 - params->even_c1_scale)));
  double rbscale = eb_compute_rbscale(params);
  int rs_base;

  ctx = (eb_ctx_sse2 *)eb_malloc_aligned(sizeof(eb_ctx_sse2), 16);
  ctx->xs = xs;
  for (i = 0; i < 4; i++)
    {
      ctx->e[i] = 0.0;
      ctx->e_i_1[i] = 0.0;
      ctx->r[i] = 0;
      ctx->a[i] = 1;
      ctx->b[i] = aspect2;
      ctx->ones[i] = 1;
      ctx->twos[i] = 2;
      ctx->aspect2[i] = aspect2;
      ctx->ohi[i] = params->levels - 1;
      ctx->ehi[i] = 1.1;
      ctx->elo[i] = -0.1;
      ctx->r_mul[i] = r_mul;
      ctx->seed1[i] = (i << 8) + 0x7000;
      ctx->seed2[i] = (i << 16) + 0x9000;
    }
  ctx->kernel[0] = 1.0 / 16;
  ctx->kernel[1] = 3.0 / 16;
  ctx->kernel[2] = 5.0 / 16;
  ctx->kernel[3] = 7.0 / 16;

  im_scale = (params->levels - 1) * 1.0 / (1 << 24);
  rs_base = 35 - EVEN_SHIFT - params->rand_scale;

  for (i = start_plane; i < end_plane; i++)
    {
      float *lut = (float *)malloc((ET_SRC_MAX + 1) * sizeof(float) * 3);
      int j;
      ctx->luts[i - start_plane] = lut;

      for (j = 0; j < ET_SRC_MAX + 1; j++)
        {
          double g = ((1 << 24) - params->luts[i][j]) * im_scale;
          int nl, rs;

          lut[j * 3] = g;
          if (g == 0.0)
            lut[j * 3 + 1] = 0.5;
          else
            lut[j * 3 + 1] = 0.5 - r_mul * rbscale / g;

          nl = (params->levels - 1 - g) * (1 << EVEN_SHIFT);
          rs = eb_compute_randshift(nl, rs_base,
                                    params->do_shadows, params->levels);

          lut[j * 3 + 2] = 1.0 / (1 << EVEN_SHIFT) / (1 << rs);
        }
    }
  for (i = i - start_plane; i < 4; i++)
    ctx->luts[i] = NULL;

  ctx->iir_line = (int *)eb_malloc_aligned(16 * (xs + 32), 16);
  ctx->a_line = (int *)eb_malloc_aligned(16 * (xs + 32), 16);
  ctx->b_line = (int *)eb_malloc_aligned(16 * (xs + 32), 16);
  ctx->r_line = (int *)eb_malloc_aligned(16 * (xs + 32), 16);
  for (i = 0; i < (xs + 32) * 4; i++)
    {
      ((float *)ctx->iir_line)[i] = 0;
      ctx->a_line[i] = 1;
      ctx->b_line[i] = aspect2;
      ctx->r_line[i] = 0;
    }

  ctx->skip_line = (char *)malloc((xs + 15) & -16);

  return ctx;
}

static void
eb_ctx_sse2_free(eb_ctx_sse2 *ctx)
{
  int i;

  for (i = 0; i < 4; i++)
    free(ctx->luts[i]);
  eb_free_aligned(ctx->iir_line);
  eb_free_aligned(ctx->a_line);
  eb_free_aligned(ctx->b_line);
  eb_free_aligned(ctx->r_line);
  free(ctx->skip_line);
  eb_free_aligned(ctx);
}
#endif

#ifdef USE_AVEC
static eb_ctx_avec *
eb_ctx_avec_new(const EvenBetterParams *params, int start_plane, int end_plane)
{
  int xs = params->source_width;
  int aspect2 = params->aspect * params->aspect;
  eb_ctx_avec *ctx;
  int i;
  double im_scale;
  double k;
  float imscale1, imscale2, rbmul, rsbase;
  float r_mul = 1.0 / (params->aspect * (1 << (6 - params->even_c1_scale)));
  double rbscale = eb_compute_rbscale(params);
  vector unsigned int zero = vec_splat_u32(0);
  const vector float kernel = { 1.0 / 16, 3.0 / 16, 5.0 / 16, 7.0 / 16 };
  vector float almostone = { 255.0/256, 255.0/256, 255.0/256, 255.0/256 };
  int rs_base;

  ctx = (eb_ctx_avec *)eb_malloc_aligned(sizeof(eb_ctx_avec), 16);
  ctx->xs = xs;

  ctx->e = (vector float) zero;
  ctx->e_i_1 = (vector float) zero;
  ctx->r = zero;
  ctx->a = zero;
  im_scale = (params->levels - 1) * (1.0 / (1 << 24));
  rs_base = 35 - EVEN_SHIFT - params->rand_scale;

  if (params->gamma == 1.0)
    k = 0;
  else if (params->gamma == 1.8)
    k = 0.835;
  else if (params->gamma == 2.0)
    k = 1.0;
  else
    /* this shouldn't happen! */
    k = 0;

  for (;;)
    {
      vector float foff, f0, f1;

      imscale1 = (1 - k) * (params->levels - 1) * (256.0 / 255.0);
      imscale2 = k * (params->levels - 1) * sqrt(256.0 / 255.0);
      for (i = 0; i < 4; i++)
        {
          ((float *)&ctx->imscale1)[i] = imscale1;
          ((float *)&ctx->imscale2)[i] = imscale2;
        }
      f0 = vec_rsqrte(almostone);
      f0 = vec_madd(f0, almostone, (vector float)zero);
      f1 = vec_madd(f0, ctx->imscale2, (vector float)zero);
      foff = vec_madd(almostone, ctx->imscale1, f1);
      f1 = vec_nmsub(f0, ctx->imscale2, foff);
      f1 = vec_nmsub(almostone, ctx->imscale1, f1);
      if (vec_all_eq(f1, (vector float)zero))
        {
          ctx->foff = foff;
          break;
        }
      k += 1e-5;
    }
  rbmul = -r_mul * rbscale;
  rsbase = 1.0 / (1 << EVEN_SHIFT) / (1 << rs_base);
  for (i = 0; i < 4; i++)
    {
      ((int *)&ctx->b)[i] = aspect2;
      ((int *)&ctx->aspect2)[i] = aspect2;
      ((int *)&ctx->seed1)[i] = (i << 8) + 0x7000;
      ((int *)&ctx->seed2)[i] = (i << 16) + 0x9000;
      ((float *)&ctx->ohi)[i] = params->levels - 1;
      ((float *)&ctx->ehi)[i] = 1.1;
      ((float *)&ctx->elo)[i] = -0.1;
      ((float *)&ctx->r_mul)[i] = r_mul;
      ((float *)&ctx->rsbase)[i] = rsbase;
      ((float *)&ctx->rbmul)[i] = rbmul;
    }
  ctx->kernel = kernel;

  rs_base = 35 - EVEN_SHIFT - params->rand_scale;

  for (i = start_plane; i < end_plane; i++)
    {
      float *lut = (float *)malloc((ET_SRC_MAX + 1) * sizeof(float) * 3);
      int j;
      ctx->luts[i - start_plane] = lut;

      for (j = 0; j < ET_SRC_MAX + 1; j++)
        {
          double g = ((1 << 24) - params->luts[i][j]) * im_scale;
          int nl, rs;

          lut[j * 3] = g;
          if (g == 0.0)
            lut[j * 3 + 1] = 0.5;
          else
            lut[j * 3 + 1] = 0.5 - r_mul * rbscale / g;
          nl = (params->levels - 1 - g) * (1 << EVEN_SHIFT);
          rs = eb_compute_randshift(nl, rs_base,
                                    params->do_shadows, params->levels);

          lut[j * 3 + 2] = 1.0 / (1 << EVEN_SHIFT) / (1 << rs);
        }
    }
  for (i = i - start_plane; i < 4; i++)
    ctx->luts[i] = NULL;

  ctx->iir_line = (vector float *)eb_malloc_aligned(16 * (xs + 32), 16);
  ctx->a_line = (vector unsigned int *)eb_malloc_aligned(16 * (xs + 32), 16);
  ctx->b_line = (vector unsigned int *)eb_malloc_aligned(16 * (xs + 32), 16);
  ctx->r_line = (vector unsigned int *)eb_malloc_aligned(16 * (xs + 32), 16);
  for (i = 0; i < (xs + 32) * 4; i++)
    {
      ((float *)ctx->iir_line)[i] = 0;
      ((int *)ctx->a_line)[i] = 1;
      ((int *)ctx->b_line)[i] = aspect2;
      ((int *)ctx->r_line)[i] = 0;
    }

  ctx->skip_line = (char *)malloc((xs + 15) & -16);

  return ctx;
}

static void
eb_ctx_avec_free(eb_ctx_avec *ctx)
{
  int i;

  for (i = 0; i < 4; i++)
    free(ctx->luts[i]);
  eb_free_aligned(ctx->iir_line);
  eb_free_aligned(ctx->a_line);
  eb_free_aligned(ctx->b_line);
  eb_free_aligned(ctx->r_line);
  free(ctx->skip_line);
  eb_free_aligned(ctx);
}

#endif

#ifdef USE_VECTOR
static int
even_better_line_vector(EvenBetterCtx *ebc, uchar **dest,
                      const ET_Rll *const *src)
{
  int n_planes = ebc->n_planes;
  int xd = ebc->dest_width;
  int strip;
  eb_srcbuf sb_alloc;
  eb_srcbuf *srcbuf;
  uchar dummy_a[32];
  uchar *dummy_dst = (uchar *)(((int)dummy_a + 15) & -16);
#ifdef USE_SSE2
  int save_mxcsr = eb_sse2_set_daz();
#endif

  srcbuf = (eb_srcbuf *)(((int)&sb_alloc + 12) & -16);

  for (strip = 0; strip < n_planes; strip += 4)
    {
#ifdef USE_AVEC
      eb_ctx_avec *ctx = ebc->avec_ctx[strip >> 2];
#endif
#ifdef USE_SSE2
      eb_ctx_sse2 *ctx = ebc->sse2_ctx[strip >> 2];
#endif
      uchar *destbufs[4];
      const ET_Rll *const *sbuf = src + strip;
      int count[4];
      int src_idx[4];
      int plane_idx, last_plane;
      float im[4], rb[4], rs[4];
      int i;

      last_plane = n_planes - strip < 4 ? n_planes - strip : 4;
      for (plane_idx = 0; plane_idx < last_plane; plane_idx++)
        {
          count[plane_idx] = 0;
          src_idx[plane_idx] = 0;
          destbufs[plane_idx] = dest[plane_idx + strip];
        }
      for (; plane_idx < 4; plane_idx++)
        {
          int j;

          for (j = 0; j < 16; j++)
            {
              ((float *)&srcbuf->im)[j * 4 + plane_idx] = 0.0;
              ((float *)&srcbuf->rb)[j * 4 + plane_idx] = 0.0;
              ((float *)&srcbuf->rs)[j * 4 + plane_idx] = 0.0;
            }
        }
      for (i = 0; i < xd; i += 16)
        {
          int jmax = (xd - i) > 16 ? 16 : xd - i;
          int skip = 1;
          int j;

          for (plane_idx = 0; plane_idx < last_plane; plane_idx++)
            {
              if (count[plane_idx] < 16 || im[plane_idx] != 0.0)
                {
                  skip = 0;
                  break;
                }
            }
          ctx->skip_line[i >> 4] = skip;

          if (skip)
            {
              /* all white */

              for (plane_idx = 0; plane_idx < last_plane; plane_idx++)
                {
                  uchar *dst_ptr = destbufs[plane_idx];
                  if (jmax == 16)
                    {
                      ((uint32 *)dst_ptr)[(i >> 2) + 0] = 0;
                      ((uint32 *)dst_ptr)[(i >> 2) + 1] = 0;
                      ((uint32 *)dst_ptr)[(i >> 2) + 2] = 0;
                      ((uint32 *)dst_ptr)[(i >> 2) + 3] = 0;
                    }
                  else
                    {
                      for (j = 0; j < jmax; j++)
                        dst_ptr[i + j] = 0;
                    }
                  count[plane_idx] -= jmax;
                }
            }
          else
            {
              for (plane_idx = 0; plane_idx < last_plane; plane_idx++)
                {
                  const float *lut = ctx->luts[plane_idx];
                  float imp = im[plane_idx];
                  float rbp = rb[plane_idx];
                  float rsp = rs[plane_idx];
                  for (j = 0; j < jmax; j++)
                    {
                      if (count[plane_idx] == 0)
                        {
                          const ET_Rll *src_p = sbuf[plane_idx] +
                            src_idx[plane_idx]++;
                          ET_SrcPixel src_pixel = src_p->value;
                          count[plane_idx] = src_p->length;
                          imp = lut[src_pixel * 3];
                          rbp = lut[src_pixel * 3 + 1];
                          rsp = lut[src_pixel * 3 + 2];
                        }
                      ((float *)&srcbuf->im)[j * 4 + plane_idx] = imp;
                      ((float *)&srcbuf->rb)[j * 4 + plane_idx] = rbp;
                      ((float *)&srcbuf->rs)[j * 4 + plane_idx] = rsp;
                      count[plane_idx]--;
                    }
                  im[plane_idx] = imp;
                  rb[plane_idx] = rbp;
                  rs[plane_idx] = rsp;
                }
              for (; plane_idx < 4; plane_idx++)
                {
                  destbufs[plane_idx] = dummy_dst - i;
                }
#ifdef USE_AVEC
              eb_avec_core(ctx, (vector unsigned char **)destbufs, srcbuf, i);
#endif
#ifdef USE_SSE2
              eb_sse2_core(ctx, destbufs, srcbuf, i);
#endif
            }
        }

      for (i = xd & -16; i >= 0; i -= 16)
        {
          if (!ctx->skip_line[i >> 4])
            {
#ifdef USE_AVEC
              eb_avec_rev_rs(ctx, i + 15);
#endif
#ifdef USE_SSE2
              eb_sse2_rev_rs(ctx, i + 15);
#endif
            }
        }
    }
#ifdef USE_SSE2
  eb_sse2_restore_daz(save_mxcsr);
#endif
  return 0;
}
#endif

#ifdef USE_AVEC
static int
even_better_line_fastprep(EvenBetterCtx *ebc, uchar **dest,
                          const ET_SrcPixel *const *src)
{
  int n_planes = ebc->n_planes;
  int xd = ebc->dest_width;
  int strip;
  eb_srcbuf sb_alloc;
  eb_srcbuf *srcbuf;
  uchar dummy_a[32];
  uchar *dummy_dst = (uchar *)(((int)dummy_a + 15) & -16);

  srcbuf = (eb_srcbuf *)(((int)&sb_alloc + 12) & -16);

  for (strip = 0; strip < n_planes; strip += 4)
    {
#ifdef USE_AVEC
      eb_ctx_avec *ctx = ebc->avec_ctx[strip >> 2];
#endif
#ifdef USE_SSE2
      eb_ctx_sse2 *ctx = ebc->sse2_ctx[strip >> 2];
#endif
      uchar *destbufs[4];
      const ET_SrcPixel *const *sbuf = src + strip;
      int plane_idx, last_plane;
      int i;

      last_plane = n_planes - strip < 4 ? n_planes - strip : 4;
      for (plane_idx = 0; plane_idx < last_plane; plane_idx++)
        {
          destbufs[plane_idx] = dest[plane_idx + strip];
        }
      for (i = 0; i < xd; i += 16)
        {
          int noskip;
          noskip = eb_avec_prep_srcbuf(ctx, last_plane, srcbuf, sbuf, i);
          ctx->skip_line[i >> 4] = noskip;
          if (noskip)
            {
              for (plane_idx = last_plane; plane_idx < 4; plane_idx++)
                destbufs[plane_idx] = dummy_dst - i;
              eb_avec_core(ctx, (vector unsigned char **)destbufs, srcbuf, i);
            }
          else
            {
              /* all white */

              for (plane_idx = 0; plane_idx < last_plane; plane_idx++)
                {
                  uchar *dst_ptr = destbufs[plane_idx];
                  ((uint32 *)dst_ptr)[(i >> 2) + 0] = 0;
                  ((uint32 *)dst_ptr)[(i >> 2) + 1] = 0;
                  ((uint32 *)dst_ptr)[(i >> 2) + 2] = 0;
                  ((uint32 *)dst_ptr)[(i >> 2) + 3] = 0;
                }
            }
        }

      for (i = xd & -16; i >= 0; i -= 16)
        {
          if (ctx->skip_line[i >> 4])
            {
#ifdef USE_AVEC
              eb_avec_rev_rs(ctx, i + 15);
#endif
#ifdef USE_SSE2
              eb_sse2_rev_rs(ctx, i + 15);
#endif
            }
        }
    }
  return 0;
}
#endif

/* Maximum number of planes, but actually we want to dynamically
   allocate all scratch buffers that depend on this. */
#define M 16

static void
even_better_line_hi (EvenBetterCtx *ebc, uchar **dest,
                     const ET_Rll *const *src)
{
  int a[M], b[M];
  int e_1_0[M], e_m1_1[M], e_0_1[M], e_1_1[M];
  int iml[M], rbl[M];
  int i, j;
  int im;
  int *pa, *pb, *piir, *pr;
  int r[M], rg;
  int xd;
  uint32 seed1 = ebc->seed1;
  uint32 seed2 = ebc->seed2;
  uint32 sum;
  int plane_idx;
  int r_scratch[M];
  int n_planes = ebc->n_planes;
  int levels = ebc->levels;
#ifdef OLD_QUANT
  int dith_mul = levels << 8;
#else
  int dith_mul = (levels - 1) << 8;
#endif
  int imo_mul = (1 << (EVEN_SHIFT + IMO_SHIFT)) / (levels - 1);
  int aspect2 = ebc->aspect * ebc->aspect;
  int *strengths = ebc->strengths;
  int even_elo = ebc->even_elo;
  int even_ehi = ebc->even_ehi;
  int coupling;
  int *c_line = ebc->c_line;
  int even_c1 = ebc->even_c1;
  int rand_shift;
  int even_rlimit = 1 << (30 - EVEN_SHIFT + even_c1);
  int count[M], src_idx[M];
  int rs[M];

  xd = ebc->dest_width;

  memset(rbl, 0x00, M * sizeof(int));
  memset(iml, 0x00, M * sizeof(int));
  memset(rs, 0x00, M * sizeof(int));

  for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
    {
      a[plane_idx] = 1;
      b[plane_idx] = aspect2;
      r[plane_idx] = 0;
      e_0_1[plane_idx] = 0;
      e_1_0[plane_idx] = 0;
      e_1_1[plane_idx] = 0;
      count[plane_idx] = 0;
      src_idx[plane_idx] = 0;
    }

  coupling = 0;

  for (i = 0; i < xd;)
    {
      int work_planes[M];
      int n_work = 0;
      int work_idx;
      int jmax;

      jmax = (xd - i) > 16 ? 16 : xd - i;

      for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
        {
          EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
          int *wcl = ctx->white_count_line;
          if (count[plane_idx] >= 16 && iml[plane_idx] == 0)
            wcl[i >> 4]++;
          else
            wcl[i >> 4] = 0;
          if (wcl[i >> 4] > 15)
            {
              uchar *dst_ptr = dest[plane_idx];
              if (jmax == 16)
                {
                  ((uint32 *)dst_ptr)[(i >> 2) + 0] = 0;
                  ((uint32 *)dst_ptr)[(i >> 2) + 1] = 0;
                  ((uint32 *)dst_ptr)[(i >> 2) + 2] = 0;
                  ((uint32 *)dst_ptr)[(i >> 2) + 3] = 0;
                }
              else
                {
                  for (j = 0; j < jmax; j++)
                    dst_ptr[i + j] = 0;
                }
              count[plane_idx] -= jmax;
            }
          else
            {
              work_planes[n_work++] = plane_idx;
            }
        }

      if (n_work == 0)
        {
          /* all planes were white */
          i += jmax;
          continue;
        }

      for (j = 0; j < jmax; j++)
        {
#ifdef FANCY_COUPLING
          coupling += c_line[i];
#else
          coupling = 0;
#endif
          /* Lookup image data and compute R for all planes. */
          for (work_idx = 0; work_idx < n_work; work_idx++)
            {
              int plane_idx = work_planes[work_idx];
              EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
              ET_SrcPixel src_pixel;
              int new_r;

              pr = ctx->r_line;
              pa = ctx->a_line;
              pb = ctx->b_line;
              if (count[plane_idx] == 0)
                {
                  const ET_Rll *src_p = src[plane_idx] + src_idx[plane_idx]++;
                  int *lut = ctx->lut;
                  int *rblut = ctx->rb_lut;
                  char *rslut = ctx->rs_lut;

                  count[plane_idx] = src_p->length;
                  src_pixel = src_p->value;
                  iml[plane_idx] = lut[src_pixel];
                  rbl[plane_idx] = rblut[src_pixel];
                  rs[plane_idx] = rslut[src_pixel];
                }
              count[plane_idx]--;

              if (r[plane_idx] + a[plane_idx] < pr[i])
                {
                  r[plane_idx] += a[plane_idx];
                  a[plane_idx] += 2;
                }
              else
                {
                  a[plane_idx] = pa[i];
                  b[plane_idx] = pb[i];
                  r[plane_idx] = pr[i];
                }
              if (iml[plane_idx] == 0)
                {
                  r_scratch[plane_idx] = 0;
                }
              else
                {
                  int r_tmp;
                  const int r_max = 0;
                  new_r = r[plane_idx];
                  if (new_r > even_rlimit)
                    new_r = even_rlimit;
                  /* Should we store back with the limit? */

                  rg = new_r << (EVEN_SHIFT - even_c1);
                  r_tmp = rg - rbl[plane_idx];
                  if (r_tmp > r_max) r_tmp >>= 3;
                  r_scratch[plane_idx] = r_tmp;
                }
            }

          /* Dither each plane. */
          for (work_idx = 0; work_idx < n_work; work_idx++)
            {
              int plane_idx = work_planes[work_idx];
              EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
              uchar *dst_ptr = dest[plane_idx];
              int new_e_1_0;
              int coupling_contribution;

              pr = ctx->r_line;
              pa = ctx->a_line;
              pb = ctx->b_line;
              piir = ctx->iir_line;

              im = iml[plane_idx];
              e_m1_1[plane_idx] = e_0_1[plane_idx];
              e_0_1[plane_idx] = e_1_1[plane_idx];
              e_1_1[plane_idx] = i == xd - 1 ? 0 : piir[i + 1];
              new_e_1_0 = ((e_1_0[plane_idx] * 7 + e_m1_1[plane_idx] * 3 +
                            e_0_1[plane_idx] * 5 + e_1_1[plane_idx] * 1) >> 4);
              if (im == 0)
                {
                  dst_ptr[i] = 0;
                }
              else
                {
                  int err;
                  int imo;

                  err = new_e_1_0;

                  err += r_scratch[plane_idx];

                  /* Add the two seeds together */
                  sum = seed1 + seed2;

                  /* If the add generated a carry, increment
                   * the result of the addition.
                   */
                  if (sum < seed1 || sum < seed2) sum++;

                  /* Seed2 becomes old seed1, seed1 becomes result */
                  seed2 = seed1;
                  seed1 = sum;

                  rand_shift = rs[plane_idx];
                  err -= (sum >> rand_shift) - (0x80000000 >> rand_shift);

                  if (err < even_elo)
                    err = even_elo;

                  else if (err > even_ehi)
                    err = even_ehi;

#if 1
                  err += coupling;
#endif

#ifdef OLD_QUANT
                  imo = ((err + im) * dith_mul) >> (EVEN_SHIFT + 8);
#else
                  imo = ((err + im) * dith_mul + (1 << (EVEN_SHIFT + 7))) >> (EVEN_SHIFT + 8);
#endif
                  if (imo < 0) imo = 0;
                  else if (imo > levels - 1) imo = levels - 1;
                  dst_ptr[i] = imo;
                  coupling_contribution = im - ((imo * imo_mul) >> IMO_SHIFT);
                  new_e_1_0 += coupling_contribution;
                  coupling += (coupling_contribution * strengths[plane_idx]) >> 8;
                }
              if (dst_ptr[i] != 0)
                {
                  a[plane_idx] = 1;
                  b[plane_idx] = aspect2;
                  r[plane_idx] = 0;
                }
              pa[i] = a[plane_idx];
              pb[i] = b[plane_idx];
              pr[i] = r[plane_idx];
              piir[i] = new_e_1_0;
              e_1_0[plane_idx] = new_e_1_0;
            }
#ifdef FANCY_COUPLING
          coupling = coupling >> 1;
          c_line[i] = coupling;
#endif
          i++;
        }
    }

  /* Note: this isn't white optimized, but the payoff is probably not
     that important. */
#ifdef FANCY_COUPLING
  coupling = 0;
  for (i = xd - 1; i >= 0; i--)
    {
      coupling = (coupling + c_line[i]) >> 1;
      c_line[i] = (coupling - (coupling >> 4));
    }
#endif

  /* Update distances. */
  for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
    {
      EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
      int *wcl = ctx->white_count_line;
      int av, bv, rv;
      int jmax;

      pr = ctx->r_line;
      pa = ctx->a_line;
      pb = ctx->b_line;

      av = 1;
      bv = 1;
      rv = 0;
      jmax = ((xd - 1) & 15) + 1;
      for (i = xd - 1; i >= 0;)
        {
          if (wcl[i >> 4] < 16)
            {
              for (j = 0; j < jmax; j++)
                {
                  if (rv + bv + av < pr[i] + pb[i])
                    {
                      rv += av;
                      av += 2;
                    }
                  else
                    {
                      rv = pr[i];
                      av = pa[i];
                      bv = pb[i];
                    }
                  if (rv > even_rlimit) rv = even_rlimit;
                  pa[i] = av;
                  pb[i] = bv + (aspect2 << 1);
                  pr[i] = rv + bv;
                  i--;
                }
            }
          else
            i -= jmax;
          jmax = 16;
        }
    }

   ebc->seed1 = seed1;
   ebc->seed2 = seed2;
}

static void
even_better_line_both (EvenBetterCtx *ebc, uchar **dest,
                       const ET_Rll *const *src)
{
#if 0
  int a[M], b[M];
  int a_sh[M], b_sh[M];
  int e_1_0[M], e_m1_1[M], e_0_1[M], e_1_1[M];
  int imraw[M];
  int iml[M];
  int i;
  int im;
  int *lut;
  const ET_SrcPixel *ps;
  int *pa, *pb, *piir, *pr;
  int *pa_sh, *pb_sh, *pr_sh;
  int r[M], rb, rg;
  int r_sh[M];
  int *rblut;
  int xd, xrem, xs;
  uint32 seed1 = ebc->seed1;
  uint32 seed2 = ebc->seed2;
  uint32 sum;
  int plane_idx;
  int r_scratch[M];
  int src_idx;
  int n_planes = ebc->n_planes;
  int levels = ebc->levels;
#ifdef OLD_QUANT
  int dith_mul = levels << 8;
#else
  int dith_mul = (levels - 1) << 8;
#endif
  int imo_mul = (1 << (EVEN_SHIFT + IMO_SHIFT)) / (levels - 1);
  int aspect2 = ebc->aspect * ebc->aspect;
  int *strengths = ebc->strengths;
  int even_elo= ebc->even_elo;
  int even_ehi= ebc->even_ehi;
  int coupling;
  int *c_line = ebc->c_line;
  int even_c1 = ebc->even_c1;
  int rand_shift = ebc->rand_shift;
  int even_rlimit = 1 << (30 - EVEN_SHIFT + even_c1);

  xs = ebc->source_width;
  xd = ebc->dest_width;
  xrem = xd - xs;

  for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
    {
      a[plane_idx] = 1;
      b[plane_idx] = aspect2;
      a_sh[plane_idx] = 1;
      b_sh[plane_idx] = aspect2;
      r[plane_idx] = 0;
      r_sh[plane_idx] = 0;
      e_0_1[plane_idx] = 0;
      e_1_0[plane_idx] = 0;
      e_1_1[plane_idx] = 0;
    }

  coupling = 0;

  src_idx = 0;
  for (i = 0; i < xd; i++)
    {
#ifdef FANCY_COUPLING
      coupling += c_line[i];
#else
      coupling = 0;
#endif

      xrem += xs;
      if (xrem >= xd)
        {
          for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
            {
              ps = src[plane_idx];
              imraw[plane_idx] = ps[src_idx];
            }
          src_idx++;
          xrem -= xd;
        }

      /* Lookup image data and compute R for all planes. */
      for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
        {
          EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
          ET_SrcPixel src_pixel;
          int new_r;

          pr = ctx->r_line;
          pa = ctx->a_line;
          pb = ctx->b_line;
          pr_sh = ctx->r_line_sh;
          pa_sh = ctx->a_line_sh;
          pb_sh = ctx->b_line_sh;
          lut = ctx->lut;
          rblut = ctx->rb_lut;
          src_pixel = imraw[plane_idx];

          im = lut[src_pixel];
          iml[plane_idx] = im;
          rb = rblut[src_pixel];
          if (r[plane_idx] + a[plane_idx] < pr[i])
            {
              r[plane_idx] += a[plane_idx];
              a[plane_idx] += 2;
            }
          else
            {
              a[plane_idx] = pa[i];
              b[plane_idx] = pb[i];
              r[plane_idx] = pr[i];
            }
          if (r_sh[plane_idx] + a_sh[plane_idx] < pr_sh[i])
            {
              r_sh[plane_idx] += a_sh[plane_idx];
              a_sh[plane_idx] += 2;
            }
          else
            {
              a_sh[plane_idx] = pa_sh[i];
              b_sh[plane_idx] = pb_sh[i];
              r_sh[plane_idx] = pr_sh[i];
            }
          if (im == 0 || im == (1 << EVEN_SHIFT))
            {
              r_scratch[plane_idx] = 0;
            }
          else
            {
              new_r = r[plane_idx];
              if (new_r > even_rlimit)
                new_r = even_rlimit;
              /* Should we store back with the limit? */
              rg = new_r << (EVEN_SHIFT - even_c1);

              new_r = r_sh[plane_idx];
              if (new_r > even_rlimit)
                new_r = even_rlimit;
              rg -= new_r << (EVEN_SHIFT - even_c1);
              r_scratch[plane_idx] = rg - rb;
            }
        }

      /* Dither each plane. */
      for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
        {
          EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
          uchar *dst_ptr = dest[plane_idx];
          int new_e_1_0;
          int coupling_contribution;

          pr = ctx->r_line;
          pa = ctx->a_line;
          pb = ctx->b_line;
          pr_sh = ctx->r_line_sh;
          pa_sh = ctx->a_line_sh;
          pb_sh = ctx->b_line_sh;
          piir = ctx->iir_line;

          im = iml[plane_idx];
          e_m1_1[plane_idx] = e_0_1[plane_idx];
          e_0_1[plane_idx] = e_1_1[plane_idx];
          e_1_1[plane_idx] = i == xd - 1 ? 0 : piir[i + 1];
          new_e_1_0 = ((e_1_0[plane_idx] * 7 + e_m1_1[plane_idx] * 3 +
                        e_0_1[plane_idx] * 5 + e_1_1[plane_idx] * 1) >> 4);
          if (im == 0)
            {
              dst_ptr[i] = 0;
            }
          else
            {
              int err;
              int imo;

              err = new_e_1_0;

              err += r_scratch[plane_idx];

              /* Add the two seeds together */
              sum = seed1 + seed2;

              /* If the add generated a carry, increment
               * the result of the addition.
               */
              if (sum < seed1 || sum < seed2) sum++;

              /* Seed2 becomes old seed1, seed1 becomes result */
              seed2 = seed1;
              seed1 = sum;

              err -= (sum >> rand_shift) - (0x80000000 >> rand_shift);

              if (err < even_elo)
                err = even_elo;

              else if (err > even_ehi)
                err = even_ehi;

#if 1
              err += coupling;
#endif

#ifdef OLD_QUANT
              imo = ((err + im) * dith_mul) >> (EVEN_SHIFT + 8);
#else
              imo = ((err + im) * dith_mul + (1 << (EVEN_SHIFT + 7))) >> (EVEN_SHIFT + 8);
#endif
              if (imo < 0) imo = 0;
              else if (imo > levels - 1) imo = levels - 1;
              dst_ptr[i] = imo;
              coupling_contribution = im - ((imo * imo_mul) >> IMO_SHIFT);
              new_e_1_0 += coupling_contribution;
              coupling += (coupling_contribution * strengths[plane_idx]) >> 8;
            }
          if (dst_ptr[i] != 0)
            {
              a[plane_idx] = 1;
              b[plane_idx] = aspect2;
              r[plane_idx] = 0;
            }
          if (dst_ptr[i] != levels - 1)
            {
              a_sh[plane_idx] = 1;
              b_sh[plane_idx] = aspect2;
              r_sh[plane_idx] = 0;
            }
          pa[i] = a[plane_idx];
          pb[i] = b[plane_idx];
          pr[i] = r[plane_idx];
          pa_sh[i] = a_sh[plane_idx];
          pb_sh[i] = b_sh[plane_idx];
          pr_sh[i] = r_sh[plane_idx];
          piir[i] = new_e_1_0;
          e_1_0[plane_idx] = new_e_1_0;
        }
#ifdef FANCY_COUPLING
      coupling = coupling >> 1;
      c_line[i] = coupling;
#endif
    }

#ifdef FANCY_COUPLING
  coupling = 0;
  for (i = xd - 1; i >= 0; i--)
    {
      if (plane_idx == 0)
        {
          coupling = (coupling + c_line[i]) >> 1;
          c_line[i] = (coupling - (coupling >> 4));
        }
    }
#endif

  /* Update distances. */
  for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
    {
      EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
      int av, bv, rv;
      int av_sh, bv_sh, rv_sh;

      pr = ctx->r_line;
      pa = ctx->a_line;
      pb = ctx->b_line;
      pr_sh = ctx->r_line_sh;
      pa_sh = ctx->a_line_sh;
      pb_sh = ctx->b_line_sh;

      av = 1;
      bv = 1;
      rv = 0;
      av_sh = 1;
      bv_sh = 1;
      rv_sh = 0;
      for (i = xd - 1; i >= 0; i--)
        {
          if (rv + bv + av < pr[i] + pb[i])
            {
              rv += av;
              av += 2;
            }
          else
            {
              rv = pr[i];
              av = pa[i];
              bv = pb[i];
            }
          if (rv > even_rlimit) rv = even_rlimit;
          pa[i] = av;
          pb[i] = bv + (aspect2 << 1);
          pr[i] = rv + bv;

          if (rv_sh + bv_sh + av_sh < pr_sh[i] + pb_sh[i])
            {
              rv_sh += av_sh;
              av_sh += 2;
            }
          else
            {
              rv_sh = pr_sh[i];
              av_sh = pa_sh[i];
              bv_sh = pb_sh[i];
            }
          if (rv_sh > even_rlimit) rv_sh = even_rlimit;
          pa_sh[i] = av_sh;
          pb_sh[i] = bv_sh + (aspect2 << 1);
          pr_sh[i] = rv_sh + bv_sh;
        }
    }

   ebc->seed1 = seed1;
   ebc->seed2 = seed2;
#endif
}

/**
 * even_better_line_rll: Screen a line using Even ToneFS screeing.
 * @ctx: An #EBPlaneCtx context.
 * @dest: Array of destination buffers, 8 bpp pixels each.
 * @src: Array of source buffers, runlength encoded.
 *
 * Screens a single line using Even ToneFS screening.
 **/
void
even_better_line_rll (EvenBetterCtx *ebc, uchar **dest,
                      const ET_Rll *const *src)
{

  if (ebc->dump_file && ebc->dump_level >= EB_DUMP_INPUT)
    {
      int i;

      /* Note: we should calculate the actual number of runlength
         codes here. As it is, it will just waste storage a bit. */
      for (i = 0; i < ebc->n_planes; i++)
        fwrite (src[i], sizeof(ET_Rll), ebc->source_width,
                ebc->dump_file);
    }
#ifdef USE_VECTOR
  if (ebc->using_vectors)
    even_better_line_vector(ebc, dest, src);
  else
#endif
  if (ebc->do_shadows)
    even_better_line_both (ebc, dest, src);
  else
    even_better_line_hi (ebc, dest, src);
  if (ebc->dump_file && ebc->dump_level >= EB_DUMP_INPUT)
    {
      int i;

      for (i = 0; i < ebc->n_planes; i++)
        fwrite (dest[i], 1, ebc->dest_width,
                ebc->dump_file);
    }
}

/**
 * even_better_compress_rll: Compress a single scan line to RLL format.
 * @dst: Destination buffer.
 * @src: Source buffer.
 * @width: Number of source pixels.
 *
 * Return value: number of runlength codes.
 **/
static int
even_better_compress_rll (ET_Rll *dst, const ET_SrcPixel *src,
                          int src_width, int dst_width)
{
  int rll_idx;
  int i;
  int count;
  ET_SrcPixel last_val;
  int whole = dst_width / src_width;
  int frac = dst_width % src_width;
  int rem;

  rll_idx = 0;
  last_val = src[0];
  count = whole;
  if (frac == 0)
    {
      for (i = 1; i < src_width; i++)
        {
          ET_SrcPixel val = src[i];

          if (count > 0xffff - whole || val != last_val)
            {
              dst[rll_idx].length = count;
              dst[rll_idx].value = last_val;
              rll_idx++;
              last_val = val;
              count = 0;
            }
          count += whole;
        }
    }
  else
    {
      rem = frac;
      for (i = 1; i < src_width; i++)
        {
          ET_SrcPixel val = src[i];

          if (count >= 0xffff - whole || val != last_val)
            {
              dst[rll_idx].length = count;
              dst[rll_idx].value = last_val;
              rll_idx++;
              last_val = val;
              count = 0;
            }
          count += whole;
          rem += frac;
          if (rem >= src_width)
            {
              count++;
              rem -= src_width;
            }
        }
    }
  dst[rll_idx].length = count;
  dst[rll_idx].value = last_val;
  rll_idx++;
  return rll_idx;
}

/**
 * even_better_line: Screen a line using Even TonenFS screeing.
 * @ctx: An #EBPlaneCtx context.
 * @dest: Array of destination buffers, 8 bpp pixels each.
 * @src: Array of source buffer, ET_SrcPixel pixels each.
 *
 * Screens a single line using Even ToneFS screening.
 **/
void
even_better_line (EvenBetterCtx *ebc, uchar **dest,
                      const ET_SrcPixel *const *src)
{
  ET_Rll *rll_buf[M];
  int i;
  int source_width = ebc->source_width;
  int dest_width = ebc->dest_width;

#ifdef USE_AVEC
  if (ebc->using_vectors == 2)
    {
      even_better_line_fastprep (ebc, dest, src);
    }
  else
#endif
    {
      for (i = 0; i < ebc->n_planes; i++)
        {
          rll_buf[i] = (ET_Rll *)malloc (source_width * sizeof(ET_Rll));
          even_better_compress_rll (rll_buf[i], src[i], source_width, dest_width);
        }
      even_better_line_rll (ebc, dest, (const ET_Rll * const *)rll_buf);
      for (i = 0; i < ebc->n_planes; i++)
        free (rll_buf[i]);
    }
}

/**
 * even_better_plane_free: Free an #EBPlaneCtx context.
 * @ctx: The #EBPlaneCtx context to free.
 *
 * Frees @ctx.
 **/
static void
even_better_plane_free (EBPlaneCtx *ctx)
{
  free (ctx->rb_line);
  free (ctx->iir_line);
  free (ctx->r_line);
  free (ctx->a_line);
  free (ctx->b_line);
  free (ctx->lut);
  free (ctx->rb_lut);
  free (ctx->rs_lut);
  free (ctx->white_count_line);
  free (ctx);
}

static int
even_log2 (int x)
{
  int y = 0;
  int z;

  for (z = x; z > 1; z = z >> 1)
    y++;
  return y;
}

/**
 * even_better_new: Create new Even ToneFS screening context.
 * @source_width: Width of source buffer.
 * @dest_width: Width of destination buffer, in pixels.
 * @lut: Lookup table for gray values.
 *
 * Creates a new context for Even ToneFS screening.
 *
 * If @dest_width is larger than @source_width, then input lines will
 * be expanded using nearest-neighbor sampling.
 *
 * @lut should be an array of 256 values, one for each possible input
 * gray value. @lut is a lookup table for gray values. Each value
 * ranges from 0 (black) to 2^24 (white).
 *
 * Return value: The new #EBPlaneCtx context.
 **/
static EBPlaneCtx *
even_better_plane_new (const EvenBetterParams *params, EvenBetterCtx *ebc,
                       int plane_idx)
{
  int source_width = params->source_width;
  int dest_width = params->dest_width;
  int *lut = params->luts[plane_idx];
  EBPlaneCtx *result;
  int i;
  int *new_lut;
  int *rb_lut;
  char *rs_lut;
  double rbscale = eb_compute_rbscale(params);
  int even_c1 = ebc->even_c1;
  int even_rlimit = 1 << (30 - EVEN_SHIFT + even_c1);
  int do_shadows = params->do_shadows;
  int log2_levels;
  int rs_base;

  result = (EBPlaneCtx *)malloc (sizeof(EBPlaneCtx));

  result->source_width = source_width;
  result->dest_width = dest_width;

  new_lut = (int *)malloc ((ET_SRC_MAX + 1) * sizeof(int));
  for (i = 0; i < ET_SRC_MAX + 1; i++)
    {
      int nli;

      if (lut == NULL)
        {
#if ET_SRC_MAX == 255
          nli = (i * 65793 + (i >> 7)) >> (24 - EVEN_SHIFT);
#else
          nli = (i * ((double) (1 << EVEN_SHIFT)) / ET_SRC_MAX) + 0.5;
#endif
        }
      else
        nli = lut[i] >> (24 - EVEN_SHIFT);
      new_lut[i] = (1 << EVEN_SHIFT) - nli;
    }

  rb_lut = (int *)malloc ((ET_SRC_MAX + 1) * sizeof(int));
  rs_lut = (char *)malloc ((ET_SRC_MAX + 1) * sizeof(int));

  log2_levels = even_log2 (params->levels);
  rs_base = 35 - EVEN_SHIFT + log2_levels - params->rand_scale;

  for (i = 0; i <= ET_SRC_MAX; i++)
    {
      double rb;
      int nl = new_lut[i] * (params->levels - 1);
      int rs;

      if (nl == 0)
        rb = 0;
      else
        {
          rb = (rbscale * (1 << (2 * EVEN_SHIFT - even_c1))) / nl;
          if (rb > even_rlimit << (EVEN_SHIFT - even_c1))
            rb = even_rlimit << (EVEN_SHIFT - even_c1);
        }

      rs = eb_compute_randshift(nl, rs_base, do_shadows, params->levels);
      rs_lut[i] = rs;

      if (params->do_shadows)
        {
          nl = ((1 << EVEN_SHIFT) - new_lut[i]) * (params->levels - 1);

          if (nl == 0)
            rb = 0;
          else
            {
              int rb_sh;
              rb_sh = (rbscale * (1 << (2 * EVEN_SHIFT - even_c1))) / nl;
              if (rb_sh > even_rlimit << (EVEN_SHIFT - even_c1))
                rb_sh = even_rlimit << (EVEN_SHIFT - even_c1);
              rb -= rb_sh;
            }
        }
      rb_lut[i] = rb;

    }

  result->lut = new_lut;
  result->rb_lut = rb_lut;
  result->rs_lut = rs_lut;

  result->rb_line = (int *)calloc (dest_width, sizeof(int));
  result->iir_line = (int *)calloc (dest_width, sizeof(int));
  result->r_line = (int *)calloc (dest_width, sizeof(int));
  result->a_line = (int *)calloc (dest_width, sizeof(int));
  result->b_line = (int *)calloc (dest_width, sizeof(int));
  result->white_count_line = (int *)calloc ((dest_width + 15) >> 4, sizeof(int));
  if (do_shadows)
    {
      result->r_line_sh = (int *)calloc (dest_width, sizeof(int));
      result->a_line_sh = (int *)calloc (dest_width, sizeof(int));
      result->b_line_sh = (int *)calloc (dest_width, sizeof(int));
    }
  else
    {
      result->r_line_sh = NULL;
      result->a_line_sh = NULL;
      result->b_line_sh = NULL;
    }
  for (i = 0; i < dest_width; i++)
    {
      result->a_line[i] = 1;
      result->b_line[i] = 1;
      result->iir_line[i] = -((rand () & 0x7fff) << 6) >> (24 - EVEN_SHIFT);
      if (do_shadows)
        {
          result->a_line_sh[i] = 1;
          result->b_line_sh[i] = 1;
        }
    }

  return result;
}

EvenBetterCtx *
even_better_new (const EvenBetterParams *params)
{
  EvenBetterCtx *result = (EvenBetterCtx *)malloc (sizeof(EvenBetterCtx));
  int n_planes = params->n_planes;
  int i;
  int log2_levels, log2_aspect;
  int using_vectors = 0;

  if (params->dump_file)
    {
      int header[5];

      header[0] = 0x70644245;
      header[1] = 'M' * 0x1010000 + 'I' * 0x101;
      header[2] = EVENBETTER_VERSION;
      header[3] = ET_SRC_MAX;
      header[4] = sizeof(ET_SrcPixel);
      fwrite (header, sizeof(int), sizeof(header) / sizeof(header[0]),
              params->dump_file);
      if (params->dump_level >= EB_DUMP_PARAMS)
        {

          fwrite (params, 1, sizeof(EvenBetterParams), params->dump_file);
        }
      if (params->dump_level >= EB_DUMP_LUTS)
        {
          int i;
          for (i = 0; i < params->n_planes; i++)
            fwrite (params->luts[i], sizeof(int), ET_SRC_MAX + 1,
                    params->dump_file);
        }
    }

  result->source_width = params->source_width;
  result->dest_width = params->dest_width;
  result->n_planes = n_planes;
  result->levels = params->levels;

  result->aspect = params->aspect;

  result->even_ehi = 0.6 * (1 << EVEN_SHIFT) / (params->levels - 1);
  result->even_elo = -result->even_ehi;

  result->strengths = (int *)malloc (sizeof(int) * n_planes);
  memcpy (result->strengths, params->strengths,
          sizeof(int) * n_planes);

  log2_levels = even_log2 (params->levels);
  log2_aspect = even_log2 (params->aspect);
  result->even_c1 = 6 + log2_aspect + log2_levels - params->even_c1_scale;
  result->do_shadows = params->do_shadows;

  result->c_line = (int *)calloc (params->dest_width, sizeof(int));

  result->seed1 = 0x5324879f;
  result->seed2 = 0xb78d0945;

  result->dump_file = params->dump_file;
  result->dump_level = params->dump_level;

#ifdef USE_SSE2
  using_vectors = eb_test_sse2();
#endif
#ifdef USE_AVEC
  using_vectors = 1; /* todo: Altivec sensing */

  /* select fastprep */
  if (sizeof(ET_SrcPixel) == 1 && using_vectors && params->gamma != 0)
    using_vectors = 2;

#endif

#ifdef USE_VECTOR
  result->using_vectors = using_vectors;
#endif
  if (using_vectors)
    {
#ifdef USE_SSE2
      result->sse2_ctx = (eb_ctx_sse2 **)malloc(sizeof(eb_ctx_sse2 *) *
                                                ((n_planes + 3) >> 2));
      for (i = 0; i < n_planes; i += 4)
        {
          int end_plane = i + 4 < n_planes ? i + 4 : n_planes;
          result->sse2_ctx[i >> 2] = eb_ctx_sse2_new(params, i, end_plane);
        }
#endif
#ifdef USE_AVEC
      result->avec_ctx = (eb_ctx_avec **)malloc(sizeof(eb_ctx_avec *) *
                                                ((n_planes + 3) >> 2));
      for (i = 0; i < n_planes; i += 4)
        {
          int end_plane = i + 4 < n_planes ? i + 4 : n_planes;
          result->avec_ctx[i >> 2] = eb_ctx_avec_new(params, i, end_plane);
        }
#endif
      result->plane_ctx = NULL;
    }
  else
    {
      result->plane_ctx = (EBPlaneCtx **)malloc(sizeof(EBPlaneCtx *) * n_planes);
      for (i = 0; i < n_planes; i++)
        result->plane_ctx[i] = even_better_plane_new (params, result, i);
    }
  return result;
}

/**
 * even_better_free: Free an #EvenBetterCtx context.
 * @ctx: The #EvenBetterCtx context to free.
 *
 * Frees @ctx.
 **/
void
even_better_free (EvenBetterCtx *ctx)
{
  int i;
  int n_planes = ctx->n_planes;

  if (ctx->dump_file)
    fclose (ctx->dump_file);

#ifdef USE_VECTOR
  if (ctx->using_vectors)
    {
#ifdef USE_SSE2
      for (i = 0; i < n_planes; i += 4)
        eb_ctx_sse2_free(ctx->sse2_ctx[i >> 2]);
      free(ctx->sse2_ctx);
#endif
#ifdef USE_AVEC
      for (i = 0; i < n_planes; i += 4)
        eb_ctx_avec_free(ctx->avec_ctx[i >> 2]);
      free(ctx->avec_ctx);
#endif
    }
  else
#endif
    {
      for (i = 0; i < n_planes; i++)
        even_better_plane_free (ctx->plane_ctx[i]);
      free(ctx->plane_ctx);
    }
  free (ctx->strengths);
  free (ctx->c_line);

  free (ctx);
}