/*
 * Copyright (C) 2011, Google Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1.  Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2.  Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 */

#include "third_party/blink/renderer/modules/webaudio/wave_shaper_dsp_kernel.h"

#include <algorithm>
#include <memory>

#include "build/build_config.h"
#include "third_party/blink/renderer/platform/audio/audio_utilities.h"
#include "third_party/blink/renderer/platform/audio/vector_math.h"
#include "third_party/blink/renderer/platform/wtf/threading.h"

namespace blink {

WaveShaperDSPKernel::WaveShaperDSPKernel(WaveShaperProcessor* processor)
    : AudioDSPKernel(processor),
      tail_time_(0),
      // 4 times render size to handle 4x oversampling.
      virtual_index_(4 * audio_utilities::kRenderQuantumFrames),
      index_(4 * audio_utilities::kRenderQuantumFrames),
      v1_(4 * audio_utilities::kRenderQuantumFrames),
      v2_(4 * audio_utilities::kRenderQuantumFrames),
      f_(4 * audio_utilities::kRenderQuantumFrames) {
  if (processor->Oversample() != WaveShaperProcessor::kOverSampleNone)
    LazyInitializeOversampling();
}

void WaveShaperDSPKernel::LazyInitializeOversampling() {
  if (!temp_buffer_) {
    temp_buffer_ = std::make_unique<AudioFloatArray>(
        audio_utilities::kRenderQuantumFrames * 2);
    temp_buffer2_ = std::make_unique<AudioFloatArray>(
        audio_utilities::kRenderQuantumFrames * 4);
    up_sampler_ =
        std::make_unique<UpSampler>(audio_utilities::kRenderQuantumFrames);
    down_sampler_ = std::make_unique<DownSampler>(
        audio_utilities::kRenderQuantumFrames * 2);
    up_sampler2_ =
        std::make_unique<UpSampler>(audio_utilities::kRenderQuantumFrames * 2);
    down_sampler2_ = std::make_unique<DownSampler>(
        audio_utilities::kRenderQuantumFrames * 4);
  }
}

void WaveShaperDSPKernel::Process(const float* source,
                                  float* destination,
                                  uint32_t frames_to_process) {
  switch (GetWaveShaperProcessor()->Oversample()) {
    case WaveShaperProcessor::kOverSampleNone:
      ProcessCurve(source, destination, frames_to_process);
      break;
    case WaveShaperProcessor::kOverSample2x:
      ProcessCurve2x(source, destination, frames_to_process);
      break;
    case WaveShaperProcessor::kOverSample4x:
      ProcessCurve4x(source, destination, frames_to_process);
      break;

    default:
      NOTREACHED();
  }
}

double WaveShaperDSPKernel::WaveShaperCurveValue(float input,
                                                 const float* curve_data,
                                                 int curve_length) const {
  // Calculate a virtual index based on input -1 -> +1 with -1 being curve[0],
  // +1 being curve[curveLength - 1], and 0 being at the center of the curve
  // data. Then linearly interpolate between the two points in the curve.
  double virtual_index = 0.5 * (input + 1) * (curve_length - 1);
  double output;
  if (virtual_index < 0) {
    // input < -1, so use curve[0]
    output = curve_data[0];
  } else if (virtual_index >= curve_length - 1) {
    // input >= 1, so use last curve value
    output = curve_data[curve_length - 1];
  } else {
    // The general case where -1 <= input < 1, where 0 <= virtualIndex <
    // curveLength - 1, so interpolate between the nearest samples on the
    // curve.
    unsigned index1 = static_cast<unsigned>(virtual_index);
    unsigned index2 = index1 + 1;
    double interpolation_factor = virtual_index - index1;

    double value1 = curve_data[index1];
    double value2 = curve_data[index2];

    output =
        (1.0 - interpolation_factor) * value1 + interpolation_factor * value2;
  }

  return output;
}

void WaveShaperDSPKernel::WaveShaperCurveValues(float* destination,
                                                const float* source,
                                                uint32_t frames_to_process,
                                                const float* curve_data,
                                                int curve_length) const {
  DCHECK_LE(frames_to_process, virtual_index_.size());
  // Index into the array computed from the source value.
  float* virtual_index = virtual_index_.Data();

  // virtual_index[k] =
  //   clampTo(0.5 * (source[k] + 1) * (curve_length - 1),
  //           0.0f,
  //           static_cast<float>(curve_length - 1))

  // Add 1 to source puttting  result in virtual_index
  vector_math::Vsadd(source, 1, 1, virtual_index, 1, frames_to_process);

  // Scale virtual_index in place by (curve_lenth -1)/2
  vector_math::Vsmul(virtual_index, 1, 0.5 * (curve_length - 1), virtual_index,
                     1, frames_to_process);

  // Clip virtual_index, in place.
  vector_math::Vclip(virtual_index, 1, 0, curve_length - 1, virtual_index, 1,
                     frames_to_process);

  // index = floor(virtual_index)
  DCHECK_LE(frames_to_process, index_.size());
  float* index = index_.Data();

  // v1 and v2 hold the curve_data corresponding to the closest curve
  // values to the source sample.  To save memory, v1 will use the
  // destination array.
  DCHECK_LE(frames_to_process, v1_.size());
  DCHECK_LE(frames_to_process, v2_.size());
  float* v1 = v1_.Data();
  float* v2 = v2_.Data();

  // Interpolation factor: virtual_index - index.
  DCHECK_LE(frames_to_process, f_.size());
  float* f = f_.Data();

  int max_index = curve_length - 1;
  unsigned k = 0;
#if defined(ARCH_CPU_X86_FAMILY)
  {
    int loop_limit = frames_to_process / 4;

    // one = 1
    __m128i one = _mm_set1_epi32(1);

    // Do 4 eleemnts at a time
    for (int loop = 0; loop < loop_limit; ++loop, k += 4) {
      // v = virtual_index[k]
      __m128 v = _mm_loadu_ps(virtual_index + k);

      // index1 = static_cast<int>(v);
      __m128i index1 = _mm_cvttps_epi32(v);

      // v = static_cast<float>(index1) and save result to index[k:k+3]
      v = _mm_cvtepi32_ps(index1);
      _mm_storeu_ps(&index[k], v);

      // index2 = index2 + 1;
      __m128i index2 = _mm_add_epi32(index1, one);

      // Convert index1/index2 to arrays of 32-bit int values that are our
      // array indices to use to get the curve data.
      int32_t* i1 = reinterpret_cast<int32_t*>(&index1);
      int32_t* i2 = reinterpret_cast<int32_t*>(&index2);

      // Get the curve_data values and save them in v1 and v2,
      // carfully clamping the values.  If the input is NaN, index1
      // could be 0x8000000.
      v1[k] = curve_data[clampTo(i1[0], 0, max_index)];
      v2[k] = curve_data[clampTo(i2[0], 0, max_index)];
      v1[k + 1] = curve_data[clampTo(i1[1], 0, max_index)];
      v2[k + 1] = curve_data[clampTo(i2[1], 0, max_index)];
      v1[k + 2] = curve_data[clampTo(i1[2], 0, max_index)];
      v2[k + 2] = curve_data[clampTo(i2[2], 0, max_index)];
      v1[k + 3] = curve_data[clampTo(i1[3], 0, max_index)];
      v2[k + 3] = curve_data[clampTo(i2[3], 0, max_index)];
    }
  }
#elif defined(CPU_ARM_NEON)
  {
    int loop_limit = frames_to_process / 4;

    // Neon constants:
    //   zero = 0
    //   one  = 1
    //   max  = max_index
    int32x4_t zero = vdupq_n_s32(0);
    int32x4_t one = vdupq_n_s32(1);
    int32x4_t max = vdupq_n_s32(max_index);

    for (int loop = 0; loop < loop_limit; ++loop, k += 4) {
      // v = virtual_index
      float32x4_t v = vld1q_f32(virtual_index + k);

      // index1 = static_cast<int32_t>(v), then clamp to a valid index range for
      // curve_data
      int32x4_t index1 = vcvtq_s32_f32(v);
      index1 = vmaxq_s32(vminq_s32(index1, max), zero);

      // v = static_cast<float>(v) and save it away for later use.
      v = vcvtq_f32_s32(index1);
      vst1q_f32(&index[k], v);

      // index2 = index1 + 1, then clamp to a valid range for curve_data.
      int32x4_t index2 = vaddq_s32(index1, one);
      index2 = vmaxq_s32(vminq_s32(index2, max), zero);

      // Save index1/2 so we can get the individual parts.  Aligned to
      // 16 bytes for vst1q instruction.
      int32_t i1[4] __attribute__((aligned(16)));
      int32_t i2[4] __attribute__((aligned(16)));
      vst1q_s32(i1, index1);
      vst1q_s32(i2, index2);

      // Get curve elements corresponding to the indices.
      v1[k] = curve_data[i1[0]];
      v2[k] = curve_data[i2[0]];
      v1[k + 1] = curve_data[i1[1]];
      v2[k + 1] = curve_data[i2[1]];
      v1[k + 2] = curve_data[i1[2]];
      v2[k + 2] = curve_data[i2[2]];
      v1[k + 3] = curve_data[i1[3]];
      v2[k + 3] = curve_data[i2[3]];
    }
  }
#endif

  // Compute values for index1 and load the curve_data corresponding to indices.
  for (; k < frames_to_process; ++k) {
    unsigned index1 =
        clampTo(static_cast<unsigned>(virtual_index[k]), 0, max_index);
    unsigned index2 = clampTo(index1 + 1, 0, max_index);
    index[k] = index1;
    v1[k] = curve_data[index1];
    v2[k] = curve_data[index2];
  }

  // f[k] = virtual_index[k] - index[k]
  vector_math::Vsub(virtual_index, 1, index, 1, f, 1, frames_to_process);

  // Do the linear interpolation of the curve data:
  // destination[k] = v1[k] + f[k]*(v2[k] - v1[k])
  //
  // 1. v2[k] = v2[k] - v1[k]
  // 2. v2[k] = f[k]*v2[k] = f[k]*(v2[k] - v1[k])
  // 3. destination[k] = destination[k] + v2[k]
  //                   = v1[k] + f[k]*(v2[k] - v1[k])
  vector_math::Vsub(v2, 1, v1, 1, v2, 1, frames_to_process);
  vector_math::Vmul(f, 1, v2, 1, v2, 1, frames_to_process);
  vector_math::Vadd(v2, 1, v1, 1, destination, 1, frames_to_process);
}

void WaveShaperDSPKernel::ProcessCurve(const float* source,
                                       float* destination,
                                       uint32_t frames_to_process) {
  DCHECK(source);
  DCHECK(destination);
  DCHECK(GetWaveShaperProcessor());

  Vector<float>* curve = GetWaveShaperProcessor()->Curve();
  if (!curve) {
    // Act as "straight wire" pass-through if no curve is set.
    memcpy(destination, source, sizeof(float) * frames_to_process);
    return;
  }

  float* curve_data = curve->data();
  int curve_length = curve->size();

  DCHECK(curve_data);

  if (!curve_data || !curve_length) {
    memcpy(destination, source, sizeof(float) * frames_to_process);
    return;
  }

  // Apply waveshaping curve.
  WaveShaperCurveValues(destination, source, frames_to_process, curve_data,
                        curve_length);
}

void WaveShaperDSPKernel::ProcessCurve2x(const float* source,
                                         float* destination,
                                         uint32_t frames_to_process) {
  DCHECK_EQ(frames_to_process, audio_utilities::kRenderQuantumFrames);

  float* temp_p = temp_buffer_->Data();

  up_sampler_->Process(source, temp_p, frames_to_process);

  // Process at 2x up-sampled rate.
  ProcessCurve(temp_p, temp_p, frames_to_process * 2);

  down_sampler_->Process(temp_p, destination, frames_to_process * 2);
}

void WaveShaperDSPKernel::ProcessCurve4x(const float* source,
                                         float* destination,
                                         uint32_t frames_to_process) {
  DCHECK_EQ(frames_to_process, audio_utilities::kRenderQuantumFrames);

  float* temp_p = temp_buffer_->Data();
  float* temp_p2 = temp_buffer2_->Data();

  up_sampler_->Process(source, temp_p, frames_to_process);
  up_sampler2_->Process(temp_p, temp_p2, frames_to_process * 2);

  // Process at 4x up-sampled rate.
  ProcessCurve(temp_p2, temp_p2, frames_to_process * 4);

  down_sampler2_->Process(temp_p2, temp_p, frames_to_process * 4);
  down_sampler_->Process(temp_p, destination, frames_to_process * 2);
}

void WaveShaperDSPKernel::Reset() {
  if (up_sampler_) {
    up_sampler_->Reset();
    down_sampler_->Reset();
    up_sampler2_->Reset();
    down_sampler2_->Reset();
  }
}

bool WaveShaperDSPKernel::RequiresTailProcessing() const {
  // Always return true even if the tail time and latency might both be zero.
  return true;
}

double WaveShaperDSPKernel::TailTime() const {
  return tail_time_;
}

double WaveShaperDSPKernel::LatencyTime() const {
  size_t latency_frames = 0;
  WaveShaperDSPKernel* kernel = const_cast<WaveShaperDSPKernel*>(this);

  switch (kernel->GetWaveShaperProcessor()->Oversample()) {
    case WaveShaperProcessor::kOverSampleNone:
      break;
    case WaveShaperProcessor::kOverSample2x:
      latency_frames += up_sampler_->LatencyFrames();
      latency_frames += down_sampler_->LatencyFrames();
      break;
    case WaveShaperProcessor::kOverSample4x: {
      // Account for first stage upsampling.
      latency_frames += up_sampler_->LatencyFrames();
      latency_frames += down_sampler_->LatencyFrames();

      // Account for second stage upsampling.
      // and divide by 2 to get back down to the regular sample-rate.
      size_t latency_frames2 =
          (up_sampler2_->LatencyFrames() + down_sampler2_->LatencyFrames()) / 2;
      latency_frames += latency_frames2;
      break;
    }
    default:
      NOTREACHED();
  }

  return static_cast<double>(latency_frames) / SampleRate();
}

}  // namespace blink