import _sortBy from 'lodash/sortBy'

import { getClipTranscriptionText } from '../metadata'
import { STOP_WORDS } from './constants'

const ALPHA_ONLY_REGEX = /[a-zA-Z]+/g

/**
 * Normalize a text by removing non-alphabetic characters, converting to lowercase,
 * and filtering out stop words.
 *
 * @param {string} text - The input text to be normalized.
 * @returns {Set<string>} - A set containing the normalized words from the input text.
 */
function normalizeText(text) {
  const words = text?.match(ALPHA_ONLY_REGEX) || []
  return new Set(
    words
      .map((word) => word.toLowerCase())
      .filter((word) => !STOP_WORDS.includes(word))
  )
}

/**
 * Calculate the IoU (Intersection over Union) based on two normalized texts.
 *
 * @param {Set<string>} text1Norm - The first text (normalized) to compare for duplication.
 * @param {Set<string>} text2Norm - The second text (normalized) to compare for duplication.
 * @returns {number} - The IoU threshold for considering two texts as duplicates.
 */
function calculateIouScore(text1Norm, text2Norm) {
  const intersection = new Set([...text1Norm].filter((word) => text2Norm.has(word)))
  const union = new Set([...text1Norm, ...text2Norm])
  return union.size ? intersection.size / union.size : 0
}

/**
 * Reads the duplication configuration based on service area and channel.
 *
 * @param {Object} duplicationConfig - The duplication configuration object.
 * @param {string} serviceArea - The service area for the duplication configuration.
 * @param {string} channel - The channel for the duplication configuration.
 * @returns {Object} - Merged duplication configuration settings.
 */
const readDuplicationConfig = (duplicationConfig, serviceArea, channel) => {
  const defaultConfig = duplicationConfig?.default ?? {}
  const serviceAreaConfig = duplicationConfig?.[`sa::${serviceArea}`] ?? {}
  const channelConfig = duplicationConfig?.[`ch::${channel}`] ?? {}

  return {
    ...defaultConfig,
    ...serviceAreaConfig,
    ...channelConfig,
  }
}

/**
 * Retrieves subsequent clips occurring after a specified current clip within a time threshold.
 *
 * @param {Object} state - The app state object
 * @param {Array<Object>} clips - The array of clips (sorted).
 * @param {number} currentClipIdx - The current clip index.
 * @param {number} thresholdSeconds - The time threshold in seconds.
 * @returns {Array<Object>} - Filtered clips occurring after the current clip within the threshold.
 */
function getClipsAfterClip(state, clips, currentClipIdx, thresholdSeconds) {
  const { channels } = state.get().radio

  const currentClip = clips[currentClipIdx]
  const { serviceArea } = channels[currentClip.channel]
  const currentClipTime = new Date(currentClip.time)

  const thresholdTime = new Date(currentClipTime.getTime() + thresholdSeconds * 1000)

  const result = []
  for (let i = currentClipIdx + 1; i < clips.length; i++) {
    const clip = clips[i]
    const clipTime = new Date(clip.time)

    if (clipTime > thresholdTime) {
      // We've reached the end of the threshold
      break
    }

    const clipServiceArea = channels[clip.channel].serviceArea
    if (clipServiceArea === serviceArea) {
      result.push(clip)
    }
  }
  return result
}

/**
 * Iterates through all clips and detects duplicates, storing the duplicate clip ids in the radio state.
 * The first clip is considered the original clip and all subsequent clips are compared against it.
 *
 * @param {Object} state - The app state object.
 * @param {Array<Object>} clips - The array of clips.
 */
export function detectAllDuplicates(state, clips) {
  const { duplicateDetectionConfig } = state.get().global
  const { channels } = state.get().radio

  // Clear initial duplicate clips selection
  state.get().radio.duplicateClips.clear()

  // Enrich clips with transcription text & filter out clips that shouldn't be processed
  const parsedClips = []
  clips.forEach((clip) => {
    const { id: channel, serviceArea } = channels[clip.channel]
    const {
      is_enabled: isEnabled,
      iou_threshold: iouThreshold,
      latest_clips_seconds: latestClipsSeconds,
    } = readDuplicationConfig(duplicateDetectionConfig, serviceArea, channel)

    const transcription = getClipTranscriptionText(clip)
    if (!isEnabled || !iouThreshold || !latestClipsSeconds || !transcription) {
      return
    }

    parsedClips.push({ ...clip, transcription, iouThreshold, latestClipsSeconds })
  })

  // Sort clips by time
  const sortedClips = _sortBy(parsedClips, (clip) => new Date(clip.time).getTime(), (clip) => clip.id)
  sortedClips.forEach((clip, clipIdx) => {
    const subsequentClips = getClipsAfterClip(state, sortedClips, clipIdx, clip.latestClipsSeconds)
    const newDuplicates = getDuplicates(clip, subsequentClips, clip.iouThreshold)
    newDuplicates.forEach((duplicateClip) => {
      state.get().radio.duplicateClips.add(duplicateClip.id)
    })
  })
}

/**
 * Get the duplicate clips for a given clip.
 *
 * @param {Object} currentClip - The current clip to compare for duplication.
 * @param {Array<Object>} latestClips - The list of latest clips.
 * @param {number} threshold - The threshold for considering two texts as duplicates.
 * @returns {Array<Object>} - The list of duplicate clips.
 */
function getDuplicates(currentClip, latestClips, threshold) {
  if (!latestClips?.length) {
    return []
  }

  const normalizedTranscription = normalizeText(currentClip.transcription)

  return latestClips.filter((clip) => {
    try {
      const normalizedClipTranscription = normalizeText(clip.transcription)
      const iouScore = calculateIouScore(normalizedTranscription, normalizedClipTranscription)
      if (iouScore && iouScore > threshold) {
        return true
      }
    } catch (e) {
      console.error(`Error normalizing clip transcription. Error=${e}`)
      return false
    }
  })
}

export default {
  detectAllDuplicates,
}
