import { stripHtml } from "string-strip-html";

// Ä, ä     \u00c4, \u00e4
// Ö, ö     \u00d6, \u00f6
// Ü, ü     \u00dc, \u00fc
// ß        \u00df
const diacriticsMap = [
  // { base: "A", letters: /[\u00c4]/g },
  { base: "a", letters: /[\u00e4]/g },
  // { base: "U", letters: /[\u00dc]/g },
  { base: "u", letters: /[\u00fc]/g },
  // { base: "O", letters: /[\u00d6]/g },
  { base: "o", letters: /[\u00f6]/g },
  { base: "ss", letters: /[\u00df]/g },
];

const domParser =
  typeof DOMParser !== "undefined" ? new DOMParser() : undefined;
const intlSegmenter = Intl.Segmenter
  ? new Intl.Segmenter("de-DE", { granularity: "word" })
  : undefined;
const wordMatcher = /\b([\w\u00c4\u00d6\u00dc\u00df\u00e4\u00f6\u00fc]+)\b/g;

interface TokenizeStringOptions {
  tokenMinLength?: number;
  replaceDiacritics?: boolean;
}

const stripHtmlFromString = (text?: string) => {
  if (!text) {
    return "";
  }

  try {
    return domParser
      ? domParser.parseFromString(text, "text/html").body.textContent || ""
      : stripHtml(text).result;
  } catch (e) {
    return "";
  }
};

export const tokenizeString = (
  text?: string,
  options?: TokenizeStringOptions
) => {
  if (!text) {
    return [];
  }
  const { tokenMinLength = 3, replaceDiacritics = true } = options ?? {};

  // we only want lowercase tokens
  let tmp = stripHtmlFromString(text).toLowerCase();

  // replace diacritics
  if (replaceDiacritics) {
    diacriticsMap.forEach((diacritic) => {
      tmp = tmp.replace(diacritic.letters, diacritic.base);
    });
  }

  // get tokens from string
  let tokens: string[] | RegExpMatchArray;
  if (intlSegmenter) {
    tokens = Array.from(intlSegmenter.segment(tmp))
      .filter((sd) => sd.isWordLike)
      .map((sd) => sd.segment);
  } else {
    tokens = tmp.match(wordMatcher) || [];
  }

  // remove tokens which are to short
  tokens = tokenMinLength
    ? tokens.filter((t) => t.length >= tokenMinLength)
    : tokens;

  // return distinct tokens
  return Array.from(new Set(tokens));
};
