trisquel-icecat/icecat/intl/components/src/String.h

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifndef intl_components_String_h_
#define intl_components_String_h_

#include "mozilla/Assertions.h"
#include "mozilla/Casting.h"
#include "mozilla/intl/ICU4CGlue.h"
#include "mozilla/intl/ICUError.h"
#include "mozilla/PodOperations.h"
#include "mozilla/Span.h"
#include "mozilla/Try.h"

#include "unicode/uchar.h"
#include "unicode/unorm2.h"
#include "unicode/ustring.h"
#include "unicode/utext.h"
#include "unicode/utypes.h"

namespace mozilla::intl {

/**
 * This component is a Mozilla-focused API for working with strings in
 * internationalization code.
 */
class String final {
 public:
  String() = delete;

  /**
   * Return the locale-sensitive lower case string of the input.
   */
  template <typename B>
  static Result<Ok, ICUError> ToLocaleLowerCase(const char* aLocale,
                                                Span<const char16_t> aString,
                                                B& aBuffer) {
    if (!aBuffer.reserve(aString.size())) {
      return Err(ICUError::OutOfMemory);
    }
    return FillBufferWithICUCall(
        aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
          return u_strToLower(target, length, aString.data(), aString.size(),
                              aLocale, status);
        });
  }

  /**
   * Return the locale-sensitive upper case string of the input.
   */
  template <typename B>
  static Result<Ok, ICUError> ToLocaleUpperCase(const char* aLocale,
                                                Span<const char16_t> aString,
                                                B& aBuffer) {
    if (!aBuffer.reserve(aString.size())) {
      return Err(ICUError::OutOfMemory);
    }
    return FillBufferWithICUCall(
        aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
          return u_strToUpper(target, length, aString.data(), aString.size(),
                              aLocale, status);
        });
  }

  /**
   * Normalization form constants to describe which normalization algorithm
   * should be performed.
   *
   * Also see:
   * - Unicode Standard, §2.12 Equivalent Sequences
   * - Unicode Standard, §3.11 Normalization Forms
   * - https://unicode.org/reports/tr15/
   */
  enum class NormalizationForm {
    /**
     * Normalization Form C
     */
    NFC,

    /**
     * Normalization Form D
     */
    NFD,

    /**
     * Normalization Form KC
     */
    NFKC,

    /**
     * Normalization Form KD
     */
    NFKD,
  };

  enum class AlreadyNormalized : bool { No, Yes };

  /**
   * Normalize the input string according to requested normalization form.
   *
   * Returns `AlreadyNormalized::Yes` when the string is already in normalized
   * form. The output buffer is unchanged in this case. Otherwise returns
   * `AlreadyNormalized::No` and places the normalized string into the output
   * buffer.
   */
  template <typename B>
  static Result<AlreadyNormalized, ICUError> Normalize(
      NormalizationForm aForm, Span<const char16_t> aString, B& aBuffer) {
    // The unorm2_getXXXInstance() methods return a shared instance which must
    // not be deleted.
    UErrorCode status = U_ZERO_ERROR;
    const UNormalizer2* normalizer;
    switch (aForm) {
      case NormalizationForm::NFC:
        normalizer = unorm2_getNFCInstance(&status);
        break;
      case NormalizationForm::NFD:
        normalizer = unorm2_getNFDInstance(&status);
        break;
      case NormalizationForm::NFKC:
        normalizer = unorm2_getNFKCInstance(&status);
        break;
      case NormalizationForm::NFKD:
        normalizer = unorm2_getNFKDInstance(&status);
        break;
    }
    if (U_FAILURE(status)) {
      return Err(ToICUError(status));
    }

    int32_t spanLengthInt = unorm2_spanQuickCheckYes(normalizer, aString.data(),
                                                     aString.size(), &status);
    if (U_FAILURE(status)) {
      return Err(ToICUError(status));
    }

    size_t spanLength = AssertedCast<size_t>(spanLengthInt);
    MOZ_ASSERT(spanLength <= aString.size());

    // Return if the input string is already normalized.
    if (spanLength == aString.size()) {
      return AlreadyNormalized::Yes;
    }

    if (!aBuffer.reserve(aString.size())) {
      return Err(ICUError::OutOfMemory);
    }

    // Copy the already normalized prefix.
    if (spanLength > 0) {
      PodCopy(aBuffer.data(), aString.data(), spanLength);

      aBuffer.written(spanLength);
    }

    MOZ_TRY(FillBufferWithICUCall(
        aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
          Span<const char16_t> remaining = aString.From(spanLength);
          return unorm2_normalizeSecondAndAppend(normalizer, target, spanLength,
                                                 length, remaining.data(),
                                                 remaining.size(), status);
        }));

    return AlreadyNormalized::No;
  }

  /**
   * Return true if the code point has the binary property "Cased".
   */
  static bool IsCased(char32_t codePoint) {
    return u_hasBinaryProperty(static_cast<UChar32>(codePoint), UCHAR_CASED);
  }

  /**
   * Return true if the code point has the binary property "Case_Ignorable".
   */
  static bool IsCaseIgnorable(char32_t codePoint) {
    return u_hasBinaryProperty(static_cast<UChar32>(codePoint),
                               UCHAR_CASE_IGNORABLE);
  }

  /**
   * Return the NFC pairwise composition of the two input characters, if any;
   * returns 0 (which we know is not a composed char!) if none exists.
   */
  static char32_t ComposePairNFC(char32_t a, char32_t b) {
    // unorm2_getNFCInstance returns a static instance that does not have to be
    // released here. If it fails, we just return 0 (no composition) always.
    static UErrorCode status = U_ZERO_ERROR;
    static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
    if (U_FAILURE(status)) {
      return 0;
    }
    UChar32 ch = unorm2_composePair(normalizer, static_cast<UChar32>(a),
                                    static_cast<UChar32>(b));
    return ch < 0 ? 0 : static_cast<char32_t>(ch);
  }

  /**
   * Put the "raw" (single-level) canonical decomposition of the input char, if
   * any, into the provided buffer. Canonical decomps are never more than two
   * chars in length (although full normalization may result in longer output
   * due to recursion).
   * Returns the length of the decomposition (0 if none, else 1 or 2).
   */
  static int DecomposeRawNFD(char32_t ab, char32_t decomp[2]) {
    // unorm2_getNFCInstance returns a static instance that does not have to be
    // released here. If it fails, we just return 0 (no decomposition) always.
    // Although we are using it to query for a decomposition, the mode of the
    // Normalizer2 is irrelevant here, so we may as well use the same singleton
    // instance as ComposePairNFC.
    static UErrorCode status = U_ZERO_ERROR;
    static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
    if (U_FAILURE(status)) {
      return 0;
    }

    // Canonical decompositions are never more than two Unicode characters,
    // or a maximum of 4 utf-16 code units.
    const unsigned MAX_DECOMP_LENGTH = 4;
    UErrorCode error = U_ZERO_ERROR;
    UChar decompUtf16[MAX_DECOMP_LENGTH];
    int32_t len =
        unorm2_getRawDecomposition(normalizer, static_cast<UChar32>(ab),
                                   decompUtf16, MAX_DECOMP_LENGTH, &error);
    if (U_FAILURE(error) || len < 0) {
      return 0;
    }
    UText text = UTEXT_INITIALIZER;
    utext_openUChars(&text, decompUtf16, len, &error);
    MOZ_ASSERT(U_SUCCESS(error));
    UChar32 ch = UTEXT_NEXT32(&text);
    len = 0;
    if (ch != U_SENTINEL) {
      decomp[0] = static_cast<char32_t>(ch);
      ++len;
      ch = UTEXT_NEXT32(&text);
      if (ch != U_SENTINEL) {
        decomp[1] = static_cast<char32_t>(ch);
        ++len;
      }
    }
    utext_close(&text);
    return len;
  }

  /**
   * Return the Unicode version, for example "13.0".
   */
  static Span<const char> GetUnicodeVersion();
};

}  // namespace mozilla::intl

#endif