256 lines
8.1 KiB
C++
256 lines
8.1 KiB
C++
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
#ifndef intl_components_String_h_
|
|
#define intl_components_String_h_
|
|
|
|
#include "mozilla/Assertions.h"
|
|
#include "mozilla/Casting.h"
|
|
#include "mozilla/intl/ICU4CGlue.h"
|
|
#include "mozilla/intl/ICUError.h"
|
|
#include "mozilla/PodOperations.h"
|
|
#include "mozilla/Span.h"
|
|
#include "mozilla/Try.h"
|
|
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/unorm2.h"
|
|
#include "unicode/ustring.h"
|
|
#include "unicode/utext.h"
|
|
#include "unicode/utypes.h"
|
|
|
|
namespace mozilla::intl {
|
|
|
|
/**
|
|
* This component is a Mozilla-focused API for working with strings in
|
|
* internationalization code.
|
|
*/
|
|
class String final {
|
|
public:
|
|
String() = delete;
|
|
|
|
/**
|
|
* Return the locale-sensitive lower case string of the input.
|
|
*/
|
|
template <typename B>
|
|
static Result<Ok, ICUError> ToLocaleLowerCase(const char* aLocale,
|
|
Span<const char16_t> aString,
|
|
B& aBuffer) {
|
|
if (!aBuffer.reserve(aString.size())) {
|
|
return Err(ICUError::OutOfMemory);
|
|
}
|
|
return FillBufferWithICUCall(
|
|
aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
|
|
return u_strToLower(target, length, aString.data(), aString.size(),
|
|
aLocale, status);
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Return the locale-sensitive upper case string of the input.
|
|
*/
|
|
template <typename B>
|
|
static Result<Ok, ICUError> ToLocaleUpperCase(const char* aLocale,
|
|
Span<const char16_t> aString,
|
|
B& aBuffer) {
|
|
if (!aBuffer.reserve(aString.size())) {
|
|
return Err(ICUError::OutOfMemory);
|
|
}
|
|
return FillBufferWithICUCall(
|
|
aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
|
|
return u_strToUpper(target, length, aString.data(), aString.size(),
|
|
aLocale, status);
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Normalization form constants to describe which normalization algorithm
|
|
* should be performed.
|
|
*
|
|
* Also see:
|
|
* - Unicode Standard, §2.12 Equivalent Sequences
|
|
* - Unicode Standard, §3.11 Normalization Forms
|
|
* - https://unicode.org/reports/tr15/
|
|
*/
|
|
enum class NormalizationForm {
|
|
/**
|
|
* Normalization Form C
|
|
*/
|
|
NFC,
|
|
|
|
/**
|
|
* Normalization Form D
|
|
*/
|
|
NFD,
|
|
|
|
/**
|
|
* Normalization Form KC
|
|
*/
|
|
NFKC,
|
|
|
|
/**
|
|
* Normalization Form KD
|
|
*/
|
|
NFKD,
|
|
};
|
|
|
|
enum class AlreadyNormalized : bool { No, Yes };
|
|
|
|
/**
|
|
* Normalize the input string according to requested normalization form.
|
|
*
|
|
* Returns `AlreadyNormalized::Yes` when the string is already in normalized
|
|
* form. The output buffer is unchanged in this case. Otherwise returns
|
|
* `AlreadyNormalized::No` and places the normalized string into the output
|
|
* buffer.
|
|
*/
|
|
template <typename B>
|
|
static Result<AlreadyNormalized, ICUError> Normalize(
|
|
NormalizationForm aForm, Span<const char16_t> aString, B& aBuffer) {
|
|
// The unorm2_getXXXInstance() methods return a shared instance which must
|
|
// not be deleted.
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
const UNormalizer2* normalizer;
|
|
switch (aForm) {
|
|
case NormalizationForm::NFC:
|
|
normalizer = unorm2_getNFCInstance(&status);
|
|
break;
|
|
case NormalizationForm::NFD:
|
|
normalizer = unorm2_getNFDInstance(&status);
|
|
break;
|
|
case NormalizationForm::NFKC:
|
|
normalizer = unorm2_getNFKCInstance(&status);
|
|
break;
|
|
case NormalizationForm::NFKD:
|
|
normalizer = unorm2_getNFKDInstance(&status);
|
|
break;
|
|
}
|
|
if (U_FAILURE(status)) {
|
|
return Err(ToICUError(status));
|
|
}
|
|
|
|
int32_t spanLengthInt = unorm2_spanQuickCheckYes(normalizer, aString.data(),
|
|
aString.size(), &status);
|
|
if (U_FAILURE(status)) {
|
|
return Err(ToICUError(status));
|
|
}
|
|
|
|
size_t spanLength = AssertedCast<size_t>(spanLengthInt);
|
|
MOZ_ASSERT(spanLength <= aString.size());
|
|
|
|
// Return if the input string is already normalized.
|
|
if (spanLength == aString.size()) {
|
|
return AlreadyNormalized::Yes;
|
|
}
|
|
|
|
if (!aBuffer.reserve(aString.size())) {
|
|
return Err(ICUError::OutOfMemory);
|
|
}
|
|
|
|
// Copy the already normalized prefix.
|
|
if (spanLength > 0) {
|
|
PodCopy(aBuffer.data(), aString.data(), spanLength);
|
|
|
|
aBuffer.written(spanLength);
|
|
}
|
|
|
|
MOZ_TRY(FillBufferWithICUCall(
|
|
aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
|
|
Span<const char16_t> remaining = aString.From(spanLength);
|
|
return unorm2_normalizeSecondAndAppend(normalizer, target, spanLength,
|
|
length, remaining.data(),
|
|
remaining.size(), status);
|
|
}));
|
|
|
|
return AlreadyNormalized::No;
|
|
}
|
|
|
|
/**
|
|
* Return true if the code point has the binary property "Cased".
|
|
*/
|
|
static bool IsCased(char32_t codePoint) {
|
|
return u_hasBinaryProperty(static_cast<UChar32>(codePoint), UCHAR_CASED);
|
|
}
|
|
|
|
/**
|
|
* Return true if the code point has the binary property "Case_Ignorable".
|
|
*/
|
|
static bool IsCaseIgnorable(char32_t codePoint) {
|
|
return u_hasBinaryProperty(static_cast<UChar32>(codePoint),
|
|
UCHAR_CASE_IGNORABLE);
|
|
}
|
|
|
|
/**
|
|
* Return the NFC pairwise composition of the two input characters, if any;
|
|
* returns 0 (which we know is not a composed char!) if none exists.
|
|
*/
|
|
static char32_t ComposePairNFC(char32_t a, char32_t b) {
|
|
// unorm2_getNFCInstance returns a static instance that does not have to be
|
|
// released here. If it fails, we just return 0 (no composition) always.
|
|
static UErrorCode status = U_ZERO_ERROR;
|
|
static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
|
|
if (U_FAILURE(status)) {
|
|
return 0;
|
|
}
|
|
UChar32 ch = unorm2_composePair(normalizer, static_cast<UChar32>(a),
|
|
static_cast<UChar32>(b));
|
|
return ch < 0 ? 0 : static_cast<char32_t>(ch);
|
|
}
|
|
|
|
/**
|
|
* Put the "raw" (single-level) canonical decomposition of the input char, if
|
|
* any, into the provided buffer. Canonical decomps are never more than two
|
|
* chars in length (although full normalization may result in longer output
|
|
* due to recursion).
|
|
* Returns the length of the decomposition (0 if none, else 1 or 2).
|
|
*/
|
|
static int DecomposeRawNFD(char32_t ab, char32_t decomp[2]) {
|
|
// unorm2_getNFCInstance returns a static instance that does not have to be
|
|
// released here. If it fails, we just return 0 (no decomposition) always.
|
|
// Although we are using it to query for a decomposition, the mode of the
|
|
// Normalizer2 is irrelevant here, so we may as well use the same singleton
|
|
// instance as ComposePairNFC.
|
|
static UErrorCode status = U_ZERO_ERROR;
|
|
static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
|
|
if (U_FAILURE(status)) {
|
|
return 0;
|
|
}
|
|
|
|
// Canonical decompositions are never more than two Unicode characters,
|
|
// or a maximum of 4 utf-16 code units.
|
|
const unsigned MAX_DECOMP_LENGTH = 4;
|
|
UErrorCode error = U_ZERO_ERROR;
|
|
UChar decompUtf16[MAX_DECOMP_LENGTH];
|
|
int32_t len =
|
|
unorm2_getRawDecomposition(normalizer, static_cast<UChar32>(ab),
|
|
decompUtf16, MAX_DECOMP_LENGTH, &error);
|
|
if (U_FAILURE(error) || len < 0) {
|
|
return 0;
|
|
}
|
|
UText text = UTEXT_INITIALIZER;
|
|
utext_openUChars(&text, decompUtf16, len, &error);
|
|
MOZ_ASSERT(U_SUCCESS(error));
|
|
UChar32 ch = UTEXT_NEXT32(&text);
|
|
len = 0;
|
|
if (ch != U_SENTINEL) {
|
|
decomp[0] = static_cast<char32_t>(ch);
|
|
++len;
|
|
ch = UTEXT_NEXT32(&text);
|
|
if (ch != U_SENTINEL) {
|
|
decomp[1] = static_cast<char32_t>(ch);
|
|
++len;
|
|
}
|
|
}
|
|
utext_close(&text);
|
|
return len;
|
|
}
|
|
|
|
/**
|
|
* Return the Unicode version, for example "13.0".
|
|
*/
|
|
static Span<const char> GetUnicodeVersion();
|
|
};
|
|
|
|
} // namespace mozilla::intl
|
|
|
|
#endif
|