trisquel-icecat/icecat/intl/icu/source/data/brkitr/rules/line.txt
2025-10-06 02:35:48 -06:00

406 lines
14 KiB
Text
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line.txt
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 (https://www.unicode.org/reports/tr14/)
# for Unicode 14.0, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
# there is a boundary preceding the hyphen. See rule 20.9
#
# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
# It sets characters of class CJ to behave like NS.
#
# Character Classes defined by TR 14.
#
!!chain;
!!quoted_literals_only;
$AI = [:LineBreak = Ambiguous:];
$AK = [:LineBreak = Aksara:];
$AL = [:LineBreak = Alphabetic:];
$AP = [:LineBreak = Aksara_Prebase:];
$AS = [:LineBreak = Aksara_Start:];
$BA = [:LineBreak = Break_After:];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
$B2 = [:LineBreak = Break_Both:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
# $CM = [:LineBreak = Combining_Mark:];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EB = [:LineBreak = EB:];
$EM = [:LineBreak = EM:];
$EX = [:LineBreak = Exclamation:];
$GL = [:LineBreak = Glue:];
$HL = [:LineBreak = Hebrew_Letter:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
$ID = [:LineBreak = Ideographic:];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
$JV = [:LineBreak = JV:];
$JT = [:LineBreak = JT:];
$LF = [:LineBreak = Line_Feed:];
$NL = [:LineBreak = Next_Line:];
# NS includes CJ for CSS strict line breaking.
$NS = [[:LineBreak = Nonstarter:] $CJ];
$NU = [:LineBreak = Numeric:];
$OP = [:LineBreak = Open_Punctuation:];
$PO = [:LineBreak = Postfix_Numeric:];
$PR = [:LineBreak = Prefix_Numeric:];
$QU = [:LineBreak = Quotation:];
$RI = [:LineBreak = Regional_Indicator:];
$SA = [:LineBreak = Complex_Context:];
$SG = [:LineBreak = Surrogate:];
$SP = [:LineBreak = Space:];
$SY = [:LineBreak = Break_Symbols:];
$VF = [:LineBreak = Virama_Final:];
$VI = [:LineBreak = Virama:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
$CMX = [[$CM] - [$ZWJ]];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context (SA).
$dictionary = [$SA];
#
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
# SA (Dictionary chars, excluding Mn and Mc)
# SG (Unpaired Surrogates)
# XX (Unknown, unassigned)
# as $AL (Alphabetic)
#
$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
## -------------------------------------------------
#
# CAN_CM is the set of characters that may combine with CM combining chars.
# Note that Linebreak UAX 14's concept of a combining char and the rules
# for what they can combine with are _very_ different from the rest of Unicode.
#
# Note that $CM itself is left out of this set. If CM is needed as a base
# it must be listed separately in the rule.
#
$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
#
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
#
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
# LB 6 Do not break before hard line breaks.
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJ;
$LB8NonBreaks $WJ;
^$CM+ $WJ;
$WJ $CM* .;
#
# LB 12 Do not break after NBSP and related characters.
# GL x
#
$GL $CM* .;
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
^$CM+ $GL;
# LB 13 Don't break before ']' or '!' or '/', even after spaces.
#
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
# LB 14 Do not break after OP, even after spaces
# Note subtle interaction with "SP IS /" rules in LB14a.
# This rule consumes the SP, chaining happens on the IS, effectivley overriding the SP IS rules,
# which is the desired behavior.
#
$OP $CM* $SP* .;
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# by rule 8, CM following a SP is stand-alone.
# LB 15a
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .;
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^([\p{Pi} & $QU] $CM* $SP*)+ .;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
# LB 15b
$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}];
# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi.
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?;
# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23"
# Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
# See issue ICU-20303
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
#
# LB 15d Do not break before numeric separators (IS), even after spaces.
# SP IS QU is handled below as part of LB 19.
[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
$SP $IS $CM* $ZWJ [^$CM $NU];
$CAN_CM $CM* $IS;
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
# LB 16
($CL | $CP) $CM* $SP* $NS;
# LB 17
$B2 $CM* $SP* $B2;
#
# LB 18 Break after spaces.
#
$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
$LB18Breaks = [$LB8Breaks $SP];
# LB 19 and LB 19a.
# Instead of implementing both as keep-together rules as in UAX #14, we have an
# East_Asian_Width and General_Category-insensitive keep-together rule
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
# on context. This avoids having to do manual chaining over multiple characters
# with many other rules over multiple characters, as a keep-together LB19a would
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
$LB18NonBreaks $CM* $QU;
^$CM+ $QU;
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
$QU $CM* .;
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
# LB 20
# <break> $CB
# $CB <break>
#
$LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a:
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a following LB15b:
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# LB 21 x (BA | HY | NS)
# BB x
#
$LB20NonBreaks $CM* ($BA | $HY | $NS);
^$CM+ ($BA | $HY | $NS);
$BB $CM* [^$CB]; # $BB x
$BB $CM* $LB20NonBreaks;
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
# HL (HY | BA) x [^HL]
#
$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?;
# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
$SY $CM* $HL;
# LB 22 Do not break before ellipses
#
$LB20NonBreaks $CM* $IN;
^$CM+ $IN;
# LB 23
#
($ALPlus | $HL) $CM* $NU;
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
$NU $CM* ($ALPlus | $HL);
# LB 23a
#
$PR $CM* ($ID | $EB | $EM);
($ID | $EB | $EM) $CM* $PO;
#
# LB 24
#
($PR | $PO) $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* ($PR | $PO);
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
#
# LB 25 Numbers.
#
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))*
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
# LB 26 Do not break a Korean syllable
#
$JL $CM* ($JL | $JV | $H2 | $H3);
($JV | $H2) $CM* ($JV | $JT);
($JT | $H3) $CM* $JT;
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
# LB 28 Do not break between alphabetics
#
($ALPlus | $HL) $CM* ($ALPlus | $HL);
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
#LB 28a Do not break Orthographic syllables
($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?;
# LB 29
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
$EB $CM* $EM;
$ExtPictUnassigned $CM* $EM;
# LB 31 Break everywhere else.
# Match a single code point if no other rule applies.
.;