trisquel-icecat/icecat/third_party/rust/regex/src/builders.rs

2535 lines
105 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#![allow(warnings)]
// This module defines an internal builder that encapsulates all interaction
// with meta::Regex construction, and then 4 public API builders that wrap
// around it. The docs are essentially repeated on each of the 4 public
// builders, with tweaks to the examples as needed.
//
// The reason why there are so many builders is partially because of a misstep
// in the initial API design: the builder constructor takes in the pattern
// strings instead of using the `build` method to accept the pattern strings.
// This means `new` has a different signature for each builder. It probably
// would have been nicer to to use one builder with `fn new()`, and then add
// `build(pat)` and `build_many(pats)` constructors.
//
// The other reason is because I think the `bytes` module should probably
// have its own builder type. That way, it is completely isolated from the
// top-level API.
//
// If I could do it again, I'd probably have a `regex::Builder` and a
// `regex::bytes::Builder`. Each would have `build` and `build_set` (or
// `build_many`) methods for constructing a single pattern `Regex` and a
// multi-pattern `RegexSet`, respectively.
use alloc::{
string::{String, ToString},
sync::Arc,
vec,
vec::Vec,
};
use regex_automata::{
meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind,
};
use crate::error::Error;
/// A builder for constructing a `Regex`, `bytes::Regex`, `RegexSet` or a
/// `bytes::RegexSet`.
///
/// This is essentially the implementation of the four different builder types
/// in the public API: `RegexBuilder`, `bytes::RegexBuilder`, `RegexSetBuilder`
/// and `bytes::RegexSetBuilder`.
#[derive(Clone, Debug)]
struct Builder {
pats: Vec<String>,
metac: meta::Config,
syntaxc: syntax::Config,
}
impl Default for Builder {
fn default() -> Builder {
let metac = meta::Config::new()
.nfa_size_limit(Some(10 * (1 << 20)))
.hybrid_cache_capacity(2 * (1 << 20));
Builder { pats: vec![], metac, syntaxc: syntax::Config::default() }
}
}
impl Builder {
fn new<I, S>(patterns: I) -> Builder
where
S: AsRef<str>,
I: IntoIterator<Item = S>,
{
let mut b = Builder::default();
b.pats.extend(patterns.into_iter().map(|p| p.as_ref().to_string()));
b
}
fn build_one_string(&self) -> Result<crate::Regex, Error> {
assert_eq!(1, self.pats.len());
let metac = self
.metac
.clone()
.match_kind(MatchKind::LeftmostFirst)
.utf8_empty(true);
let syntaxc = self.syntaxc.clone().utf8(true);
let pattern = Arc::from(self.pats[0].as_str());
meta::Builder::new()
.configure(metac)
.syntax(syntaxc)
.build(&pattern)
.map(|meta| crate::Regex { meta, pattern })
.map_err(Error::from_meta_build_error)
}
fn build_one_bytes(&self) -> Result<crate::bytes::Regex, Error> {
assert_eq!(1, self.pats.len());
let metac = self
.metac
.clone()
.match_kind(MatchKind::LeftmostFirst)
.utf8_empty(false);
let syntaxc = self.syntaxc.clone().utf8(false);
let pattern = Arc::from(self.pats[0].as_str());
meta::Builder::new()
.configure(metac)
.syntax(syntaxc)
.build(&pattern)
.map(|meta| crate::bytes::Regex { meta, pattern })
.map_err(Error::from_meta_build_error)
}
fn build_many_string(&self) -> Result<crate::RegexSet, Error> {
let metac = self
.metac
.clone()
.match_kind(MatchKind::All)
.utf8_empty(true)
.which_captures(WhichCaptures::None);
let syntaxc = self.syntaxc.clone().utf8(true);
let patterns = Arc::from(self.pats.as_slice());
meta::Builder::new()
.configure(metac)
.syntax(syntaxc)
.build_many(&patterns)
.map(|meta| crate::RegexSet { meta, patterns })
.map_err(Error::from_meta_build_error)
}
fn build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error> {
let metac = self
.metac
.clone()
.match_kind(MatchKind::All)
.utf8_empty(false)
.which_captures(WhichCaptures::None);
let syntaxc = self.syntaxc.clone().utf8(false);
let patterns = Arc::from(self.pats.as_slice());
meta::Builder::new()
.configure(metac)
.syntax(syntaxc)
.build_many(&patterns)
.map(|meta| crate::bytes::RegexSet { meta, patterns })
.map_err(Error::from_meta_build_error)
}
fn case_insensitive(&mut self, yes: bool) -> &mut Builder {
self.syntaxc = self.syntaxc.case_insensitive(yes);
self
}
fn multi_line(&mut self, yes: bool) -> &mut Builder {
self.syntaxc = self.syntaxc.multi_line(yes);
self
}
fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder {
self.syntaxc = self.syntaxc.dot_matches_new_line(yes);
self
}
fn crlf(&mut self, yes: bool) -> &mut Builder {
self.syntaxc = self.syntaxc.crlf(yes);
self
}
fn line_terminator(&mut self, byte: u8) -> &mut Builder {
self.metac = self.metac.clone().line_terminator(byte);
self.syntaxc = self.syntaxc.line_terminator(byte);
self
}
fn swap_greed(&mut self, yes: bool) -> &mut Builder {
self.syntaxc = self.syntaxc.swap_greed(yes);
self
}
fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder {
self.syntaxc = self.syntaxc.ignore_whitespace(yes);
self
}
fn unicode(&mut self, yes: bool) -> &mut Builder {
self.syntaxc = self.syntaxc.unicode(yes);
self
}
fn octal(&mut self, yes: bool) -> &mut Builder {
self.syntaxc = self.syntaxc.octal(yes);
self
}
fn size_limit(&mut self, limit: usize) -> &mut Builder {
self.metac = self.metac.clone().nfa_size_limit(Some(limit));
self
}
fn dfa_size_limit(&mut self, limit: usize) -> &mut Builder {
self.metac = self.metac.clone().hybrid_cache_capacity(limit);
self
}
fn nest_limit(&mut self, limit: u32) -> &mut Builder {
self.syntaxc = self.syntaxc.nest_limit(limit);
self
}
}
pub(crate) mod string {
use crate::{error::Error, Regex, RegexSet};
use super::Builder;
/// A configurable builder for a [`Regex`].
///
/// This builder can be used to programmatically set flags such as `i`
/// (case insensitive) and `x` (for verbose mode). This builder can also be
/// used to configure things like the line terminator and a size limit on
/// the compiled regular expression.
#[derive(Clone, Debug)]
pub struct RegexBuilder {
builder: Builder,
}
impl RegexBuilder {
/// Create a new builder with a default configuration for the given
/// pattern.
///
/// If the pattern is invalid or exceeds the configured size limits,
/// then an error will be returned when [`RegexBuilder::build`] is
/// called.
pub fn new(pattern: &str) -> RegexBuilder {
RegexBuilder { builder: Builder::new([pattern]) }
}
/// Compiles the pattern given to `RegexBuilder::new` with the
/// configuration set on this builder.
///
/// If the pattern isn't a valid regex or if a configured size limit
/// was exceeded, then an error is returned.
pub fn build(&self) -> Result<Regex, Error> {
self.builder.build_one_string()
}
/// This configures Unicode mode for the entire pattern.
///
/// Enabling Unicode mode does a number of things:
///
/// * Most fundamentally, it causes the fundamental atom of matching
/// to be a single codepoint. When Unicode mode is disabled, it's a
/// single byte. For example, when Unicode mode is enabled, `.` will
/// match `💩` once, where as it will match 4 times when Unicode mode
/// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
/// * Case insensitive matching uses Unicode simple case folding rules.
/// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
/// available.
/// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
/// `\d`.
/// * The word boundary assertions, `\b` and `\B`, use the Unicode
/// definition of a word character.
///
/// Note that if Unicode mode is disabled, then the regex will fail to
/// compile if it could match invalid UTF-8. For example, when Unicode
/// mode is disabled, then since `.` matches any byte (except for
/// `\n`), then it can match invalid UTF-8 and thus building a regex
/// from it will fail. Another example is `\w` and `\W`. Since `\w` can
/// only match ASCII bytes when Unicode mode is disabled, it's allowed.
/// But `\W` can match more than ASCII bytes, including invalid UTF-8,
/// and so it is not allowed. This restriction can be lifted only by
/// using a [`bytes::Regex`](crate::bytes::Regex).
///
/// For more details on the Unicode support in this crate, see the
/// [Unicode section](crate#unicode) in this crate's top-level
/// documentation.
///
/// The default for this is `true`.
///
/// # Example
///
/// ```
/// use regex::RegexBuilder;
///
/// let re = RegexBuilder::new(r"\w")
/// .unicode(false)
/// .build()
/// .unwrap();
/// // Normally greek letters would be included in \w, but since
/// // Unicode mode is disabled, it only matches ASCII letters.
/// assert!(!re.is_match("δ"));
///
/// let re = RegexBuilder::new(r"s")
/// .case_insensitive(true)
/// .unicode(false)
/// .build()
/// .unwrap();
/// // Normally 'ſ' is included when searching for 's' case
/// // insensitively due to Unicode's simple case folding rules. But
/// // when Unicode mode is disabled, only ASCII case insensitive rules
/// // are used.
/// assert!(!re.is_match("ſ"));
/// ```
pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.unicode(yes);
self
}
/// This configures whether to enable case insensitive matching for the
/// entire pattern.
///
/// This setting can also be configured using the inline flag `i`
/// in the pattern. For example, `(?i:foo)` matches `foo` case
/// insensitively while `(?-i:foo)` matches `foo` case sensitively.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::RegexBuilder;
///
/// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
/// .case_insensitive(true)
/// .build()
/// .unwrap();
/// assert!(re.is_match("FoObarQuUx"));
/// // Even though case insensitive matching is enabled in the builder,
/// // it can be locally disabled within the pattern. In this case,
/// // `bar` is matched case sensitively.
/// assert!(!re.is_match("fooBARquux"));
/// ```
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.case_insensitive(yes);
self
}
/// This configures multi-line mode for the entire pattern.
///
/// Enabling multi-line mode changes the behavior of the `^` and `$`
/// anchor assertions. Instead of only matching at the beginning and
/// end of a haystack, respectively, multi-line mode causes them to
/// match at the beginning and end of a line *in addition* to the
/// beginning and end of a haystack. More precisely, `^` will match at
/// the position immediately following a `\n` and `$` will match at the
/// position immediately preceding a `\n`.
///
/// The behavior of this option can be impacted by other settings too:
///
/// * The [`RegexBuilder::line_terminator`] option changes `\n` above
/// to any ASCII byte.
/// * The [`RegexBuilder::crlf`] option changes the line terminator to
/// be either `\r` or `\n`, but never at the position between a `\r`
/// and `\n`.
///
/// This setting can also be configured using the inline flag `m` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::RegexBuilder;
///
/// let re = RegexBuilder::new(r"^foo$")
/// .multi_line(true)
/// .build()
/// .unwrap();
/// assert_eq!(Some(1..4), re.find("\nfoo\n").map(|m| m.range()));
/// ```
pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.multi_line(yes);
self
}
/// This configures dot-matches-new-line mode for the entire pattern.
///
/// Perhaps surprisingly, the default behavior for `.` is not to match
/// any character, but rather, to match any character except for the
/// line terminator (which is `\n` by default). When this mode is
/// enabled, the behavior changes such that `.` truly matches any
/// character.
///
/// This setting can also be configured using the inline flag `s` in
/// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
/// regexes.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::RegexBuilder;
///
/// let re = RegexBuilder::new(r"foo.bar")
/// .dot_matches_new_line(true)
/// .build()
/// .unwrap();
/// let hay = "foo\nbar";
/// assert_eq!(Some("foo\nbar"), re.find(hay).map(|m| m.as_str()));
/// ```
pub fn dot_matches_new_line(
&mut self,
yes: bool,
) -> &mut RegexBuilder {
self.builder.dot_matches_new_line(yes);
self
}
/// This configures CRLF mode for the entire pattern.
///
/// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
/// short) and `\n` ("line feed" or LF for short) are treated as line
/// terminators. This results in the following:
///
/// * Unless dot-matches-new-line mode is enabled, `.` will now match
/// any character except for `\n` and `\r`.
/// * When multi-line mode is enabled, `^` will match immediately
/// following a `\n` or a `\r`. Similarly, `$` will match immediately
/// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
/// between `\r` and `\n`.
///
/// This setting can also be configured using the inline flag `R` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::RegexBuilder;
///
/// let re = RegexBuilder::new(r"^foo$")
/// .multi_line(true)
/// .crlf(true)
/// .build()
/// .unwrap();
/// let hay = "\r\nfoo\r\n";
/// // If CRLF mode weren't enabled here, then '$' wouldn't match
/// // immediately after 'foo', and thus no match would be found.
/// assert_eq!(Some("foo"), re.find(hay).map(|m| m.as_str()));
/// ```
///
/// This example demonstrates that `^` will never match at a position
/// between `\r` and `\n`. (`$` will similarly not match between a `\r`
/// and a `\n`.)
///
/// ```
/// use regex::RegexBuilder;
///
/// let re = RegexBuilder::new(r"^")
/// .multi_line(true)
/// .crlf(true)
/// .build()
/// .unwrap();
/// let hay = "\r\n\r\n";
/// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
/// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
/// ```
pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.crlf(yes);
self
}
/// Configures the line terminator to be used by the regex.
///
/// The line terminator is relevant in two ways for a particular regex:
///
/// * When dot-matches-new-line mode is *not* enabled (the default),
/// then `.` will match any character except for the configured line
/// terminator.
/// * When multi-line mode is enabled (not the default), then `^` and
/// `$` will match immediately after and before, respectively, a line
/// terminator.
///
/// In both cases, if CRLF mode is enabled in a particular context,
/// then it takes precedence over any configured line terminator.
///
/// This option cannot be configured from within the pattern.
///
/// The default line terminator is `\n`.
///
/// # Example
///
/// This shows how to treat the NUL byte as a line terminator. This can
/// be a useful heuristic when searching binary data.
///
/// ```
/// use regex::RegexBuilder;
///
/// let re = RegexBuilder::new(r"^foo$")
/// .multi_line(true)
/// .line_terminator(b'\x00')
/// .build()
/// .unwrap();
/// let hay = "\x00foo\x00";
/// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
/// ```
///
/// This example shows that the behavior of `.` is impacted by this
/// setting as well:
///
/// ```
/// use regex::RegexBuilder;
///
/// let re = RegexBuilder::new(r".")
/// .line_terminator(b'\x00')
/// .build()
/// .unwrap();
/// assert!(re.is_match("\n"));
/// assert!(!re.is_match("\x00"));
/// ```
///
/// This shows that building a regex will fail if the byte given
/// is not ASCII and the pattern could result in matching invalid
/// UTF-8. This is because any singular non-ASCII byte is not valid
/// UTF-8, and it is not permitted for a [`Regex`] to match invalid
/// UTF-8. (It is permissible to use a non-ASCII byte when building a
/// [`bytes::Regex`](crate::bytes::Regex).)
///
/// ```
/// use regex::RegexBuilder;
///
/// assert!(RegexBuilder::new(r".").line_terminator(0x80).build().is_err());
/// // Note that using a non-ASCII byte isn't enough on its own to
/// // cause regex compilation to fail. You actually have to make use
/// // of it in the regex in a way that leads to matching invalid
/// // UTF-8. If you don't, then regex compilation will succeed!
/// assert!(RegexBuilder::new(r"a").line_terminator(0x80).build().is_ok());
/// ```
pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
self.builder.line_terminator(byte);
self
}
/// This configures swap-greed mode for the entire pattern.
///
/// When swap-greed mode is enabled, patterns like `a+` will become
/// non-greedy and patterns like `a+?` will become greedy. In other
/// words, the meanings of `a+` and `a+?` are switched.
///
/// This setting can also be configured using the inline flag `U` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::RegexBuilder;
///
/// let re = RegexBuilder::new(r"a+")
/// .swap_greed(true)
/// .build()
/// .unwrap();
/// assert_eq!(Some("a"), re.find("aaa").map(|m| m.as_str()));
/// ```
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.swap_greed(yes);
self
}
/// This configures verbose mode for the entire pattern.
///
/// When enabled, whitespace will treated as insignifcant in the
/// pattern and `#` can be used to start a comment until the next new
/// line.
///
/// Normally, in most places in a pattern, whitespace is treated
/// literally. For example ` +` will match one or more ASCII whitespace
/// characters.
///
/// When verbose mode is enabled, `\#` can be used to match a literal
/// `#` and `\ ` can be used to match a literal ASCII whitespace
/// character.
///
/// Verbose mode is useful for permitting regexes to be formatted and
/// broken up more nicely. This may make them more easily readable.
///
/// This setting can also be configured using the inline flag `x` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::RegexBuilder;
///
/// let pat = r"
/// \b
/// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
/// [\s--\n]+ # whitespace should separate names
/// (?: # middle name can be an initial!
/// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
/// [\s--\n]+
/// )?
/// (?<last>\p{Uppercase}\w*)
/// \b
/// ";
/// let re = RegexBuilder::new(pat)
/// .ignore_whitespace(true)
/// .build()
/// .unwrap();
///
/// let caps = re.captures("Harry Potter").unwrap();
/// assert_eq!("Harry", &caps["first"]);
/// assert_eq!("Potter", &caps["last"]);
///
/// let caps = re.captures("Harry J. Potter").unwrap();
/// assert_eq!("Harry", &caps["first"]);
/// // Since a middle name/initial isn't required for an overall match,
/// // we can't assume that 'initial' or 'middle' will be populated!
/// assert_eq!(Some("J"), caps.name("initial").map(|m| m.as_str()));
/// assert_eq!(None, caps.name("middle").map(|m| m.as_str()));
/// assert_eq!("Potter", &caps["last"]);
///
/// let caps = re.captures("Harry James Potter").unwrap();
/// assert_eq!("Harry", &caps["first"]);
/// // Since a middle name/initial isn't required for an overall match,
/// // we can't assume that 'initial' or 'middle' will be populated!
/// assert_eq!(None, caps.name("initial").map(|m| m.as_str()));
/// assert_eq!(Some("James"), caps.name("middle").map(|m| m.as_str()));
/// assert_eq!("Potter", &caps["last"]);
/// ```
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.ignore_whitespace(yes);
self
}
/// This configures octal mode for the entire pattern.
///
/// Octal syntax is a little-known way of uttering Unicode codepoints
/// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
/// equivalent patterns, where the last example shows octal syntax.
///
/// While supporting octal syntax isn't in and of itself a problem,
/// it does make good error messages harder. That is, in PCRE based
/// regex engines, syntax like `\1` invokes a backreference, which is
/// explicitly unsupported this library. However, many users expect
/// backreferences to be supported. Therefore, when octal support
/// is disabled, the error message will explicitly mention that
/// backreferences aren't supported.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::RegexBuilder;
///
/// // Normally this pattern would not compile, with an error message
/// // about backreferences not being supported. But with octal mode
/// // enabled, octal escape sequences work.
/// let re = RegexBuilder::new(r"\141")
/// .octal(true)
/// .build()
/// .unwrap();
/// assert!(re.is_match("a"));
/// ```
pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.octal(yes);
self
}
/// Sets the approximate size limit, in bytes, of the compiled regex.
///
/// This roughly corresponds to the number of heap memory, in
/// bytes, occupied by a single regex. If the regex would otherwise
/// approximately exceed this limit, then compiling that regex will
/// fail.
///
/// The main utility of a method like this is to avoid compiling
/// regexes that use an unexpected amount of resources, such as
/// time and memory. Even if the memory usage of a large regex is
/// acceptable, its search time may not be. Namely, worst case time
/// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
/// `n ~ len(haystack)`. That is, search time depends, in part, on the
/// size of the compiled regex. This means that putting a limit on the
/// size of the regex limits how much a regex can impact search time.
///
/// For more information about regex size limits, see the section on
/// [untrusted inputs](crate#untrusted-input) in the top-level crate
/// documentation.
///
/// The default for this is some reasonable number that permits most
/// patterns to compile successfully.
///
/// # Example
///
/// ```
/// use regex::RegexBuilder;
///
/// // It may surprise you how big some seemingly small patterns can
/// // be! Since \w is Unicode aware, this generates a regex that can
/// // match approximately 140,000 distinct codepoints.
/// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
/// ```
pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
self.builder.size_limit(bytes);
self
}
/// Set the approximate capacity, in bytes, of the cache of transitions
/// used by the lazy DFA.
///
/// While the lazy DFA isn't always used, in tends to be the most
/// commonly use regex engine in default configurations. It tends to
/// adopt the performance profile of a fully build DFA, but without the
/// downside of taking worst case exponential time to build.
///
/// The downside is that it needs to keep a cache of transitions and
/// states that are built while running a search, and this cache
/// can fill up. When it fills up, the cache will reset itself. Any
/// previously generated states and transitions will then need to be
/// re-generated. If this happens too many times, then this library
/// will bail out of using the lazy DFA and switch to a different regex
/// engine.
///
/// If your regex provokes this particular downside of the lazy DFA,
/// then it may be beneficial to increase its cache capacity. This will
/// potentially reduce the frequency of cache resetting (ideally to
/// `0`). While it won't fix all potential performance problems with
/// the lazy DFA, increasing the cache capacity does fix some.
///
/// There is no easy way to determine, a priori, whether increasing
/// this cache capacity will help. In general, the larger your regex,
/// the more cache it's likely to use. But that isn't an ironclad rule.
/// For example, a regex like `[01]*1[01]{N}` would normally produce a
/// fully build DFA that is exponential in size with respect to `N`.
/// The lazy DFA will prevent exponential space blow-up, but it cache
/// is likely to fill up, even when it's large and even for smallish
/// values of `N`.
///
/// If you aren't sure whether this helps or not, it is sensible to
/// set this to some arbitrarily large number in testing, such as
/// `usize::MAX`. Namely, this represents the amount of capacity that
/// *may* be used. It's probably not a good idea to use `usize::MAX` in
/// production though, since it implies there are no controls on heap
/// memory used by this library during a search. In effect, set it to
/// whatever you're willing to allocate for a single regex search.
pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
self.builder.dfa_size_limit(bytes);
self
}
/// Set the nesting limit for this parser.
///
/// The nesting limit controls how deep the abstract syntax tree is
/// allowed to be. If the AST exceeds the given limit (e.g., with too
/// many nested groups), then an error is returned by the parser.
///
/// The purpose of this limit is to act as a heuristic to prevent stack
/// overflow for consumers that do structural induction on an AST using
/// explicit recursion. While this crate never does this (instead using
/// constant stack space and moving the call stack to the heap), other
/// crates may.
///
/// This limit is not checked until the entire AST is parsed.
/// Therefore, if callers want to put a limit on the amount of heap
/// space used, then they should impose a limit on the length, in
/// bytes, of the concrete pattern string. In particular, this is
/// viable since this parser implementation will limit itself to heap
/// space proportional to the length of the pattern string. See also
/// the [untrusted inputs](crate#untrusted-input) section in the
/// top-level crate documentation for more information about this.
///
/// Note that a nest limit of `0` will return a nest limit error for
/// most patterns but not all. For example, a nest limit of `0` permits
/// `a` but not `ab`, since `ab` requires an explicit concatenation,
/// which results in a nest depth of `1`. In general, a nest limit is
/// not something that manifests in an obvious way in the concrete
/// syntax, therefore, it should not be used in a granular way.
///
/// # Example
///
/// ```
/// use regex::RegexBuilder;
///
/// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
/// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
/// ```
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
self.builder.nest_limit(limit);
self
}
}
/// A configurable builder for a [`RegexSet`].
///
/// This builder can be used to programmatically set flags such as
/// `i` (case insensitive) and `x` (for verbose mode). This builder
/// can also be used to configure things like the line terminator
/// and a size limit on the compiled regular expression.
#[derive(Clone, Debug)]
pub struct RegexSetBuilder {
builder: Builder,
}
impl RegexSetBuilder {
/// Create a new builder with a default configuration for the given
/// patterns.
///
/// If the patterns are invalid or exceed the configured size limits,
/// then an error will be returned when [`RegexSetBuilder::build`] is
/// called.
pub fn new<I, S>(patterns: I) -> RegexSetBuilder
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
RegexSetBuilder { builder: Builder::new(patterns) }
}
/// Compiles the patterns given to `RegexSetBuilder::new` with the
/// configuration set on this builder.
///
/// If the patterns aren't valid regexes or if a configured size limit
/// was exceeded, then an error is returned.
pub fn build(&self) -> Result<RegexSet, Error> {
self.builder.build_many_string()
}
/// This configures Unicode mode for the all of the patterns.
///
/// Enabling Unicode mode does a number of things:
///
/// * Most fundamentally, it causes the fundamental atom of matching
/// to be a single codepoint. When Unicode mode is disabled, it's a
/// single byte. For example, when Unicode mode is enabled, `.` will
/// match `💩` once, where as it will match 4 times when Unicode mode
/// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
/// * Case insensitive matching uses Unicode simple case folding rules.
/// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
/// available.
/// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
/// `\d`.
/// * The word boundary assertions, `\b` and `\B`, use the Unicode
/// definition of a word character.
///
/// Note that if Unicode mode is disabled, then the regex will fail to
/// compile if it could match invalid UTF-8. For example, when Unicode
/// mode is disabled, then since `.` matches any byte (except for
/// `\n`), then it can match invalid UTF-8 and thus building a regex
/// from it will fail. Another example is `\w` and `\W`. Since `\w` can
/// only match ASCII bytes when Unicode mode is disabled, it's allowed.
/// But `\W` can match more than ASCII bytes, including invalid UTF-8,
/// and so it is not allowed. This restriction can be lifted only by
/// using a [`bytes::RegexSet`](crate::bytes::RegexSet).
///
/// For more details on the Unicode support in this crate, see the
/// [Unicode section](crate#unicode) in this crate's top-level
/// documentation.
///
/// The default for this is `true`.
///
/// # Example
///
/// ```
/// use regex::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"\w"])
/// .unicode(false)
/// .build()
/// .unwrap();
/// // Normally greek letters would be included in \w, but since
/// // Unicode mode is disabled, it only matches ASCII letters.
/// assert!(!re.is_match("δ"));
///
/// let re = RegexSetBuilder::new([r"s"])
/// .case_insensitive(true)
/// .unicode(false)
/// .build()
/// .unwrap();
/// // Normally 'ſ' is included when searching for 's' case
/// // insensitively due to Unicode's simple case folding rules. But
/// // when Unicode mode is disabled, only ASCII case insensitive rules
/// // are used.
/// assert!(!re.is_match("ſ"));
/// ```
pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.builder.unicode(yes);
self
}
/// This configures whether to enable case insensitive matching for all
/// of the patterns.
///
/// This setting can also be configured using the inline flag `i`
/// in the pattern. For example, `(?i:foo)` matches `foo` case
/// insensitively while `(?-i:foo)` matches `foo` case sensitively.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
/// .case_insensitive(true)
/// .build()
/// .unwrap();
/// assert!(re.is_match("FoObarQuUx"));
/// // Even though case insensitive matching is enabled in the builder,
/// // it can be locally disabled within the pattern. In this case,
/// // `bar` is matched case sensitively.
/// assert!(!re.is_match("fooBARquux"));
/// ```
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.builder.case_insensitive(yes);
self
}
/// This configures multi-line mode for all of the patterns.
///
/// Enabling multi-line mode changes the behavior of the `^` and `$`
/// anchor assertions. Instead of only matching at the beginning and
/// end of a haystack, respectively, multi-line mode causes them to
/// match at the beginning and end of a line *in addition* to the
/// beginning and end of a haystack. More precisely, `^` will match at
/// the position immediately following a `\n` and `$` will match at the
/// position immediately preceding a `\n`.
///
/// The behavior of this option can be impacted by other settings too:
///
/// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
/// above to any ASCII byte.
/// * The [`RegexSetBuilder::crlf`] option changes the line terminator
/// to be either `\r` or `\n`, but never at the position between a `\r`
/// and `\n`.
///
/// This setting can also be configured using the inline flag `m` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"^foo$"])
/// .multi_line(true)
/// .build()
/// .unwrap();
/// assert!(re.is_match("\nfoo\n"));
/// ```
pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.builder.multi_line(yes);
self
}
/// This configures dot-matches-new-line mode for the entire pattern.
///
/// Perhaps surprisingly, the default behavior for `.` is not to match
/// any character, but rather, to match any character except for the
/// line terminator (which is `\n` by default). When this mode is
/// enabled, the behavior changes such that `.` truly matches any
/// character.
///
/// This setting can also be configured using the inline flag `s` in
/// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
/// regexes.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"foo.bar"])
/// .dot_matches_new_line(true)
/// .build()
/// .unwrap();
/// let hay = "foo\nbar";
/// assert!(re.is_match(hay));
/// ```
pub fn dot_matches_new_line(
&mut self,
yes: bool,
) -> &mut RegexSetBuilder {
self.builder.dot_matches_new_line(yes);
self
}
/// This configures CRLF mode for all of the patterns.
///
/// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
/// short) and `\n` ("line feed" or LF for short) are treated as line
/// terminators. This results in the following:
///
/// * Unless dot-matches-new-line mode is enabled, `.` will now match
/// any character except for `\n` and `\r`.
/// * When multi-line mode is enabled, `^` will match immediately
/// following a `\n` or a `\r`. Similarly, `$` will match immediately
/// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
/// between `\r` and `\n`.
///
/// This setting can also be configured using the inline flag `R` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"^foo$"])
/// .multi_line(true)
/// .crlf(true)
/// .build()
/// .unwrap();
/// let hay = "\r\nfoo\r\n";
/// // If CRLF mode weren't enabled here, then '$' wouldn't match
/// // immediately after 'foo', and thus no match would be found.
/// assert!(re.is_match(hay));
/// ```
///
/// This example demonstrates that `^` will never match at a position
/// between `\r` and `\n`. (`$` will similarly not match between a `\r`
/// and a `\n`.)
///
/// ```
/// use regex::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"^\n"])
/// .multi_line(true)
/// .crlf(true)
/// .build()
/// .unwrap();
/// assert!(!re.is_match("\r\n"));
/// ```
pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.builder.crlf(yes);
self
}
/// Configures the line terminator to be used by the regex.
///
/// The line terminator is relevant in two ways for a particular regex:
///
/// * When dot-matches-new-line mode is *not* enabled (the default),
/// then `.` will match any character except for the configured line
/// terminator.
/// * When multi-line mode is enabled (not the default), then `^` and
/// `$` will match immediately after and before, respectively, a line
/// terminator.
///
/// In both cases, if CRLF mode is enabled in a particular context,
/// then it takes precedence over any configured line terminator.
///
/// This option cannot be configured from within the pattern.
///
/// The default line terminator is `\n`.
///
/// # Example
///
/// This shows how to treat the NUL byte as a line terminator. This can
/// be a useful heuristic when searching binary data.
///
/// ```
/// use regex::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"^foo$"])
/// .multi_line(true)
/// .line_terminator(b'\x00')
/// .build()
/// .unwrap();
/// let hay = "\x00foo\x00";
/// assert!(re.is_match(hay));
/// ```
///
/// This example shows that the behavior of `.` is impacted by this
/// setting as well:
///
/// ```
/// use regex::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"."])
/// .line_terminator(b'\x00')
/// .build()
/// .unwrap();
/// assert!(re.is_match("\n"));
/// assert!(!re.is_match("\x00"));
/// ```
///
/// This shows that building a regex will fail if the byte given
/// is not ASCII and the pattern could result in matching invalid
/// UTF-8. This is because any singular non-ASCII byte is not valid
/// UTF-8, and it is not permitted for a [`RegexSet`] to match invalid
/// UTF-8. (It is permissible to use a non-ASCII byte when building a
/// [`bytes::RegexSet`](crate::bytes::RegexSet).)
///
/// ```
/// use regex::RegexSetBuilder;
///
/// assert!(
/// RegexSetBuilder::new([r"."])
/// .line_terminator(0x80)
/// .build()
/// .is_err()
/// );
/// // Note that using a non-ASCII byte isn't enough on its own to
/// // cause regex compilation to fail. You actually have to make use
/// // of it in the regex in a way that leads to matching invalid
/// // UTF-8. If you don't, then regex compilation will succeed!
/// assert!(
/// RegexSetBuilder::new([r"a"])
/// .line_terminator(0x80)
/// .build()
/// .is_ok()
/// );
/// ```
pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
self.builder.line_terminator(byte);
self
}
/// This configures swap-greed mode for all of the patterns.
///
/// When swap-greed mode is enabled, patterns like `a+` will become
/// non-greedy and patterns like `a+?` will become greedy. In other
/// words, the meanings of `a+` and `a+?` are switched.
///
/// This setting can also be configured using the inline flag `U` in
/// the pattern.
///
/// Note that this is generally not useful for a `RegexSet` since a
/// `RegexSet` can only report whether a pattern matches or not. Since
/// greediness never impacts whether a match is found or not (only the
/// offsets of the match), it follows that whether parts of a pattern
/// are greedy or not doesn't matter for a `RegexSet`.
///
/// The default for this is `false`.
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.builder.swap_greed(yes);
self
}
/// This configures verbose mode for all of the patterns.
///
/// When enabled, whitespace will treated as insignifcant in the
/// pattern and `#` can be used to start a comment until the next new
/// line.
///
/// Normally, in most places in a pattern, whitespace is treated
/// literally. For example ` +` will match one or more ASCII whitespace
/// characters.
///
/// When verbose mode is enabled, `\#` can be used to match a literal
/// `#` and `\ ` can be used to match a literal ASCII whitespace
/// character.
///
/// Verbose mode is useful for permitting regexes to be formatted and
/// broken up more nicely. This may make them more easily readable.
///
/// This setting can also be configured using the inline flag `x` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::RegexSetBuilder;
///
/// let pat = r"
/// \b
/// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
/// [\s--\n]+ # whitespace should separate names
/// (?: # middle name can be an initial!
/// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
/// [\s--\n]+
/// )?
/// (?<last>\p{Uppercase}\w*)
/// \b
/// ";
/// let re = RegexSetBuilder::new([pat])
/// .ignore_whitespace(true)
/// .build()
/// .unwrap();
/// assert!(re.is_match("Harry Potter"));
/// assert!(re.is_match("Harry J. Potter"));
/// assert!(re.is_match("Harry James Potter"));
/// assert!(!re.is_match("harry J. Potter"));
/// ```
pub fn ignore_whitespace(
&mut self,
yes: bool,
) -> &mut RegexSetBuilder {
self.builder.ignore_whitespace(yes);
self
}
/// This configures octal mode for all of the patterns.
///
/// Octal syntax is a little-known way of uttering Unicode codepoints
/// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
/// equivalent patterns, where the last example shows octal syntax.
///
/// While supporting octal syntax isn't in and of itself a problem,
/// it does make good error messages harder. That is, in PCRE based
/// regex engines, syntax like `\1` invokes a backreference, which is
/// explicitly unsupported this library. However, many users expect
/// backreferences to be supported. Therefore, when octal support
/// is disabled, the error message will explicitly mention that
/// backreferences aren't supported.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::RegexSetBuilder;
///
/// // Normally this pattern would not compile, with an error message
/// // about backreferences not being supported. But with octal mode
/// // enabled, octal escape sequences work.
/// let re = RegexSetBuilder::new([r"\141"])
/// .octal(true)
/// .build()
/// .unwrap();
/// assert!(re.is_match("a"));
/// ```
pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.builder.octal(yes);
self
}
/// Sets the approximate size limit, in bytes, of the compiled regex.
///
/// This roughly corresponds to the number of heap memory, in
/// bytes, occupied by a single regex. If the regex would otherwise
/// approximately exceed this limit, then compiling that regex will
/// fail.
///
/// The main utility of a method like this is to avoid compiling
/// regexes that use an unexpected amount of resources, such as
/// time and memory. Even if the memory usage of a large regex is
/// acceptable, its search time may not be. Namely, worst case time
/// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
/// `n ~ len(haystack)`. That is, search time depends, in part, on the
/// size of the compiled regex. This means that putting a limit on the
/// size of the regex limits how much a regex can impact search time.
///
/// For more information about regex size limits, see the section on
/// [untrusted inputs](crate#untrusted-input) in the top-level crate
/// documentation.
///
/// The default for this is some reasonable number that permits most
/// patterns to compile successfully.
///
/// # Example
///
/// ```
/// use regex::RegexSetBuilder;
///
/// // It may surprise you how big some seemingly small patterns can
/// // be! Since \w is Unicode aware, this generates a regex that can
/// // match approximately 140,000 distinct codepoints.
/// assert!(
/// RegexSetBuilder::new([r"\w"])
/// .size_limit(45_000)
/// .build()
/// .is_err()
/// );
/// ```
pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
self.builder.size_limit(bytes);
self
}
/// Set the approximate capacity, in bytes, of the cache of transitions
/// used by the lazy DFA.
///
/// While the lazy DFA isn't always used, in tends to be the most
/// commonly use regex engine in default configurations. It tends to
/// adopt the performance profile of a fully build DFA, but without the
/// downside of taking worst case exponential time to build.
///
/// The downside is that it needs to keep a cache of transitions and
/// states that are built while running a search, and this cache
/// can fill up. When it fills up, the cache will reset itself. Any
/// previously generated states and transitions will then need to be
/// re-generated. If this happens too many times, then this library
/// will bail out of using the lazy DFA and switch to a different regex
/// engine.
///
/// If your regex provokes this particular downside of the lazy DFA,
/// then it may be beneficial to increase its cache capacity. This will
/// potentially reduce the frequency of cache resetting (ideally to
/// `0`). While it won't fix all potential performance problems with
/// the lazy DFA, increasing the cache capacity does fix some.
///
/// There is no easy way to determine, a priori, whether increasing
/// this cache capacity will help. In general, the larger your regex,
/// the more cache it's likely to use. But that isn't an ironclad rule.
/// For example, a regex like `[01]*1[01]{N}` would normally produce a
/// fully build DFA that is exponential in size with respect to `N`.
/// The lazy DFA will prevent exponential space blow-up, but it cache
/// is likely to fill up, even when it's large and even for smallish
/// values of `N`.
///
/// If you aren't sure whether this helps or not, it is sensible to
/// set this to some arbitrarily large number in testing, such as
/// `usize::MAX`. Namely, this represents the amount of capacity that
/// *may* be used. It's probably not a good idea to use `usize::MAX` in
/// production though, since it implies there are no controls on heap
/// memory used by this library during a search. In effect, set it to
/// whatever you're willing to allocate for a single regex search.
pub fn dfa_size_limit(
&mut self,
bytes: usize,
) -> &mut RegexSetBuilder {
self.builder.dfa_size_limit(bytes);
self
}
/// Set the nesting limit for this parser.
///
/// The nesting limit controls how deep the abstract syntax tree is
/// allowed to be. If the AST exceeds the given limit (e.g., with too
/// many nested groups), then an error is returned by the parser.
///
/// The purpose of this limit is to act as a heuristic to prevent stack
/// overflow for consumers that do structural induction on an AST using
/// explicit recursion. While this crate never does this (instead using
/// constant stack space and moving the call stack to the heap), other
/// crates may.
///
/// This limit is not checked until the entire AST is parsed.
/// Therefore, if callers want to put a limit on the amount of heap
/// space used, then they should impose a limit on the length, in
/// bytes, of the concrete pattern string. In particular, this is
/// viable since this parser implementation will limit itself to heap
/// space proportional to the length of the pattern string. See also
/// the [untrusted inputs](crate#untrusted-input) section in the
/// top-level crate documentation for more information about this.
///
/// Note that a nest limit of `0` will return a nest limit error for
/// most patterns but not all. For example, a nest limit of `0` permits
/// `a` but not `ab`, since `ab` requires an explicit concatenation,
/// which results in a nest depth of `1`. In general, a nest limit is
/// not something that manifests in an obvious way in the concrete
/// syntax, therefore, it should not be used in a granular way.
///
/// # Example
///
/// ```
/// use regex::RegexSetBuilder;
///
/// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
/// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
/// ```
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
self.builder.nest_limit(limit);
self
}
}
}
pub(crate) mod bytes {
use crate::{
bytes::{Regex, RegexSet},
error::Error,
};
use super::Builder;
/// A configurable builder for a [`Regex`].
///
/// This builder can be used to programmatically set flags such as `i`
/// (case insensitive) and `x` (for verbose mode). This builder can also be
/// used to configure things like the line terminator and a size limit on
/// the compiled regular expression.
#[derive(Clone, Debug)]
pub struct RegexBuilder {
builder: Builder,
}
impl RegexBuilder {
/// Create a new builder with a default configuration for the given
/// pattern.
///
/// If the pattern is invalid or exceeds the configured size limits,
/// then an error will be returned when [`RegexBuilder::build`] is
/// called.
pub fn new(pattern: &str) -> RegexBuilder {
RegexBuilder { builder: Builder::new([pattern]) }
}
/// Compiles the pattern given to `RegexBuilder::new` with the
/// configuration set on this builder.
///
/// If the pattern isn't a valid regex or if a configured size limit
/// was exceeded, then an error is returned.
pub fn build(&self) -> Result<Regex, Error> {
self.builder.build_one_bytes()
}
/// This configures Unicode mode for the entire pattern.
///
/// Enabling Unicode mode does a number of things:
///
/// * Most fundamentally, it causes the fundamental atom of matching
/// to be a single codepoint. When Unicode mode is disabled, it's a
/// single byte. For example, when Unicode mode is enabled, `.` will
/// match `💩` once, where as it will match 4 times when Unicode mode
/// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
/// * Case insensitive matching uses Unicode simple case folding rules.
/// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
/// available.
/// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
/// `\d`.
/// * The word boundary assertions, `\b` and `\B`, use the Unicode
/// definition of a word character.
///
/// Note that unlike the top-level `Regex` for searching `&str`, it
/// is permitted to disable Unicode mode even if the resulting pattern
/// could match invalid UTF-8. For example, `(?-u:.)` is not a valid
/// pattern for a top-level `Regex`, but is valid for a `bytes::Regex`.
///
/// For more details on the Unicode support in this crate, see the
/// [Unicode section](crate#unicode) in this crate's top-level
/// documentation.
///
/// The default for this is `true`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// let re = RegexBuilder::new(r"\w")
/// .unicode(false)
/// .build()
/// .unwrap();
/// // Normally greek letters would be included in \w, but since
/// // Unicode mode is disabled, it only matches ASCII letters.
/// assert!(!re.is_match("δ".as_bytes()));
///
/// let re = RegexBuilder::new(r"s")
/// .case_insensitive(true)
/// .unicode(false)
/// .build()
/// .unwrap();
/// // Normally 'ſ' is included when searching for 's' case
/// // insensitively due to Unicode's simple case folding rules. But
/// // when Unicode mode is disabled, only ASCII case insensitive rules
/// // are used.
/// assert!(!re.is_match("ſ".as_bytes()));
/// ```
///
/// Since this builder is for constructing a [`bytes::Regex`](Regex),
/// one can disable Unicode mode even if it would match invalid UTF-8:
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// let re = RegexBuilder::new(r".")
/// .unicode(false)
/// .build()
/// .unwrap();
/// // Normally greek letters would be included in \w, but since
/// // Unicode mode is disabled, it only matches ASCII letters.
/// assert!(re.is_match(b"\xFF"));
/// ```
pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.unicode(yes);
self
}
/// This configures whether to enable case insensitive matching for the
/// entire pattern.
///
/// This setting can also be configured using the inline flag `i`
/// in the pattern. For example, `(?i:foo)` matches `foo` case
/// insensitively while `(?-i:foo)` matches `foo` case sensitively.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
/// .case_insensitive(true)
/// .build()
/// .unwrap();
/// assert!(re.is_match(b"FoObarQuUx"));
/// // Even though case insensitive matching is enabled in the builder,
/// // it can be locally disabled within the pattern. In this case,
/// // `bar` is matched case sensitively.
/// assert!(!re.is_match(b"fooBARquux"));
/// ```
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.case_insensitive(yes);
self
}
/// This configures multi-line mode for the entire pattern.
///
/// Enabling multi-line mode changes the behavior of the `^` and `$`
/// anchor assertions. Instead of only matching at the beginning and
/// end of a haystack, respectively, multi-line mode causes them to
/// match at the beginning and end of a line *in addition* to the
/// beginning and end of a haystack. More precisely, `^` will match at
/// the position immediately following a `\n` and `$` will match at the
/// position immediately preceding a `\n`.
///
/// The behavior of this option can be impacted by other settings too:
///
/// * The [`RegexBuilder::line_terminator`] option changes `\n` above
/// to any ASCII byte.
/// * The [`RegexBuilder::crlf`] option changes the line terminator to
/// be either `\r` or `\n`, but never at the position between a `\r`
/// and `\n`.
///
/// This setting can also be configured using the inline flag `m` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// let re = RegexBuilder::new(r"^foo$")
/// .multi_line(true)
/// .build()
/// .unwrap();
/// assert_eq!(Some(1..4), re.find(b"\nfoo\n").map(|m| m.range()));
/// ```
pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.multi_line(yes);
self
}
/// This configures dot-matches-new-line mode for the entire pattern.
///
/// Perhaps surprisingly, the default behavior for `.` is not to match
/// any character, but rather, to match any character except for the
/// line terminator (which is `\n` by default). When this mode is
/// enabled, the behavior changes such that `.` truly matches any
/// character.
///
/// This setting can also be configured using the inline flag `s` in
/// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
/// regexes.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// let re = RegexBuilder::new(r"foo.bar")
/// .dot_matches_new_line(true)
/// .build()
/// .unwrap();
/// let hay = b"foo\nbar";
/// assert_eq!(Some(&b"foo\nbar"[..]), re.find(hay).map(|m| m.as_bytes()));
/// ```
pub fn dot_matches_new_line(
&mut self,
yes: bool,
) -> &mut RegexBuilder {
self.builder.dot_matches_new_line(yes);
self
}
/// This configures CRLF mode for the entire pattern.
///
/// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
/// short) and `\n` ("line feed" or LF for short) are treated as line
/// terminators. This results in the following:
///
/// * Unless dot-matches-new-line mode is enabled, `.` will now match
/// any character except for `\n` and `\r`.
/// * When multi-line mode is enabled, `^` will match immediately
/// following a `\n` or a `\r`. Similarly, `$` will match immediately
/// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
/// between `\r` and `\n`.
///
/// This setting can also be configured using the inline flag `R` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// let re = RegexBuilder::new(r"^foo$")
/// .multi_line(true)
/// .crlf(true)
/// .build()
/// .unwrap();
/// let hay = b"\r\nfoo\r\n";
/// // If CRLF mode weren't enabled here, then '$' wouldn't match
/// // immediately after 'foo', and thus no match would be found.
/// assert_eq!(Some(&b"foo"[..]), re.find(hay).map(|m| m.as_bytes()));
/// ```
///
/// This example demonstrates that `^` will never match at a position
/// between `\r` and `\n`. (`$` will similarly not match between a `\r`
/// and a `\n`.)
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// let re = RegexBuilder::new(r"^")
/// .multi_line(true)
/// .crlf(true)
/// .build()
/// .unwrap();
/// let hay = b"\r\n\r\n";
/// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
/// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
/// ```
pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.crlf(yes);
self
}
/// Configures the line terminator to be used by the regex.
///
/// The line terminator is relevant in two ways for a particular regex:
///
/// * When dot-matches-new-line mode is *not* enabled (the default),
/// then `.` will match any character except for the configured line
/// terminator.
/// * When multi-line mode is enabled (not the default), then `^` and
/// `$` will match immediately after and before, respectively, a line
/// terminator.
///
/// In both cases, if CRLF mode is enabled in a particular context,
/// then it takes precedence over any configured line terminator.
///
/// This option cannot be configured from within the pattern.
///
/// The default line terminator is `\n`.
///
/// # Example
///
/// This shows how to treat the NUL byte as a line terminator. This can
/// be a useful heuristic when searching binary data.
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// let re = RegexBuilder::new(r"^foo$")
/// .multi_line(true)
/// .line_terminator(b'\x00')
/// .build()
/// .unwrap();
/// let hay = b"\x00foo\x00";
/// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
/// ```
///
/// This example shows that the behavior of `.` is impacted by this
/// setting as well:
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// let re = RegexBuilder::new(r".")
/// .line_terminator(b'\x00')
/// .build()
/// .unwrap();
/// assert!(re.is_match(b"\n"));
/// assert!(!re.is_match(b"\x00"));
/// ```
///
/// This shows that building a regex will work even when the byte
/// given is not ASCII. This is unlike the top-level `Regex` API where
/// matching invalid UTF-8 is not allowed.
///
/// Note though that you must disable Unicode mode. This is required
/// because Unicode mode requires matching one codepoint at a time,
/// and there is no way to match a non-ASCII byte as if it were a
/// codepoint.
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// assert!(
/// RegexBuilder::new(r".")
/// .unicode(false)
/// .line_terminator(0x80)
/// .build()
/// .is_ok(),
/// );
/// ```
pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
self.builder.line_terminator(byte);
self
}
/// This configures swap-greed mode for the entire pattern.
///
/// When swap-greed mode is enabled, patterns like `a+` will become
/// non-greedy and patterns like `a+?` will become greedy. In other
/// words, the meanings of `a+` and `a+?` are switched.
///
/// This setting can also be configured using the inline flag `U` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// let re = RegexBuilder::new(r"a+")
/// .swap_greed(true)
/// .build()
/// .unwrap();
/// assert_eq!(Some(&b"a"[..]), re.find(b"aaa").map(|m| m.as_bytes()));
/// ```
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.swap_greed(yes);
self
}
/// This configures verbose mode for the entire pattern.
///
/// When enabled, whitespace will treated as insignifcant in the
/// pattern and `#` can be used to start a comment until the next new
/// line.
///
/// Normally, in most places in a pattern, whitespace is treated
/// literally. For example ` +` will match one or more ASCII whitespace
/// characters.
///
/// When verbose mode is enabled, `\#` can be used to match a literal
/// `#` and `\ ` can be used to match a literal ASCII whitespace
/// character.
///
/// Verbose mode is useful for permitting regexes to be formatted and
/// broken up more nicely. This may make them more easily readable.
///
/// This setting can also be configured using the inline flag `x` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// let pat = r"
/// \b
/// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
/// [\s--\n]+ # whitespace should separate names
/// (?: # middle name can be an initial!
/// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
/// [\s--\n]+
/// )?
/// (?<last>\p{Uppercase}\w*)
/// \b
/// ";
/// let re = RegexBuilder::new(pat)
/// .ignore_whitespace(true)
/// .build()
/// .unwrap();
///
/// let caps = re.captures(b"Harry Potter").unwrap();
/// assert_eq!(&b"Harry"[..], &caps["first"]);
/// assert_eq!(&b"Potter"[..], &caps["last"]);
///
/// let caps = re.captures(b"Harry J. Potter").unwrap();
/// assert_eq!(&b"Harry"[..], &caps["first"]);
/// // Since a middle name/initial isn't required for an overall match,
/// // we can't assume that 'initial' or 'middle' will be populated!
/// assert_eq!(
/// Some(&b"J"[..]),
/// caps.name("initial").map(|m| m.as_bytes()),
/// );
/// assert_eq!(None, caps.name("middle").map(|m| m.as_bytes()));
/// assert_eq!(&b"Potter"[..], &caps["last"]);
///
/// let caps = re.captures(b"Harry James Potter").unwrap();
/// assert_eq!(&b"Harry"[..], &caps["first"]);
/// // Since a middle name/initial isn't required for an overall match,
/// // we can't assume that 'initial' or 'middle' will be populated!
/// assert_eq!(None, caps.name("initial").map(|m| m.as_bytes()));
/// assert_eq!(
/// Some(&b"James"[..]),
/// caps.name("middle").map(|m| m.as_bytes()),
/// );
/// assert_eq!(&b"Potter"[..], &caps["last"]);
/// ```
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.ignore_whitespace(yes);
self
}
/// This configures octal mode for the entire pattern.
///
/// Octal syntax is a little-known way of uttering Unicode codepoints
/// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
/// equivalent patterns, where the last example shows octal syntax.
///
/// While supporting octal syntax isn't in and of itself a problem,
/// it does make good error messages harder. That is, in PCRE based
/// regex engines, syntax like `\1` invokes a backreference, which is
/// explicitly unsupported this library. However, many users expect
/// backreferences to be supported. Therefore, when octal support
/// is disabled, the error message will explicitly mention that
/// backreferences aren't supported.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// // Normally this pattern would not compile, with an error message
/// // about backreferences not being supported. But with octal mode
/// // enabled, octal escape sequences work.
/// let re = RegexBuilder::new(r"\141")
/// .octal(true)
/// .build()
/// .unwrap();
/// assert!(re.is_match(b"a"));
/// ```
pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
self.builder.octal(yes);
self
}
/// Sets the approximate size limit, in bytes, of the compiled regex.
///
/// This roughly corresponds to the number of heap memory, in
/// bytes, occupied by a single regex. If the regex would otherwise
/// approximately exceed this limit, then compiling that regex will
/// fail.
///
/// The main utility of a method like this is to avoid compiling
/// regexes that use an unexpected amount of resources, such as
/// time and memory. Even if the memory usage of a large regex is
/// acceptable, its search time may not be. Namely, worst case time
/// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
/// `n ~ len(haystack)`. That is, search time depends, in part, on the
/// size of the compiled regex. This means that putting a limit on the
/// size of the regex limits how much a regex can impact search time.
///
/// For more information about regex size limits, see the section on
/// [untrusted inputs](crate#untrusted-input) in the top-level crate
/// documentation.
///
/// The default for this is some reasonable number that permits most
/// patterns to compile successfully.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// // It may surprise you how big some seemingly small patterns can
/// // be! Since \w is Unicode aware, this generates a regex that can
/// // match approximately 140,000 distinct codepoints.
/// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
/// ```
pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
self.builder.size_limit(bytes);
self
}
/// Set the approximate capacity, in bytes, of the cache of transitions
/// used by the lazy DFA.
///
/// While the lazy DFA isn't always used, in tends to be the most
/// commonly use regex engine in default configurations. It tends to
/// adopt the performance profile of a fully build DFA, but without the
/// downside of taking worst case exponential time to build.
///
/// The downside is that it needs to keep a cache of transitions and
/// states that are built while running a search, and this cache
/// can fill up. When it fills up, the cache will reset itself. Any
/// previously generated states and transitions will then need to be
/// re-generated. If this happens too many times, then this library
/// will bail out of using the lazy DFA and switch to a different regex
/// engine.
///
/// If your regex provokes this particular downside of the lazy DFA,
/// then it may be beneficial to increase its cache capacity. This will
/// potentially reduce the frequency of cache resetting (ideally to
/// `0`). While it won't fix all potential performance problems with
/// the lazy DFA, increasing the cache capacity does fix some.
///
/// There is no easy way to determine, a priori, whether increasing
/// this cache capacity will help. In general, the larger your regex,
/// the more cache it's likely to use. But that isn't an ironclad rule.
/// For example, a regex like `[01]*1[01]{N}` would normally produce a
/// fully build DFA that is exponential in size with respect to `N`.
/// The lazy DFA will prevent exponential space blow-up, but it cache
/// is likely to fill up, even when it's large and even for smallish
/// values of `N`.
///
/// If you aren't sure whether this helps or not, it is sensible to
/// set this to some arbitrarily large number in testing, such as
/// `usize::MAX`. Namely, this represents the amount of capacity that
/// *may* be used. It's probably not a good idea to use `usize::MAX` in
/// production though, since it implies there are no controls on heap
/// memory used by this library during a search. In effect, set it to
/// whatever you're willing to allocate for a single regex search.
pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
self.builder.dfa_size_limit(bytes);
self
}
/// Set the nesting limit for this parser.
///
/// The nesting limit controls how deep the abstract syntax tree is
/// allowed to be. If the AST exceeds the given limit (e.g., with too
/// many nested groups), then an error is returned by the parser.
///
/// The purpose of this limit is to act as a heuristic to prevent stack
/// overflow for consumers that do structural induction on an AST using
/// explicit recursion. While this crate never does this (instead using
/// constant stack space and moving the call stack to the heap), other
/// crates may.
///
/// This limit is not checked until the entire AST is parsed.
/// Therefore, if callers want to put a limit on the amount of heap
/// space used, then they should impose a limit on the length, in
/// bytes, of the concrete pattern string. In particular, this is
/// viable since this parser implementation will limit itself to heap
/// space proportional to the length of the pattern string. See also
/// the [untrusted inputs](crate#untrusted-input) section in the
/// top-level crate documentation for more information about this.
///
/// Note that a nest limit of `0` will return a nest limit error for
/// most patterns but not all. For example, a nest limit of `0` permits
/// `a` but not `ab`, since `ab` requires an explicit concatenation,
/// which results in a nest depth of `1`. In general, a nest limit is
/// not something that manifests in an obvious way in the concrete
/// syntax, therefore, it should not be used in a granular way.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexBuilder;
///
/// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
/// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
/// ```
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
self.builder.nest_limit(limit);
self
}
}
/// A configurable builder for a [`RegexSet`].
///
/// This builder can be used to programmatically set flags such as `i`
/// (case insensitive) and `x` (for verbose mode). This builder can also be
/// used to configure things like the line terminator and a size limit on
/// the compiled regular expression.
#[derive(Clone, Debug)]
pub struct RegexSetBuilder {
builder: Builder,
}
impl RegexSetBuilder {
/// Create a new builder with a default configuration for the given
/// patterns.
///
/// If the patterns are invalid or exceed the configured size limits,
/// then an error will be returned when [`RegexSetBuilder::build`] is
/// called.
pub fn new<I, S>(patterns: I) -> RegexSetBuilder
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
RegexSetBuilder { builder: Builder::new(patterns) }
}
/// Compiles the patterns given to `RegexSetBuilder::new` with the
/// configuration set on this builder.
///
/// If the patterns aren't valid regexes or if a configured size limit
/// was exceeded, then an error is returned.
pub fn build(&self) -> Result<RegexSet, Error> {
self.builder.build_many_bytes()
}
/// This configures Unicode mode for the all of the patterns.
///
/// Enabling Unicode mode does a number of things:
///
/// * Most fundamentally, it causes the fundamental atom of matching
/// to be a single codepoint. When Unicode mode is disabled, it's a
/// single byte. For example, when Unicode mode is enabled, `.` will
/// match `💩` once, where as it will match 4 times when Unicode mode
/// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
/// * Case insensitive matching uses Unicode simple case folding rules.
/// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
/// available.
/// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
/// `\d`.
/// * The word boundary assertions, `\b` and `\B`, use the Unicode
/// definition of a word character.
///
/// Note that unlike the top-level `RegexSet` for searching `&str`,
/// it is permitted to disable Unicode mode even if the resulting
/// pattern could match invalid UTF-8. For example, `(?-u:.)` is not
/// a valid pattern for a top-level `RegexSet`, but is valid for a
/// `bytes::RegexSet`.
///
/// For more details on the Unicode support in this crate, see the
/// [Unicode section](crate#unicode) in this crate's top-level
/// documentation.
///
/// The default for this is `true`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"\w"])
/// .unicode(false)
/// .build()
/// .unwrap();
/// // Normally greek letters would be included in \w, but since
/// // Unicode mode is disabled, it only matches ASCII letters.
/// assert!(!re.is_match("δ".as_bytes()));
///
/// let re = RegexSetBuilder::new([r"s"])
/// .case_insensitive(true)
/// .unicode(false)
/// .build()
/// .unwrap();
/// // Normally 'ſ' is included when searching for 's' case
/// // insensitively due to Unicode's simple case folding rules. But
/// // when Unicode mode is disabled, only ASCII case insensitive rules
/// // are used.
/// assert!(!re.is_match("ſ".as_bytes()));
/// ```
///
/// Since this builder is for constructing a
/// [`bytes::RegexSet`](RegexSet), one can disable Unicode mode even if
/// it would match invalid UTF-8:
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"."])
/// .unicode(false)
/// .build()
/// .unwrap();
/// // Normally greek letters would be included in \w, but since
/// // Unicode mode is disabled, it only matches ASCII letters.
/// assert!(re.is_match(b"\xFF"));
/// ```
pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.builder.unicode(yes);
self
}
/// This configures whether to enable case insensitive matching for all
/// of the patterns.
///
/// This setting can also be configured using the inline flag `i`
/// in the pattern. For example, `(?i:foo)` matches `foo` case
/// insensitively while `(?-i:foo)` matches `foo` case sensitively.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
/// .case_insensitive(true)
/// .build()
/// .unwrap();
/// assert!(re.is_match(b"FoObarQuUx"));
/// // Even though case insensitive matching is enabled in the builder,
/// // it can be locally disabled within the pattern. In this case,
/// // `bar` is matched case sensitively.
/// assert!(!re.is_match(b"fooBARquux"));
/// ```
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.builder.case_insensitive(yes);
self
}
/// This configures multi-line mode for all of the patterns.
///
/// Enabling multi-line mode changes the behavior of the `^` and `$`
/// anchor assertions. Instead of only matching at the beginning and
/// end of a haystack, respectively, multi-line mode causes them to
/// match at the beginning and end of a line *in addition* to the
/// beginning and end of a haystack. More precisely, `^` will match at
/// the position immediately following a `\n` and `$` will match at the
/// position immediately preceding a `\n`.
///
/// The behavior of this option can be impacted by other settings too:
///
/// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
/// above to any ASCII byte.
/// * The [`RegexSetBuilder::crlf`] option changes the line terminator
/// to be either `\r` or `\n`, but never at the position between a `\r`
/// and `\n`.
///
/// This setting can also be configured using the inline flag `m` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"^foo$"])
/// .multi_line(true)
/// .build()
/// .unwrap();
/// assert!(re.is_match(b"\nfoo\n"));
/// ```
pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.builder.multi_line(yes);
self
}
/// This configures dot-matches-new-line mode for the entire pattern.
///
/// Perhaps surprisingly, the default behavior for `.` is not to match
/// any character, but rather, to match any character except for the
/// line terminator (which is `\n` by default). When this mode is
/// enabled, the behavior changes such that `.` truly matches any
/// character.
///
/// This setting can also be configured using the inline flag `s` in
/// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
/// regexes.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"foo.bar"])
/// .dot_matches_new_line(true)
/// .build()
/// .unwrap();
/// let hay = b"foo\nbar";
/// assert!(re.is_match(hay));
/// ```
pub fn dot_matches_new_line(
&mut self,
yes: bool,
) -> &mut RegexSetBuilder {
self.builder.dot_matches_new_line(yes);
self
}
/// This configures CRLF mode for all of the patterns.
///
/// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
/// short) and `\n` ("line feed" or LF for short) are treated as line
/// terminators. This results in the following:
///
/// * Unless dot-matches-new-line mode is enabled, `.` will now match
/// any character except for `\n` and `\r`.
/// * When multi-line mode is enabled, `^` will match immediately
/// following a `\n` or a `\r`. Similarly, `$` will match immediately
/// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
/// between `\r` and `\n`.
///
/// This setting can also be configured using the inline flag `R` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"^foo$"])
/// .multi_line(true)
/// .crlf(true)
/// .build()
/// .unwrap();
/// let hay = b"\r\nfoo\r\n";
/// // If CRLF mode weren't enabled here, then '$' wouldn't match
/// // immediately after 'foo', and thus no match would be found.
/// assert!(re.is_match(hay));
/// ```
///
/// This example demonstrates that `^` will never match at a position
/// between `\r` and `\n`. (`$` will similarly not match between a `\r`
/// and a `\n`.)
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"^\n"])
/// .multi_line(true)
/// .crlf(true)
/// .build()
/// .unwrap();
/// assert!(!re.is_match(b"\r\n"));
/// ```
pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.builder.crlf(yes);
self
}
/// Configures the line terminator to be used by the regex.
///
/// The line terminator is relevant in two ways for a particular regex:
///
/// * When dot-matches-new-line mode is *not* enabled (the default),
/// then `.` will match any character except for the configured line
/// terminator.
/// * When multi-line mode is enabled (not the default), then `^` and
/// `$` will match immediately after and before, respectively, a line
/// terminator.
///
/// In both cases, if CRLF mode is enabled in a particular context,
/// then it takes precedence over any configured line terminator.
///
/// This option cannot be configured from within the pattern.
///
/// The default line terminator is `\n`.
///
/// # Example
///
/// This shows how to treat the NUL byte as a line terminator. This can
/// be a useful heuristic when searching binary data.
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"^foo$"])
/// .multi_line(true)
/// .line_terminator(b'\x00')
/// .build()
/// .unwrap();
/// let hay = b"\x00foo\x00";
/// assert!(re.is_match(hay));
/// ```
///
/// This example shows that the behavior of `.` is impacted by this
/// setting as well:
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// let re = RegexSetBuilder::new([r"."])
/// .line_terminator(b'\x00')
/// .build()
/// .unwrap();
/// assert!(re.is_match(b"\n"));
/// assert!(!re.is_match(b"\x00"));
/// ```
///
/// This shows that building a regex will work even when the byte given
/// is not ASCII. This is unlike the top-level `RegexSet` API where
/// matching invalid UTF-8 is not allowed.
///
/// Note though that you must disable Unicode mode. This is required
/// because Unicode mode requires matching one codepoint at a time,
/// and there is no way to match a non-ASCII byte as if it were a
/// codepoint.
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// assert!(
/// RegexSetBuilder::new([r"."])
/// .unicode(false)
/// .line_terminator(0x80)
/// .build()
/// .is_ok(),
/// );
/// ```
pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
self.builder.line_terminator(byte);
self
}
/// This configures swap-greed mode for all of the patterns.
///
/// When swap-greed mode is enabled, patterns like `a+` will become
/// non-greedy and patterns like `a+?` will become greedy. In other
/// words, the meanings of `a+` and `a+?` are switched.
///
/// This setting can also be configured using the inline flag `U` in
/// the pattern.
///
/// Note that this is generally not useful for a `RegexSet` since a
/// `RegexSet` can only report whether a pattern matches or not. Since
/// greediness never impacts whether a match is found or not (only the
/// offsets of the match), it follows that whether parts of a pattern
/// are greedy or not doesn't matter for a `RegexSet`.
///
/// The default for this is `false`.
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.builder.swap_greed(yes);
self
}
/// This configures verbose mode for all of the patterns.
///
/// When enabled, whitespace will treated as insignifcant in the
/// pattern and `#` can be used to start a comment until the next new
/// line.
///
/// Normally, in most places in a pattern, whitespace is treated
/// literally. For example ` +` will match one or more ASCII whitespace
/// characters.
///
/// When verbose mode is enabled, `\#` can be used to match a literal
/// `#` and `\ ` can be used to match a literal ASCII whitespace
/// character.
///
/// Verbose mode is useful for permitting regexes to be formatted and
/// broken up more nicely. This may make them more easily readable.
///
/// This setting can also be configured using the inline flag `x` in
/// the pattern.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// let pat = r"
/// \b
/// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
/// [\s--\n]+ # whitespace should separate names
/// (?: # middle name can be an initial!
/// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
/// [\s--\n]+
/// )?
/// (?<last>\p{Uppercase}\w*)
/// \b
/// ";
/// let re = RegexSetBuilder::new([pat])
/// .ignore_whitespace(true)
/// .build()
/// .unwrap();
/// assert!(re.is_match(b"Harry Potter"));
/// assert!(re.is_match(b"Harry J. Potter"));
/// assert!(re.is_match(b"Harry James Potter"));
/// assert!(!re.is_match(b"harry J. Potter"));
/// ```
pub fn ignore_whitespace(
&mut self,
yes: bool,
) -> &mut RegexSetBuilder {
self.builder.ignore_whitespace(yes);
self
}
/// This configures octal mode for all of the patterns.
///
/// Octal syntax is a little-known way of uttering Unicode codepoints
/// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
/// equivalent patterns, where the last example shows octal syntax.
///
/// While supporting octal syntax isn't in and of itself a problem,
/// it does make good error messages harder. That is, in PCRE based
/// regex engines, syntax like `\1` invokes a backreference, which is
/// explicitly unsupported this library. However, many users expect
/// backreferences to be supported. Therefore, when octal support
/// is disabled, the error message will explicitly mention that
/// backreferences aren't supported.
///
/// The default for this is `false`.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// // Normally this pattern would not compile, with an error message
/// // about backreferences not being supported. But with octal mode
/// // enabled, octal escape sequences work.
/// let re = RegexSetBuilder::new([r"\141"])
/// .octal(true)
/// .build()
/// .unwrap();
/// assert!(re.is_match(b"a"));
/// ```
pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.builder.octal(yes);
self
}
/// Sets the approximate size limit, in bytes, of the compiled regex.
///
/// This roughly corresponds to the number of heap memory, in
/// bytes, occupied by a single regex. If the regex would otherwise
/// approximately exceed this limit, then compiling that regex will
/// fail.
///
/// The main utility of a method like this is to avoid compiling
/// regexes that use an unexpected amount of resources, such as
/// time and memory. Even if the memory usage of a large regex is
/// acceptable, its search time may not be. Namely, worst case time
/// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
/// `n ~ len(haystack)`. That is, search time depends, in part, on the
/// size of the compiled regex. This means that putting a limit on the
/// size of the regex limits how much a regex can impact search time.
///
/// For more information about regex size limits, see the section on
/// [untrusted inputs](crate#untrusted-input) in the top-level crate
/// documentation.
///
/// The default for this is some reasonable number that permits most
/// patterns to compile successfully.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// // It may surprise you how big some seemingly small patterns can
/// // be! Since \w is Unicode aware, this generates a regex that can
/// // match approximately 140,000 distinct codepoints.
/// assert!(
/// RegexSetBuilder::new([r"\w"])
/// .size_limit(45_000)
/// .build()
/// .is_err()
/// );
/// ```
pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
self.builder.size_limit(bytes);
self
}
/// Set the approximate capacity, in bytes, of the cache of transitions
/// used by the lazy DFA.
///
/// While the lazy DFA isn't always used, in tends to be the most
/// commonly use regex engine in default configurations. It tends to
/// adopt the performance profile of a fully build DFA, but without the
/// downside of taking worst case exponential time to build.
///
/// The downside is that it needs to keep a cache of transitions and
/// states that are built while running a search, and this cache
/// can fill up. When it fills up, the cache will reset itself. Any
/// previously generated states and transitions will then need to be
/// re-generated. If this happens too many times, then this library
/// will bail out of using the lazy DFA and switch to a different regex
/// engine.
///
/// If your regex provokes this particular downside of the lazy DFA,
/// then it may be beneficial to increase its cache capacity. This will
/// potentially reduce the frequency of cache resetting (ideally to
/// `0`). While it won't fix all potential performance problems with
/// the lazy DFA, increasing the cache capacity does fix some.
///
/// There is no easy way to determine, a priori, whether increasing
/// this cache capacity will help. In general, the larger your regex,
/// the more cache it's likely to use. But that isn't an ironclad rule.
/// For example, a regex like `[01]*1[01]{N}` would normally produce a
/// fully build DFA that is exponential in size with respect to `N`.
/// The lazy DFA will prevent exponential space blow-up, but it cache
/// is likely to fill up, even when it's large and even for smallish
/// values of `N`.
///
/// If you aren't sure whether this helps or not, it is sensible to
/// set this to some arbitrarily large number in testing, such as
/// `usize::MAX`. Namely, this represents the amount of capacity that
/// *may* be used. It's probably not a good idea to use `usize::MAX` in
/// production though, since it implies there are no controls on heap
/// memory used by this library during a search. In effect, set it to
/// whatever you're willing to allocate for a single regex search.
pub fn dfa_size_limit(
&mut self,
bytes: usize,
) -> &mut RegexSetBuilder {
self.builder.dfa_size_limit(bytes);
self
}
/// Set the nesting limit for this parser.
///
/// The nesting limit controls how deep the abstract syntax tree is
/// allowed to be. If the AST exceeds the given limit (e.g., with too
/// many nested groups), then an error is returned by the parser.
///
/// The purpose of this limit is to act as a heuristic to prevent stack
/// overflow for consumers that do structural induction on an AST using
/// explicit recursion. While this crate never does this (instead using
/// constant stack space and moving the call stack to the heap), other
/// crates may.
///
/// This limit is not checked until the entire AST is parsed.
/// Therefore, if callers want to put a limit on the amount of heap
/// space used, then they should impose a limit on the length, in
/// bytes, of the concrete pattern string. In particular, this is
/// viable since this parser implementation will limit itself to heap
/// space proportional to the length of the pattern string. See also
/// the [untrusted inputs](crate#untrusted-input) section in the
/// top-level crate documentation for more information about this.
///
/// Note that a nest limit of `0` will return a nest limit error for
/// most patterns but not all. For example, a nest limit of `0` permits
/// `a` but not `ab`, since `ab` requires an explicit concatenation,
/// which results in a nest depth of `1`. In general, a nest limit is
/// not something that manifests in an obvious way in the concrete
/// syntax, therefore, it should not be used in a granular way.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexSetBuilder;
///
/// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
/// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
/// ```
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
self.builder.nest_limit(limit);
self
}
}
}