2535 lines
105 KiB
Rust
2535 lines
105 KiB
Rust
#![allow(warnings)]
|
||
|
||
// This module defines an internal builder that encapsulates all interaction
|
||
// with meta::Regex construction, and then 4 public API builders that wrap
|
||
// around it. The docs are essentially repeated on each of the 4 public
|
||
// builders, with tweaks to the examples as needed.
|
||
//
|
||
// The reason why there are so many builders is partially because of a misstep
|
||
// in the initial API design: the builder constructor takes in the pattern
|
||
// strings instead of using the `build` method to accept the pattern strings.
|
||
// This means `new` has a different signature for each builder. It probably
|
||
// would have been nicer to to use one builder with `fn new()`, and then add
|
||
// `build(pat)` and `build_many(pats)` constructors.
|
||
//
|
||
// The other reason is because I think the `bytes` module should probably
|
||
// have its own builder type. That way, it is completely isolated from the
|
||
// top-level API.
|
||
//
|
||
// If I could do it again, I'd probably have a `regex::Builder` and a
|
||
// `regex::bytes::Builder`. Each would have `build` and `build_set` (or
|
||
// `build_many`) methods for constructing a single pattern `Regex` and a
|
||
// multi-pattern `RegexSet`, respectively.
|
||
|
||
use alloc::{
|
||
string::{String, ToString},
|
||
sync::Arc,
|
||
vec,
|
||
vec::Vec,
|
||
};
|
||
|
||
use regex_automata::{
|
||
meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind,
|
||
};
|
||
|
||
use crate::error::Error;
|
||
|
||
/// A builder for constructing a `Regex`, `bytes::Regex`, `RegexSet` or a
|
||
/// `bytes::RegexSet`.
|
||
///
|
||
/// This is essentially the implementation of the four different builder types
|
||
/// in the public API: `RegexBuilder`, `bytes::RegexBuilder`, `RegexSetBuilder`
|
||
/// and `bytes::RegexSetBuilder`.
|
||
#[derive(Clone, Debug)]
|
||
struct Builder {
|
||
pats: Vec<String>,
|
||
metac: meta::Config,
|
||
syntaxc: syntax::Config,
|
||
}
|
||
|
||
impl Default for Builder {
|
||
fn default() -> Builder {
|
||
let metac = meta::Config::new()
|
||
.nfa_size_limit(Some(10 * (1 << 20)))
|
||
.hybrid_cache_capacity(2 * (1 << 20));
|
||
Builder { pats: vec![], metac, syntaxc: syntax::Config::default() }
|
||
}
|
||
}
|
||
|
||
impl Builder {
|
||
fn new<I, S>(patterns: I) -> Builder
|
||
where
|
||
S: AsRef<str>,
|
||
I: IntoIterator<Item = S>,
|
||
{
|
||
let mut b = Builder::default();
|
||
b.pats.extend(patterns.into_iter().map(|p| p.as_ref().to_string()));
|
||
b
|
||
}
|
||
|
||
fn build_one_string(&self) -> Result<crate::Regex, Error> {
|
||
assert_eq!(1, self.pats.len());
|
||
let metac = self
|
||
.metac
|
||
.clone()
|
||
.match_kind(MatchKind::LeftmostFirst)
|
||
.utf8_empty(true);
|
||
let syntaxc = self.syntaxc.clone().utf8(true);
|
||
let pattern = Arc::from(self.pats[0].as_str());
|
||
meta::Builder::new()
|
||
.configure(metac)
|
||
.syntax(syntaxc)
|
||
.build(&pattern)
|
||
.map(|meta| crate::Regex { meta, pattern })
|
||
.map_err(Error::from_meta_build_error)
|
||
}
|
||
|
||
fn build_one_bytes(&self) -> Result<crate::bytes::Regex, Error> {
|
||
assert_eq!(1, self.pats.len());
|
||
let metac = self
|
||
.metac
|
||
.clone()
|
||
.match_kind(MatchKind::LeftmostFirst)
|
||
.utf8_empty(false);
|
||
let syntaxc = self.syntaxc.clone().utf8(false);
|
||
let pattern = Arc::from(self.pats[0].as_str());
|
||
meta::Builder::new()
|
||
.configure(metac)
|
||
.syntax(syntaxc)
|
||
.build(&pattern)
|
||
.map(|meta| crate::bytes::Regex { meta, pattern })
|
||
.map_err(Error::from_meta_build_error)
|
||
}
|
||
|
||
fn build_many_string(&self) -> Result<crate::RegexSet, Error> {
|
||
let metac = self
|
||
.metac
|
||
.clone()
|
||
.match_kind(MatchKind::All)
|
||
.utf8_empty(true)
|
||
.which_captures(WhichCaptures::None);
|
||
let syntaxc = self.syntaxc.clone().utf8(true);
|
||
let patterns = Arc::from(self.pats.as_slice());
|
||
meta::Builder::new()
|
||
.configure(metac)
|
||
.syntax(syntaxc)
|
||
.build_many(&patterns)
|
||
.map(|meta| crate::RegexSet { meta, patterns })
|
||
.map_err(Error::from_meta_build_error)
|
||
}
|
||
|
||
fn build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error> {
|
||
let metac = self
|
||
.metac
|
||
.clone()
|
||
.match_kind(MatchKind::All)
|
||
.utf8_empty(false)
|
||
.which_captures(WhichCaptures::None);
|
||
let syntaxc = self.syntaxc.clone().utf8(false);
|
||
let patterns = Arc::from(self.pats.as_slice());
|
||
meta::Builder::new()
|
||
.configure(metac)
|
||
.syntax(syntaxc)
|
||
.build_many(&patterns)
|
||
.map(|meta| crate::bytes::RegexSet { meta, patterns })
|
||
.map_err(Error::from_meta_build_error)
|
||
}
|
||
|
||
fn case_insensitive(&mut self, yes: bool) -> &mut Builder {
|
||
self.syntaxc = self.syntaxc.case_insensitive(yes);
|
||
self
|
||
}
|
||
|
||
fn multi_line(&mut self, yes: bool) -> &mut Builder {
|
||
self.syntaxc = self.syntaxc.multi_line(yes);
|
||
self
|
||
}
|
||
|
||
fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder {
|
||
self.syntaxc = self.syntaxc.dot_matches_new_line(yes);
|
||
self
|
||
}
|
||
|
||
fn crlf(&mut self, yes: bool) -> &mut Builder {
|
||
self.syntaxc = self.syntaxc.crlf(yes);
|
||
self
|
||
}
|
||
|
||
fn line_terminator(&mut self, byte: u8) -> &mut Builder {
|
||
self.metac = self.metac.clone().line_terminator(byte);
|
||
self.syntaxc = self.syntaxc.line_terminator(byte);
|
||
self
|
||
}
|
||
|
||
fn swap_greed(&mut self, yes: bool) -> &mut Builder {
|
||
self.syntaxc = self.syntaxc.swap_greed(yes);
|
||
self
|
||
}
|
||
|
||
fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder {
|
||
self.syntaxc = self.syntaxc.ignore_whitespace(yes);
|
||
self
|
||
}
|
||
|
||
fn unicode(&mut self, yes: bool) -> &mut Builder {
|
||
self.syntaxc = self.syntaxc.unicode(yes);
|
||
self
|
||
}
|
||
|
||
fn octal(&mut self, yes: bool) -> &mut Builder {
|
||
self.syntaxc = self.syntaxc.octal(yes);
|
||
self
|
||
}
|
||
|
||
fn size_limit(&mut self, limit: usize) -> &mut Builder {
|
||
self.metac = self.metac.clone().nfa_size_limit(Some(limit));
|
||
self
|
||
}
|
||
|
||
fn dfa_size_limit(&mut self, limit: usize) -> &mut Builder {
|
||
self.metac = self.metac.clone().hybrid_cache_capacity(limit);
|
||
self
|
||
}
|
||
|
||
fn nest_limit(&mut self, limit: u32) -> &mut Builder {
|
||
self.syntaxc = self.syntaxc.nest_limit(limit);
|
||
self
|
||
}
|
||
}
|
||
|
||
pub(crate) mod string {
|
||
use crate::{error::Error, Regex, RegexSet};
|
||
|
||
use super::Builder;
|
||
|
||
/// A configurable builder for a [`Regex`].
|
||
///
|
||
/// This builder can be used to programmatically set flags such as `i`
|
||
/// (case insensitive) and `x` (for verbose mode). This builder can also be
|
||
/// used to configure things like the line terminator and a size limit on
|
||
/// the compiled regular expression.
|
||
#[derive(Clone, Debug)]
|
||
pub struct RegexBuilder {
|
||
builder: Builder,
|
||
}
|
||
|
||
impl RegexBuilder {
|
||
/// Create a new builder with a default configuration for the given
|
||
/// pattern.
|
||
///
|
||
/// If the pattern is invalid or exceeds the configured size limits,
|
||
/// then an error will be returned when [`RegexBuilder::build`] is
|
||
/// called.
|
||
pub fn new(pattern: &str) -> RegexBuilder {
|
||
RegexBuilder { builder: Builder::new([pattern]) }
|
||
}
|
||
|
||
/// Compiles the pattern given to `RegexBuilder::new` with the
|
||
/// configuration set on this builder.
|
||
///
|
||
/// If the pattern isn't a valid regex or if a configured size limit
|
||
/// was exceeded, then an error is returned.
|
||
pub fn build(&self) -> Result<Regex, Error> {
|
||
self.builder.build_one_string()
|
||
}
|
||
|
||
/// This configures Unicode mode for the entire pattern.
|
||
///
|
||
/// Enabling Unicode mode does a number of things:
|
||
///
|
||
/// * Most fundamentally, it causes the fundamental atom of matching
|
||
/// to be a single codepoint. When Unicode mode is disabled, it's a
|
||
/// single byte. For example, when Unicode mode is enabled, `.` will
|
||
/// match `💩` once, where as it will match 4 times when Unicode mode
|
||
/// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
|
||
/// * Case insensitive matching uses Unicode simple case folding rules.
|
||
/// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
|
||
/// available.
|
||
/// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
|
||
/// `\d`.
|
||
/// * The word boundary assertions, `\b` and `\B`, use the Unicode
|
||
/// definition of a word character.
|
||
///
|
||
/// Note that if Unicode mode is disabled, then the regex will fail to
|
||
/// compile if it could match invalid UTF-8. For example, when Unicode
|
||
/// mode is disabled, then since `.` matches any byte (except for
|
||
/// `\n`), then it can match invalid UTF-8 and thus building a regex
|
||
/// from it will fail. Another example is `\w` and `\W`. Since `\w` can
|
||
/// only match ASCII bytes when Unicode mode is disabled, it's allowed.
|
||
/// But `\W` can match more than ASCII bytes, including invalid UTF-8,
|
||
/// and so it is not allowed. This restriction can be lifted only by
|
||
/// using a [`bytes::Regex`](crate::bytes::Regex).
|
||
///
|
||
/// For more details on the Unicode support in this crate, see the
|
||
/// [Unicode section](crate#unicode) in this crate's top-level
|
||
/// documentation.
|
||
///
|
||
/// The default for this is `true`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"\w")
|
||
/// .unicode(false)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// // Normally greek letters would be included in \w, but since
|
||
/// // Unicode mode is disabled, it only matches ASCII letters.
|
||
/// assert!(!re.is_match("δ"));
|
||
///
|
||
/// let re = RegexBuilder::new(r"s")
|
||
/// .case_insensitive(true)
|
||
/// .unicode(false)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// // Normally 'ſ' is included when searching for 's' case
|
||
/// // insensitively due to Unicode's simple case folding rules. But
|
||
/// // when Unicode mode is disabled, only ASCII case insensitive rules
|
||
/// // are used.
|
||
/// assert!(!re.is_match("ſ"));
|
||
/// ```
|
||
pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.unicode(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures whether to enable case insensitive matching for the
|
||
/// entire pattern.
|
||
///
|
||
/// This setting can also be configured using the inline flag `i`
|
||
/// in the pattern. For example, `(?i:foo)` matches `foo` case
|
||
/// insensitively while `(?-i:foo)` matches `foo` case sensitively.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
|
||
/// .case_insensitive(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match("FoObarQuUx"));
|
||
/// // Even though case insensitive matching is enabled in the builder,
|
||
/// // it can be locally disabled within the pattern. In this case,
|
||
/// // `bar` is matched case sensitively.
|
||
/// assert!(!re.is_match("fooBARquux"));
|
||
/// ```
|
||
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.case_insensitive(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures multi-line mode for the entire pattern.
|
||
///
|
||
/// Enabling multi-line mode changes the behavior of the `^` and `$`
|
||
/// anchor assertions. Instead of only matching at the beginning and
|
||
/// end of a haystack, respectively, multi-line mode causes them to
|
||
/// match at the beginning and end of a line *in addition* to the
|
||
/// beginning and end of a haystack. More precisely, `^` will match at
|
||
/// the position immediately following a `\n` and `$` will match at the
|
||
/// position immediately preceding a `\n`.
|
||
///
|
||
/// The behavior of this option can be impacted by other settings too:
|
||
///
|
||
/// * The [`RegexBuilder::line_terminator`] option changes `\n` above
|
||
/// to any ASCII byte.
|
||
/// * The [`RegexBuilder::crlf`] option changes the line terminator to
|
||
/// be either `\r` or `\n`, but never at the position between a `\r`
|
||
/// and `\n`.
|
||
///
|
||
/// This setting can also be configured using the inline flag `m` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"^foo$")
|
||
/// .multi_line(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert_eq!(Some(1..4), re.find("\nfoo\n").map(|m| m.range()));
|
||
/// ```
|
||
pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.multi_line(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures dot-matches-new-line mode for the entire pattern.
|
||
///
|
||
/// Perhaps surprisingly, the default behavior for `.` is not to match
|
||
/// any character, but rather, to match any character except for the
|
||
/// line terminator (which is `\n` by default). When this mode is
|
||
/// enabled, the behavior changes such that `.` truly matches any
|
||
/// character.
|
||
///
|
||
/// This setting can also be configured using the inline flag `s` in
|
||
/// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
|
||
/// regexes.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"foo.bar")
|
||
/// .dot_matches_new_line(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = "foo\nbar";
|
||
/// assert_eq!(Some("foo\nbar"), re.find(hay).map(|m| m.as_str()));
|
||
/// ```
|
||
pub fn dot_matches_new_line(
|
||
&mut self,
|
||
yes: bool,
|
||
) -> &mut RegexBuilder {
|
||
self.builder.dot_matches_new_line(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures CRLF mode for the entire pattern.
|
||
///
|
||
/// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
|
||
/// short) and `\n` ("line feed" or LF for short) are treated as line
|
||
/// terminators. This results in the following:
|
||
///
|
||
/// * Unless dot-matches-new-line mode is enabled, `.` will now match
|
||
/// any character except for `\n` and `\r`.
|
||
/// * When multi-line mode is enabled, `^` will match immediately
|
||
/// following a `\n` or a `\r`. Similarly, `$` will match immediately
|
||
/// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
|
||
/// between `\r` and `\n`.
|
||
///
|
||
/// This setting can also be configured using the inline flag `R` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"^foo$")
|
||
/// .multi_line(true)
|
||
/// .crlf(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = "\r\nfoo\r\n";
|
||
/// // If CRLF mode weren't enabled here, then '$' wouldn't match
|
||
/// // immediately after 'foo', and thus no match would be found.
|
||
/// assert_eq!(Some("foo"), re.find(hay).map(|m| m.as_str()));
|
||
/// ```
|
||
///
|
||
/// This example demonstrates that `^` will never match at a position
|
||
/// between `\r` and `\n`. (`$` will similarly not match between a `\r`
|
||
/// and a `\n`.)
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"^")
|
||
/// .multi_line(true)
|
||
/// .crlf(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = "\r\n\r\n";
|
||
/// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
|
||
/// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
|
||
/// ```
|
||
pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.crlf(yes);
|
||
self
|
||
}
|
||
|
||
/// Configures the line terminator to be used by the regex.
|
||
///
|
||
/// The line terminator is relevant in two ways for a particular regex:
|
||
///
|
||
/// * When dot-matches-new-line mode is *not* enabled (the default),
|
||
/// then `.` will match any character except for the configured line
|
||
/// terminator.
|
||
/// * When multi-line mode is enabled (not the default), then `^` and
|
||
/// `$` will match immediately after and before, respectively, a line
|
||
/// terminator.
|
||
///
|
||
/// In both cases, if CRLF mode is enabled in a particular context,
|
||
/// then it takes precedence over any configured line terminator.
|
||
///
|
||
/// This option cannot be configured from within the pattern.
|
||
///
|
||
/// The default line terminator is `\n`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// This shows how to treat the NUL byte as a line terminator. This can
|
||
/// be a useful heuristic when searching binary data.
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"^foo$")
|
||
/// .multi_line(true)
|
||
/// .line_terminator(b'\x00')
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = "\x00foo\x00";
|
||
/// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
|
||
/// ```
|
||
///
|
||
/// This example shows that the behavior of `.` is impacted by this
|
||
/// setting as well:
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r".")
|
||
/// .line_terminator(b'\x00')
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match("\n"));
|
||
/// assert!(!re.is_match("\x00"));
|
||
/// ```
|
||
///
|
||
/// This shows that building a regex will fail if the byte given
|
||
/// is not ASCII and the pattern could result in matching invalid
|
||
/// UTF-8. This is because any singular non-ASCII byte is not valid
|
||
/// UTF-8, and it is not permitted for a [`Regex`] to match invalid
|
||
/// UTF-8. (It is permissible to use a non-ASCII byte when building a
|
||
/// [`bytes::Regex`](crate::bytes::Regex).)
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// assert!(RegexBuilder::new(r".").line_terminator(0x80).build().is_err());
|
||
/// // Note that using a non-ASCII byte isn't enough on its own to
|
||
/// // cause regex compilation to fail. You actually have to make use
|
||
/// // of it in the regex in a way that leads to matching invalid
|
||
/// // UTF-8. If you don't, then regex compilation will succeed!
|
||
/// assert!(RegexBuilder::new(r"a").line_terminator(0x80).build().is_ok());
|
||
/// ```
|
||
pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
|
||
self.builder.line_terminator(byte);
|
||
self
|
||
}
|
||
|
||
/// This configures swap-greed mode for the entire pattern.
|
||
///
|
||
/// When swap-greed mode is enabled, patterns like `a+` will become
|
||
/// non-greedy and patterns like `a+?` will become greedy. In other
|
||
/// words, the meanings of `a+` and `a+?` are switched.
|
||
///
|
||
/// This setting can also be configured using the inline flag `U` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"a+")
|
||
/// .swap_greed(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert_eq!(Some("a"), re.find("aaa").map(|m| m.as_str()));
|
||
/// ```
|
||
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.swap_greed(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures verbose mode for the entire pattern.
|
||
///
|
||
/// When enabled, whitespace will treated as insignifcant in the
|
||
/// pattern and `#` can be used to start a comment until the next new
|
||
/// line.
|
||
///
|
||
/// Normally, in most places in a pattern, whitespace is treated
|
||
/// literally. For example ` +` will match one or more ASCII whitespace
|
||
/// characters.
|
||
///
|
||
/// When verbose mode is enabled, `\#` can be used to match a literal
|
||
/// `#` and `\ ` can be used to match a literal ASCII whitespace
|
||
/// character.
|
||
///
|
||
/// Verbose mode is useful for permitting regexes to be formatted and
|
||
/// broken up more nicely. This may make them more easily readable.
|
||
///
|
||
/// This setting can also be configured using the inline flag `x` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// let pat = r"
|
||
/// \b
|
||
/// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
|
||
/// [\s--\n]+ # whitespace should separate names
|
||
/// (?: # middle name can be an initial!
|
||
/// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
|
||
/// [\s--\n]+
|
||
/// )?
|
||
/// (?<last>\p{Uppercase}\w*)
|
||
/// \b
|
||
/// ";
|
||
/// let re = RegexBuilder::new(pat)
|
||
/// .ignore_whitespace(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
///
|
||
/// let caps = re.captures("Harry Potter").unwrap();
|
||
/// assert_eq!("Harry", &caps["first"]);
|
||
/// assert_eq!("Potter", &caps["last"]);
|
||
///
|
||
/// let caps = re.captures("Harry J. Potter").unwrap();
|
||
/// assert_eq!("Harry", &caps["first"]);
|
||
/// // Since a middle name/initial isn't required for an overall match,
|
||
/// // we can't assume that 'initial' or 'middle' will be populated!
|
||
/// assert_eq!(Some("J"), caps.name("initial").map(|m| m.as_str()));
|
||
/// assert_eq!(None, caps.name("middle").map(|m| m.as_str()));
|
||
/// assert_eq!("Potter", &caps["last"]);
|
||
///
|
||
/// let caps = re.captures("Harry James Potter").unwrap();
|
||
/// assert_eq!("Harry", &caps["first"]);
|
||
/// // Since a middle name/initial isn't required for an overall match,
|
||
/// // we can't assume that 'initial' or 'middle' will be populated!
|
||
/// assert_eq!(None, caps.name("initial").map(|m| m.as_str()));
|
||
/// assert_eq!(Some("James"), caps.name("middle").map(|m| m.as_str()));
|
||
/// assert_eq!("Potter", &caps["last"]);
|
||
/// ```
|
||
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.ignore_whitespace(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures octal mode for the entire pattern.
|
||
///
|
||
/// Octal syntax is a little-known way of uttering Unicode codepoints
|
||
/// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
|
||
/// equivalent patterns, where the last example shows octal syntax.
|
||
///
|
||
/// While supporting octal syntax isn't in and of itself a problem,
|
||
/// it does make good error messages harder. That is, in PCRE based
|
||
/// regex engines, syntax like `\1` invokes a backreference, which is
|
||
/// explicitly unsupported this library. However, many users expect
|
||
/// backreferences to be supported. Therefore, when octal support
|
||
/// is disabled, the error message will explicitly mention that
|
||
/// backreferences aren't supported.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// // Normally this pattern would not compile, with an error message
|
||
/// // about backreferences not being supported. But with octal mode
|
||
/// // enabled, octal escape sequences work.
|
||
/// let re = RegexBuilder::new(r"\141")
|
||
/// .octal(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match("a"));
|
||
/// ```
|
||
pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.octal(yes);
|
||
self
|
||
}
|
||
|
||
/// Sets the approximate size limit, in bytes, of the compiled regex.
|
||
///
|
||
/// This roughly corresponds to the number of heap memory, in
|
||
/// bytes, occupied by a single regex. If the regex would otherwise
|
||
/// approximately exceed this limit, then compiling that regex will
|
||
/// fail.
|
||
///
|
||
/// The main utility of a method like this is to avoid compiling
|
||
/// regexes that use an unexpected amount of resources, such as
|
||
/// time and memory. Even if the memory usage of a large regex is
|
||
/// acceptable, its search time may not be. Namely, worst case time
|
||
/// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
|
||
/// `n ~ len(haystack)`. That is, search time depends, in part, on the
|
||
/// size of the compiled regex. This means that putting a limit on the
|
||
/// size of the regex limits how much a regex can impact search time.
|
||
///
|
||
/// For more information about regex size limits, see the section on
|
||
/// [untrusted inputs](crate#untrusted-input) in the top-level crate
|
||
/// documentation.
|
||
///
|
||
/// The default for this is some reasonable number that permits most
|
||
/// patterns to compile successfully.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// // It may surprise you how big some seemingly small patterns can
|
||
/// // be! Since \w is Unicode aware, this generates a regex that can
|
||
/// // match approximately 140,000 distinct codepoints.
|
||
/// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
|
||
/// ```
|
||
pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
|
||
self.builder.size_limit(bytes);
|
||
self
|
||
}
|
||
|
||
/// Set the approximate capacity, in bytes, of the cache of transitions
|
||
/// used by the lazy DFA.
|
||
///
|
||
/// While the lazy DFA isn't always used, in tends to be the most
|
||
/// commonly use regex engine in default configurations. It tends to
|
||
/// adopt the performance profile of a fully build DFA, but without the
|
||
/// downside of taking worst case exponential time to build.
|
||
///
|
||
/// The downside is that it needs to keep a cache of transitions and
|
||
/// states that are built while running a search, and this cache
|
||
/// can fill up. When it fills up, the cache will reset itself. Any
|
||
/// previously generated states and transitions will then need to be
|
||
/// re-generated. If this happens too many times, then this library
|
||
/// will bail out of using the lazy DFA and switch to a different regex
|
||
/// engine.
|
||
///
|
||
/// If your regex provokes this particular downside of the lazy DFA,
|
||
/// then it may be beneficial to increase its cache capacity. This will
|
||
/// potentially reduce the frequency of cache resetting (ideally to
|
||
/// `0`). While it won't fix all potential performance problems with
|
||
/// the lazy DFA, increasing the cache capacity does fix some.
|
||
///
|
||
/// There is no easy way to determine, a priori, whether increasing
|
||
/// this cache capacity will help. In general, the larger your regex,
|
||
/// the more cache it's likely to use. But that isn't an ironclad rule.
|
||
/// For example, a regex like `[01]*1[01]{N}` would normally produce a
|
||
/// fully build DFA that is exponential in size with respect to `N`.
|
||
/// The lazy DFA will prevent exponential space blow-up, but it cache
|
||
/// is likely to fill up, even when it's large and even for smallish
|
||
/// values of `N`.
|
||
///
|
||
/// If you aren't sure whether this helps or not, it is sensible to
|
||
/// set this to some arbitrarily large number in testing, such as
|
||
/// `usize::MAX`. Namely, this represents the amount of capacity that
|
||
/// *may* be used. It's probably not a good idea to use `usize::MAX` in
|
||
/// production though, since it implies there are no controls on heap
|
||
/// memory used by this library during a search. In effect, set it to
|
||
/// whatever you're willing to allocate for a single regex search.
|
||
pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
|
||
self.builder.dfa_size_limit(bytes);
|
||
self
|
||
}
|
||
|
||
/// Set the nesting limit for this parser.
|
||
///
|
||
/// The nesting limit controls how deep the abstract syntax tree is
|
||
/// allowed to be. If the AST exceeds the given limit (e.g., with too
|
||
/// many nested groups), then an error is returned by the parser.
|
||
///
|
||
/// The purpose of this limit is to act as a heuristic to prevent stack
|
||
/// overflow for consumers that do structural induction on an AST using
|
||
/// explicit recursion. While this crate never does this (instead using
|
||
/// constant stack space and moving the call stack to the heap), other
|
||
/// crates may.
|
||
///
|
||
/// This limit is not checked until the entire AST is parsed.
|
||
/// Therefore, if callers want to put a limit on the amount of heap
|
||
/// space used, then they should impose a limit on the length, in
|
||
/// bytes, of the concrete pattern string. In particular, this is
|
||
/// viable since this parser implementation will limit itself to heap
|
||
/// space proportional to the length of the pattern string. See also
|
||
/// the [untrusted inputs](crate#untrusted-input) section in the
|
||
/// top-level crate documentation for more information about this.
|
||
///
|
||
/// Note that a nest limit of `0` will return a nest limit error for
|
||
/// most patterns but not all. For example, a nest limit of `0` permits
|
||
/// `a` but not `ab`, since `ab` requires an explicit concatenation,
|
||
/// which results in a nest depth of `1`. In general, a nest limit is
|
||
/// not something that manifests in an obvious way in the concrete
|
||
/// syntax, therefore, it should not be used in a granular way.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexBuilder;
|
||
///
|
||
/// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
|
||
/// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
|
||
/// ```
|
||
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
|
||
self.builder.nest_limit(limit);
|
||
self
|
||
}
|
||
}
|
||
|
||
/// A configurable builder for a [`RegexSet`].
|
||
///
|
||
/// This builder can be used to programmatically set flags such as
|
||
/// `i` (case insensitive) and `x` (for verbose mode). This builder
|
||
/// can also be used to configure things like the line terminator
|
||
/// and a size limit on the compiled regular expression.
|
||
#[derive(Clone, Debug)]
|
||
pub struct RegexSetBuilder {
|
||
builder: Builder,
|
||
}
|
||
|
||
impl RegexSetBuilder {
|
||
/// Create a new builder with a default configuration for the given
|
||
/// patterns.
|
||
///
|
||
/// If the patterns are invalid or exceed the configured size limits,
|
||
/// then an error will be returned when [`RegexSetBuilder::build`] is
|
||
/// called.
|
||
pub fn new<I, S>(patterns: I) -> RegexSetBuilder
|
||
where
|
||
I: IntoIterator<Item = S>,
|
||
S: AsRef<str>,
|
||
{
|
||
RegexSetBuilder { builder: Builder::new(patterns) }
|
||
}
|
||
|
||
/// Compiles the patterns given to `RegexSetBuilder::new` with the
|
||
/// configuration set on this builder.
|
||
///
|
||
/// If the patterns aren't valid regexes or if a configured size limit
|
||
/// was exceeded, then an error is returned.
|
||
pub fn build(&self) -> Result<RegexSet, Error> {
|
||
self.builder.build_many_string()
|
||
}
|
||
|
||
/// This configures Unicode mode for the all of the patterns.
|
||
///
|
||
/// Enabling Unicode mode does a number of things:
|
||
///
|
||
/// * Most fundamentally, it causes the fundamental atom of matching
|
||
/// to be a single codepoint. When Unicode mode is disabled, it's a
|
||
/// single byte. For example, when Unicode mode is enabled, `.` will
|
||
/// match `💩` once, where as it will match 4 times when Unicode mode
|
||
/// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
|
||
/// * Case insensitive matching uses Unicode simple case folding rules.
|
||
/// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
|
||
/// available.
|
||
/// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
|
||
/// `\d`.
|
||
/// * The word boundary assertions, `\b` and `\B`, use the Unicode
|
||
/// definition of a word character.
|
||
///
|
||
/// Note that if Unicode mode is disabled, then the regex will fail to
|
||
/// compile if it could match invalid UTF-8. For example, when Unicode
|
||
/// mode is disabled, then since `.` matches any byte (except for
|
||
/// `\n`), then it can match invalid UTF-8 and thus building a regex
|
||
/// from it will fail. Another example is `\w` and `\W`. Since `\w` can
|
||
/// only match ASCII bytes when Unicode mode is disabled, it's allowed.
|
||
/// But `\W` can match more than ASCII bytes, including invalid UTF-8,
|
||
/// and so it is not allowed. This restriction can be lifted only by
|
||
/// using a [`bytes::RegexSet`](crate::bytes::RegexSet).
|
||
///
|
||
/// For more details on the Unicode support in this crate, see the
|
||
/// [Unicode section](crate#unicode) in this crate's top-level
|
||
/// documentation.
|
||
///
|
||
/// The default for this is `true`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"\w"])
|
||
/// .unicode(false)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// // Normally greek letters would be included in \w, but since
|
||
/// // Unicode mode is disabled, it only matches ASCII letters.
|
||
/// assert!(!re.is_match("δ"));
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"s"])
|
||
/// .case_insensitive(true)
|
||
/// .unicode(false)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// // Normally 'ſ' is included when searching for 's' case
|
||
/// // insensitively due to Unicode's simple case folding rules. But
|
||
/// // when Unicode mode is disabled, only ASCII case insensitive rules
|
||
/// // are used.
|
||
/// assert!(!re.is_match("ſ"));
|
||
/// ```
|
||
pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||
self.builder.unicode(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures whether to enable case insensitive matching for all
|
||
/// of the patterns.
|
||
///
|
||
/// This setting can also be configured using the inline flag `i`
|
||
/// in the pattern. For example, `(?i:foo)` matches `foo` case
|
||
/// insensitively while `(?-i:foo)` matches `foo` case sensitively.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
|
||
/// .case_insensitive(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match("FoObarQuUx"));
|
||
/// // Even though case insensitive matching is enabled in the builder,
|
||
/// // it can be locally disabled within the pattern. In this case,
|
||
/// // `bar` is matched case sensitively.
|
||
/// assert!(!re.is_match("fooBARquux"));
|
||
/// ```
|
||
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||
self.builder.case_insensitive(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures multi-line mode for all of the patterns.
|
||
///
|
||
/// Enabling multi-line mode changes the behavior of the `^` and `$`
|
||
/// anchor assertions. Instead of only matching at the beginning and
|
||
/// end of a haystack, respectively, multi-line mode causes them to
|
||
/// match at the beginning and end of a line *in addition* to the
|
||
/// beginning and end of a haystack. More precisely, `^` will match at
|
||
/// the position immediately following a `\n` and `$` will match at the
|
||
/// position immediately preceding a `\n`.
|
||
///
|
||
/// The behavior of this option can be impacted by other settings too:
|
||
///
|
||
/// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
|
||
/// above to any ASCII byte.
|
||
/// * The [`RegexSetBuilder::crlf`] option changes the line terminator
|
||
/// to be either `\r` or `\n`, but never at the position between a `\r`
|
||
/// and `\n`.
|
||
///
|
||
/// This setting can also be configured using the inline flag `m` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"^foo$"])
|
||
/// .multi_line(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match("\nfoo\n"));
|
||
/// ```
|
||
pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||
self.builder.multi_line(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures dot-matches-new-line mode for the entire pattern.
|
||
///
|
||
/// Perhaps surprisingly, the default behavior for `.` is not to match
|
||
/// any character, but rather, to match any character except for the
|
||
/// line terminator (which is `\n` by default). When this mode is
|
||
/// enabled, the behavior changes such that `.` truly matches any
|
||
/// character.
|
||
///
|
||
/// This setting can also be configured using the inline flag `s` in
|
||
/// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
|
||
/// regexes.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"foo.bar"])
|
||
/// .dot_matches_new_line(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = "foo\nbar";
|
||
/// assert!(re.is_match(hay));
|
||
/// ```
|
||
pub fn dot_matches_new_line(
|
||
&mut self,
|
||
yes: bool,
|
||
) -> &mut RegexSetBuilder {
|
||
self.builder.dot_matches_new_line(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures CRLF mode for all of the patterns.
|
||
///
|
||
/// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
|
||
/// short) and `\n` ("line feed" or LF for short) are treated as line
|
||
/// terminators. This results in the following:
|
||
///
|
||
/// * Unless dot-matches-new-line mode is enabled, `.` will now match
|
||
/// any character except for `\n` and `\r`.
|
||
/// * When multi-line mode is enabled, `^` will match immediately
|
||
/// following a `\n` or a `\r`. Similarly, `$` will match immediately
|
||
/// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
|
||
/// between `\r` and `\n`.
|
||
///
|
||
/// This setting can also be configured using the inline flag `R` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"^foo$"])
|
||
/// .multi_line(true)
|
||
/// .crlf(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = "\r\nfoo\r\n";
|
||
/// // If CRLF mode weren't enabled here, then '$' wouldn't match
|
||
/// // immediately after 'foo', and thus no match would be found.
|
||
/// assert!(re.is_match(hay));
|
||
/// ```
|
||
///
|
||
/// This example demonstrates that `^` will never match at a position
|
||
/// between `\r` and `\n`. (`$` will similarly not match between a `\r`
|
||
/// and a `\n`.)
|
||
///
|
||
/// ```
|
||
/// use regex::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"^\n"])
|
||
/// .multi_line(true)
|
||
/// .crlf(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(!re.is_match("\r\n"));
|
||
/// ```
|
||
pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||
self.builder.crlf(yes);
|
||
self
|
||
}
|
||
|
||
/// Configures the line terminator to be used by the regex.
|
||
///
|
||
/// The line terminator is relevant in two ways for a particular regex:
|
||
///
|
||
/// * When dot-matches-new-line mode is *not* enabled (the default),
|
||
/// then `.` will match any character except for the configured line
|
||
/// terminator.
|
||
/// * When multi-line mode is enabled (not the default), then `^` and
|
||
/// `$` will match immediately after and before, respectively, a line
|
||
/// terminator.
|
||
///
|
||
/// In both cases, if CRLF mode is enabled in a particular context,
|
||
/// then it takes precedence over any configured line terminator.
|
||
///
|
||
/// This option cannot be configured from within the pattern.
|
||
///
|
||
/// The default line terminator is `\n`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// This shows how to treat the NUL byte as a line terminator. This can
|
||
/// be a useful heuristic when searching binary data.
|
||
///
|
||
/// ```
|
||
/// use regex::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"^foo$"])
|
||
/// .multi_line(true)
|
||
/// .line_terminator(b'\x00')
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = "\x00foo\x00";
|
||
/// assert!(re.is_match(hay));
|
||
/// ```
|
||
///
|
||
/// This example shows that the behavior of `.` is impacted by this
|
||
/// setting as well:
|
||
///
|
||
/// ```
|
||
/// use regex::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"."])
|
||
/// .line_terminator(b'\x00')
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match("\n"));
|
||
/// assert!(!re.is_match("\x00"));
|
||
/// ```
|
||
///
|
||
/// This shows that building a regex will fail if the byte given
|
||
/// is not ASCII and the pattern could result in matching invalid
|
||
/// UTF-8. This is because any singular non-ASCII byte is not valid
|
||
/// UTF-8, and it is not permitted for a [`RegexSet`] to match invalid
|
||
/// UTF-8. (It is permissible to use a non-ASCII byte when building a
|
||
/// [`bytes::RegexSet`](crate::bytes::RegexSet).)
|
||
///
|
||
/// ```
|
||
/// use regex::RegexSetBuilder;
|
||
///
|
||
/// assert!(
|
||
/// RegexSetBuilder::new([r"."])
|
||
/// .line_terminator(0x80)
|
||
/// .build()
|
||
/// .is_err()
|
||
/// );
|
||
/// // Note that using a non-ASCII byte isn't enough on its own to
|
||
/// // cause regex compilation to fail. You actually have to make use
|
||
/// // of it in the regex in a way that leads to matching invalid
|
||
/// // UTF-8. If you don't, then regex compilation will succeed!
|
||
/// assert!(
|
||
/// RegexSetBuilder::new([r"a"])
|
||
/// .line_terminator(0x80)
|
||
/// .build()
|
||
/// .is_ok()
|
||
/// );
|
||
/// ```
|
||
pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
|
||
self.builder.line_terminator(byte);
|
||
self
|
||
}
|
||
|
||
/// This configures swap-greed mode for all of the patterns.
|
||
///
|
||
/// When swap-greed mode is enabled, patterns like `a+` will become
|
||
/// non-greedy and patterns like `a+?` will become greedy. In other
|
||
/// words, the meanings of `a+` and `a+?` are switched.
|
||
///
|
||
/// This setting can also be configured using the inline flag `U` in
|
||
/// the pattern.
|
||
///
|
||
/// Note that this is generally not useful for a `RegexSet` since a
|
||
/// `RegexSet` can only report whether a pattern matches or not. Since
|
||
/// greediness never impacts whether a match is found or not (only the
|
||
/// offsets of the match), it follows that whether parts of a pattern
|
||
/// are greedy or not doesn't matter for a `RegexSet`.
|
||
///
|
||
/// The default for this is `false`.
|
||
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||
self.builder.swap_greed(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures verbose mode for all of the patterns.
|
||
///
|
||
/// When enabled, whitespace will treated as insignifcant in the
|
||
/// pattern and `#` can be used to start a comment until the next new
|
||
/// line.
|
||
///
|
||
/// Normally, in most places in a pattern, whitespace is treated
|
||
/// literally. For example ` +` will match one or more ASCII whitespace
|
||
/// characters.
|
||
///
|
||
/// When verbose mode is enabled, `\#` can be used to match a literal
|
||
/// `#` and `\ ` can be used to match a literal ASCII whitespace
|
||
/// character.
|
||
///
|
||
/// Verbose mode is useful for permitting regexes to be formatted and
|
||
/// broken up more nicely. This may make them more easily readable.
|
||
///
|
||
/// This setting can also be configured using the inline flag `x` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexSetBuilder;
|
||
///
|
||
/// let pat = r"
|
||
/// \b
|
||
/// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
|
||
/// [\s--\n]+ # whitespace should separate names
|
||
/// (?: # middle name can be an initial!
|
||
/// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
|
||
/// [\s--\n]+
|
||
/// )?
|
||
/// (?<last>\p{Uppercase}\w*)
|
||
/// \b
|
||
/// ";
|
||
/// let re = RegexSetBuilder::new([pat])
|
||
/// .ignore_whitespace(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match("Harry Potter"));
|
||
/// assert!(re.is_match("Harry J. Potter"));
|
||
/// assert!(re.is_match("Harry James Potter"));
|
||
/// assert!(!re.is_match("harry J. Potter"));
|
||
/// ```
|
||
pub fn ignore_whitespace(
|
||
&mut self,
|
||
yes: bool,
|
||
) -> &mut RegexSetBuilder {
|
||
self.builder.ignore_whitespace(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures octal mode for all of the patterns.
|
||
///
|
||
/// Octal syntax is a little-known way of uttering Unicode codepoints
|
||
/// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
|
||
/// equivalent patterns, where the last example shows octal syntax.
|
||
///
|
||
/// While supporting octal syntax isn't in and of itself a problem,
|
||
/// it does make good error messages harder. That is, in PCRE based
|
||
/// regex engines, syntax like `\1` invokes a backreference, which is
|
||
/// explicitly unsupported this library. However, many users expect
|
||
/// backreferences to be supported. Therefore, when octal support
|
||
/// is disabled, the error message will explicitly mention that
|
||
/// backreferences aren't supported.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexSetBuilder;
|
||
///
|
||
/// // Normally this pattern would not compile, with an error message
|
||
/// // about backreferences not being supported. But with octal mode
|
||
/// // enabled, octal escape sequences work.
|
||
/// let re = RegexSetBuilder::new([r"\141"])
|
||
/// .octal(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match("a"));
|
||
/// ```
|
||
pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||
self.builder.octal(yes);
|
||
self
|
||
}
|
||
|
||
/// Sets the approximate size limit, in bytes, of the compiled regex.
|
||
///
|
||
/// This roughly corresponds to the number of heap memory, in
|
||
/// bytes, occupied by a single regex. If the regex would otherwise
|
||
/// approximately exceed this limit, then compiling that regex will
|
||
/// fail.
|
||
///
|
||
/// The main utility of a method like this is to avoid compiling
|
||
/// regexes that use an unexpected amount of resources, such as
|
||
/// time and memory. Even if the memory usage of a large regex is
|
||
/// acceptable, its search time may not be. Namely, worst case time
|
||
/// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
|
||
/// `n ~ len(haystack)`. That is, search time depends, in part, on the
|
||
/// size of the compiled regex. This means that putting a limit on the
|
||
/// size of the regex limits how much a regex can impact search time.
|
||
///
|
||
/// For more information about regex size limits, see the section on
|
||
/// [untrusted inputs](crate#untrusted-input) in the top-level crate
|
||
/// documentation.
|
||
///
|
||
/// The default for this is some reasonable number that permits most
|
||
/// patterns to compile successfully.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexSetBuilder;
|
||
///
|
||
/// // It may surprise you how big some seemingly small patterns can
|
||
/// // be! Since \w is Unicode aware, this generates a regex that can
|
||
/// // match approximately 140,000 distinct codepoints.
|
||
/// assert!(
|
||
/// RegexSetBuilder::new([r"\w"])
|
||
/// .size_limit(45_000)
|
||
/// .build()
|
||
/// .is_err()
|
||
/// );
|
||
/// ```
|
||
pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
|
||
self.builder.size_limit(bytes);
|
||
self
|
||
}
|
||
|
||
/// Set the approximate capacity, in bytes, of the cache of transitions
|
||
/// used by the lazy DFA.
|
||
///
|
||
/// While the lazy DFA isn't always used, in tends to be the most
|
||
/// commonly use regex engine in default configurations. It tends to
|
||
/// adopt the performance profile of a fully build DFA, but without the
|
||
/// downside of taking worst case exponential time to build.
|
||
///
|
||
/// The downside is that it needs to keep a cache of transitions and
|
||
/// states that are built while running a search, and this cache
|
||
/// can fill up. When it fills up, the cache will reset itself. Any
|
||
/// previously generated states and transitions will then need to be
|
||
/// re-generated. If this happens too many times, then this library
|
||
/// will bail out of using the lazy DFA and switch to a different regex
|
||
/// engine.
|
||
///
|
||
/// If your regex provokes this particular downside of the lazy DFA,
|
||
/// then it may be beneficial to increase its cache capacity. This will
|
||
/// potentially reduce the frequency of cache resetting (ideally to
|
||
/// `0`). While it won't fix all potential performance problems with
|
||
/// the lazy DFA, increasing the cache capacity does fix some.
|
||
///
|
||
/// There is no easy way to determine, a priori, whether increasing
|
||
/// this cache capacity will help. In general, the larger your regex,
|
||
/// the more cache it's likely to use. But that isn't an ironclad rule.
|
||
/// For example, a regex like `[01]*1[01]{N}` would normally produce a
|
||
/// fully build DFA that is exponential in size with respect to `N`.
|
||
/// The lazy DFA will prevent exponential space blow-up, but it cache
|
||
/// is likely to fill up, even when it's large and even for smallish
|
||
/// values of `N`.
|
||
///
|
||
/// If you aren't sure whether this helps or not, it is sensible to
|
||
/// set this to some arbitrarily large number in testing, such as
|
||
/// `usize::MAX`. Namely, this represents the amount of capacity that
|
||
/// *may* be used. It's probably not a good idea to use `usize::MAX` in
|
||
/// production though, since it implies there are no controls on heap
|
||
/// memory used by this library during a search. In effect, set it to
|
||
/// whatever you're willing to allocate for a single regex search.
|
||
pub fn dfa_size_limit(
|
||
&mut self,
|
||
bytes: usize,
|
||
) -> &mut RegexSetBuilder {
|
||
self.builder.dfa_size_limit(bytes);
|
||
self
|
||
}
|
||
|
||
/// Set the nesting limit for this parser.
|
||
///
|
||
/// The nesting limit controls how deep the abstract syntax tree is
|
||
/// allowed to be. If the AST exceeds the given limit (e.g., with too
|
||
/// many nested groups), then an error is returned by the parser.
|
||
///
|
||
/// The purpose of this limit is to act as a heuristic to prevent stack
|
||
/// overflow for consumers that do structural induction on an AST using
|
||
/// explicit recursion. While this crate never does this (instead using
|
||
/// constant stack space and moving the call stack to the heap), other
|
||
/// crates may.
|
||
///
|
||
/// This limit is not checked until the entire AST is parsed.
|
||
/// Therefore, if callers want to put a limit on the amount of heap
|
||
/// space used, then they should impose a limit on the length, in
|
||
/// bytes, of the concrete pattern string. In particular, this is
|
||
/// viable since this parser implementation will limit itself to heap
|
||
/// space proportional to the length of the pattern string. See also
|
||
/// the [untrusted inputs](crate#untrusted-input) section in the
|
||
/// top-level crate documentation for more information about this.
|
||
///
|
||
/// Note that a nest limit of `0` will return a nest limit error for
|
||
/// most patterns but not all. For example, a nest limit of `0` permits
|
||
/// `a` but not `ab`, since `ab` requires an explicit concatenation,
|
||
/// which results in a nest depth of `1`. In general, a nest limit is
|
||
/// not something that manifests in an obvious way in the concrete
|
||
/// syntax, therefore, it should not be used in a granular way.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::RegexSetBuilder;
|
||
///
|
||
/// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
|
||
/// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
|
||
/// ```
|
||
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
|
||
self.builder.nest_limit(limit);
|
||
self
|
||
}
|
||
}
|
||
}
|
||
|
||
pub(crate) mod bytes {
|
||
use crate::{
|
||
bytes::{Regex, RegexSet},
|
||
error::Error,
|
||
};
|
||
|
||
use super::Builder;
|
||
|
||
/// A configurable builder for a [`Regex`].
|
||
///
|
||
/// This builder can be used to programmatically set flags such as `i`
|
||
/// (case insensitive) and `x` (for verbose mode). This builder can also be
|
||
/// used to configure things like the line terminator and a size limit on
|
||
/// the compiled regular expression.
|
||
#[derive(Clone, Debug)]
|
||
pub struct RegexBuilder {
|
||
builder: Builder,
|
||
}
|
||
|
||
impl RegexBuilder {
|
||
/// Create a new builder with a default configuration for the given
|
||
/// pattern.
|
||
///
|
||
/// If the pattern is invalid or exceeds the configured size limits,
|
||
/// then an error will be returned when [`RegexBuilder::build`] is
|
||
/// called.
|
||
pub fn new(pattern: &str) -> RegexBuilder {
|
||
RegexBuilder { builder: Builder::new([pattern]) }
|
||
}
|
||
|
||
/// Compiles the pattern given to `RegexBuilder::new` with the
|
||
/// configuration set on this builder.
|
||
///
|
||
/// If the pattern isn't a valid regex or if a configured size limit
|
||
/// was exceeded, then an error is returned.
|
||
pub fn build(&self) -> Result<Regex, Error> {
|
||
self.builder.build_one_bytes()
|
||
}
|
||
|
||
/// This configures Unicode mode for the entire pattern.
|
||
///
|
||
/// Enabling Unicode mode does a number of things:
|
||
///
|
||
/// * Most fundamentally, it causes the fundamental atom of matching
|
||
/// to be a single codepoint. When Unicode mode is disabled, it's a
|
||
/// single byte. For example, when Unicode mode is enabled, `.` will
|
||
/// match `💩` once, where as it will match 4 times when Unicode mode
|
||
/// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
|
||
/// * Case insensitive matching uses Unicode simple case folding rules.
|
||
/// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
|
||
/// available.
|
||
/// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
|
||
/// `\d`.
|
||
/// * The word boundary assertions, `\b` and `\B`, use the Unicode
|
||
/// definition of a word character.
|
||
///
|
||
/// Note that unlike the top-level `Regex` for searching `&str`, it
|
||
/// is permitted to disable Unicode mode even if the resulting pattern
|
||
/// could match invalid UTF-8. For example, `(?-u:.)` is not a valid
|
||
/// pattern for a top-level `Regex`, but is valid for a `bytes::Regex`.
|
||
///
|
||
/// For more details on the Unicode support in this crate, see the
|
||
/// [Unicode section](crate#unicode) in this crate's top-level
|
||
/// documentation.
|
||
///
|
||
/// The default for this is `true`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"\w")
|
||
/// .unicode(false)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// // Normally greek letters would be included in \w, but since
|
||
/// // Unicode mode is disabled, it only matches ASCII letters.
|
||
/// assert!(!re.is_match("δ".as_bytes()));
|
||
///
|
||
/// let re = RegexBuilder::new(r"s")
|
||
/// .case_insensitive(true)
|
||
/// .unicode(false)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// // Normally 'ſ' is included when searching for 's' case
|
||
/// // insensitively due to Unicode's simple case folding rules. But
|
||
/// // when Unicode mode is disabled, only ASCII case insensitive rules
|
||
/// // are used.
|
||
/// assert!(!re.is_match("ſ".as_bytes()));
|
||
/// ```
|
||
///
|
||
/// Since this builder is for constructing a [`bytes::Regex`](Regex),
|
||
/// one can disable Unicode mode even if it would match invalid UTF-8:
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r".")
|
||
/// .unicode(false)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// // Normally greek letters would be included in \w, but since
|
||
/// // Unicode mode is disabled, it only matches ASCII letters.
|
||
/// assert!(re.is_match(b"\xFF"));
|
||
/// ```
|
||
pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.unicode(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures whether to enable case insensitive matching for the
|
||
/// entire pattern.
|
||
///
|
||
/// This setting can also be configured using the inline flag `i`
|
||
/// in the pattern. For example, `(?i:foo)` matches `foo` case
|
||
/// insensitively while `(?-i:foo)` matches `foo` case sensitively.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
|
||
/// .case_insensitive(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match(b"FoObarQuUx"));
|
||
/// // Even though case insensitive matching is enabled in the builder,
|
||
/// // it can be locally disabled within the pattern. In this case,
|
||
/// // `bar` is matched case sensitively.
|
||
/// assert!(!re.is_match(b"fooBARquux"));
|
||
/// ```
|
||
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.case_insensitive(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures multi-line mode for the entire pattern.
|
||
///
|
||
/// Enabling multi-line mode changes the behavior of the `^` and `$`
|
||
/// anchor assertions. Instead of only matching at the beginning and
|
||
/// end of a haystack, respectively, multi-line mode causes them to
|
||
/// match at the beginning and end of a line *in addition* to the
|
||
/// beginning and end of a haystack. More precisely, `^` will match at
|
||
/// the position immediately following a `\n` and `$` will match at the
|
||
/// position immediately preceding a `\n`.
|
||
///
|
||
/// The behavior of this option can be impacted by other settings too:
|
||
///
|
||
/// * The [`RegexBuilder::line_terminator`] option changes `\n` above
|
||
/// to any ASCII byte.
|
||
/// * The [`RegexBuilder::crlf`] option changes the line terminator to
|
||
/// be either `\r` or `\n`, but never at the position between a `\r`
|
||
/// and `\n`.
|
||
///
|
||
/// This setting can also be configured using the inline flag `m` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"^foo$")
|
||
/// .multi_line(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert_eq!(Some(1..4), re.find(b"\nfoo\n").map(|m| m.range()));
|
||
/// ```
|
||
pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.multi_line(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures dot-matches-new-line mode for the entire pattern.
|
||
///
|
||
/// Perhaps surprisingly, the default behavior for `.` is not to match
|
||
/// any character, but rather, to match any character except for the
|
||
/// line terminator (which is `\n` by default). When this mode is
|
||
/// enabled, the behavior changes such that `.` truly matches any
|
||
/// character.
|
||
///
|
||
/// This setting can also be configured using the inline flag `s` in
|
||
/// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
|
||
/// regexes.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"foo.bar")
|
||
/// .dot_matches_new_line(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = b"foo\nbar";
|
||
/// assert_eq!(Some(&b"foo\nbar"[..]), re.find(hay).map(|m| m.as_bytes()));
|
||
/// ```
|
||
pub fn dot_matches_new_line(
|
||
&mut self,
|
||
yes: bool,
|
||
) -> &mut RegexBuilder {
|
||
self.builder.dot_matches_new_line(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures CRLF mode for the entire pattern.
|
||
///
|
||
/// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
|
||
/// short) and `\n` ("line feed" or LF for short) are treated as line
|
||
/// terminators. This results in the following:
|
||
///
|
||
/// * Unless dot-matches-new-line mode is enabled, `.` will now match
|
||
/// any character except for `\n` and `\r`.
|
||
/// * When multi-line mode is enabled, `^` will match immediately
|
||
/// following a `\n` or a `\r`. Similarly, `$` will match immediately
|
||
/// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
|
||
/// between `\r` and `\n`.
|
||
///
|
||
/// This setting can also be configured using the inline flag `R` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"^foo$")
|
||
/// .multi_line(true)
|
||
/// .crlf(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = b"\r\nfoo\r\n";
|
||
/// // If CRLF mode weren't enabled here, then '$' wouldn't match
|
||
/// // immediately after 'foo', and thus no match would be found.
|
||
/// assert_eq!(Some(&b"foo"[..]), re.find(hay).map(|m| m.as_bytes()));
|
||
/// ```
|
||
///
|
||
/// This example demonstrates that `^` will never match at a position
|
||
/// between `\r` and `\n`. (`$` will similarly not match between a `\r`
|
||
/// and a `\n`.)
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"^")
|
||
/// .multi_line(true)
|
||
/// .crlf(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = b"\r\n\r\n";
|
||
/// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
|
||
/// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
|
||
/// ```
|
||
pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.crlf(yes);
|
||
self
|
||
}
|
||
|
||
/// Configures the line terminator to be used by the regex.
|
||
///
|
||
/// The line terminator is relevant in two ways for a particular regex:
|
||
///
|
||
/// * When dot-matches-new-line mode is *not* enabled (the default),
|
||
/// then `.` will match any character except for the configured line
|
||
/// terminator.
|
||
/// * When multi-line mode is enabled (not the default), then `^` and
|
||
/// `$` will match immediately after and before, respectively, a line
|
||
/// terminator.
|
||
///
|
||
/// In both cases, if CRLF mode is enabled in a particular context,
|
||
/// then it takes precedence over any configured line terminator.
|
||
///
|
||
/// This option cannot be configured from within the pattern.
|
||
///
|
||
/// The default line terminator is `\n`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// This shows how to treat the NUL byte as a line terminator. This can
|
||
/// be a useful heuristic when searching binary data.
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"^foo$")
|
||
/// .multi_line(true)
|
||
/// .line_terminator(b'\x00')
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = b"\x00foo\x00";
|
||
/// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
|
||
/// ```
|
||
///
|
||
/// This example shows that the behavior of `.` is impacted by this
|
||
/// setting as well:
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r".")
|
||
/// .line_terminator(b'\x00')
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match(b"\n"));
|
||
/// assert!(!re.is_match(b"\x00"));
|
||
/// ```
|
||
///
|
||
/// This shows that building a regex will work even when the byte
|
||
/// given is not ASCII. This is unlike the top-level `Regex` API where
|
||
/// matching invalid UTF-8 is not allowed.
|
||
///
|
||
/// Note though that you must disable Unicode mode. This is required
|
||
/// because Unicode mode requires matching one codepoint at a time,
|
||
/// and there is no way to match a non-ASCII byte as if it were a
|
||
/// codepoint.
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// assert!(
|
||
/// RegexBuilder::new(r".")
|
||
/// .unicode(false)
|
||
/// .line_terminator(0x80)
|
||
/// .build()
|
||
/// .is_ok(),
|
||
/// );
|
||
/// ```
|
||
pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
|
||
self.builder.line_terminator(byte);
|
||
self
|
||
}
|
||
|
||
/// This configures swap-greed mode for the entire pattern.
|
||
///
|
||
/// When swap-greed mode is enabled, patterns like `a+` will become
|
||
/// non-greedy and patterns like `a+?` will become greedy. In other
|
||
/// words, the meanings of `a+` and `a+?` are switched.
|
||
///
|
||
/// This setting can also be configured using the inline flag `U` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// let re = RegexBuilder::new(r"a+")
|
||
/// .swap_greed(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert_eq!(Some(&b"a"[..]), re.find(b"aaa").map(|m| m.as_bytes()));
|
||
/// ```
|
||
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.swap_greed(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures verbose mode for the entire pattern.
|
||
///
|
||
/// When enabled, whitespace will treated as insignifcant in the
|
||
/// pattern and `#` can be used to start a comment until the next new
|
||
/// line.
|
||
///
|
||
/// Normally, in most places in a pattern, whitespace is treated
|
||
/// literally. For example ` +` will match one or more ASCII whitespace
|
||
/// characters.
|
||
///
|
||
/// When verbose mode is enabled, `\#` can be used to match a literal
|
||
/// `#` and `\ ` can be used to match a literal ASCII whitespace
|
||
/// character.
|
||
///
|
||
/// Verbose mode is useful for permitting regexes to be formatted and
|
||
/// broken up more nicely. This may make them more easily readable.
|
||
///
|
||
/// This setting can also be configured using the inline flag `x` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// let pat = r"
|
||
/// \b
|
||
/// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
|
||
/// [\s--\n]+ # whitespace should separate names
|
||
/// (?: # middle name can be an initial!
|
||
/// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
|
||
/// [\s--\n]+
|
||
/// )?
|
||
/// (?<last>\p{Uppercase}\w*)
|
||
/// \b
|
||
/// ";
|
||
/// let re = RegexBuilder::new(pat)
|
||
/// .ignore_whitespace(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
///
|
||
/// let caps = re.captures(b"Harry Potter").unwrap();
|
||
/// assert_eq!(&b"Harry"[..], &caps["first"]);
|
||
/// assert_eq!(&b"Potter"[..], &caps["last"]);
|
||
///
|
||
/// let caps = re.captures(b"Harry J. Potter").unwrap();
|
||
/// assert_eq!(&b"Harry"[..], &caps["first"]);
|
||
/// // Since a middle name/initial isn't required for an overall match,
|
||
/// // we can't assume that 'initial' or 'middle' will be populated!
|
||
/// assert_eq!(
|
||
/// Some(&b"J"[..]),
|
||
/// caps.name("initial").map(|m| m.as_bytes()),
|
||
/// );
|
||
/// assert_eq!(None, caps.name("middle").map(|m| m.as_bytes()));
|
||
/// assert_eq!(&b"Potter"[..], &caps["last"]);
|
||
///
|
||
/// let caps = re.captures(b"Harry James Potter").unwrap();
|
||
/// assert_eq!(&b"Harry"[..], &caps["first"]);
|
||
/// // Since a middle name/initial isn't required for an overall match,
|
||
/// // we can't assume that 'initial' or 'middle' will be populated!
|
||
/// assert_eq!(None, caps.name("initial").map(|m| m.as_bytes()));
|
||
/// assert_eq!(
|
||
/// Some(&b"James"[..]),
|
||
/// caps.name("middle").map(|m| m.as_bytes()),
|
||
/// );
|
||
/// assert_eq!(&b"Potter"[..], &caps["last"]);
|
||
/// ```
|
||
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.ignore_whitespace(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures octal mode for the entire pattern.
|
||
///
|
||
/// Octal syntax is a little-known way of uttering Unicode codepoints
|
||
/// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
|
||
/// equivalent patterns, where the last example shows octal syntax.
|
||
///
|
||
/// While supporting octal syntax isn't in and of itself a problem,
|
||
/// it does make good error messages harder. That is, in PCRE based
|
||
/// regex engines, syntax like `\1` invokes a backreference, which is
|
||
/// explicitly unsupported this library. However, many users expect
|
||
/// backreferences to be supported. Therefore, when octal support
|
||
/// is disabled, the error message will explicitly mention that
|
||
/// backreferences aren't supported.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// // Normally this pattern would not compile, with an error message
|
||
/// // about backreferences not being supported. But with octal mode
|
||
/// // enabled, octal escape sequences work.
|
||
/// let re = RegexBuilder::new(r"\141")
|
||
/// .octal(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match(b"a"));
|
||
/// ```
|
||
pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
|
||
self.builder.octal(yes);
|
||
self
|
||
}
|
||
|
||
/// Sets the approximate size limit, in bytes, of the compiled regex.
|
||
///
|
||
/// This roughly corresponds to the number of heap memory, in
|
||
/// bytes, occupied by a single regex. If the regex would otherwise
|
||
/// approximately exceed this limit, then compiling that regex will
|
||
/// fail.
|
||
///
|
||
/// The main utility of a method like this is to avoid compiling
|
||
/// regexes that use an unexpected amount of resources, such as
|
||
/// time and memory. Even if the memory usage of a large regex is
|
||
/// acceptable, its search time may not be. Namely, worst case time
|
||
/// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
|
||
/// `n ~ len(haystack)`. That is, search time depends, in part, on the
|
||
/// size of the compiled regex. This means that putting a limit on the
|
||
/// size of the regex limits how much a regex can impact search time.
|
||
///
|
||
/// For more information about regex size limits, see the section on
|
||
/// [untrusted inputs](crate#untrusted-input) in the top-level crate
|
||
/// documentation.
|
||
///
|
||
/// The default for this is some reasonable number that permits most
|
||
/// patterns to compile successfully.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// // It may surprise you how big some seemingly small patterns can
|
||
/// // be! Since \w is Unicode aware, this generates a regex that can
|
||
/// // match approximately 140,000 distinct codepoints.
|
||
/// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
|
||
/// ```
|
||
pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
|
||
self.builder.size_limit(bytes);
|
||
self
|
||
}
|
||
|
||
/// Set the approximate capacity, in bytes, of the cache of transitions
|
||
/// used by the lazy DFA.
|
||
///
|
||
/// While the lazy DFA isn't always used, in tends to be the most
|
||
/// commonly use regex engine in default configurations. It tends to
|
||
/// adopt the performance profile of a fully build DFA, but without the
|
||
/// downside of taking worst case exponential time to build.
|
||
///
|
||
/// The downside is that it needs to keep a cache of transitions and
|
||
/// states that are built while running a search, and this cache
|
||
/// can fill up. When it fills up, the cache will reset itself. Any
|
||
/// previously generated states and transitions will then need to be
|
||
/// re-generated. If this happens too many times, then this library
|
||
/// will bail out of using the lazy DFA and switch to a different regex
|
||
/// engine.
|
||
///
|
||
/// If your regex provokes this particular downside of the lazy DFA,
|
||
/// then it may be beneficial to increase its cache capacity. This will
|
||
/// potentially reduce the frequency of cache resetting (ideally to
|
||
/// `0`). While it won't fix all potential performance problems with
|
||
/// the lazy DFA, increasing the cache capacity does fix some.
|
||
///
|
||
/// There is no easy way to determine, a priori, whether increasing
|
||
/// this cache capacity will help. In general, the larger your regex,
|
||
/// the more cache it's likely to use. But that isn't an ironclad rule.
|
||
/// For example, a regex like `[01]*1[01]{N}` would normally produce a
|
||
/// fully build DFA that is exponential in size with respect to `N`.
|
||
/// The lazy DFA will prevent exponential space blow-up, but it cache
|
||
/// is likely to fill up, even when it's large and even for smallish
|
||
/// values of `N`.
|
||
///
|
||
/// If you aren't sure whether this helps or not, it is sensible to
|
||
/// set this to some arbitrarily large number in testing, such as
|
||
/// `usize::MAX`. Namely, this represents the amount of capacity that
|
||
/// *may* be used. It's probably not a good idea to use `usize::MAX` in
|
||
/// production though, since it implies there are no controls on heap
|
||
/// memory used by this library during a search. In effect, set it to
|
||
/// whatever you're willing to allocate for a single regex search.
|
||
pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
|
||
self.builder.dfa_size_limit(bytes);
|
||
self
|
||
}
|
||
|
||
/// Set the nesting limit for this parser.
|
||
///
|
||
/// The nesting limit controls how deep the abstract syntax tree is
|
||
/// allowed to be. If the AST exceeds the given limit (e.g., with too
|
||
/// many nested groups), then an error is returned by the parser.
|
||
///
|
||
/// The purpose of this limit is to act as a heuristic to prevent stack
|
||
/// overflow for consumers that do structural induction on an AST using
|
||
/// explicit recursion. While this crate never does this (instead using
|
||
/// constant stack space and moving the call stack to the heap), other
|
||
/// crates may.
|
||
///
|
||
/// This limit is not checked until the entire AST is parsed.
|
||
/// Therefore, if callers want to put a limit on the amount of heap
|
||
/// space used, then they should impose a limit on the length, in
|
||
/// bytes, of the concrete pattern string. In particular, this is
|
||
/// viable since this parser implementation will limit itself to heap
|
||
/// space proportional to the length of the pattern string. See also
|
||
/// the [untrusted inputs](crate#untrusted-input) section in the
|
||
/// top-level crate documentation for more information about this.
|
||
///
|
||
/// Note that a nest limit of `0` will return a nest limit error for
|
||
/// most patterns but not all. For example, a nest limit of `0` permits
|
||
/// `a` but not `ab`, since `ab` requires an explicit concatenation,
|
||
/// which results in a nest depth of `1`. In general, a nest limit is
|
||
/// not something that manifests in an obvious way in the concrete
|
||
/// syntax, therefore, it should not be used in a granular way.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexBuilder;
|
||
///
|
||
/// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
|
||
/// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
|
||
/// ```
|
||
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
|
||
self.builder.nest_limit(limit);
|
||
self
|
||
}
|
||
}
|
||
|
||
/// A configurable builder for a [`RegexSet`].
|
||
///
|
||
/// This builder can be used to programmatically set flags such as `i`
|
||
/// (case insensitive) and `x` (for verbose mode). This builder can also be
|
||
/// used to configure things like the line terminator and a size limit on
|
||
/// the compiled regular expression.
|
||
#[derive(Clone, Debug)]
|
||
pub struct RegexSetBuilder {
|
||
builder: Builder,
|
||
}
|
||
|
||
impl RegexSetBuilder {
|
||
/// Create a new builder with a default configuration for the given
|
||
/// patterns.
|
||
///
|
||
/// If the patterns are invalid or exceed the configured size limits,
|
||
/// then an error will be returned when [`RegexSetBuilder::build`] is
|
||
/// called.
|
||
pub fn new<I, S>(patterns: I) -> RegexSetBuilder
|
||
where
|
||
I: IntoIterator<Item = S>,
|
||
S: AsRef<str>,
|
||
{
|
||
RegexSetBuilder { builder: Builder::new(patterns) }
|
||
}
|
||
|
||
/// Compiles the patterns given to `RegexSetBuilder::new` with the
|
||
/// configuration set on this builder.
|
||
///
|
||
/// If the patterns aren't valid regexes or if a configured size limit
|
||
/// was exceeded, then an error is returned.
|
||
pub fn build(&self) -> Result<RegexSet, Error> {
|
||
self.builder.build_many_bytes()
|
||
}
|
||
|
||
/// This configures Unicode mode for the all of the patterns.
|
||
///
|
||
/// Enabling Unicode mode does a number of things:
|
||
///
|
||
/// * Most fundamentally, it causes the fundamental atom of matching
|
||
/// to be a single codepoint. When Unicode mode is disabled, it's a
|
||
/// single byte. For example, when Unicode mode is enabled, `.` will
|
||
/// match `💩` once, where as it will match 4 times when Unicode mode
|
||
/// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
|
||
/// * Case insensitive matching uses Unicode simple case folding rules.
|
||
/// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
|
||
/// available.
|
||
/// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
|
||
/// `\d`.
|
||
/// * The word boundary assertions, `\b` and `\B`, use the Unicode
|
||
/// definition of a word character.
|
||
///
|
||
/// Note that unlike the top-level `RegexSet` for searching `&str`,
|
||
/// it is permitted to disable Unicode mode even if the resulting
|
||
/// pattern could match invalid UTF-8. For example, `(?-u:.)` is not
|
||
/// a valid pattern for a top-level `RegexSet`, but is valid for a
|
||
/// `bytes::RegexSet`.
|
||
///
|
||
/// For more details on the Unicode support in this crate, see the
|
||
/// [Unicode section](crate#unicode) in this crate's top-level
|
||
/// documentation.
|
||
///
|
||
/// The default for this is `true`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"\w"])
|
||
/// .unicode(false)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// // Normally greek letters would be included in \w, but since
|
||
/// // Unicode mode is disabled, it only matches ASCII letters.
|
||
/// assert!(!re.is_match("δ".as_bytes()));
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"s"])
|
||
/// .case_insensitive(true)
|
||
/// .unicode(false)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// // Normally 'ſ' is included when searching for 's' case
|
||
/// // insensitively due to Unicode's simple case folding rules. But
|
||
/// // when Unicode mode is disabled, only ASCII case insensitive rules
|
||
/// // are used.
|
||
/// assert!(!re.is_match("ſ".as_bytes()));
|
||
/// ```
|
||
///
|
||
/// Since this builder is for constructing a
|
||
/// [`bytes::RegexSet`](RegexSet), one can disable Unicode mode even if
|
||
/// it would match invalid UTF-8:
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"."])
|
||
/// .unicode(false)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// // Normally greek letters would be included in \w, but since
|
||
/// // Unicode mode is disabled, it only matches ASCII letters.
|
||
/// assert!(re.is_match(b"\xFF"));
|
||
/// ```
|
||
pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||
self.builder.unicode(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures whether to enable case insensitive matching for all
|
||
/// of the patterns.
|
||
///
|
||
/// This setting can also be configured using the inline flag `i`
|
||
/// in the pattern. For example, `(?i:foo)` matches `foo` case
|
||
/// insensitively while `(?-i:foo)` matches `foo` case sensitively.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
|
||
/// .case_insensitive(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match(b"FoObarQuUx"));
|
||
/// // Even though case insensitive matching is enabled in the builder,
|
||
/// // it can be locally disabled within the pattern. In this case,
|
||
/// // `bar` is matched case sensitively.
|
||
/// assert!(!re.is_match(b"fooBARquux"));
|
||
/// ```
|
||
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||
self.builder.case_insensitive(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures multi-line mode for all of the patterns.
|
||
///
|
||
/// Enabling multi-line mode changes the behavior of the `^` and `$`
|
||
/// anchor assertions. Instead of only matching at the beginning and
|
||
/// end of a haystack, respectively, multi-line mode causes them to
|
||
/// match at the beginning and end of a line *in addition* to the
|
||
/// beginning and end of a haystack. More precisely, `^` will match at
|
||
/// the position immediately following a `\n` and `$` will match at the
|
||
/// position immediately preceding a `\n`.
|
||
///
|
||
/// The behavior of this option can be impacted by other settings too:
|
||
///
|
||
/// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
|
||
/// above to any ASCII byte.
|
||
/// * The [`RegexSetBuilder::crlf`] option changes the line terminator
|
||
/// to be either `\r` or `\n`, but never at the position between a `\r`
|
||
/// and `\n`.
|
||
///
|
||
/// This setting can also be configured using the inline flag `m` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"^foo$"])
|
||
/// .multi_line(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match(b"\nfoo\n"));
|
||
/// ```
|
||
pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||
self.builder.multi_line(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures dot-matches-new-line mode for the entire pattern.
|
||
///
|
||
/// Perhaps surprisingly, the default behavior for `.` is not to match
|
||
/// any character, but rather, to match any character except for the
|
||
/// line terminator (which is `\n` by default). When this mode is
|
||
/// enabled, the behavior changes such that `.` truly matches any
|
||
/// character.
|
||
///
|
||
/// This setting can also be configured using the inline flag `s` in
|
||
/// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
|
||
/// regexes.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"foo.bar"])
|
||
/// .dot_matches_new_line(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = b"foo\nbar";
|
||
/// assert!(re.is_match(hay));
|
||
/// ```
|
||
pub fn dot_matches_new_line(
|
||
&mut self,
|
||
yes: bool,
|
||
) -> &mut RegexSetBuilder {
|
||
self.builder.dot_matches_new_line(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures CRLF mode for all of the patterns.
|
||
///
|
||
/// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
|
||
/// short) and `\n` ("line feed" or LF for short) are treated as line
|
||
/// terminators. This results in the following:
|
||
///
|
||
/// * Unless dot-matches-new-line mode is enabled, `.` will now match
|
||
/// any character except for `\n` and `\r`.
|
||
/// * When multi-line mode is enabled, `^` will match immediately
|
||
/// following a `\n` or a `\r`. Similarly, `$` will match immediately
|
||
/// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
|
||
/// between `\r` and `\n`.
|
||
///
|
||
/// This setting can also be configured using the inline flag `R` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"^foo$"])
|
||
/// .multi_line(true)
|
||
/// .crlf(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = b"\r\nfoo\r\n";
|
||
/// // If CRLF mode weren't enabled here, then '$' wouldn't match
|
||
/// // immediately after 'foo', and thus no match would be found.
|
||
/// assert!(re.is_match(hay));
|
||
/// ```
|
||
///
|
||
/// This example demonstrates that `^` will never match at a position
|
||
/// between `\r` and `\n`. (`$` will similarly not match between a `\r`
|
||
/// and a `\n`.)
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"^\n"])
|
||
/// .multi_line(true)
|
||
/// .crlf(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(!re.is_match(b"\r\n"));
|
||
/// ```
|
||
pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||
self.builder.crlf(yes);
|
||
self
|
||
}
|
||
|
||
/// Configures the line terminator to be used by the regex.
|
||
///
|
||
/// The line terminator is relevant in two ways for a particular regex:
|
||
///
|
||
/// * When dot-matches-new-line mode is *not* enabled (the default),
|
||
/// then `.` will match any character except for the configured line
|
||
/// terminator.
|
||
/// * When multi-line mode is enabled (not the default), then `^` and
|
||
/// `$` will match immediately after and before, respectively, a line
|
||
/// terminator.
|
||
///
|
||
/// In both cases, if CRLF mode is enabled in a particular context,
|
||
/// then it takes precedence over any configured line terminator.
|
||
///
|
||
/// This option cannot be configured from within the pattern.
|
||
///
|
||
/// The default line terminator is `\n`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// This shows how to treat the NUL byte as a line terminator. This can
|
||
/// be a useful heuristic when searching binary data.
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"^foo$"])
|
||
/// .multi_line(true)
|
||
/// .line_terminator(b'\x00')
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// let hay = b"\x00foo\x00";
|
||
/// assert!(re.is_match(hay));
|
||
/// ```
|
||
///
|
||
/// This example shows that the behavior of `.` is impacted by this
|
||
/// setting as well:
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// let re = RegexSetBuilder::new([r"."])
|
||
/// .line_terminator(b'\x00')
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match(b"\n"));
|
||
/// assert!(!re.is_match(b"\x00"));
|
||
/// ```
|
||
///
|
||
/// This shows that building a regex will work even when the byte given
|
||
/// is not ASCII. This is unlike the top-level `RegexSet` API where
|
||
/// matching invalid UTF-8 is not allowed.
|
||
///
|
||
/// Note though that you must disable Unicode mode. This is required
|
||
/// because Unicode mode requires matching one codepoint at a time,
|
||
/// and there is no way to match a non-ASCII byte as if it were a
|
||
/// codepoint.
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// assert!(
|
||
/// RegexSetBuilder::new([r"."])
|
||
/// .unicode(false)
|
||
/// .line_terminator(0x80)
|
||
/// .build()
|
||
/// .is_ok(),
|
||
/// );
|
||
/// ```
|
||
pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
|
||
self.builder.line_terminator(byte);
|
||
self
|
||
}
|
||
|
||
/// This configures swap-greed mode for all of the patterns.
|
||
///
|
||
/// When swap-greed mode is enabled, patterns like `a+` will become
|
||
/// non-greedy and patterns like `a+?` will become greedy. In other
|
||
/// words, the meanings of `a+` and `a+?` are switched.
|
||
///
|
||
/// This setting can also be configured using the inline flag `U` in
|
||
/// the pattern.
|
||
///
|
||
/// Note that this is generally not useful for a `RegexSet` since a
|
||
/// `RegexSet` can only report whether a pattern matches or not. Since
|
||
/// greediness never impacts whether a match is found or not (only the
|
||
/// offsets of the match), it follows that whether parts of a pattern
|
||
/// are greedy or not doesn't matter for a `RegexSet`.
|
||
///
|
||
/// The default for this is `false`.
|
||
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||
self.builder.swap_greed(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures verbose mode for all of the patterns.
|
||
///
|
||
/// When enabled, whitespace will treated as insignifcant in the
|
||
/// pattern and `#` can be used to start a comment until the next new
|
||
/// line.
|
||
///
|
||
/// Normally, in most places in a pattern, whitespace is treated
|
||
/// literally. For example ` +` will match one or more ASCII whitespace
|
||
/// characters.
|
||
///
|
||
/// When verbose mode is enabled, `\#` can be used to match a literal
|
||
/// `#` and `\ ` can be used to match a literal ASCII whitespace
|
||
/// character.
|
||
///
|
||
/// Verbose mode is useful for permitting regexes to be formatted and
|
||
/// broken up more nicely. This may make them more easily readable.
|
||
///
|
||
/// This setting can also be configured using the inline flag `x` in
|
||
/// the pattern.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// let pat = r"
|
||
/// \b
|
||
/// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
|
||
/// [\s--\n]+ # whitespace should separate names
|
||
/// (?: # middle name can be an initial!
|
||
/// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
|
||
/// [\s--\n]+
|
||
/// )?
|
||
/// (?<last>\p{Uppercase}\w*)
|
||
/// \b
|
||
/// ";
|
||
/// let re = RegexSetBuilder::new([pat])
|
||
/// .ignore_whitespace(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match(b"Harry Potter"));
|
||
/// assert!(re.is_match(b"Harry J. Potter"));
|
||
/// assert!(re.is_match(b"Harry James Potter"));
|
||
/// assert!(!re.is_match(b"harry J. Potter"));
|
||
/// ```
|
||
pub fn ignore_whitespace(
|
||
&mut self,
|
||
yes: bool,
|
||
) -> &mut RegexSetBuilder {
|
||
self.builder.ignore_whitespace(yes);
|
||
self
|
||
}
|
||
|
||
/// This configures octal mode for all of the patterns.
|
||
///
|
||
/// Octal syntax is a little-known way of uttering Unicode codepoints
|
||
/// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
|
||
/// equivalent patterns, where the last example shows octal syntax.
|
||
///
|
||
/// While supporting octal syntax isn't in and of itself a problem,
|
||
/// it does make good error messages harder. That is, in PCRE based
|
||
/// regex engines, syntax like `\1` invokes a backreference, which is
|
||
/// explicitly unsupported this library. However, many users expect
|
||
/// backreferences to be supported. Therefore, when octal support
|
||
/// is disabled, the error message will explicitly mention that
|
||
/// backreferences aren't supported.
|
||
///
|
||
/// The default for this is `false`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// // Normally this pattern would not compile, with an error message
|
||
/// // about backreferences not being supported. But with octal mode
|
||
/// // enabled, octal escape sequences work.
|
||
/// let re = RegexSetBuilder::new([r"\141"])
|
||
/// .octal(true)
|
||
/// .build()
|
||
/// .unwrap();
|
||
/// assert!(re.is_match(b"a"));
|
||
/// ```
|
||
pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||
self.builder.octal(yes);
|
||
self
|
||
}
|
||
|
||
/// Sets the approximate size limit, in bytes, of the compiled regex.
|
||
///
|
||
/// This roughly corresponds to the number of heap memory, in
|
||
/// bytes, occupied by a single regex. If the regex would otherwise
|
||
/// approximately exceed this limit, then compiling that regex will
|
||
/// fail.
|
||
///
|
||
/// The main utility of a method like this is to avoid compiling
|
||
/// regexes that use an unexpected amount of resources, such as
|
||
/// time and memory. Even if the memory usage of a large regex is
|
||
/// acceptable, its search time may not be. Namely, worst case time
|
||
/// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
|
||
/// `n ~ len(haystack)`. That is, search time depends, in part, on the
|
||
/// size of the compiled regex. This means that putting a limit on the
|
||
/// size of the regex limits how much a regex can impact search time.
|
||
///
|
||
/// For more information about regex size limits, see the section on
|
||
/// [untrusted inputs](crate#untrusted-input) in the top-level crate
|
||
/// documentation.
|
||
///
|
||
/// The default for this is some reasonable number that permits most
|
||
/// patterns to compile successfully.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// // It may surprise you how big some seemingly small patterns can
|
||
/// // be! Since \w is Unicode aware, this generates a regex that can
|
||
/// // match approximately 140,000 distinct codepoints.
|
||
/// assert!(
|
||
/// RegexSetBuilder::new([r"\w"])
|
||
/// .size_limit(45_000)
|
||
/// .build()
|
||
/// .is_err()
|
||
/// );
|
||
/// ```
|
||
pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
|
||
self.builder.size_limit(bytes);
|
||
self
|
||
}
|
||
|
||
/// Set the approximate capacity, in bytes, of the cache of transitions
|
||
/// used by the lazy DFA.
|
||
///
|
||
/// While the lazy DFA isn't always used, in tends to be the most
|
||
/// commonly use regex engine in default configurations. It tends to
|
||
/// adopt the performance profile of a fully build DFA, but without the
|
||
/// downside of taking worst case exponential time to build.
|
||
///
|
||
/// The downside is that it needs to keep a cache of transitions and
|
||
/// states that are built while running a search, and this cache
|
||
/// can fill up. When it fills up, the cache will reset itself. Any
|
||
/// previously generated states and transitions will then need to be
|
||
/// re-generated. If this happens too many times, then this library
|
||
/// will bail out of using the lazy DFA and switch to a different regex
|
||
/// engine.
|
||
///
|
||
/// If your regex provokes this particular downside of the lazy DFA,
|
||
/// then it may be beneficial to increase its cache capacity. This will
|
||
/// potentially reduce the frequency of cache resetting (ideally to
|
||
/// `0`). While it won't fix all potential performance problems with
|
||
/// the lazy DFA, increasing the cache capacity does fix some.
|
||
///
|
||
/// There is no easy way to determine, a priori, whether increasing
|
||
/// this cache capacity will help. In general, the larger your regex,
|
||
/// the more cache it's likely to use. But that isn't an ironclad rule.
|
||
/// For example, a regex like `[01]*1[01]{N}` would normally produce a
|
||
/// fully build DFA that is exponential in size with respect to `N`.
|
||
/// The lazy DFA will prevent exponential space blow-up, but it cache
|
||
/// is likely to fill up, even when it's large and even for smallish
|
||
/// values of `N`.
|
||
///
|
||
/// If you aren't sure whether this helps or not, it is sensible to
|
||
/// set this to some arbitrarily large number in testing, such as
|
||
/// `usize::MAX`. Namely, this represents the amount of capacity that
|
||
/// *may* be used. It's probably not a good idea to use `usize::MAX` in
|
||
/// production though, since it implies there are no controls on heap
|
||
/// memory used by this library during a search. In effect, set it to
|
||
/// whatever you're willing to allocate for a single regex search.
|
||
pub fn dfa_size_limit(
|
||
&mut self,
|
||
bytes: usize,
|
||
) -> &mut RegexSetBuilder {
|
||
self.builder.dfa_size_limit(bytes);
|
||
self
|
||
}
|
||
|
||
/// Set the nesting limit for this parser.
|
||
///
|
||
/// The nesting limit controls how deep the abstract syntax tree is
|
||
/// allowed to be. If the AST exceeds the given limit (e.g., with too
|
||
/// many nested groups), then an error is returned by the parser.
|
||
///
|
||
/// The purpose of this limit is to act as a heuristic to prevent stack
|
||
/// overflow for consumers that do structural induction on an AST using
|
||
/// explicit recursion. While this crate never does this (instead using
|
||
/// constant stack space and moving the call stack to the heap), other
|
||
/// crates may.
|
||
///
|
||
/// This limit is not checked until the entire AST is parsed.
|
||
/// Therefore, if callers want to put a limit on the amount of heap
|
||
/// space used, then they should impose a limit on the length, in
|
||
/// bytes, of the concrete pattern string. In particular, this is
|
||
/// viable since this parser implementation will limit itself to heap
|
||
/// space proportional to the length of the pattern string. See also
|
||
/// the [untrusted inputs](crate#untrusted-input) section in the
|
||
/// top-level crate documentation for more information about this.
|
||
///
|
||
/// Note that a nest limit of `0` will return a nest limit error for
|
||
/// most patterns but not all. For example, a nest limit of `0` permits
|
||
/// `a` but not `ab`, since `ab` requires an explicit concatenation,
|
||
/// which results in a nest depth of `1`. In general, a nest limit is
|
||
/// not something that manifests in an obvious way in the concrete
|
||
/// syntax, therefore, it should not be used in a granular way.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use regex::bytes::RegexSetBuilder;
|
||
///
|
||
/// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
|
||
/// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
|
||
/// ```
|
||
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
|
||
self.builder.nest_limit(limit);
|
||
self
|
||
}
|
||
}
|
||
}
|