mirror of https://github.com/prometheus/prometheus
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1086 lines
32 KiB
1086 lines
32 KiB
// Copyright 2020 The Prometheus Authors |
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
|
// you may not use this file except in compliance with the License. |
|
// You may obtain a copy of the License at |
|
// |
|
// http://www.apache.org/licenses/LICENSE-2.0 |
|
// |
|
// Unless required by applicable law or agreed to in writing, software |
|
// distributed under the License is distributed on an "AS IS" BASIS, |
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
// See the License for the specific language governing permissions and |
|
// limitations under the License. |
|
|
|
package labels |
|
|
|
import ( |
|
"slices" |
|
"strings" |
|
"unicode" |
|
"unicode/utf8" |
|
|
|
"github.com/grafana/regexp" |
|
"github.com/grafana/regexp/syntax" |
|
"golang.org/x/text/unicode/norm" |
|
) |
|
|
|
const ( |
|
maxSetMatches = 256 |
|
|
|
// The minimum number of alternate values a regex should have to trigger |
|
// the optimization done by optimizeEqualOrPrefixStringMatchers() and so use a map |
|
// to match values instead of iterating over a list. This value has |
|
// been computed running BenchmarkOptimizeEqualStringMatchers. |
|
minEqualMultiStringMatcherMapThreshold = 16 |
|
) |
|
|
|
type FastRegexMatcher struct { |
|
// Under some conditions, re is nil because the expression is never parsed. |
|
// We store the original string to be able to return it in GetRegexString(). |
|
reString string |
|
re *regexp.Regexp |
|
|
|
setMatches []string |
|
stringMatcher StringMatcher |
|
prefix string |
|
suffix string |
|
contains []string |
|
|
|
// matchString is the "compiled" function to run by MatchString(). |
|
matchString func(string) bool |
|
} |
|
|
|
func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) { |
|
m := &FastRegexMatcher{ |
|
reString: v, |
|
} |
|
|
|
m.stringMatcher, m.setMatches = optimizeAlternatingLiterals(v) |
|
if m.stringMatcher != nil { |
|
// If we already have a string matcher, we don't need to parse the regex |
|
// or compile the matchString function. This also avoids the behavior in |
|
// compileMatchStringFunction where it prefers to use setMatches when |
|
// available, even if the string matcher is faster. |
|
m.matchString = m.stringMatcher.Matches |
|
} else { |
|
parsed, err := syntax.Parse(v, syntax.Perl|syntax.DotNL) |
|
if err != nil { |
|
return nil, err |
|
} |
|
// Simplify the syntax tree to run faster. |
|
parsed = parsed.Simplify() |
|
m.re, err = regexp.Compile("^(?s:" + parsed.String() + ")$") |
|
if err != nil { |
|
return nil, err |
|
} |
|
if parsed.Op == syntax.OpConcat { |
|
m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed) |
|
} |
|
if matches, caseSensitive := findSetMatches(parsed); caseSensitive { |
|
m.setMatches = matches |
|
} |
|
m.stringMatcher = stringMatcherFromRegexp(parsed) |
|
m.matchString = m.compileMatchStringFunction() |
|
} |
|
|
|
return m, nil |
|
} |
|
|
|
// compileMatchStringFunction returns the function to run by MatchString(). |
|
func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool { |
|
// If the only optimization available is the string matcher, then we can just run it. |
|
if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && len(m.contains) == 0 && m.stringMatcher != nil { |
|
return m.stringMatcher.Matches |
|
} |
|
|
|
return func(s string) bool { |
|
if len(m.setMatches) != 0 { |
|
for _, match := range m.setMatches { |
|
if match == s { |
|
return true |
|
} |
|
} |
|
return false |
|
} |
|
if m.prefix != "" && !strings.HasPrefix(s, m.prefix) { |
|
return false |
|
} |
|
if m.suffix != "" && !strings.HasSuffix(s, m.suffix) { |
|
return false |
|
} |
|
if len(m.contains) > 0 && !containsInOrder(s, m.contains) { |
|
return false |
|
} |
|
if m.stringMatcher != nil { |
|
return m.stringMatcher.Matches(s) |
|
} |
|
return m.re.MatchString(s) |
|
} |
|
} |
|
|
|
// IsOptimized returns true if any fast-path optimization is applied to the |
|
// regex matcher. |
|
func (m *FastRegexMatcher) IsOptimized() bool { |
|
return len(m.setMatches) > 0 || m.stringMatcher != nil || m.prefix != "" || m.suffix != "" || len(m.contains) > 0 |
|
} |
|
|
|
// findSetMatches extract equality matches from a regexp. |
|
// Returns nil if we can't replace the regexp by only equality matchers or the regexp contains |
|
// a mix of case sensitive and case insensitive matchers. |
|
func findSetMatches(re *syntax.Regexp) (matches []string, caseSensitive bool) { |
|
clearBeginEndText(re) |
|
|
|
return findSetMatchesInternal(re, "") |
|
} |
|
|
|
func findSetMatchesInternal(re *syntax.Regexp, base string) (matches []string, caseSensitive bool) { |
|
switch re.Op { |
|
case syntax.OpBeginText: |
|
// Correctly handling the begin text operator inside a regex is tricky, |
|
// so in this case we fallback to the regex engine. |
|
return nil, false |
|
case syntax.OpEndText: |
|
// Correctly handling the end text operator inside a regex is tricky, |
|
// so in this case we fallback to the regex engine. |
|
return nil, false |
|
case syntax.OpLiteral: |
|
return []string{base + string(re.Rune)}, isCaseSensitive(re) |
|
case syntax.OpEmptyMatch: |
|
if base != "" { |
|
return []string{base}, isCaseSensitive(re) |
|
} |
|
case syntax.OpAlternate: |
|
return findSetMatchesFromAlternate(re, base) |
|
case syntax.OpCapture: |
|
clearCapture(re) |
|
return findSetMatchesInternal(re, base) |
|
case syntax.OpConcat: |
|
return findSetMatchesFromConcat(re, base) |
|
case syntax.OpCharClass: |
|
if len(re.Rune)%2 != 0 { |
|
return nil, false |
|
} |
|
var matches []string |
|
var totalSet int |
|
for i := 0; i+1 < len(re.Rune); i += 2 { |
|
totalSet += int(re.Rune[i+1]-re.Rune[i]) + 1 |
|
} |
|
// limits the total characters that can be used to create matches. |
|
// In some case like negation [^0-9] a lot of possibilities exists and that |
|
// can create thousands of possible matches at which points we're better off using regexp. |
|
if totalSet > maxSetMatches { |
|
return nil, false |
|
} |
|
for i := 0; i+1 < len(re.Rune); i += 2 { |
|
lo, hi := re.Rune[i], re.Rune[i+1] |
|
for c := lo; c <= hi; c++ { |
|
matches = append(matches, base+string(c)) |
|
} |
|
} |
|
return matches, isCaseSensitive(re) |
|
default: |
|
return nil, false |
|
} |
|
return nil, false |
|
} |
|
|
|
func findSetMatchesFromConcat(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) { |
|
if len(re.Sub) == 0 { |
|
return nil, false |
|
} |
|
clearCapture(re.Sub...) |
|
|
|
matches = []string{base} |
|
|
|
for i := 0; i < len(re.Sub); i++ { |
|
var newMatches []string |
|
for j, b := range matches { |
|
m, caseSensitive := findSetMatchesInternal(re.Sub[i], b) |
|
if m == nil { |
|
return nil, false |
|
} |
|
if tooManyMatches(newMatches, m...) { |
|
return nil, false |
|
} |
|
|
|
// All matches must have the same case sensitivity. If it's the first set of matches |
|
// returned, we store its sensitivity as the expected case, and then we'll check all |
|
// other ones. |
|
if i == 0 && j == 0 { |
|
matchesCaseSensitive = caseSensitive |
|
} |
|
if matchesCaseSensitive != caseSensitive { |
|
return nil, false |
|
} |
|
|
|
newMatches = append(newMatches, m...) |
|
} |
|
matches = newMatches |
|
} |
|
|
|
return matches, matchesCaseSensitive |
|
} |
|
|
|
func findSetMatchesFromAlternate(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) { |
|
for i, sub := range re.Sub { |
|
found, caseSensitive := findSetMatchesInternal(sub, base) |
|
if found == nil { |
|
return nil, false |
|
} |
|
if tooManyMatches(matches, found...) { |
|
return nil, false |
|
} |
|
|
|
// All matches must have the same case sensitivity. If it's the first set of matches |
|
// returned, we store its sensitivity as the expected case, and then we'll check all |
|
// other ones. |
|
if i == 0 { |
|
matchesCaseSensitive = caseSensitive |
|
} |
|
if matchesCaseSensitive != caseSensitive { |
|
return nil, false |
|
} |
|
|
|
matches = append(matches, found...) |
|
} |
|
|
|
return matches, matchesCaseSensitive |
|
} |
|
|
|
// clearCapture removes capture operation as they are not used for matching. |
|
func clearCapture(regs ...*syntax.Regexp) { |
|
for _, r := range regs { |
|
// Iterate on the regexp because capture groups could be nested. |
|
for r.Op == syntax.OpCapture { |
|
*r = *r.Sub[0] |
|
} |
|
} |
|
} |
|
|
|
// clearBeginEndText removes the begin and end text from the regexp. Prometheus regexp are anchored to the beginning and end of the string. |
|
func clearBeginEndText(re *syntax.Regexp) { |
|
// Do not clear begin/end text from an alternate operator because it could |
|
// change the actual regexp properties. |
|
if re.Op == syntax.OpAlternate { |
|
return |
|
} |
|
|
|
if len(re.Sub) == 0 { |
|
return |
|
} |
|
if len(re.Sub) == 1 { |
|
if re.Sub[0].Op == syntax.OpBeginText || re.Sub[0].Op == syntax.OpEndText { |
|
// We need to remove this element. Since it's the only one, we convert into a matcher of an empty string. |
|
// OpEmptyMatch is regexp's nop operator. |
|
re.Op = syntax.OpEmptyMatch |
|
re.Sub = nil |
|
return |
|
} |
|
} |
|
if re.Sub[0].Op == syntax.OpBeginText { |
|
re.Sub = re.Sub[1:] |
|
} |
|
if re.Sub[len(re.Sub)-1].Op == syntax.OpEndText { |
|
re.Sub = re.Sub[:len(re.Sub)-1] |
|
} |
|
} |
|
|
|
// isCaseInsensitive tells if a regexp is case insensitive. |
|
// The flag should be check at each level of the syntax tree. |
|
func isCaseInsensitive(reg *syntax.Regexp) bool { |
|
return (reg.Flags & syntax.FoldCase) != 0 |
|
} |
|
|
|
// isCaseSensitive tells if a regexp is case sensitive. |
|
// The flag should be check at each level of the syntax tree. |
|
func isCaseSensitive(reg *syntax.Regexp) bool { |
|
return !isCaseInsensitive(reg) |
|
} |
|
|
|
// tooManyMatches guards against creating too many set matches. |
|
func tooManyMatches(matches []string, added ...string) bool { |
|
return len(matches)+len(added) > maxSetMatches |
|
} |
|
|
|
func (m *FastRegexMatcher) MatchString(s string) bool { |
|
return m.matchString(s) |
|
} |
|
|
|
func (m *FastRegexMatcher) SetMatches() []string { |
|
// IMPORTANT: always return a copy, otherwise if the caller manipulate this slice it will |
|
// also get manipulated in the cached FastRegexMatcher instance. |
|
return slices.Clone(m.setMatches) |
|
} |
|
|
|
func (m *FastRegexMatcher) GetRegexString() string { |
|
return m.reString |
|
} |
|
|
|
// optimizeAlternatingLiterals optimizes a regex of the form |
|
// |
|
// `literal1|literal2|literal3|...` |
|
// |
|
// this function returns an optimized StringMatcher or nil if the regex |
|
// cannot be optimized in this way, and a list of setMatches up to maxSetMatches. |
|
func optimizeAlternatingLiterals(s string) (StringMatcher, []string) { |
|
if len(s) == 0 { |
|
return emptyStringMatcher{}, nil |
|
} |
|
|
|
estimatedAlternates := strings.Count(s, "|") + 1 |
|
|
|
// If there are no alternates, check if the string is a literal |
|
if estimatedAlternates == 1 { |
|
if regexp.QuoteMeta(s) == s { |
|
return &equalStringMatcher{s: s, caseSensitive: true}, []string{s} |
|
} |
|
return nil, nil |
|
} |
|
|
|
multiMatcher := newEqualMultiStringMatcher(true, estimatedAlternates, 0, 0) |
|
|
|
for end := strings.IndexByte(s, '|'); end > -1; end = strings.IndexByte(s, '|') { |
|
// Split the string into the next literal and the remainder |
|
subMatch := s[:end] |
|
s = s[end+1:] |
|
|
|
// break if any of the submatches are not literals |
|
if regexp.QuoteMeta(subMatch) != subMatch { |
|
return nil, nil |
|
} |
|
|
|
multiMatcher.add(subMatch) |
|
} |
|
|
|
// break if the remainder is not a literal |
|
if regexp.QuoteMeta(s) != s { |
|
return nil, nil |
|
} |
|
multiMatcher.add(s) |
|
|
|
return multiMatcher, multiMatcher.setMatches() |
|
} |
|
|
|
// optimizeConcatRegex returns literal prefix/suffix text that can be safely |
|
// checked against the label value before running the regexp matcher. |
|
func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix string, contains []string) { |
|
sub := r.Sub |
|
clearCapture(sub...) |
|
|
|
// We can safely remove begin and end text matchers respectively |
|
// at the beginning and end of the regexp. |
|
if len(sub) > 0 && sub[0].Op == syntax.OpBeginText { |
|
sub = sub[1:] |
|
} |
|
if len(sub) > 0 && sub[len(sub)-1].Op == syntax.OpEndText { |
|
sub = sub[:len(sub)-1] |
|
} |
|
|
|
if len(sub) == 0 { |
|
return |
|
} |
|
|
|
// Given Prometheus regex matchers are always anchored to the begin/end |
|
// of the text, if the first/last operations are literals, we can safely |
|
// treat them as prefix/suffix. |
|
if sub[0].Op == syntax.OpLiteral && (sub[0].Flags&syntax.FoldCase) == 0 { |
|
prefix = string(sub[0].Rune) |
|
} |
|
if last := len(sub) - 1; sub[last].Op == syntax.OpLiteral && (sub[last].Flags&syntax.FoldCase) == 0 { |
|
suffix = string(sub[last].Rune) |
|
} |
|
|
|
// If contains any literal which is not a prefix/suffix, we keep track of |
|
// all the ones which are case-sensitive. |
|
for i := 1; i < len(sub)-1; i++ { |
|
if sub[i].Op == syntax.OpLiteral && (sub[i].Flags&syntax.FoldCase) == 0 { |
|
contains = append(contains, string(sub[i].Rune)) |
|
} |
|
} |
|
|
|
return |
|
} |
|
|
|
// StringMatcher is a matcher that matches a string in place of a regular expression. |
|
type StringMatcher interface { |
|
Matches(s string) bool |
|
} |
|
|
|
// stringMatcherFromRegexp attempts to replace a common regexp with a string matcher. |
|
// It returns nil if the regexp is not supported. |
|
func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher { |
|
clearBeginEndText(re) |
|
|
|
m := stringMatcherFromRegexpInternal(re) |
|
m = optimizeEqualOrPrefixStringMatchers(m, minEqualMultiStringMatcherMapThreshold) |
|
|
|
return m |
|
} |
|
|
|
func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher { |
|
clearCapture(re) |
|
|
|
switch re.Op { |
|
case syntax.OpBeginText: |
|
// Correctly handling the begin text operator inside a regex is tricky, |
|
// so in this case we fallback to the regex engine. |
|
return nil |
|
case syntax.OpEndText: |
|
// Correctly handling the end text operator inside a regex is tricky, |
|
// so in this case we fallback to the regex engine. |
|
return nil |
|
case syntax.OpPlus: |
|
if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL { |
|
return nil |
|
} |
|
return &anyNonEmptyStringMatcher{ |
|
matchNL: re.Sub[0].Op == syntax.OpAnyChar, |
|
} |
|
case syntax.OpStar: |
|
if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL { |
|
return nil |
|
} |
|
|
|
// If the newline is valid, than this matcher literally match any string (even empty). |
|
if re.Sub[0].Op == syntax.OpAnyChar { |
|
return trueMatcher{} |
|
} |
|
|
|
// Any string is fine (including an empty one), as far as it doesn't contain any newline. |
|
return anyStringWithoutNewlineMatcher{} |
|
case syntax.OpQuest: |
|
// Only optimize for ".?". |
|
if len(re.Sub) != 1 || (re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL) { |
|
return nil |
|
} |
|
|
|
return &zeroOrOneCharacterStringMatcher{ |
|
matchNL: re.Sub[0].Op == syntax.OpAnyChar, |
|
} |
|
case syntax.OpEmptyMatch: |
|
return emptyStringMatcher{} |
|
|
|
case syntax.OpLiteral: |
|
return &equalStringMatcher{ |
|
s: string(re.Rune), |
|
caseSensitive: !isCaseInsensitive(re), |
|
} |
|
case syntax.OpAlternate: |
|
or := make([]StringMatcher, 0, len(re.Sub)) |
|
for _, sub := range re.Sub { |
|
m := stringMatcherFromRegexpInternal(sub) |
|
if m == nil { |
|
return nil |
|
} |
|
or = append(or, m) |
|
} |
|
return orStringMatcher(or) |
|
case syntax.OpConcat: |
|
clearCapture(re.Sub...) |
|
|
|
if len(re.Sub) == 0 { |
|
return emptyStringMatcher{} |
|
} |
|
if len(re.Sub) == 1 { |
|
return stringMatcherFromRegexpInternal(re.Sub[0]) |
|
} |
|
|
|
var left, right StringMatcher |
|
|
|
// Let's try to find if there's a first and last any matchers. |
|
if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar || re.Sub[0].Op == syntax.OpQuest { |
|
left = stringMatcherFromRegexpInternal(re.Sub[0]) |
|
if left == nil { |
|
return nil |
|
} |
|
re.Sub = re.Sub[1:] |
|
} |
|
if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar || re.Sub[len(re.Sub)-1].Op == syntax.OpQuest { |
|
right = stringMatcherFromRegexpInternal(re.Sub[len(re.Sub)-1]) |
|
if right == nil { |
|
return nil |
|
} |
|
re.Sub = re.Sub[:len(re.Sub)-1] |
|
} |
|
|
|
matches, matchesCaseSensitive := findSetMatchesInternal(re, "") |
|
|
|
if len(matches) == 0 && len(re.Sub) == 2 { |
|
// We have not find fixed set matches. We look for other known cases that |
|
// we can optimize. |
|
switch { |
|
// Prefix is literal. |
|
case right == nil && re.Sub[0].Op == syntax.OpLiteral: |
|
right = stringMatcherFromRegexpInternal(re.Sub[1]) |
|
if right != nil { |
|
matches = []string{string(re.Sub[0].Rune)} |
|
matchesCaseSensitive = !isCaseInsensitive(re.Sub[0]) |
|
} |
|
|
|
// Suffix is literal. |
|
case left == nil && re.Sub[1].Op == syntax.OpLiteral: |
|
left = stringMatcherFromRegexpInternal(re.Sub[0]) |
|
if left != nil { |
|
matches = []string{string(re.Sub[1].Rune)} |
|
matchesCaseSensitive = !isCaseInsensitive(re.Sub[1]) |
|
} |
|
} |
|
} |
|
|
|
// Ensure we've found some literals to match (optionally with a left and/or right matcher). |
|
// If not, then this optimization doesn't trigger. |
|
if len(matches) == 0 { |
|
return nil |
|
} |
|
|
|
// Use the right (and best) matcher based on what we've found. |
|
switch { |
|
// No left and right matchers (only fixed set matches). |
|
case left == nil && right == nil: |
|
// if there's no any matchers on both side it's a concat of literals |
|
or := make([]StringMatcher, 0, len(matches)) |
|
for _, match := range matches { |
|
or = append(or, &equalStringMatcher{ |
|
s: match, |
|
caseSensitive: matchesCaseSensitive, |
|
}) |
|
} |
|
return orStringMatcher(or) |
|
|
|
// Right matcher with 1 fixed set match. |
|
case left == nil && len(matches) == 1: |
|
return newLiteralPrefixStringMatcher(matches[0], matchesCaseSensitive, right) |
|
|
|
// Left matcher with 1 fixed set match. |
|
case right == nil && len(matches) == 1: |
|
return &literalSuffixStringMatcher{ |
|
left: left, |
|
suffix: matches[0], |
|
suffixCaseSensitive: matchesCaseSensitive, |
|
} |
|
|
|
// We found literals in the middle. We can trigger the fast path only if |
|
// the matches are case sensitive because containsStringMatcher doesn't |
|
// support case insensitive. |
|
case matchesCaseSensitive: |
|
return &containsStringMatcher{ |
|
substrings: matches, |
|
left: left, |
|
right: right, |
|
} |
|
} |
|
} |
|
return nil |
|
} |
|
|
|
// containsStringMatcher matches a string if it contains any of the substrings. |
|
// If left and right are not nil, it's a contains operation where left and right must match. |
|
// If left is nil, it's a hasPrefix operation and right must match. |
|
// Finally, if right is nil it's a hasSuffix operation and left must match. |
|
type containsStringMatcher struct { |
|
// The matcher that must match the left side. Can be nil. |
|
left StringMatcher |
|
|
|
// At least one of these strings must match in the "middle", between left and right matchers. |
|
substrings []string |
|
|
|
// The matcher that must match the right side. Can be nil. |
|
right StringMatcher |
|
} |
|
|
|
func (m *containsStringMatcher) Matches(s string) bool { |
|
for _, substr := range m.substrings { |
|
switch { |
|
case m.right != nil && m.left != nil: |
|
searchStartPos := 0 |
|
|
|
for { |
|
pos := strings.Index(s[searchStartPos:], substr) |
|
if pos < 0 { |
|
break |
|
} |
|
|
|
// Since we started searching from searchStartPos, we have to add that offset |
|
// to get the actual position of the substring inside the text. |
|
pos += searchStartPos |
|
|
|
// If both the left and right matchers match, then we can stop searching because |
|
// we've found a match. |
|
if m.left.Matches(s[:pos]) && m.right.Matches(s[pos+len(substr):]) { |
|
return true |
|
} |
|
|
|
// Continue searching for another occurrence of the substring inside the text. |
|
searchStartPos = pos + 1 |
|
} |
|
case m.left != nil: |
|
// If we have to check for characters on the left then we need to match a suffix. |
|
if strings.HasSuffix(s, substr) && m.left.Matches(s[:len(s)-len(substr)]) { |
|
return true |
|
} |
|
case m.right != nil: |
|
if strings.HasPrefix(s, substr) && m.right.Matches(s[len(substr):]) { |
|
return true |
|
} |
|
} |
|
} |
|
return false |
|
} |
|
|
|
func newLiteralPrefixStringMatcher(prefix string, prefixCaseSensitive bool, right StringMatcher) StringMatcher { |
|
if prefixCaseSensitive { |
|
return &literalPrefixSensitiveStringMatcher{ |
|
prefix: prefix, |
|
right: right, |
|
} |
|
} |
|
|
|
return &literalPrefixInsensitiveStringMatcher{ |
|
prefix: prefix, |
|
right: right, |
|
} |
|
} |
|
|
|
// literalPrefixSensitiveStringMatcher matches a string with the given literal case-sensitive prefix and right side matcher. |
|
type literalPrefixSensitiveStringMatcher struct { |
|
prefix string |
|
|
|
// The matcher that must match the right side. Can be nil. |
|
right StringMatcher |
|
} |
|
|
|
func (m *literalPrefixSensitiveStringMatcher) Matches(s string) bool { |
|
if !strings.HasPrefix(s, m.prefix) { |
|
return false |
|
} |
|
|
|
// Ensure the right side matches. |
|
return m.right.Matches(s[len(m.prefix):]) |
|
} |
|
|
|
// literalPrefixInsensitiveStringMatcher matches a string with the given literal case-insensitive prefix and right side matcher. |
|
type literalPrefixInsensitiveStringMatcher struct { |
|
prefix string |
|
|
|
// The matcher that must match the right side. Can be nil. |
|
right StringMatcher |
|
} |
|
|
|
func (m *literalPrefixInsensitiveStringMatcher) Matches(s string) bool { |
|
if !hasPrefixCaseInsensitive(s, m.prefix) { |
|
return false |
|
} |
|
|
|
// Ensure the right side matches. |
|
return m.right.Matches(s[len(m.prefix):]) |
|
} |
|
|
|
// literalSuffixStringMatcher matches a string with the given literal suffix and left side matcher. |
|
type literalSuffixStringMatcher struct { |
|
// The matcher that must match the left side. Can be nil. |
|
left StringMatcher |
|
|
|
suffix string |
|
suffixCaseSensitive bool |
|
} |
|
|
|
func (m *literalSuffixStringMatcher) Matches(s string) bool { |
|
// Ensure the suffix matches. |
|
if m.suffixCaseSensitive && !strings.HasSuffix(s, m.suffix) { |
|
return false |
|
} |
|
if !m.suffixCaseSensitive && !hasSuffixCaseInsensitive(s, m.suffix) { |
|
return false |
|
} |
|
|
|
// Ensure the left side matches. |
|
return m.left.Matches(s[:len(s)-len(m.suffix)]) |
|
} |
|
|
|
// emptyStringMatcher matches an empty string. |
|
type emptyStringMatcher struct{} |
|
|
|
func (m emptyStringMatcher) Matches(s string) bool { |
|
return len(s) == 0 |
|
} |
|
|
|
// orStringMatcher matches any of the sub-matchers. |
|
type orStringMatcher []StringMatcher |
|
|
|
func (m orStringMatcher) Matches(s string) bool { |
|
for _, matcher := range m { |
|
if matcher.Matches(s) { |
|
return true |
|
} |
|
} |
|
return false |
|
} |
|
|
|
// equalStringMatcher matches a string exactly and support case insensitive. |
|
type equalStringMatcher struct { |
|
s string |
|
caseSensitive bool |
|
} |
|
|
|
func (m *equalStringMatcher) Matches(s string) bool { |
|
if m.caseSensitive { |
|
return m.s == s |
|
} |
|
return strings.EqualFold(m.s, s) |
|
} |
|
|
|
type multiStringMatcherBuilder interface { |
|
StringMatcher |
|
add(s string) |
|
addPrefix(prefix string, prefixCaseSensitive bool, matcher StringMatcher) |
|
setMatches() []string |
|
} |
|
|
|
func newEqualMultiStringMatcher(caseSensitive bool, estimatedSize, estimatedPrefixes, minPrefixLength int) multiStringMatcherBuilder { |
|
// If the estimated size is low enough, it's faster to use a slice instead of a map. |
|
if estimatedSize < minEqualMultiStringMatcherMapThreshold && estimatedPrefixes == 0 { |
|
return &equalMultiStringSliceMatcher{caseSensitive: caseSensitive, values: make([]string, 0, estimatedSize)} |
|
} |
|
|
|
return &equalMultiStringMapMatcher{ |
|
values: make(map[string]struct{}, estimatedSize), |
|
prefixes: make(map[string][]StringMatcher, estimatedPrefixes), |
|
minPrefixLen: minPrefixLength, |
|
caseSensitive: caseSensitive, |
|
} |
|
} |
|
|
|
// equalMultiStringSliceMatcher matches a string exactly against a slice of valid values. |
|
type equalMultiStringSliceMatcher struct { |
|
values []string |
|
|
|
caseSensitive bool |
|
} |
|
|
|
func (m *equalMultiStringSliceMatcher) add(s string) { |
|
m.values = append(m.values, s) |
|
} |
|
|
|
func (m *equalMultiStringSliceMatcher) addPrefix(_ string, _ bool, _ StringMatcher) { |
|
panic("not implemented") |
|
} |
|
|
|
func (m *equalMultiStringSliceMatcher) setMatches() []string { |
|
return m.values |
|
} |
|
|
|
func (m *equalMultiStringSliceMatcher) Matches(s string) bool { |
|
if m.caseSensitive { |
|
for _, v := range m.values { |
|
if s == v { |
|
return true |
|
} |
|
} |
|
} else { |
|
for _, v := range m.values { |
|
if strings.EqualFold(s, v) { |
|
return true |
|
} |
|
} |
|
} |
|
return false |
|
} |
|
|
|
// equalMultiStringMapMatcher matches a string exactly against a map of valid values |
|
// or against a set of prefix matchers. |
|
type equalMultiStringMapMatcher struct { |
|
// values contains values to match a string against. If the matching is case insensitive, |
|
// the values here must be lowercase. |
|
values map[string]struct{} |
|
// prefixes maps strings, all of length minPrefixLen, to sets of matchers to check the rest of the string. |
|
// If the matching is case insensitive, prefixes are all lowercase. |
|
prefixes map[string][]StringMatcher |
|
// minPrefixLen can be zero, meaning there are no prefix matchers. |
|
minPrefixLen int |
|
caseSensitive bool |
|
} |
|
|
|
func (m *equalMultiStringMapMatcher) add(s string) { |
|
if !m.caseSensitive { |
|
s = toNormalisedLower(s) |
|
} |
|
|
|
m.values[s] = struct{}{} |
|
} |
|
|
|
func (m *equalMultiStringMapMatcher) addPrefix(prefix string, prefixCaseSensitive bool, matcher StringMatcher) { |
|
if m.minPrefixLen == 0 { |
|
panic("addPrefix called when no prefix length defined") |
|
} |
|
if len(prefix) < m.minPrefixLen { |
|
panic("addPrefix called with a too short prefix") |
|
} |
|
if m.caseSensitive != prefixCaseSensitive { |
|
panic("addPrefix called with a prefix whose case sensitivity is different than the expected one") |
|
} |
|
|
|
s := prefix[:m.minPrefixLen] |
|
if !m.caseSensitive { |
|
s = strings.ToLower(s) |
|
} |
|
|
|
m.prefixes[s] = append(m.prefixes[s], matcher) |
|
} |
|
|
|
func (m *equalMultiStringMapMatcher) setMatches() []string { |
|
if len(m.values) >= maxSetMatches || len(m.prefixes) > 0 { |
|
return nil |
|
} |
|
|
|
matches := make([]string, 0, len(m.values)) |
|
for s := range m.values { |
|
matches = append(matches, s) |
|
} |
|
return matches |
|
} |
|
|
|
func (m *equalMultiStringMapMatcher) Matches(s string) bool { |
|
if !m.caseSensitive { |
|
s = toNormalisedLower(s) |
|
} |
|
|
|
if _, ok := m.values[s]; ok { |
|
return true |
|
} |
|
if m.minPrefixLen > 0 && len(s) >= m.minPrefixLen { |
|
for _, matcher := range m.prefixes[s[:m.minPrefixLen]] { |
|
if matcher.Matches(s) { |
|
return true |
|
} |
|
} |
|
} |
|
return false |
|
} |
|
|
|
// toNormalisedLower normalise the input string using "Unicode Normalization Form D" and then convert |
|
// it to lower case. |
|
func toNormalisedLower(s string) string { |
|
var buf []byte |
|
for i := 0; i < len(s); i++ { |
|
c := s[i] |
|
if c >= utf8.RuneSelf { |
|
return strings.Map(unicode.ToLower, norm.NFKD.String(s)) |
|
} |
|
if 'A' <= c && c <= 'Z' { |
|
if buf == nil { |
|
buf = []byte(s) |
|
} |
|
buf[i] = c + 'a' - 'A' |
|
} |
|
} |
|
if buf == nil { |
|
return s |
|
} |
|
return yoloString(buf) |
|
} |
|
|
|
// anyStringWithoutNewlineMatcher is a stringMatcher which matches any string |
|
// (including an empty one) as far as it doesn't contain any newline character. |
|
type anyStringWithoutNewlineMatcher struct{} |
|
|
|
func (m anyStringWithoutNewlineMatcher) Matches(s string) bool { |
|
// We need to make sure it doesn't contain a newline. Since the newline is |
|
// an ASCII character, we can use strings.IndexByte(). |
|
return strings.IndexByte(s, '\n') == -1 |
|
} |
|
|
|
// anyNonEmptyStringMatcher is a stringMatcher which matches any non-empty string. |
|
type anyNonEmptyStringMatcher struct { |
|
matchNL bool |
|
} |
|
|
|
func (m *anyNonEmptyStringMatcher) Matches(s string) bool { |
|
if m.matchNL { |
|
// It's OK if the string contains a newline so we just need to make |
|
// sure it's non-empty. |
|
return len(s) > 0 |
|
} |
|
|
|
// We need to make sure it non-empty and doesn't contain a newline. |
|
// Since the newline is an ASCII character, we can use strings.IndexByte(). |
|
return len(s) > 0 && strings.IndexByte(s, '\n') == -1 |
|
} |
|
|
|
// zeroOrOneCharacterStringMatcher is a StringMatcher which matches zero or one occurrence |
|
// of any character. The newline character is matches only if matchNL is set to true. |
|
type zeroOrOneCharacterStringMatcher struct { |
|
matchNL bool |
|
} |
|
|
|
func (m *zeroOrOneCharacterStringMatcher) Matches(s string) bool { |
|
// If there's more than one rune in the string, then it can't match. |
|
if r, size := utf8.DecodeRuneInString(s); r == utf8.RuneError { |
|
// Size is 0 for empty strings, 1 for invalid rune. |
|
// Empty string matches, invalid rune matches if there isn't anything else. |
|
return size == len(s) |
|
} else if size < len(s) { |
|
return false |
|
} |
|
|
|
// No need to check for the newline if the string is empty or matching a newline is OK. |
|
if m.matchNL || len(s) == 0 { |
|
return true |
|
} |
|
|
|
return s[0] != '\n' |
|
} |
|
|
|
// trueMatcher is a stringMatcher which matches any string (always returns true). |
|
type trueMatcher struct{} |
|
|
|
func (m trueMatcher) Matches(_ string) bool { |
|
return true |
|
} |
|
|
|
// optimizeEqualOrPrefixStringMatchers optimize a specific case where all matchers are made by an |
|
// alternation (orStringMatcher) of strings checked for equality (equalStringMatcher) or |
|
// with a literal prefix (literalPrefixSensitiveStringMatcher or literalPrefixInsensitiveStringMatcher). |
|
// |
|
// In this specific case, when we have many strings to match against we can use a map instead |
|
// of iterating over the list of strings. |
|
func optimizeEqualOrPrefixStringMatchers(input StringMatcher, threshold int) StringMatcher { |
|
var ( |
|
caseSensitive bool |
|
caseSensitiveSet bool |
|
numValues int |
|
numPrefixes int |
|
minPrefixLength int |
|
) |
|
|
|
// Analyse the input StringMatcher to count the number of occurrences |
|
// and ensure all of them have the same case sensitivity. |
|
analyseEqualMatcherCallback := func(matcher *equalStringMatcher) bool { |
|
// Ensure we don't have mixed case sensitivity. |
|
if caseSensitiveSet && caseSensitive != matcher.caseSensitive { |
|
return false |
|
} else if !caseSensitiveSet { |
|
caseSensitive = matcher.caseSensitive |
|
caseSensitiveSet = true |
|
} |
|
|
|
numValues++ |
|
return true |
|
} |
|
|
|
analysePrefixMatcherCallback := func(prefix string, prefixCaseSensitive bool, matcher StringMatcher) bool { |
|
// Ensure we don't have mixed case sensitivity. |
|
if caseSensitiveSet && caseSensitive != prefixCaseSensitive { |
|
return false |
|
} else if !caseSensitiveSet { |
|
caseSensitive = prefixCaseSensitive |
|
caseSensitiveSet = true |
|
} |
|
if numPrefixes == 0 || len(prefix) < minPrefixLength { |
|
minPrefixLength = len(prefix) |
|
} |
|
|
|
numPrefixes++ |
|
return true |
|
} |
|
|
|
if !findEqualOrPrefixStringMatchers(input, analyseEqualMatcherCallback, analysePrefixMatcherCallback) { |
|
return input |
|
} |
|
|
|
// If the number of values and prefixes found is less than the threshold, then we should skip the optimization. |
|
if (numValues + numPrefixes) < threshold { |
|
return input |
|
} |
|
|
|
// Parse again the input StringMatcher to extract all values and storing them. |
|
// We can skip the case sensitivity check because we've already checked it and |
|
// if the code reach this point then it means all matchers have the same case sensitivity. |
|
multiMatcher := newEqualMultiStringMatcher(caseSensitive, numValues, numPrefixes, minPrefixLength) |
|
|
|
// Ignore the return value because we already iterated over the input StringMatcher |
|
// and it was all good. |
|
findEqualOrPrefixStringMatchers(input, func(matcher *equalStringMatcher) bool { |
|
multiMatcher.add(matcher.s) |
|
return true |
|
}, func(prefix string, prefixCaseSensitive bool, matcher StringMatcher) bool { |
|
multiMatcher.addPrefix(prefix, caseSensitive, matcher) |
|
return true |
|
}) |
|
|
|
return multiMatcher |
|
} |
|
|
|
// findEqualOrPrefixStringMatchers analyze the input StringMatcher and calls the equalMatcherCallback for each |
|
// equalStringMatcher found, and prefixMatcherCallback for each literalPrefixSensitiveStringMatcher and literalPrefixInsensitiveStringMatcher found. |
|
// |
|
// Returns true if and only if the input StringMatcher is *only* composed by an alternation of equalStringMatcher and/or |
|
// literal prefix matcher. Returns false if prefixMatcherCallback is nil and a literal prefix matcher is encountered. |
|
func findEqualOrPrefixStringMatchers(input StringMatcher, equalMatcherCallback func(matcher *equalStringMatcher) bool, prefixMatcherCallback func(prefix string, prefixCaseSensitive bool, matcher StringMatcher) bool) bool { |
|
orInput, ok := input.(orStringMatcher) |
|
if !ok { |
|
return false |
|
} |
|
|
|
for _, m := range orInput { |
|
switch casted := m.(type) { |
|
case orStringMatcher: |
|
if !findEqualOrPrefixStringMatchers(m, equalMatcherCallback, prefixMatcherCallback) { |
|
return false |
|
} |
|
|
|
case *equalStringMatcher: |
|
if !equalMatcherCallback(casted) { |
|
return false |
|
} |
|
|
|
case *literalPrefixSensitiveStringMatcher: |
|
if prefixMatcherCallback == nil || !prefixMatcherCallback(casted.prefix, true, casted) { |
|
return false |
|
} |
|
|
|
case *literalPrefixInsensitiveStringMatcher: |
|
if prefixMatcherCallback == nil || !prefixMatcherCallback(casted.prefix, false, casted) { |
|
return false |
|
} |
|
|
|
default: |
|
// It's not an equal or prefix string matcher, so we have to stop searching |
|
// cause this optimization can't be applied. |
|
return false |
|
} |
|
} |
|
|
|
return true |
|
} |
|
|
|
func hasPrefixCaseInsensitive(s, prefix string) bool { |
|
return len(s) >= len(prefix) && strings.EqualFold(s[0:len(prefix)], prefix) |
|
} |
|
|
|
func hasSuffixCaseInsensitive(s, suffix string) bool { |
|
return len(s) >= len(suffix) && strings.EqualFold(s[len(s)-len(suffix):], suffix) |
|
} |
|
|
|
func containsInOrder(s string, contains []string) bool { |
|
// Optimization for the case we only have to look for 1 substring. |
|
if len(contains) == 1 { |
|
return strings.Contains(s, contains[0]) |
|
} |
|
|
|
return containsInOrderMulti(s, contains) |
|
} |
|
|
|
func containsInOrderMulti(s string, contains []string) bool { |
|
offset := 0 |
|
|
|
for _, substr := range contains { |
|
at := strings.Index(s[offset:], substr) |
|
if at == -1 { |
|
return false |
|
} |
|
|
|
offset += at + len(substr) |
|
} |
|
|
|
return true |
|
}
|
|
|