From 78fdd2188dcd955c22ee903814c3e624ee354b94 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 31 May 2024 15:35:58 +0200 Subject: [PATCH 1/3] Improve contains check done by FastRegexMatcher Signed-off-by: Marco Pracucci --- model/labels/regexp.go | 35 ++++++++++++++++------- model/labels/regexp_test.go | 57 +++++++++++++++++++++++-------------- 2 files changed, 60 insertions(+), 32 deletions(-) diff --git a/model/labels/regexp.go b/model/labels/regexp.go index b484e2716..9a9d846fd 100644 --- a/model/labels/regexp.go +++ b/model/labels/regexp.go @@ -42,7 +42,7 @@ type FastRegexMatcher struct { stringMatcher StringMatcher prefix string suffix string - contains string + contains []string // matchString is the "compiled" function to run by MatchString(). matchString func(string) bool @@ -87,7 +87,7 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) { // compileMatchStringFunction returns the function to run by MatchString(). func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool { // If the only optimization available is the string matcher, then we can just run it. - if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && m.contains == "" && m.stringMatcher != nil { + if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && len(m.contains) == 0 && m.stringMatcher != nil { return m.stringMatcher.Matches } @@ -106,7 +106,7 @@ func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool { if m.suffix != "" && !strings.HasSuffix(s, m.suffix) { return false } - if m.contains != "" && !strings.Contains(s, m.contains) { + if len(m.contains) > 0 && !containsInOrder(s, m.contains) { return false } if m.stringMatcher != nil { @@ -119,7 +119,7 @@ func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool { // IsOptimized returns true if any fast-path optimization is applied to the // regex matcher. func (m *FastRegexMatcher) IsOptimized() bool { - return len(m.setMatches) > 0 || m.stringMatcher != nil || m.prefix != "" || m.suffix != "" || m.contains != "" + return len(m.setMatches) > 0 || m.stringMatcher != nil || m.prefix != "" || m.suffix != "" || len(m.contains) > 0 } // findSetMatches extract equality matches from a regexp. @@ -361,8 +361,9 @@ func optimizeAlternatingLiterals(s string) (StringMatcher, []string) { // optimizeConcatRegex returns literal prefix/suffix text that can be safely // checked against the label value before running the regexp matcher. -func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix, contains string) { +func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix string, contains []string) { sub := r.Sub + clearCapture(sub...) // We can safely remove begin and end text matchers respectively // at the beginning and end of the regexp. @@ -387,13 +388,12 @@ func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix, contains string) { suffix = string(sub[last].Rune) } - // If contains any literal which is not a prefix/suffix, we keep the - // 1st one. We do not keep the whole list of literals to simplify the - // fast path. + // If contains any literal which is not a prefix/suffix, we keep track of + // all the ones which are case sensitive. for i := 1; i < len(sub)-1; i++ { + // TODO if it's case insensitive we should return an contains list or is it safe to keep searching for case sensitive ones? if sub[i].Op == syntax.OpLiteral && (sub[i].Flags&syntax.FoldCase) == 0 { - contains = string(sub[i].Rune) - break + contains = append(contains, string(sub[i].Rune)) } } @@ -940,3 +940,18 @@ func hasPrefixCaseInsensitive(s, prefix string) bool { func hasSuffixCaseInsensitive(s, suffix string) bool { return len(s) >= len(suffix) && strings.EqualFold(s[len(s)-len(suffix):], suffix) } + +func containsInOrder(s string, contains []string) bool { + offset := 0 + + for _, substr := range contains { + at := strings.Index(s[offset:], substr) + if at == -1 { + return false + } + + offset += at + len(substr) + } + + return true +} diff --git a/model/labels/regexp_test.go b/model/labels/regexp_test.go index 1db90a473..0a75841c9 100644 --- a/model/labels/regexp_test.go +++ b/model/labels/regexp_test.go @@ -81,6 +81,10 @@ var ( ".*foo.?", ".?foo.+", "foo.?|bar", + // Concat of literals and wildcards. + ".*-.*-.*-.*-.*", + "(.+)-(.+)-(.+)-(.+)-(.+)", + "((.*))-((.*))-((.*))-((.*))-((.*))", } values = []string{ "foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "", @@ -132,29 +136,29 @@ func TestOptimizeConcatRegex(t *testing.T) { regex string prefix string suffix string - contains string + contains []string }{ - {regex: "foo(hello|bar)", prefix: "foo", suffix: "", contains: ""}, - {regex: "foo(hello|bar)world", prefix: "foo", suffix: "world", contains: ""}, - {regex: "foo.*", prefix: "foo", suffix: "", contains: ""}, - {regex: "foo.*hello.*bar", prefix: "foo", suffix: "bar", contains: "hello"}, - {regex: ".*foo", prefix: "", suffix: "foo", contains: ""}, - {regex: "^.*foo$", prefix: "", suffix: "foo", contains: ""}, - {regex: ".*foo.*", prefix: "", suffix: "", contains: "foo"}, - {regex: ".*foo.*bar.*", prefix: "", suffix: "", contains: "foo"}, - {regex: ".*(foo|bar).*", prefix: "", suffix: "", contains: ""}, - {regex: ".*[abc].*", prefix: "", suffix: "", contains: ""}, - {regex: ".*((?i)abc).*", prefix: "", suffix: "", contains: ""}, - {regex: ".*(?i:abc).*", prefix: "", suffix: "", contains: ""}, - {regex: "(?i:abc).*", prefix: "", suffix: "", contains: ""}, - {regex: ".*(?i:abc)", prefix: "", suffix: "", contains: ""}, - {regex: ".*(?i:abc)def.*", prefix: "", suffix: "", contains: "def"}, - {regex: "(?i).*(?-i:abc)def", prefix: "", suffix: "", contains: "abc"}, - {regex: ".*(?msU:abc).*", prefix: "", suffix: "", contains: "abc"}, - {regex: "[aA]bc.*", prefix: "", suffix: "", contains: "bc"}, - {regex: "^5..$", prefix: "5", suffix: "", contains: ""}, - {regex: "^release.*", prefix: "release", suffix: "", contains: ""}, - {regex: "^env-[0-9]+laio[1]?[^0-9].*", prefix: "env-", suffix: "", contains: "laio"}, + {regex: "foo(hello|bar)", prefix: "foo", suffix: "", contains: nil}, + {regex: "foo(hello|bar)world", prefix: "foo", suffix: "world", contains: nil}, + {regex: "foo.*", prefix: "foo", suffix: "", contains: nil}, + {regex: "foo.*hello.*bar", prefix: "foo", suffix: "bar", contains: []string{"hello"}}, + {regex: ".*foo", prefix: "", suffix: "foo", contains: nil}, + {regex: "^.*foo$", prefix: "", suffix: "foo", contains: nil}, + {regex: ".*foo.*", prefix: "", suffix: "", contains: []string{"foo"}}, + {regex: ".*foo.*bar.*", prefix: "", suffix: "", contains: []string{"foo", "bar"}}, + {regex: ".*(foo|bar).*", prefix: "", suffix: "", contains: nil}, + {regex: ".*[abc].*", prefix: "", suffix: "", contains: nil}, + {regex: ".*((?i)abc).*", prefix: "", suffix: "", contains: nil}, + {regex: ".*(?i:abc).*", prefix: "", suffix: "", contains: nil}, + {regex: "(?i:abc).*", prefix: "", suffix: "", contains: nil}, + {regex: ".*(?i:abc)", prefix: "", suffix: "", contains: nil}, + {regex: ".*(?i:abc)def.*", prefix: "", suffix: "", contains: []string{"def"}}, + {regex: "(?i).*(?-i:abc)def", prefix: "", suffix: "", contains: []string{"abc"}}, + {regex: ".*(?msU:abc).*", prefix: "", suffix: "", contains: []string{"abc"}}, + {regex: "[aA]bc.*", prefix: "", suffix: "", contains: []string{"bc"}}, + {regex: "^5..$", prefix: "5", suffix: "", contains: nil}, + {regex: "^release.*", prefix: "release", suffix: "", contains: nil}, + {regex: "^env-[0-9]+laio[1]?[^0-9].*", prefix: "env-", suffix: "", contains: []string{"laio"}}, } for _, c := range cases { @@ -1089,6 +1093,15 @@ func TestHasSuffixCaseInsensitive(t *testing.T) { require.False(t, hasSuffixCaseInsensitive("marco", "abcdefghi")) } +func TestContainsInOrder(t *testing.T) { + require.True(t, containsInOrder("abcdefghilmno", []string{"ab", "cd", "no"})) + require.True(t, containsInOrder("abcdefghilmno", []string{"def", "hil"})) + + require.False(t, containsInOrder("abcdefghilmno", []string{"ac"})) + require.False(t, containsInOrder("abcdefghilmno", []string{"ab", "cd", "de"})) + require.False(t, containsInOrder("abcdefghilmno", []string{"cd", "ab"})) +} + func getTestNameFromRegexp(re string) string { if len(re) > 32 { return re[:32] From a0807733be25c2988ff936a679327ceba5644696 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 3 Jun 2024 11:05:20 +0200 Subject: [PATCH 2/3] Improved tests Signed-off-by: Marco Pracucci --- model/labels/regexp.go | 3 +-- model/labels/regexp_test.go | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/model/labels/regexp.go b/model/labels/regexp.go index 9a9d846fd..11fadc687 100644 --- a/model/labels/regexp.go +++ b/model/labels/regexp.go @@ -389,9 +389,8 @@ func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix string, contains []st } // If contains any literal which is not a prefix/suffix, we keep track of - // all the ones which are case sensitive. + // all the ones which are case-sensitive. for i := 1; i < len(sub)-1; i++ { - // TODO if it's case insensitive we should return an contains list or is it safe to keep searching for case sensitive ones? if sub[i].Op == syntax.OpLiteral && (sub[i].Flags&syntax.FoldCase) == 0 { contains = append(contains, string(sub[i].Rune)) } diff --git a/model/labels/regexp_test.go b/model/labels/regexp_test.go index 0a75841c9..400b5721b 100644 --- a/model/labels/regexp_test.go +++ b/model/labels/regexp_test.go @@ -84,11 +84,12 @@ var ( // Concat of literals and wildcards. ".*-.*-.*-.*-.*", "(.+)-(.+)-(.+)-(.+)-(.+)", - "((.*))-((.*))-((.*))-((.*))-((.*))", + "((.*))(?i:f)((.*))o((.*))o((.*))", + "((.*))f((.*))(?i:o)((.*))o((.*))", } values = []string{ "foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "", - "FOO", "Foo", "OO", "Oo", "\nfoo\n", strings.Repeat("f", 20), "prometheus", "prometheus_api_v1", "prometheus_api_v1_foo", + "FOO", "Foo", "fOo", "foO", "OO", "Oo", "\nfoo\n", strings.Repeat("f", 20), "prometheus", "prometheus_api_v1", "prometheus_api_v1_foo", "10.0.1.20", "10.0.2.10", "10.0.3.30", "10.0.4.40", "foofoo0", "foofoo", "😀foo0", From d966ae6400625bf58626838b162e0a7fd83eaed4 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 4 Jun 2024 10:24:36 +0200 Subject: [PATCH 3/3] Optimize containsInOrder() inlining it Signed-off-by: Marco Pracucci --- model/labels/regexp.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/model/labels/regexp.go b/model/labels/regexp.go index 11fadc687..f228d7ff1 100644 --- a/model/labels/regexp.go +++ b/model/labels/regexp.go @@ -941,6 +941,15 @@ func hasSuffixCaseInsensitive(s, suffix string) bool { } func containsInOrder(s string, contains []string) bool { + // Optimization for the case we only have to look for 1 substring. + if len(contains) == 1 { + return strings.Contains(s, contains[0]) + } + + return containsInOrderMulti(s, contains) +} + +func containsInOrderMulti(s string, contains []string) bool { offset := 0 for _, substr := range contains {