From 78fdd2188dcd955c22ee903814c3e624ee354b94 Mon Sep 17 00:00:00 2001
From: Marco Pracucci <marco@pracucci.com>
Date: Fri, 31 May 2024 15:35:58 +0200
Subject: [PATCH 1/3] Improve contains check done by FastRegexMatcher

Signed-off-by: Marco Pracucci <marco@pracucci.com>
---
 model/labels/regexp.go      | 35 ++++++++++++++++-------
 model/labels/regexp_test.go | 57 +++++++++++++++++++++++--------------
 2 files changed, 60 insertions(+), 32 deletions(-)

diff --git a/model/labels/regexp.go b/model/labels/regexp.go
index b484e2716..9a9d846fd 100644
--- a/model/labels/regexp.go
+++ b/model/labels/regexp.go
@@ -42,7 +42,7 @@ type FastRegexMatcher struct {
 	stringMatcher StringMatcher
 	prefix        string
 	suffix        string
-	contains      string
+	contains      []string
 
 	// matchString is the "compiled" function to run by MatchString().
 	matchString func(string) bool
@@ -87,7 +87,7 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
 // compileMatchStringFunction returns the function to run by MatchString().
 func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool {
 	// If the only optimization available is the string matcher, then we can just run it.
-	if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && m.contains == "" && m.stringMatcher != nil {
+	if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && len(m.contains) == 0 && m.stringMatcher != nil {
 		return m.stringMatcher.Matches
 	}
 
@@ -106,7 +106,7 @@ func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool {
 		if m.suffix != "" && !strings.HasSuffix(s, m.suffix) {
 			return false
 		}
-		if m.contains != "" && !strings.Contains(s, m.contains) {
+		if len(m.contains) > 0 && !containsInOrder(s, m.contains) {
 			return false
 		}
 		if m.stringMatcher != nil {
@@ -119,7 +119,7 @@ func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool {
 // IsOptimized returns true if any fast-path optimization is applied to the
 // regex matcher.
 func (m *FastRegexMatcher) IsOptimized() bool {
-	return len(m.setMatches) > 0 || m.stringMatcher != nil || m.prefix != "" || m.suffix != "" || m.contains != ""
+	return len(m.setMatches) > 0 || m.stringMatcher != nil || m.prefix != "" || m.suffix != "" || len(m.contains) > 0
 }
 
 // findSetMatches extract equality matches from a regexp.
@@ -361,8 +361,9 @@ func optimizeAlternatingLiterals(s string) (StringMatcher, []string) {
 
 // optimizeConcatRegex returns literal prefix/suffix text that can be safely
 // checked against the label value before running the regexp matcher.
-func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix, contains string) {
+func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix string, contains []string) {
 	sub := r.Sub
+	clearCapture(sub...)
 
 	// We can safely remove begin and end text matchers respectively
 	// at the beginning and end of the regexp.
@@ -387,13 +388,12 @@ func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix, contains string) {
 		suffix = string(sub[last].Rune)
 	}
 
-	// If contains any literal which is not a prefix/suffix, we keep the
-	// 1st one. We do not keep the whole list of literals to simplify the
-	// fast path.
+	// If contains any literal which is not a prefix/suffix, we keep track of
+	// all the ones which are case sensitive.
 	for i := 1; i < len(sub)-1; i++ {
+		// TODO if it's case insensitive we should return an contains list or is it safe to keep searching for case sensitive ones?
 		if sub[i].Op == syntax.OpLiteral && (sub[i].Flags&syntax.FoldCase) == 0 {
-			contains = string(sub[i].Rune)
-			break
+			contains = append(contains, string(sub[i].Rune))
 		}
 	}
 
@@ -940,3 +940,18 @@ func hasPrefixCaseInsensitive(s, prefix string) bool {
 func hasSuffixCaseInsensitive(s, suffix string) bool {
 	return len(s) >= len(suffix) && strings.EqualFold(s[len(s)-len(suffix):], suffix)
 }
+
+func containsInOrder(s string, contains []string) bool {
+	offset := 0
+
+	for _, substr := range contains {
+		at := strings.Index(s[offset:], substr)
+		if at == -1 {
+			return false
+		}
+
+		offset += at + len(substr)
+	}
+
+	return true
+}
diff --git a/model/labels/regexp_test.go b/model/labels/regexp_test.go
index 1db90a473..0a75841c9 100644
--- a/model/labels/regexp_test.go
+++ b/model/labels/regexp_test.go
@@ -81,6 +81,10 @@ var (
 		".*foo.?",
 		".?foo.+",
 		"foo.?|bar",
+		// Concat of literals and wildcards.
+		".*-.*-.*-.*-.*",
+		"(.+)-(.+)-(.+)-(.+)-(.+)",
+		"((.*))-((.*))-((.*))-((.*))-((.*))",
 	}
 	values = []string{
 		"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
@@ -132,29 +136,29 @@ func TestOptimizeConcatRegex(t *testing.T) {
 		regex    string
 		prefix   string
 		suffix   string
-		contains string
+		contains []string
 	}{
-		{regex: "foo(hello|bar)", prefix: "foo", suffix: "", contains: ""},
-		{regex: "foo(hello|bar)world", prefix: "foo", suffix: "world", contains: ""},
-		{regex: "foo.*", prefix: "foo", suffix: "", contains: ""},
-		{regex: "foo.*hello.*bar", prefix: "foo", suffix: "bar", contains: "hello"},
-		{regex: ".*foo", prefix: "", suffix: "foo", contains: ""},
-		{regex: "^.*foo$", prefix: "", suffix: "foo", contains: ""},
-		{regex: ".*foo.*", prefix: "", suffix: "", contains: "foo"},
-		{regex: ".*foo.*bar.*", prefix: "", suffix: "", contains: "foo"},
-		{regex: ".*(foo|bar).*", prefix: "", suffix: "", contains: ""},
-		{regex: ".*[abc].*", prefix: "", suffix: "", contains: ""},
-		{regex: ".*((?i)abc).*", prefix: "", suffix: "", contains: ""},
-		{regex: ".*(?i:abc).*", prefix: "", suffix: "", contains: ""},
-		{regex: "(?i:abc).*", prefix: "", suffix: "", contains: ""},
-		{regex: ".*(?i:abc)", prefix: "", suffix: "", contains: ""},
-		{regex: ".*(?i:abc)def.*", prefix: "", suffix: "", contains: "def"},
-		{regex: "(?i).*(?-i:abc)def", prefix: "", suffix: "", contains: "abc"},
-		{regex: ".*(?msU:abc).*", prefix: "", suffix: "", contains: "abc"},
-		{regex: "[aA]bc.*", prefix: "", suffix: "", contains: "bc"},
-		{regex: "^5..$", prefix: "5", suffix: "", contains: ""},
-		{regex: "^release.*", prefix: "release", suffix: "", contains: ""},
-		{regex: "^env-[0-9]+laio[1]?[^0-9].*", prefix: "env-", suffix: "", contains: "laio"},
+		{regex: "foo(hello|bar)", prefix: "foo", suffix: "", contains: nil},
+		{regex: "foo(hello|bar)world", prefix: "foo", suffix: "world", contains: nil},
+		{regex: "foo.*", prefix: "foo", suffix: "", contains: nil},
+		{regex: "foo.*hello.*bar", prefix: "foo", suffix: "bar", contains: []string{"hello"}},
+		{regex: ".*foo", prefix: "", suffix: "foo", contains: nil},
+		{regex: "^.*foo$", prefix: "", suffix: "foo", contains: nil},
+		{regex: ".*foo.*", prefix: "", suffix: "", contains: []string{"foo"}},
+		{regex: ".*foo.*bar.*", prefix: "", suffix: "", contains: []string{"foo", "bar"}},
+		{regex: ".*(foo|bar).*", prefix: "", suffix: "", contains: nil},
+		{regex: ".*[abc].*", prefix: "", suffix: "", contains: nil},
+		{regex: ".*((?i)abc).*", prefix: "", suffix: "", contains: nil},
+		{regex: ".*(?i:abc).*", prefix: "", suffix: "", contains: nil},
+		{regex: "(?i:abc).*", prefix: "", suffix: "", contains: nil},
+		{regex: ".*(?i:abc)", prefix: "", suffix: "", contains: nil},
+		{regex: ".*(?i:abc)def.*", prefix: "", suffix: "", contains: []string{"def"}},
+		{regex: "(?i).*(?-i:abc)def", prefix: "", suffix: "", contains: []string{"abc"}},
+		{regex: ".*(?msU:abc).*", prefix: "", suffix: "", contains: []string{"abc"}},
+		{regex: "[aA]bc.*", prefix: "", suffix: "", contains: []string{"bc"}},
+		{regex: "^5..$", prefix: "5", suffix: "", contains: nil},
+		{regex: "^release.*", prefix: "release", suffix: "", contains: nil},
+		{regex: "^env-[0-9]+laio[1]?[^0-9].*", prefix: "env-", suffix: "", contains: []string{"laio"}},
 	}
 
 	for _, c := range cases {
@@ -1089,6 +1093,15 @@ func TestHasSuffixCaseInsensitive(t *testing.T) {
 	require.False(t, hasSuffixCaseInsensitive("marco", "abcdefghi"))
 }
 
+func TestContainsInOrder(t *testing.T) {
+	require.True(t, containsInOrder("abcdefghilmno", []string{"ab", "cd", "no"}))
+	require.True(t, containsInOrder("abcdefghilmno", []string{"def", "hil"}))
+
+	require.False(t, containsInOrder("abcdefghilmno", []string{"ac"}))
+	require.False(t, containsInOrder("abcdefghilmno", []string{"ab", "cd", "de"}))
+	require.False(t, containsInOrder("abcdefghilmno", []string{"cd", "ab"}))
+}
+
 func getTestNameFromRegexp(re string) string {
 	if len(re) > 32 {
 		return re[:32]

From a0807733be25c2988ff936a679327ceba5644696 Mon Sep 17 00:00:00 2001
From: Marco Pracucci <marco@pracucci.com>
Date: Mon, 3 Jun 2024 11:05:20 +0200
Subject: [PATCH 2/3] Improved tests

Signed-off-by: Marco Pracucci <marco@pracucci.com>
---
 model/labels/regexp.go      | 3 +--
 model/labels/regexp_test.go | 5 +++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/model/labels/regexp.go b/model/labels/regexp.go
index 9a9d846fd..11fadc687 100644
--- a/model/labels/regexp.go
+++ b/model/labels/regexp.go
@@ -389,9 +389,8 @@ func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix string, contains []st
 	}
 
 	// If contains any literal which is not a prefix/suffix, we keep track of
-	// all the ones which are case sensitive.
+	// all the ones which are case-sensitive.
 	for i := 1; i < len(sub)-1; i++ {
-		// TODO if it's case insensitive we should return an contains list or is it safe to keep searching for case sensitive ones?
 		if sub[i].Op == syntax.OpLiteral && (sub[i].Flags&syntax.FoldCase) == 0 {
 			contains = append(contains, string(sub[i].Rune))
 		}
diff --git a/model/labels/regexp_test.go b/model/labels/regexp_test.go
index 0a75841c9..400b5721b 100644
--- a/model/labels/regexp_test.go
+++ b/model/labels/regexp_test.go
@@ -84,11 +84,12 @@ var (
 		// Concat of literals and wildcards.
 		".*-.*-.*-.*-.*",
 		"(.+)-(.+)-(.+)-(.+)-(.+)",
-		"((.*))-((.*))-((.*))-((.*))-((.*))",
+		"((.*))(?i:f)((.*))o((.*))o((.*))",
+		"((.*))f((.*))(?i:o)((.*))o((.*))",
 	}
 	values = []string{
 		"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
-		"FOO", "Foo", "OO", "Oo", "\nfoo\n", strings.Repeat("f", 20), "prometheus", "prometheus_api_v1", "prometheus_api_v1_foo",
+		"FOO", "Foo", "fOo", "foO", "OO", "Oo", "\nfoo\n", strings.Repeat("f", 20), "prometheus", "prometheus_api_v1", "prometheus_api_v1_foo",
 		"10.0.1.20", "10.0.2.10", "10.0.3.30", "10.0.4.40",
 		"foofoo0", "foofoo", "😀foo0",
 

From d966ae6400625bf58626838b162e0a7fd83eaed4 Mon Sep 17 00:00:00 2001
From: Marco Pracucci <marco@pracucci.com>
Date: Tue, 4 Jun 2024 10:24:36 +0200
Subject: [PATCH 3/3] Optimize containsInOrder() inlining it

Signed-off-by: Marco Pracucci <marco@pracucci.com>
---
 model/labels/regexp.go | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/model/labels/regexp.go b/model/labels/regexp.go
index 11fadc687..f228d7ff1 100644
--- a/model/labels/regexp.go
+++ b/model/labels/regexp.go
@@ -941,6 +941,15 @@ func hasSuffixCaseInsensitive(s, suffix string) bool {
 }
 
 func containsInOrder(s string, contains []string) bool {
+	// Optimization for the case we only have to look for 1 substring.
+	if len(contains) == 1 {
+		return strings.Contains(s, contains[0])
+	}
+
+	return containsInOrderMulti(s, contains)
+}
+
+func containsInOrderMulti(s string, contains []string) bool {
 	offset := 0
 
 	for _, substr := range contains {