statping/types/services/routine.go

442 lines
11 KiB
Go
Raw Normal View History

2020-03-04 10:29:00 +00:00
package services
2018-06-10 01:31:13 +00:00
import (
"bytes"
2020-05-20 06:59:28 +00:00
"crypto/tls"
2018-06-10 01:31:13 +00:00
"fmt"
2020-06-16 09:42:21 +00:00
"github.com/prometheus/client_golang/prometheus"
"github.com/statping/statping/types/metrics"
"google.golang.org/grpc"
"net"
2018-06-10 01:31:13 +00:00
"net/http"
"net/url"
2018-06-10 01:31:13 +00:00
"regexp"
"strings"
2018-06-10 01:31:13 +00:00
"time"
2020-03-14 03:13:20 +00:00
"github.com/statping/statping/types/failures"
"github.com/statping/statping/types/hits"
"github.com/statping/statping/utils"
2018-06-10 01:31:13 +00:00
)
// checkServices will start the checking go routine for each service
2020-03-04 10:29:00 +00:00
func CheckServices() {
log.Infoln(fmt.Sprintf("Starting monitoring process for %v Services", len(allServices)))
for _, s := range allServices {
time.Sleep(50 * time.Millisecond)
2020-02-26 05:38:03 +00:00
go ServiceCheckQueue(s, true)
2018-06-10 01:31:13 +00:00
}
}
// CheckQueue is the main go routine for checking a service
2020-02-26 05:38:03 +00:00
func ServiceCheckQueue(s *Service, record bool) {
2020-03-04 10:29:00 +00:00
s.Start()
s.Checkpoint = utils.Now()
2020-02-25 07:41:28 +00:00
s.SleepDuration = (time.Duration(s.Id) * 100) * time.Millisecond
2020-03-04 14:20:47 +00:00
2018-08-19 00:37:00 +00:00
CheckLoop:
for {
select {
2018-08-19 00:37:00 +00:00
case <-s.Running:
log.Infoln(fmt.Sprintf("Stopping service: %v", s.Name))
2018-08-19 00:37:00 +00:00
break CheckLoop
case <-time.After(s.SleepDuration):
2020-03-04 14:20:47 +00:00
s.CheckService(record)
2020-03-12 04:58:56 +00:00
s.UpdateStats()
2020-02-26 05:38:03 +00:00
s.Checkpoint = s.Checkpoint.Add(s.Duration())
if !s.Online {
2020-02-26 05:38:03 +00:00
s.SleepDuration = s.Duration()
} else {
s.SleepDuration = s.Checkpoint.Sub(time.Now())
}
}
2018-06-15 04:30:10 +00:00
}
2018-06-10 01:31:13 +00:00
}
2020-02-26 05:38:03 +00:00
func parseHost(s *Service) string {
if s.Type == "tcp" || s.Type == "udp" || s.Type == "grpc" {
return s.Domain
} else {
2019-05-27 17:35:51 +00:00
u, err := url.Parse(s.Domain)
if err != nil {
return s.Domain
}
2019-05-27 17:35:51 +00:00
return strings.Split(u.Host, ":")[0]
}
}
// dnsCheck will check the domain name and return a float64 for the amount of time the DNS check took
2020-03-10 05:24:35 +00:00
func dnsCheck(s *Service) (int64, error) {
var err error
2020-03-10 05:24:35 +00:00
t1 := utils.Now()
2020-02-26 05:38:03 +00:00
host := parseHost(s)
if s.Type == "tcp" || s.Type == "udp" || s.Type == "grpc" {
_, err = net.LookupHost(host)
} else {
_, err = net.LookupIP(host)
}
if err != nil {
return 0, err
}
return utils.Now().Sub(t1).Microseconds(), err
}
2019-06-06 19:10:52 +00:00
func isIPv6(address string) bool {
return strings.Count(address, ":") >= 2
}
2019-04-20 03:41:09 +00:00
// checkIcmp will send a ICMP ping packet to the service
func CheckIcmp(s *Service, record bool) (*Service, error) {
2020-03-05 08:27:51 +00:00
defer s.updateLastCheck()
timer := prometheus.NewTimer(metrics.ServiceTimer(s.Name))
2020-06-16 09:42:21 +00:00
defer timer.ObserveDuration()
2020-03-05 08:27:51 +00:00
dur, err := utils.Ping(s.Domain, s.Timeout)
if err != nil {
if record {
recordFailure(s, fmt.Sprintf("Could not send ICMP to service %v, %v", s.Domain, err))
}
return s, err
2019-04-20 03:41:09 +00:00
}
s.PingTime = dur
s.Latency = dur
2019-04-20 03:41:09 +00:00
s.LastResponse = ""
s.Online = true
if record {
recordSuccess(s)
}
return s, nil
2019-04-20 03:41:09 +00:00
}
// CheckGrpc will check a gRPC service
func CheckGrpc(s *Service, record bool) (*Service, error) {
defer s.updateLastCheck()
timer := prometheus.NewTimer(metrics.ServiceTimer(s.Name))
2020-06-16 09:42:21 +00:00
defer timer.ObserveDuration()
dnsLookup, err := dnsCheck(s)
if err != nil {
if record {
recordFailure(s, fmt.Sprintf("Could not get IP address for GRPC service %v, %v", s.Domain, err))
}
return s, err
}
s.PingTime = dnsLookup
t1 := utils.Now()
domain := fmt.Sprintf("%v", s.Domain)
if s.Port != 0 {
domain = fmt.Sprintf("%v:%v", s.Domain, s.Port)
if isIPv6(s.Domain) {
domain = fmt.Sprintf("[%v]:%v", s.Domain, s.Port)
}
}
conn, err := grpc.Dial(domain, grpc.WithInsecure(), grpc.WithBlock())
if err != nil {
log.Fatalf("did not connect: %v", err)
}
if err != nil {
if record {
recordFailure(s, fmt.Sprintf("Dial Error %v", err))
}
return s, err
}
if err := conn.Close(); err != nil {
if record {
recordFailure(s, fmt.Sprintf("%v Socket Close Error %v", strings.ToUpper(s.Type), err))
}
return s, err
}
s.Latency = utils.Now().Sub(t1).Microseconds()
s.LastResponse = ""
s.Online = true
if record {
recordSuccess(s)
}
return s, nil
}
// checkTcp will check a TCP service
func CheckTcp(s *Service, record bool) (*Service, error) {
2020-03-05 08:27:51 +00:00
defer s.updateLastCheck()
timer := prometheus.NewTimer(metrics.ServiceTimer(s.Name))
2020-06-16 09:42:21 +00:00
defer timer.ObserveDuration()
2020-03-05 08:27:51 +00:00
2020-02-26 05:38:03 +00:00
dnsLookup, err := dnsCheck(s)
if err != nil {
if record {
2020-02-26 05:38:03 +00:00
recordFailure(s, fmt.Sprintf("Could not get IP address for TCP service %v, %v", s.Domain, err))
}
return s, err
}
s.PingTime = dnsLookup
2020-03-10 05:24:35 +00:00
t1 := utils.Now()
domain := fmt.Sprintf("%v", s.Domain)
if s.Port != 0 {
domain = fmt.Sprintf("%v:%v", s.Domain, s.Port)
2019-06-06 19:10:52 +00:00
if isIPv6(s.Domain) {
domain = fmt.Sprintf("[%v]:%v", s.Domain, s.Port)
}
}
2020-05-20 06:59:28 +00:00
tlsConfig, err := s.LoadTLSCert()
if err != nil {
log.Errorln(err)
}
// test TCP connection if there is no TLS Certificate set
if s.TLSCert.String == "" {
conn, err := net.DialTimeout(s.Type, domain, time.Duration(s.Timeout)*time.Second)
if err != nil {
if record {
recordFailure(s, fmt.Sprintf("Dial Error: %v", err))
}
return s, err
2018-08-19 00:37:00 +00:00
}
defer conn.Close()
} else {
// test TCP connection if TLS Certificate was set
dialer := &net.Dialer{
KeepAlive: time.Duration(s.Timeout) * time.Second,
Timeout: time.Duration(s.Timeout) * time.Second,
}
conn, err := tls.DialWithDialer(dialer, s.Type, domain, tlsConfig)
if err != nil {
if record {
recordFailure(s, fmt.Sprintf("Dial Error: %v", err))
}
return s, err
2018-08-19 00:37:00 +00:00
}
defer conn.Close()
}
s.Latency = utils.Now().Sub(t1).Microseconds()
s.LastResponse = ""
s.Online = true
2018-08-19 00:37:00 +00:00
if record {
2018-09-25 07:03:49 +00:00
recordSuccess(s)
2018-08-19 00:37:00 +00:00
}
return s, nil
}
2020-03-05 08:27:51 +00:00
func (s *Service) updateLastCheck() {
s.LastCheck = time.Now()
}
// checkHttp will check a HTTP service
func CheckHttp(s *Service, record bool) (*Service, error) {
2020-03-05 08:27:51 +00:00
defer s.updateLastCheck()
timer := prometheus.NewTimer(metrics.ServiceTimer(s.Name))
2020-06-16 09:42:21 +00:00
defer timer.ObserveDuration()
2020-03-05 08:27:51 +00:00
2020-02-26 05:38:03 +00:00
dnsLookup, err := dnsCheck(s)
if err != nil {
2018-08-19 00:37:00 +00:00
if record {
2020-02-26 05:38:03 +00:00
recordFailure(s, fmt.Sprintf("Could not get IP address for domain %v, %v", s.Domain, err))
2018-08-19 00:37:00 +00:00
}
return s, err
}
s.PingTime = dnsLookup
2020-03-10 05:24:35 +00:00
t1 := utils.Now()
2018-11-25 10:18:21 +00:00
timeout := time.Duration(s.Timeout) * time.Second
var content []byte
var res *http.Response
var data *bytes.Buffer
2019-03-16 22:55:45 +00:00
var headers []string
contentType := "application/json" // default Content-Type
2019-03-16 22:55:45 +00:00
if s.Headers.Valid {
headers = strings.Split(s.Headers.String, ",")
} else {
headers = nil
}
// check if 'Content-Type' header was defined
for _, header := range headers {
if strings.Split(header, "=")[0] == "Content-Type" {
contentType = strings.Split(header, "=")[1]
break
}
2019-03-16 22:55:45 +00:00
}
if s.Redirect.Bool {
headers = append(headers, "Redirect=true")
}
if s.PostData.String != "" {
data = bytes.NewBuffer([]byte(s.PostData.String))
} else {
data = bytes.NewBuffer(nil)
}
// force set Content-Type to 'application/json' if requests are made
// with POST method
if s.Method == "POST" && contentType != "application/json" {
contentType = "application/json"
}
2020-05-20 06:41:50 +00:00
customTLS, err := s.LoadTLSCert()
if err != nil {
log.Errorln(err)
}
content, res, err = utils.HttpRequest(s.Domain, s.Method, contentType, headers, data, timeout, s.VerifySSL.Bool, customTLS)
2018-06-30 22:37:01 +00:00
if err != nil {
2018-08-19 00:37:00 +00:00
if record {
2020-02-26 05:38:03 +00:00
recordFailure(s, fmt.Sprintf("HTTP Error %v", err))
2018-08-19 00:37:00 +00:00
}
return s, err
2018-06-30 22:37:01 +00:00
}
s.Latency = utils.Now().Sub(t1).Microseconds()
2018-11-25 10:18:21 +00:00
s.LastResponse = string(content)
s.LastStatusCode = res.StatusCode
2020-06-16 09:42:21 +00:00
metrics.Gauge("status_code", float64(res.StatusCode), s.Name)
if s.Expected.String != "" {
2018-11-25 10:18:21 +00:00
match, err := regexp.MatchString(s.Expected.String, string(content))
if err != nil {
log.Warnln(fmt.Sprintf("Service %v expected: %v to match %v", s.Name, string(content), s.Expected.String))
}
2018-06-10 01:31:13 +00:00
if !match {
2018-08-19 00:37:00 +00:00
if record {
2020-02-26 05:38:03 +00:00
recordFailure(s, fmt.Sprintf("HTTP Response Body did not match '%v'", s.Expected))
2018-08-19 00:37:00 +00:00
}
return s, err
2018-06-10 01:31:13 +00:00
}
}
2018-11-25 10:18:21 +00:00
if s.ExpectedStatus != res.StatusCode {
2018-08-19 00:37:00 +00:00
if record {
2020-02-26 05:38:03 +00:00
recordFailure(s, fmt.Sprintf("HTTP Status Code %v did not match %v", res.StatusCode, s.ExpectedStatus))
2018-08-19 00:37:00 +00:00
}
return s, err
2018-06-10 01:31:13 +00:00
}
2018-08-19 00:37:00 +00:00
if record {
2018-09-25 07:03:49 +00:00
recordSuccess(s)
2018-08-19 00:37:00 +00:00
}
s.Online = true
return s, err
2018-06-15 04:30:10 +00:00
}
2018-09-25 07:03:49 +00:00
// recordSuccess will create a new 'hit' record in the database for a successful/online service
2020-02-26 05:38:03 +00:00
func recordSuccess(s *Service) {
2020-03-10 05:24:35 +00:00
s.LastOnline = utils.Now()
2020-03-04 14:20:47 +00:00
s.Online = true
2020-03-04 10:29:00 +00:00
hit := &hits.Hit{
Service: s.Id,
Latency: s.Latency,
PingTime: s.PingTime,
2020-03-10 05:24:35 +00:00
CreatedAt: utils.Now(),
2018-06-15 04:30:10 +00:00
}
2020-03-04 14:20:47 +00:00
if err := hit.Create(); err != nil {
log.Error(err)
}
log.WithFields(utils.ToFields(hit, s)).Infoln(
2020-03-13 04:06:06 +00:00
fmt.Sprintf("Service #%d '%v' Successful Response: %s | Lookup in: %s | Online: %v | Interval: %d seconds", s.Id, s.Name, humanMicro(hit.Latency), humanMicro(hit.PingTime), s.Online, s.Interval))
2020-03-10 05:24:35 +00:00
s.LastLookupTime = hit.PingTime
s.LastLatency = hit.Latency
metrics.Gauge("online", 1., s.Name, s.Type)
metrics.Inc("success", s.Name)
2020-03-14 03:13:20 +00:00
sendSuccess(s)
s.SuccessNotified = true
2018-06-10 01:31:13 +00:00
}
2020-03-14 03:13:20 +00:00
func AddNotifier(n ServiceNotifier) {
notif := n.Select()
allNotifiers[notif.Method] = n
2020-03-14 03:13:20 +00:00
}
func sendSuccess(s *Service) {
if !s.AllowNotifications.Bool {
return
}
// dont send notification if server was already previous online
if s.SuccessNotified {
return
}
2020-03-25 18:46:50 +00:00
2020-03-14 03:13:20 +00:00
for _, n := range allNotifiers {
notif := n.Select()
if notif.CanSend() {
log.Infof("Sending notification to: %s!", notif.Method)
if _, err := n.OnSuccess(*s); err != nil {
notif.Logger().Errorln(err)
2020-03-14 03:13:20 +00:00
}
s.UserNotified = true
s.SuccessNotified = true
//s.UpdateNotify.Bool
}
}
s.notifyAfterCount = 0
2020-03-14 03:13:20 +00:00
}
2018-12-06 19:03:55 +00:00
// recordFailure will create a new 'Failure' record in the database for a offline service
2020-02-26 05:38:03 +00:00
func recordFailure(s *Service, issue string) {
2020-03-12 04:58:56 +00:00
s.LastOffline = utils.Now()
2020-03-05 08:27:51 +00:00
2020-03-04 10:29:00 +00:00
fail := &failures.Failure{
Service: s.Id,
Issue: issue,
PingTime: s.PingTime,
2020-03-12 04:58:56 +00:00
CreatedAt: utils.Now(),
2019-01-29 12:02:13 +00:00
ErrorCode: s.LastStatusCode,
}
2020-02-25 07:41:28 +00:00
log.WithFields(utils.ToFields(fail, s)).
2020-03-12 04:58:56 +00:00
Warnln(fmt.Sprintf("Service %v Failing: %v | Lookup in: %v", s.Name, issue, humanMicro(fail.PingTime)))
2020-03-04 10:29:00 +00:00
2020-03-04 14:20:47 +00:00
if err := fail.Create(); err != nil {
log.Error(err)
}
s.Online = false
s.SuccessNotified = false
2020-02-26 05:38:03 +00:00
s.DownText = s.DowntimeText()
metrics.Gauge("online", 0., s.Name, s.Type)
metrics.Inc("failure", s.Name)
2020-03-14 03:13:20 +00:00
sendFailure(s, fail)
}
func sendFailure(s *Service, f *failures.Failure) {
if !s.AllowNotifications.Bool {
return
}
// ignore failure if user was already notified and
// they have "continuous notifications" switched off.
if s.UserNotified && !s.UpdateNotify.Bool {
return
}
if s.notifyAfterCount > s.NotifyAfter {
for _, n := range allNotifiers {
notif := n.Select()
if notif.CanSend() {
log.Infof("Sending Failure notification to: %s!", notif.Method)
if _, err := n.OnFailure(*s, *f); err != nil {
notif.Logger().WithField("failure", f.Issue).Errorln(err)
}
s.UserNotified = true
s.SuccessNotified = true
//s.UpdateNotify.Bool
2020-03-14 03:13:20 +00:00
}
}
}
s.notifyAfterCount++
2020-03-04 10:29:00 +00:00
}
// Check will run checkHttp for HTTP services and checkTcp for TCP services
// if record param is set to true, it will add a record into the database.
2020-03-04 14:20:47 +00:00
func (s *Service) CheckService(record bool) {
switch s.Type {
2020-03-04 10:29:00 +00:00
case "http":
2020-03-04 14:20:47 +00:00
CheckHttp(s, record)
2020-03-04 10:29:00 +00:00
case "tcp", "udp":
2020-03-04 14:20:47 +00:00
CheckTcp(s, record)
case "grpc":
CheckGrpc(s, record)
2020-03-04 10:29:00 +00:00
case "icmp":
2020-03-04 14:20:47 +00:00
CheckIcmp(s, record)
2020-03-04 10:29:00 +00:00
}
2018-06-10 01:31:13 +00:00
}