diff --git a/configs/prod.yml b/configs/prod.yml index bb17b7cb..ab674342 100644 --- a/configs/prod.yml +++ b/configs/prod.yml @@ -3,7 +3,7 @@ admin_password: admin admin_user: superadmin allow_reports: false base_path: "" -cleanup_interval: 12h0m0s +cleanup_interval: 24h0m0s cmd_file: /bin/bash db_conn: postgres db_database: ${DATABASE_NAME} @@ -22,7 +22,7 @@ max_life_conn: 180 max_open_conn: 25 name: Razorpay Status Page postgres_sslmode: require -remove_after: 2160h0m0s +remove_after: 720h0m0s sample_data: true sass: /usr/local/bin/sass use_assets: true diff --git a/handlers/services.go b/handlers/services.go index d4364d61..0b69ba4c 100644 --- a/handlers/services.go +++ b/handlers/services.go @@ -19,6 +19,13 @@ type serviceOrder struct { Order int `json:"order"` } +var ( + zeroTime time.Time + zeroBool bool + zeroInt int + zeroInt64 int64 +) + func findService(r *http.Request) (*services.Service, error) { vars := mux.Vars(r) id := utils.ToInt(vars["id"]) @@ -132,19 +139,31 @@ func apiServicePatchHandler(w http.ResponseWriter, r *http.Request) { func apiServiceUpdateHandler(w http.ResponseWriter, r *http.Request) { service, err := findService(r) + if err != nil { sendErrorJson(err, w, r) return } - if err := DecodeJSON(r, &service); err != nil { + + s2 := *service + + s2.SubServicesDetails = map[int64]services.SubService{} + + if err := DecodeJSON(r, &s2); err != nil { sendErrorJson(err, w, r) return } - if err := service.Update(); err != nil { + + s2.LastProcessingTime = zeroTime + s2.Online = zeroBool + s2.FailureCounter = zeroInt + s2.CurrentDowntime = zeroInt64 + + if err := s2.Update(); err != nil { sendErrorJson(err, w, r) return } - go service.CheckService(true) + go s2.CheckService(true) sendJsonAction(service, "update", w, r) } diff --git a/types/services/database.go b/types/services/database.go index 29e5435a..585db3da 100644 --- a/types/services/database.go +++ b/types/services/database.go @@ -80,6 +80,12 @@ func Find(id int64) (*Service, error) { return srv, res.Error() } +func FindFirstFromDB(id int64) (*Service, error) { + var srv = Service{} + res := db.First(&srv, id) + return &srv, res.Error() +} + func FindOne(id int64) (*Service, error) { srv := allServices[id] if srv == nil { diff --git a/types/services/routine.go b/types/services/routine.go index daeb4895..0821d909 100644 --- a/types/services/routine.go +++ b/types/services/routine.go @@ -434,36 +434,40 @@ func CheckCollection(s *Service, record bool) (*Service, error) { timer := prometheus.NewTimer(metrics.ServiceTimer(s.Name)) defer timer.ObserveDuration() + hcStartTime := time.Now() + combinedStatus := STATUS_UP var impactedSubService SubService - var latency, pingtime int64 downCount := 0 for id, subServiceDetail := range s.SubServicesDetails { - if subService, err := FindOne(id); err != nil { + if subService, err := FindFirstFromDB(id); err != nil { log.Errorf("[Ignored]Failed to find Sub Service : %s %s %s %s", s.Id, s.Name, id, subServiceDetail.DisplayName) continue } else { - hit := subService.LastHit() - failure := subService.LastFailure() - pingtime = hit.PingTime - if failure.CreatedAt.After(hit.CreatedAt) { - pingtime = failure.PingTime + if !subService.Online && subService.CurrentDowntime > 0 { + downtimeType := STATUS_DOWN + if d, de := downtimes.Find(subService.CurrentDowntime); de != nil { + log.Errorf("[Ignored]Failed to find Sub Service Downtime : %s %s %s %s", s.Id, s.Name, id, subServiceDetail.DisplayName) + continue + } else { + downtimeType = d.SubStatus + } + if combinedStatus != STATUS_DOWN { switch subServiceDetail.DependencyType { case CRITICAL: - combinedStatus = HandleEmptyStatus(failure.Type) + combinedStatus = HandleEmptyStatus(downtimeType) impactedSubService = subServiceDetail case DELAYED, PARTIAL: combinedStatus = STATUS_DEGRADED - if failure.Type == STATUS_DOWN { + if downtimeType == STATUS_DOWN { downCount++ } impactedSubService = subServiceDetail } } } - latency += pingtime } } @@ -471,8 +475,8 @@ func CheckCollection(s *Service, record bool) (*Service, error) { combinedStatus = STATUS_DOWN } - s.Latency = latency - s.PingTime = latency + s.Latency = time.Now().Sub(hcStartTime).Milliseconds() + s.PingTime = time.Now().Sub(hcStartTime).Milliseconds() s.LastFailureType = combinedStatus if combinedStatus == STATUS_DOWN || combinedStatus == STATUS_DEGRADED { if record { @@ -571,7 +575,7 @@ func (s *Service) CheckService(record bool) (err error) { func (s *Service) HandleDowntime(err error, record bool) { if err != nil { s.FailureCounter++ - if s.FailureCounter >= s.GetFtc() { + if s.FailureCounter >= s.GetFtc() || s.Type == "collection" { s.Online = false @@ -591,7 +595,15 @@ func (s *Service) HandleDowntime(err error, record bool) { } downtime.End = time.Now() - downtime.SubStatus = ApplyStatus(downtime.SubStatus, HandleEmptyStatus(s.LastFailureType), STATUS_DEGRADED) + newStatus := HandleEmptyStatus(s.LastFailureType) + + if downtime.SubStatus != "" && downtime.SubStatus != newStatus { + downtime.Id = 0 + downtime.Start = time.Now().Add(time.Duration(-s.Interval) * (time.Second)) + } + + downtime.SubStatus = newStatus + downtime.Failures = s.FailureCounter if downtime.Id > 0 { diff --git a/types/services/struct.go b/types/services/struct.go index 1b13e067..879a5ed4 100644 --- a/types/services/struct.go +++ b/types/services/struct.go @@ -67,7 +67,7 @@ type Service struct { Incidents []*incidents.Incident `gorm:"foreignkey:service;association_foreignkey:id" json:"incidents,omitempty" yaml:"incidents"` Checkins []*checkins.Checkin `gorm:"foreignkey:service;association_foreignkey:id" json:"checkins,omitempty" yaml:"-" scope:"user,admin"` Failures []*failures.Failure `gorm:"-" json:"failures,omitempty" yaml:"-" scope:"user,admin"` - LastProcessingTime time.Time `gorm:"column:last_processing_time" json:"last_processing_time"` + LastProcessingTime time.Time `gorm:"column:last_processing_time" json:"-"` notifyAfterCount int64 `gorm:"column:notify_after_count" yaml:"-"` prevOnline bool `gorm:"column:prev_online" yaml:"-"`