cronsun/job.go

702 lines
13 KiB
Go
Raw Normal View History

2017-05-12 06:48:24 +00:00
package cronsun
2017-01-09 02:32:14 +00:00
import (
"bytes"
"encoding/json"
2017-01-19 11:03:18 +00:00
"fmt"
"os/exec"
"os/user"
2017-03-09 09:43:08 +00:00
"runtime"
2017-02-17 09:22:43 +00:00
"strconv"
2017-01-12 08:35:30 +00:00
"strings"
"sync/atomic"
"syscall"
2017-02-17 09:22:43 +00:00
"time"
2017-03-09 07:39:08 +00:00
"golang.org/x/net/context"
client "github.com/coreos/etcd/clientv3"
2017-01-19 11:03:18 +00:00
"github.com/coreos/etcd/mvcc/mvccpb"
2017-05-09 10:27:32 +00:00
"github.com/shunfei/cronsun/conf"
2017-05-12 07:38:50 +00:00
"github.com/shunfei/cronsun/log"
2017-05-09 10:27:32 +00:00
"github.com/shunfei/cronsun/node/cron"
)
2017-01-12 08:35:30 +00:00
const (
2017-01-21 10:16:45 +00:00
DefaultJobGroup = "default"
2017-01-12 08:35:30 +00:00
)
const (
KindCommon = iota
KindAlone // 任何时间段只允许单机执行
KindInterval // 一个任务执行间隔内允许执行一次
)
2017-01-09 09:13:56 +00:00
// 需要执行的 cron cmd 命令
2017-01-12 08:35:30 +00:00
// 注册到 /cronsun/cmd/groupName/<id>
2017-01-09 02:32:14 +00:00
type Job struct {
ID string `json:"id"`
Name string `json:"name"`
Group string `json:"group"`
Command string `json:"cmd"`
User string `json:"user"`
Rules []*JobRule `json:"rules"`
Pause bool `json:"pause"` // 可手工控制的状态
Timeout int64 `json:"timeout"` // 任务执行时间超时设置,大于 0 时有效
// 设置任务在单个节点上可以同时允许多少个
// 针对两次任务执行间隔比任务执行时间要长的任务启用
Parallels int64 `json:"parallels"`
2017-03-17 03:20:13 +00:00
// 执行任务失败重试次数
// 默认为 0不重试
Retry int `json:"retry"`
// 执行任务失败重试时间间隔
// 单位秒,如果不大于 0 则马上重试
Interval int `json:"interval"`
// 任务类型
// 0: 普通任务
// 1: 单机任务
// 如果为单机任务node 加载任务的时候 Parallels 设置 1
Kind int `json:"kind"`
// 平均执行时间,单位 ms
AvgTime int64 `json:"avg_time"`
2017-04-05 10:12:35 +00:00
// 执行失败发送通知
FailNotify bool `json:"fail_notify"`
// 发送通知地址
2017-04-07 08:28:41 +00:00
To []string `json:"to"`
2017-02-13 01:48:02 +00:00
// 执行任务的结点,用于记录 job log
runOn string
// 用于存储分隔后的任务
cmd []string
// 控制同时执行任务数
Count *int64 `json:"-"`
2017-01-09 02:32:14 +00:00
}
type JobRule struct {
ID string `json:"id"`
2017-01-11 08:12:37 +00:00
Timer string `json:"timer"`
2017-01-09 09:13:56 +00:00
GroupIDs []string `json:"gids"`
2017-01-11 08:12:37 +00:00
NodeIDs []string `json:"nids"`
ExcludeNodeIDs []string `json:"exclude_nids"`
Schedule cron.Schedule `json:"-"`
2017-01-09 02:32:14 +00:00
}
// 任务锁
type locker struct {
kind int
ttl int64
lID client.LeaseID
timer *time.Timer
done chan struct{}
}
func (l *locker) keepAlive() {
duration := time.Duration(l.ttl)*time.Second - 500*time.Millisecond
l.timer = time.NewTimer(duration)
for {
select {
case <-l.done:
return
case <-l.timer.C:
_, err := DefalutClient.KeepAliveOnce(l.lID)
if err != nil {
log.Warnf("lock keep alive err: %s", err.Error())
return
}
l.timer.Reset(duration)
}
}
}
func (l *locker) unlock() {
if l.kind != KindAlone {
return
}
close(l.done)
l.timer.Stop()
if _, err := DefalutClient.KeepAliveOnce(l.lID); err != nil {
log.Warnf("unlock keep alive err: %s", err.Error())
}
}
type Cmd struct {
*Job
*JobRule
}
func (c *Cmd) GetID() string {
return c.Job.ID + c.JobRule.ID
}
2017-03-17 03:20:13 +00:00
func (c *Cmd) Run() {
// 同时执行任务数限制
if c.Job.limit() {
return
}
defer c.Job.unlimit()
if c.Job.Kind != KindCommon {
lk := c.lock()
if lk == nil {
return
}
defer lk.unlock()
}
2017-03-17 03:20:13 +00:00
if c.Job.Retry <= 0 {
c.Job.Run()
return
}
for i := 0; i < c.Job.Retry; i++ {
if c.Job.Run() {
return
}
if c.Job.Interval > 0 {
time.Sleep(time.Duration(c.Job.Interval) * time.Second)
}
}
}
func (j *Job) limit() bool {
if j.Parallels == 0 {
return false
}
2017-07-18 03:47:18 +00:00
count := atomic.AddInt64(j.Count, 1)
if j.Parallels < count {
atomic.AddInt64(j.Count, -1)
j.Fail(time.Now(), fmt.Sprintf("job[%s] running on[%s] running:[%d]", j.Key(), j.runOn, count))
return true
}
return false
}
func (j *Job) unlimit() {
if j.Parallels == 0 {
return
}
atomic.AddInt64(j.Count, -1)
}
func (j *Job) Init(n string) {
var c int64
j.Count, j.runOn = &c, n
}
func (c *Cmd) lockTtl() int64 {
now := time.Now()
prev := c.JobRule.Schedule.Next(now)
ttl := int64(c.JobRule.Schedule.Next(prev).Sub(prev) / time.Second)
if ttl == 0 {
return 0
}
if c.Job.Kind == KindInterval {
ttl -= 2
if ttl > conf.Config.LockTtl {
ttl = conf.Config.LockTtl
}
if ttl < 1 {
ttl = 1
}
return ttl
}
cost := c.Job.AvgTime / 1e3
if c.Job.AvgTime/1e3-cost*1e3 > 0 {
cost += 1
}
// 如果执行间隔时间不大于执行时间,把过期时间设置为执行时间的下限-1
// 以便下次执行的时候,能获取到 lock
if ttl >= cost {
ttl -= cost
}
if ttl > conf.Config.LockTtl {
ttl = conf.Config.LockTtl
}
// 支持的最小时间间隔 2s
if ttl < 2 {
ttl = 2
}
return ttl
}
func (c *Cmd) newLock() *locker {
return &locker{
kind: c.Job.Kind,
ttl: c.lockTtl(),
done: make(chan struct{}),
}
}
func (c *Cmd) lock() *locker {
lk := c.newLock()
// 非法的 rule
if lk.ttl == 0 {
return nil
}
resp, err := DefalutClient.Grant(lk.ttl)
if err != nil {
2017-05-12 07:38:50 +00:00
log.Infof("job[%s] didn't get a lock, err: %s", c.Job.Key(), err.Error())
return nil
}
ok, err := DefalutClient.GetLock(c.Job.ID, resp.ID)
if err != nil {
2017-05-12 07:38:50 +00:00
log.Infof("job[%s] didn't get a lock, err: %s", c.Job.Key(), err.Error())
return nil
}
if !ok {
return nil
}
lk.lID = resp.ID
if lk.kind == KindAlone {
go lk.keepAlive()
}
return lk
}
// 优先取结点里的值,更新 group 时可用 gid 判断是否对 job 进行处理
func (j *JobRule) included(nid string, gs map[string]*Group) bool {
for i, count := 0, len(j.NodeIDs); i < count; i++ {
if nid == j.NodeIDs[i] {
return true
}
}
for _, gid := range j.GroupIDs {
if g, ok := gs[gid]; ok && g.Included(nid) {
return true
}
}
return false
}
// 验证 timer 字段
func (j *JobRule) Valid() error {
// 注意 interface nil 的比较
if j.Schedule != nil {
return nil
}
if len(j.Timer) == 0 {
return ErrNilRule
}
sch, err := cron.Parse(j.Timer)
if err != nil {
return fmt.Errorf("invalid JobRule[%s], parse err: %s", j.Timer, err.Error())
}
j.Schedule = sch
return nil
}
2017-01-16 06:30:55 +00:00
func GetJob(group, id string) (job *Job, err error) {
job, _, err = GetJobAndRev(group, id)
return
}
func GetJobAndRev(group, id string) (job *Job, rev int64, err error) {
2017-01-16 06:30:55 +00:00
resp, err := DefalutClient.Get(JobKey(group, id))
if err != nil {
return
}
if resp.Count == 0 {
err = ErrNotFound
return
}
rev = resp.Kvs[0].ModRevision
if err = json.Unmarshal(resp.Kvs[0].Value, &job); err != nil {
return
}
job.splitCmd()
2017-01-16 06:30:55 +00:00
return
}
func DeleteJob(group, id string) (resp *client.DeleteResponse, err error) {
return DefalutClient.Delete(JobKey(group, id))
}
func GetJobs() (jobs map[string]*Job, err error) {
resp, err := DefalutClient.Get(conf.Config.Cmd, client.WithPrefix())
if err != nil {
return
}
count := len(resp.Kvs)
2017-02-17 06:19:35 +00:00
jobs = make(map[string]*Job, count)
if count == 0 {
return
}
for _, j := range resp.Kvs {
job := new(Job)
if e := json.Unmarshal(j.Value, job); e != nil {
log.Warnf("job[%s] umarshal err: %s", string(j.Key), e.Error())
continue
}
if err := job.Valid(); err != nil {
log.Warnf("job[%s] is invalid: %s", string(j.Key), err.Error())
2017-03-03 08:00:01 +00:00
continue
}
job.alone()
jobs[job.ID] = job
}
return
}
2017-01-19 11:03:18 +00:00
func WatchJobs() client.WatchChan {
return DefalutClient.Watch(conf.Config.Cmd, client.WithPrefix())
2017-01-19 11:03:18 +00:00
}
func GetJobFromKv(kv *mvccpb.KeyValue) (job *Job, err error) {
2017-01-19 11:03:18 +00:00
job = new(Job)
if err = json.Unmarshal(kv.Value, job); err != nil {
err = fmt.Errorf("job[%s] umarshal err: %s", string(kv.Key), err.Error())
return
2017-01-19 11:03:18 +00:00
}
err = job.Valid()
job.alone()
2017-01-19 11:03:18 +00:00
return
}
func (j *Job) alone() {
if j.Kind == KindAlone {
j.Parallels = 1
}
}
func (j *Job) splitCmd() {
j.cmd = strings.Split(j.Command, " ")
}
func (j *Job) String() string {
data, err := json.Marshal(j)
if err != nil {
return err.Error()
}
return string(data)
}
// Run 执行任务
2017-03-17 03:20:13 +00:00
func (j *Job) Run() bool {
2017-03-09 07:39:08 +00:00
var (
cmd *exec.Cmd
proc *Process
2017-03-09 07:39:08 +00:00
sysProcAttr *syscall.SysProcAttr
)
2017-03-09 07:39:08 +00:00
t := time.Now()
2017-03-09 08:12:32 +00:00
// 用户权限控制
2017-02-16 04:00:31 +00:00
if len(j.User) > 0 {
u, err := user.Lookup(j.User)
if err != nil {
j.Fail(t, err.Error())
2017-03-17 03:20:13 +00:00
return false
2017-02-16 04:00:31 +00:00
}
uid, err := strconv.Atoi(u.Uid)
if err != nil {
2017-03-09 08:12:32 +00:00
j.Fail(t, "not support run with user on windows")
2017-03-17 03:20:13 +00:00
return false
}
2017-05-27 02:26:39 +00:00
if uid != _Uid {
gid, _ := strconv.Atoi(u.Gid)
sysProcAttr = &syscall.SysProcAttr{
Credential: &syscall.Credential{
Uid: uint32(uid),
Gid: uint32(gid),
},
}
}
2017-02-16 04:00:31 +00:00
}
2017-03-09 08:12:32 +00:00
// 超时控制
2017-03-09 07:39:08 +00:00
if j.Timeout > 0 {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(j.Timeout)*time.Second)
defer cancel()
cmd = exec.CommandContext(ctx, j.cmd[0], j.cmd[1:]...)
} else {
cmd = exec.Command(j.cmd[0], j.cmd[1:]...)
}
cmd.SysProcAttr = sysProcAttr
var b bytes.Buffer
cmd.Stdout = &b
cmd.Stderr = &b
if err := cmd.Start(); err != nil {
j.Fail(t, fmt.Sprintf("%s\n%s", b.String(), err.Error()))
2017-03-17 03:20:13 +00:00
return false
}
proc = &Process{
ID: strconv.Itoa(cmd.Process.Pid),
JobID: j.ID,
Group: j.Group,
NodeID: j.runOn,
Time: t,
}
proc.Start()
2017-03-14 07:23:53 +00:00
defer proc.Stop()
if err := cmd.Wait(); err != nil {
j.Fail(t, fmt.Sprintf("%s\n%s", b.String(), err.Error()))
2017-03-17 03:20:13 +00:00
return false
}
j.Success(t, b.String())
2017-03-17 03:20:13 +00:00
return true
2017-01-12 08:35:30 +00:00
}
2017-03-09 09:43:08 +00:00
func (j *Job) RunWithRecovery() {
defer func() {
if r := recover(); r != nil {
const size = 64 << 10
buf := make([]byte, size)
buf = buf[:runtime.Stack(buf, false)]
log.Warnf("panic running job: %v\n%s", r, buf)
}
}()
j.Run()
}
2017-02-17 13:29:04 +00:00
// 从 etcd 的 key 中取 id
func GetIDFromKey(key string) string {
index := strings.LastIndex(key, "/")
if index < 0 {
return ""
}
return key[index+1:]
}
2017-01-16 06:30:55 +00:00
func JobKey(group, id string) string {
return conf.Config.Cmd + group + "/" + id
}
2017-01-12 08:35:30 +00:00
func (j *Job) Key() string {
2017-01-16 06:30:55 +00:00
return JobKey(j.Group, j.ID)
2017-01-12 08:35:30 +00:00
}
func (j *Job) Check() error {
j.ID = strings.TrimSpace(j.ID)
if !IsValidAsKeyPath(j.ID) {
return ErrIllegalJobId
}
2017-01-12 08:35:30 +00:00
j.Name = strings.TrimSpace(j.Name)
if len(j.Name) == 0 {
return ErrEmptyJobName
}
j.Group = strings.TrimSpace(j.Group)
if len(j.Group) == 0 {
j.Group = DefaultJobGroup
}
if !IsValidAsKeyPath(j.Group) {
return ErrIllegalJobGroupName
}
2017-02-16 06:42:28 +00:00
j.User = strings.TrimSpace(j.User)
2017-02-17 09:22:43 +00:00
for i := range j.Rules {
id := strings.TrimSpace(j.Rules[i].ID)
2017-02-17 09:22:43 +00:00
if id == "" || strings.HasPrefix(id, "NEW") {
j.Rules[i].ID = NextID()
2017-02-17 09:22:43 +00:00
}
}
2017-01-12 08:35:30 +00:00
// 不修改 Command 的内容,简单判断是否为空
if len(strings.TrimSpace(j.Command)) == 0 {
return ErrEmptyJobCommand
}
return j.Valid()
}
// 执行结果写入 mongoDB
2017-02-16 07:05:59 +00:00
func (j *Job) Success(t time.Time, out string) {
CreateJobLog(j, t, out, true)
}
2017-02-16 07:05:59 +00:00
func (j *Job) Fail(t time.Time, msg string) {
j.Notify(t, msg)
2017-02-16 07:05:59 +00:00
CreateJobLog(j, t, msg, false)
}
func (j *Job) Notify(t time.Time, msg string) {
if !conf.Config.Mail.Enable || !j.FailNotify {
return
}
ts := t.Format(time.RFC3339)
body := "job: " + j.Key() + "\n" +
"job name: " + j.Name + "\n" +
"job cmd: " + j.Command + "\n" +
"node: " + j.runOn + "\n" +
"time: " + ts + "\n" +
"err: " + msg
m := Message{
Subject: "node[" + j.runOn + "] job[" + j.ShortName() + "] time[" + ts + "] exec failed",
Body: body,
2017-04-14 07:33:14 +00:00
To: j.To,
}
data, err := json.Marshal(m)
if err != nil {
log.Warnf("job[%s] send notice fail, err: %s", j.Key(), err.Error())
return
}
_, err = DefalutClient.Put(conf.Config.Noticer+"/"+j.runOn, string(data))
if err != nil {
log.Warnf("job[%s] send notice fail, err: %s", j.Key(), err.Error())
return
}
}
func (j *Job) Avg(t, et time.Time) {
execTime := int64(et.Sub(t) / time.Millisecond)
if j.AvgTime == 0 {
j.AvgTime = execTime
return
}
j.AvgTime = (j.AvgTime + execTime) / 2
}
func (j *Job) Cmds(nid string, gs map[string]*Group) (cmds map[string]*Cmd) {
cmds = make(map[string]*Cmd)
if j.Pause {
return
}
for _, r := range j.Rules {
for _, id := range r.ExcludeNodeIDs {
if nid == id {
continue
}
}
if r.included(nid, gs) {
cmd := &Cmd{
Job: j,
JobRule: r,
}
cmds[cmd.GetID()] = cmd
}
}
return
}
2017-03-03 08:00:01 +00:00
2017-03-09 09:43:08 +00:00
func (j Job) IsRunOn(nid string, gs map[string]*Group) bool {
for _, r := range j.Rules {
for _, id := range r.ExcludeNodeIDs {
if nid == id {
continue
}
}
if r.included(nid, gs) {
return true
}
}
return false
}
2017-03-03 08:00:01 +00:00
// 安全选项验证
func (j *Job) Valid() error {
2017-03-03 08:00:01 +00:00
if len(j.cmd) == 0 {
j.splitCmd()
}
if err := j.ValidRules(); err != nil {
return err
}
2017-03-03 08:00:01 +00:00
security := conf.Config.Security
if !security.Open {
return nil
}
if !j.validUser() {
return ErrSecurityInvalidUser
2017-03-03 08:00:01 +00:00
}
if !j.validCmd() {
return ErrSecurityInvalidCmd
}
return nil
2017-03-03 08:00:01 +00:00
}
func (j *Job) validUser() bool {
if len(conf.Config.Security.Users) == 0 {
return true
}
for _, u := range conf.Config.Security.Users {
if j.User == u {
return true
}
}
return false
}
func (j *Job) validCmd() bool {
if len(conf.Config.Security.Ext) == 0 {
return true
}
for _, ext := range conf.Config.Security.Ext {
if strings.HasSuffix(j.cmd[0], ext) {
return true
}
}
return false
}
func (j *Job) ValidRules() error {
for _, r := range j.Rules {
if err := r.Valid(); err != nil {
return err
}
}
return nil
}
func (j *Job) ShortName() string {
if len(j.Name) <= 10 {
return j.Name
}
names := []rune(j.Name)
if len(names) <= 10 {
return j.Name
}
return string(names[:10]) + "..."
}