diff --git a/doc/code-docs/locales/en/LC_MESSAGES/monitor.po b/doc/code-docs/locales/en/LC_MESSAGES/monitor.po index 0108368..c9d3045 100644 --- a/doc/code-docs/locales/en/LC_MESSAGES/monitor.po +++ b/doc/code-docs/locales/en/LC_MESSAGES/monitor.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: InternLM \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-09-07 10:56+0800\n" +"POT-Creation-Date: 2023-09-25 13:44+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: en\n" @@ -19,180 +19,280 @@ msgstr "" "Content-Transfer-Encoding: 8bit\n" "Generated-By: Babel 2.12.1\n" -#: ../../source/monitor.rst:2 f95ef3bff8574c77a28ca2f6212cc4b8 +#: ../../source/monitor.rst:2 msgid "监控和告警" msgstr "Monitor and Alert" -#: ../../source/monitor.rst:5 959bd4a6061f4483875c7950ab4546cf +#: ../../source/monitor.rst:5 msgid "监控" msgstr "Monitoring" -#: ../../source/monitor.rst:7 6071bc878d894865b73380cb887847c1 +#: ../../source/monitor.rst:7 msgid "" "InternLM 使用 ``internlm.monitor.monitor.initialize_monitor_manager()`` " "来初始化上下文监控管理。其中,一个实例化的单例对象 ``internlm.monitor.monitor.MonitorManager`` " "将管理监控线程并使用 ``internlm.monitor.monitor.MonitorTracker`` 来跟踪模型训练生命周期和训练状态。" msgstr "" -"InternLM uses ``internlm.monitor.monitor.initialize_monitor_manager()`` to initialize context monitor. During this time, " -"a singleton ``internlm.monitor.monitor.MonitorManager`` will manage monitoring thread and track training status " -"with ``internlm.monitor.monitor.MonitorTracker``." +"InternLM uses ``internlm.monitor.monitor.initialize_monitor_manager()`` " +"to initialize context monitor. During this time, a singleton " +"``internlm.monitor.monitor.MonitorManager`` will manage monitoring thread" +" and track training status with " +"``internlm.monitor.monitor.MonitorTracker``." -#: 9256a063b6dd449786f29e03ce085176 #: internlm.monitor.monitor.initialize_monitor_manager:1 of msgid "" "Initialize monitor manager for monitoring training lifetime and alerting " "exception info to Feishu." msgstr "" -#: 138340fca72a4226be901f7f16c8a590 904b7938fdea46bf81c1ef738aa7bfae -#: 9ed2a7b4af2243b289e72b2751aec902 aa0dd0dc6bee4a5bb15cc9705f7c13ee +#: internlm.monitor.alert.initialize_light_monitor #: internlm.monitor.alert.send_feishu_msg_with_webhook +#: internlm.monitor.alert.send_heartbeat #: internlm.monitor.monitor.MonitorManager.start_monitor #: internlm.monitor.monitor.MonitorTracker #: internlm.monitor.monitor.initialize_monitor_manager of msgid "参数" msgstr "" -#: 3b302339e1d143b6b1d782ff59c9396d 6a06f053828b4c80aef56970750e2085 #: internlm.monitor.monitor.MonitorManager.start_monitor:3 #: internlm.monitor.monitor.initialize_monitor_manager:3 of msgid "The training job name." msgstr "" -#: 3330d06145ee4d35b0b3632e799a35b3 c105473f2f6a4f838a9f0d098762d698 #: internlm.monitor.monitor.MonitorManager.start_monitor:5 #: internlm.monitor.monitor.initialize_monitor_manager:5 of msgid "The Feishu webhook address for sending alert messages." msgstr "" -#: 774c6ff82a2e452295a1a7dcabaded3d internlm.monitor.monitor.MonitorManager:1 -#: of +#: internlm.monitor.monitor.MonitorManager:1 of msgid "" "Monitor Manager for managing monitor thread and monitoring training " "status." msgstr "" -#: 72e696c0ce8f41ea8c7947d35cf322f0 #: internlm.monitor.monitor.MonitorManager.monitor_loss_spike:1 of msgid "Check loss value, if loss spike occurs, send alert message to Feishu." msgstr "" -#: 2b668b057fa84e8b92c65bfd49bfb3e9 #: internlm.monitor.monitor.MonitorManager.monitor_exception:1 of msgid "Catch and format exception information, send alert message to Feishu." msgstr "" -#: 9852b7143026476d89e1a175223e6d79 #: internlm.monitor.monitor.MonitorManager.handle_sigterm:1 of msgid "Catch SIGTERM signal, and send alert message to Feishu." msgstr "" -#: 2e3827bad7b1445fb0d9a7c5a28def5d #: internlm.monitor.monitor.MonitorManager.start_monitor:1 of msgid "" "Initialize and start monitor thread for checking training job status, " "loss spike and so on." msgstr "" -#: 271cc3e1b0834a7ba6a1ba4d5cce0ef1 #: internlm.monitor.monitor.MonitorManager.start_monitor:7 of msgid "The time of monitor interval in seconds, defaults to 300." msgstr "" -#: e4a06091fce8401b83e31ce26c8075a0 #: internlm.monitor.monitor.MonitorManager.start_monitor:9 of msgid "" "The limit multiple of current loss to previous loss value, which means " "loss spike may be occurs, defaults to 1.5." msgstr "" -#: 28bde748477e41f39fa6ca3e1855923d #: internlm.monitor.monitor.MonitorManager.stop_monitor:1 of msgid "Stop the monitor and alert thread." msgstr "" -#: ffb3dda227664748bdb326b6630bc827 internlm.monitor.monitor.MonitorTracker:1 -#: of +#: internlm.monitor.monitor.MonitorTracker:1 of msgid "Track job status and alert to Feishu during job training." msgstr "" -#: a1e93683cbb04d8ab825e2776e76efa7 internlm.monitor.monitor.MonitorTracker:3 -#: of +#: internlm.monitor.monitor.MonitorTracker:3 of msgid "The Feishu webhook address for sending alerting messages." msgstr "" -#: 7913eeecc0904c128046e80cec1553f2 internlm.monitor.monitor.MonitorTracker:5 -#: of +#: internlm.monitor.monitor.MonitorTracker:5 of msgid "The interval in seconds for monitoring checks. Defaults to 300." msgstr "" -#: 8d1abc3067584866983139dd3d85c59c internlm.monitor.monitor.MonitorTracker:7 -#: of +#: internlm.monitor.monitor.MonitorTracker:7 of msgid "The threshold for detecting loss value spikes. Defaults to 1.5." msgstr "" -#: a0416fd68700450793daa2167f776618 #: internlm.monitor.monitor.MonitorTracker.run:1 of msgid "start the monitor tracker." msgstr "" -#: f55eb990c07b4e8f9388236dd60f0017 #: internlm.monitor.monitor.MonitorTracker.stop:1 of msgid "Stop the monitor tracker." msgstr "" -#: ../../source/monitor.rst:18 2202bc091aab417097a1b0268dfe6785 +#: ../../source/monitor.rst:18 msgid "告警" msgstr "Alerting" -#: ../../source/monitor.rst:20 69334f83e644455aa619dde70b8ed1f2 +#: ../../source/monitor.rst:20 msgid "" "InternLM 监控线程会周期性地检查模型训练过程中是否出现 loss spike、潜在的 training stuck、运行时异常等,并捕获 " "SIGTERM 异常信号。当出现上述情况时,将触发警报,并通过调用 " "``internlm.monitor.alert.send_feishu_msg_with_webhook()`` 向飞书的 Webhook " "地址发送报警消息。" msgstr "" -"InternLM monitor thread periodically tracks loss spike, potential stuck condition, runtime exception, and SIGTERM signal. " -"When above situation occurs, an alert will be triggered and a message will be sent to the Feishu webhook address by calling " +"InternLM monitor thread periodically tracks loss spike, potential stuck " +"condition, runtime exception, and SIGTERM signal. When above situation " +"occurs, an alert will be triggered and a message will be sent to the " +"Feishu webhook address by calling " "``internlm.monitor.alert.send_feishu_msg_with_webhook()``." -#: 15980526c2fa4ed8befa1604f271a3f1 #: internlm.monitor.alert.send_feishu_msg_with_webhook:1 of msgid "Use Feishu robot to send messages with the given webhook." msgstr "" -#: 38e5738c2b914c8096e1a0f345e6c0b4 #: internlm.monitor.alert.send_feishu_msg_with_webhook:3 of msgid "The webhook to be used to send message." msgstr "" -#: 4984f1a3bb0d46b48b2aad4fba8b43d9 #: internlm.monitor.alert.send_feishu_msg_with_webhook:5 of msgid "The message title." msgstr "" -#: a9822a4cf30d4947b12f70a0efe62a5e #: internlm.monitor.alert.send_feishu_msg_with_webhook:7 of msgid "The message body." msgstr "" -#: 57d9ab65fe9f45c28351839fecf2f31e #: internlm.monitor.alert.send_feishu_msg_with_webhook of msgid "返回" msgstr "" -#: 2b6ac97fd152498183a8624a9087812b #: internlm.monitor.alert.send_feishu_msg_with_webhook:10 of msgid "The response from the request. Or catch the exception and return None." msgstr "" -#: ec45dedf976046eb909f5b7f79a7d44c +#: internlm.monitor.alert.initialize_light_monitor #: internlm.monitor.alert.send_feishu_msg_with_webhook of msgid "抛出" msgstr "" -#: 4c6aeec19a6041cfbfa577b1c5a85ac1 #: internlm.monitor.alert.send_feishu_msg_with_webhook:12 of msgid "An exception rasied by the HTTP post request." msgstr "" +#: ../../source/monitor.rst:25 +msgid "轻量监控" +msgstr "Light Monitoring" + +#: ../../source/monitor.rst:27 +msgid "" +"InternLM轻量级监控工具采用心跳机制实时监测训练过程中的各项指标,如loss、grad_norm、训练阶段的耗时等。同时,InternLM还可以通过" +" `grafana dashboard `_ " +"直观地呈现这些指标信息,以便用户进行更加全面和深入的训练分析。" +msgstr "" +"The InternLM light monitoring tool employs a heartbeat mechanism to real-" +"time monitor various metrics during the training process, such as loss, " +"grad_norm, and training phase duration. Additionally, InternLM can " +"present these metric details through a `grafana dashboard " +"`_, allowing users to conduct " +"more comprehensive and in-depth training analysis in an intuitive manner." + +#: ../../source/monitor.rst:29 +msgid "" +"轻量监控的配置由配置文件中的 ``monitor`` 字段指定, 用户可以通过修改配置文件 `config file " +"`_ " +"来更改监控配置。以下是一个监控配置的示例:" +msgstr "" +"The configuration for light monitoring is specified by the ``monitor`` " +"field in the configuration file. Users can modify monitoring settings by " +"editing the configuration file `config file " +"`_. " +"Here is an example of a monitoring configuration:" + +#: ../../source/monitor.rst:41 +msgid "enable_feishu_alert (bool):是否启用飞书告警。默认值:False。" +msgstr "enable_feishu_alert: Whether to enable Feishu alerts. Defaults: False." + +#: ../../source/monitor.rst:42 +msgid "feishu_alert_address (str):飞书告警的 Webhook 地址。默认值:None。" +msgstr "feishu_alert_address: The webhook address for Feishu alerts. Defaults: None." + +#: ../../source/monitor.rst:43 +msgid "light_monitor_address (str):轻量监控的地址。默认值:None。" +msgstr "light_monitor_address: The address for lightweight monitoring. Defaults: None." + +#: ../../source/monitor.rst:45 +msgid "" +"InternLM 使用 ``internlm.monitor.alert.initialize_light_monitor`` " +"来初始化轻量监控客户端。一旦初始化完成,它会建立与监控服务器的连接。在训练过程中,使用 " +"``internlm.monitor.alert.send_heartbeat`` " +"来发送不同类型的心跳信息至监控服务器。监控服务器会根据这些心跳信息来检测训练是否出现异常,并在需要时发送警报消息。" +msgstr "" +"InternLM uses ``internlm.monitor.alert.initialize_light_monitor`` to " +"initialize the lightweight monitoring client. Once initialization is " +"complete, it establishes a connection with the monitoring server. During " +"the training process, it uses ``internlm.monitor.alert.send_heartbeat`` " +"to send various types of heartbeat messages to the monitoring server. The" +" monitoring server uses these heartbeat messages to detect if the " +"training encounters any abnormalities and sends alert messages as needed." + +#: internlm.monitor.alert.initialize_light_monitor:1 of +msgid "Initialize the lightweight monitoring module." +msgstr "" + +#: internlm.monitor.alert.initialize_light_monitor:3 of +msgid "The address of the monitor. Defaults to 'MONITOR_SERVER' environment." +msgstr "" + +#: internlm.monitor.alert.initialize_light_monitor:6 of +msgid "" +"If any exceptions occur during initialization, they will be caught and " +"logged as warnings." +msgstr "" + +#: internlm.monitor.alert.initialize_light_monitor:9 +#: internlm.monitor.alert.send_heartbeat:9 of +msgid "示例" +msgstr "Example" + +#: internlm.monitor.alert.initialize_light_monitor:10 of +msgid "" +"Initialize the monitoring module with the default address " +"``initialize_light_monitor()``" +msgstr "" + +#: internlm.monitor.alert.send_heartbeat:1 of +msgid "Send a heartbeat message to a monitoring server." +msgstr "" + +#: internlm.monitor.alert.send_heartbeat:3 of +msgid "" +"The type of heartbeat message, e.g., \"train_metrics\", \"init_time\", " +"\"stage_time\"." +msgstr "" + +#: internlm.monitor.alert.send_heartbeat:5 of +msgid "A dictionary containing message data to be included in the heartbeat." +msgstr "" + +#: internlm.monitor.alert.send_heartbeat:10 of +#, fuzzy +msgid "" +"Sending a heartbeat message for training metrics " +"``send_heartbeat(\"train_metrics\", {\"loss\": 0.1, \"accuracy\": " +"0.95})``" +msgstr "" + +#: internlm.monitor.alert.send_heartbeat:13 of +msgid "" +"Sending a heartbeat message for initialization time " +"``send_heartbeat(\"init_time\", {\"import_time\": 0.25})``" +msgstr "" + +#: internlm.monitor.alert.send_heartbeat:16 of +msgid "" +"Sending a heartbeat message for stage time " +"``send_heartbeat(\"stage_time\", {\"fwd_time\": 2.3, \"bwd_time\": " +"6.2})``" +msgstr "" + +#~ msgid "" +#~ "InternLM轻量监控基于心跳机制来监控训练过程中是否出现 " +#~ "loss、grad_norm异常、训练各阶段时间超时等异常,并通过dashboard展示训练指标信息等。" +#~ msgstr "" diff --git a/doc/code-docs/source/monitor.rst b/doc/code-docs/source/monitor.rst index de150fd..b3c684c 100644 --- a/doc/code-docs/source/monitor.rst +++ b/doc/code-docs/source/monitor.rst @@ -20,3 +20,30 @@ InternLM 使用 ``internlm.monitor.monitor.initialize_monitor_manager()`` 来初 InternLM 监控线程会周期性地检查模型训练过程中是否出现 loss spike、潜在的 training stuck、运行时异常等,并捕获 SIGTERM 异常信号。当出现上述情况时,将触发警报,并通过调用 ``internlm.monitor.alert.send_feishu_msg_with_webhook()`` 向飞书的 Webhook 地址发送报警消息。 .. autofunction:: internlm.monitor.alert.send_feishu_msg_with_webhook + +轻量监控 +----------------- + +InternLM轻量级监控工具采用心跳机制实时监测训练过程中的各项指标,如loss、grad_norm、训练阶段的耗时等。同时,InternLM还可以通过 `grafana dashboard `_ 直观地呈现这些指标信息,以便用户进行更加全面和深入的训练分析。 + +轻量监控的配置由配置文件中的 ``monitor`` 字段指定, 用户可以通过修改配置文件 `config file `_ 来更改监控配置。以下是一个监控配置的示例: + +.. code-block:: python + + monitor = dict( + alert=dict( + enable_feishu_alert=False, + feishu_alert_address=None, + light_monitor_address=None, + ), + ) + +- enable_feishu_alert (bool):是否启用飞书告警。默认值:False。 +- feishu_alert_address (str):飞书告警的 Webhook 地址。默认值:None。 +- light_monitor_address (str):轻量监控的地址。默认值:None。 + +InternLM 使用 ``internlm.monitor.alert.initialize_light_monitor`` 来初始化轻量监控客户端。一旦初始化完成,它会建立与监控服务器的连接。在训练过程中,使用 ``internlm.monitor.alert.send_heartbeat`` 来发送不同类型的心跳信息至监控服务器。监控服务器会根据这些心跳信息来检测训练是否出现异常,并在需要时发送警报消息。 + +.. autofunction:: internlm.monitor.alert.initialize_light_monitor + +.. autofunction:: internlm.monitor.alert.send_heartbeat diff --git a/internlm/monitor/alert.py b/internlm/monitor/alert.py index 1772e7f..e04aa0c 100644 --- a/internlm/monitor/alert.py +++ b/internlm/monitor/alert.py @@ -13,6 +13,19 @@ logger = get_logger(__file__) def initialize_light_monitor(monitor_address: str = None): + """ + Initialize the lightweight monitoring module. + + Args: + monitor_address (str, optional): The address of the monitor. Defaults to 'MONITOR_SERVER' environment. + + Raises: + Exception: If any exceptions occur during initialization, they will be caught and logged as warnings. + + Example: + Initialize the monitoring module with the default address + ``initialize_light_monitor()`` + """ try: from uniscale_monitoring import init_monitor @@ -22,6 +35,24 @@ def initialize_light_monitor(monitor_address: str = None): def send_heartbeat(msg_type: str, msg: Dict): + """ + Send a heartbeat message to a monitoring server. + + Args: + msg_type (str): The type of heartbeat message, e.g., "train_metrics", "init_time", "stage_time". + msg (Dict): A dictionary containing message data to be included in the heartbeat. + + Example: + Sending a heartbeat message for training metrics + ``send_heartbeat("train_metrics", {"loss": 0.1, "accuracy": 0.95})`` + + Sending a heartbeat message for initialization time + ``send_heartbeat("init_time", {"import_time": 0.25})`` + + Sending a heartbeat message for stage time + ``send_heartbeat("stage_time", {"fwd_time": 2.3, "bwd_time": 6.2})`` + """ + def nan2none(v): if isinstance(v, float) and math.isnan(v): return None