diff --git a/applications/ColossalChat/coati/dataset/__init__.py b/applications/ColossalChat/coati/dataset/__init__.py index 78bd46359..f36bb05e5 100755 --- a/applications/ColossalChat/coati/dataset/__init__.py +++ b/applications/ColossalChat/coati/dataset/__init__.py @@ -7,7 +7,7 @@ from .loader import ( StatefulDistributedSampler, load_tokenized_dataset, ) -from .tokenization_utils import tokenize_kto, tokenize_prompt, tokenize_rlhf, tokenize_sft, tokenize_process_reward +from .tokenization_utils import tokenize_kto, tokenize_process_reward, tokenize_prompt, tokenize_rlhf, tokenize_sft __all__ = [ "tokenize_prompt", @@ -23,5 +23,5 @@ __all__ = [ "tokenize_kto", "setup_conversation_template", "Conversation", - "tokenize_process_reward" + "tokenize_process_reward", ] diff --git a/applications/ColossalChat/coati/dataset/conversation.py b/applications/ColossalChat/coati/dataset/conversation.py index f66deb885..0eb735286 100755 --- a/applications/ColossalChat/coati/dataset/conversation.py +++ b/applications/ColossalChat/coati/dataset/conversation.py @@ -3,7 +3,6 @@ import os from dataclasses import dataclass, field from typing import Any, Dict, List -import torch.distributed as dist from transformers import AutoTokenizer, PreTrainedTokenizer from colossalai.logging import get_dist_logger diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py b/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py index ede6fa531..5eeeadbac 100644 --- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py +++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py @@ -12,7 +12,14 @@ import random import time from multiprocessing import cpu_count -from coati.dataset import setup_conversation_template, tokenize_kto, tokenize_prompt, tokenize_rlhf, tokenize_sft, tokenize_process_reward +from coati.dataset import ( + setup_conversation_template, + tokenize_kto, + tokenize_process_reward, + tokenize_prompt, + tokenize_rlhf, + tokenize_sft, +) from datasets import dataset_dict, load_dataset from transformers import AutoTokenizer @@ -28,7 +35,7 @@ def main(): type=str, required=True, default=None, - choices=["sft", "prompt", "preference", "kto", 'prm'], + choices=["sft", "prompt", "preference", "kto", "prm"], help="Type of dataset, chose from 'sft', 'prompt', 'preference'. 'kto'", ) parser.add_argument(