ColossalAI/applications/ColossalChat/tests/verify_chat_data.py

import argparse
import json

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_source",
        type=str,
        required=True,
        default=None,
        help="The raw data file",
    )
    parser.add_argument(
        "--to_verify_file",
        type=str,
        required=True,
        default=None,
        help="The file that contains the data to be verified",
    )
    parser.add_argument(
        "--data_type",
        type=str,
        required=True,
        default=None,
        help="The data type",
    )
    args = parser.parse_args()

    # Read data
    data = []
    with open(args.data_source, "r", encoding="utf8") as f:
        for line in f.readlines():
            data.append(json.loads(line))
    to_verify_data = []
    with open(args.to_verify_file, "r", encoding="utf8") as f:
        for line in f.readlines():
            to_verify_data.append(json.loads(line))

    if args.data_type == "sft":
        target_lable = [msg["content"].strip() for msg in data[0]["messages"] if msg["from"] == "assistant"]
        target_negative_label = [msg["content"].strip() for msg in data[0]["messages"] if msg["from"] == "human"]

        # Read to verify file

        to_verify_lable = to_verify_data[0]["labels_decode"]
        for label in target_lable:
            assert any([label in s for s in to_verify_lable]), f"Label {label} not in target label {to_verify_lable}"
        for label in target_negative_label:
            assert all(
                [label not in s for s in to_verify_lable]
            ), f"Negative label {label} in target label {to_verify_lable}"
    elif args.data_type == "dpo":
        chosen_lable = data[0]["chosen"][0]["content"].strip()
        rejected_lable = data[0]["rejected"][0]["content"].strip()

        # Read to verify file
        to_verify_lable_chosen = to_verify_data[0]["chosen_label_decode"]
        to_verify_lable_rejected = to_verify_data[0]["rejected_label_decode"]
        assert any(
            [chosen_lable in s for s in to_verify_lable_chosen]
        ), f"Chosen label {chosen_lable} not in target chosen label {to_verify_lable_chosen}"
        assert any(
            [rejected_lable in s for s in to_verify_lable_rejected]
        ), f"Rejected label {rejected_lable} not in target rejected label {to_verify_lable_chosen}"
[ColossalChat] Update RLHF V2 (#5286) * Add dpo. Fix sft, ppo, lora. Refactor all * fix and tested ppo * 2 nd round refactor * add ci tests * fix ci * fix ci * fix readme, style * fix readme style * fix style, fix benchmark * reproduce benchmark result, remove useless files * rename to ColossalChat * use new image * fix ci workflow * fix ci * use local model/tokenizer for ci tests * fix ci * fix ci * fix ci * fix ci timeout * fix rm progress bar. fix ci timeout * fix ci * fix ci typo * remove 3d plugin from ci temporary * test environment * cannot save optimizer * support chat template * fix readme * fix path * test ci locally * restore build_or_pr * fix ci data path * fix benchmark * fix ci, move ci tests to 3080, disable fast tokenizer * move ci to 85 * support flash attention 2 * add all-in-one data preparation script. Fix colossal-llama2-chat chat template * add hardware requirements * move ci test data * fix save_model, add unwrap * fix missing bos * fix missing bos; support grad accumulation with gemini * fix ci * fix ci * fix ci * fix llama2 chat template config * debug sft * debug sft * fix colossalai version requirement * fix ci * add sanity check to prevent NaN loss * fix requirements * add dummy data generation script * add dummy data generation script * add dummy data generation script * add dummy data generation script * update readme * update readme * update readme and ignore * fix logger bug * support parallel_output * modify data preparation logic * fix tokenization * update lr * fix inference * run pre-commit --------- Co-authored-by: Tong Li <tong.li352711588@gmail.com> 8 months ago			`import argparse`
			`import json`

			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument(`
			`"--data_source",`
			`type=str,`
			`required=True,`
			`default=None,`
			`help="The raw data file",`
			`)`
			`parser.add_argument(`
			`"--to_verify_file",`
			`type=str,`
			`required=True,`
			`default=None,`
			`help="The file that contains the data to be verified",`
			`)`
			`parser.add_argument(`
			`"--data_type",`
			`type=str,`
			`required=True,`
			`default=None,`
			`help="The data type",`
			`)`
			`args = parser.parse_args()`

			`# Read data`
			`data = []`
			`with open(args.data_source, "r", encoding="utf8") as f:`
			`for line in f.readlines():`
			`data.append(json.loads(line))`
			`to_verify_data = []`
			`with open(args.to_verify_file, "r", encoding="utf8") as f:`
			`for line in f.readlines():`
			`to_verify_data.append(json.loads(line))`

			`if args.data_type == "sft":`
			`target_lable = [msg["content"].strip() for msg in data[0]["messages"] if msg["from"] == "assistant"]`
			`target_negative_label = [msg["content"].strip() for msg in data[0]["messages"] if msg["from"] == "human"]`

			`# Read to verify file`

			`to_verify_lable = to_verify_data[0]["labels_decode"]`
			`for label in target_lable:`
			`assert any([label in s for s in to_verify_lable]), f"Label {label} not in target label {to_verify_lable}"`
			`for label in target_negative_label:`
			`assert all(`
			`[label not in s for s in to_verify_lable]`
			`), f"Negative label {label} in target label {to_verify_lable}"`
			`elif args.data_type == "dpo":`
			`chosen_lable = data[0]["chosen"][0]["content"].strip()`
			`rejected_lable = data[0]["rejected"][0]["content"].strip()`

			`# Read to verify file`
			`to_verify_lable_chosen = to_verify_data[0]["chosen_label_decode"]`
			`to_verify_lable_rejected = to_verify_data[0]["rejected_label_decode"]`
			`assert any(`
			`[chosen_lable in s for s in to_verify_lable_chosen]`
			`), f"Chosen label {chosen_lable} not in target chosen label {to_verify_lable_chosen}"`
			`assert any(`
			`[rejected_lable in s for s in to_verify_lable_rejected]`
			`), f"Rejected label {rejected_lable} not in target rejected label {to_verify_lable_chosen}"`