Merge pull request #3916 from FrankLeeeee/sync/dtensor-with-develop

[sync] sync feature/dtensor with develop
pull/3926/head
Frank Lee 2023-06-07 11:50:43 +08:00 committed by GitHub
commit d51e83d642
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
175 changed files with 4744 additions and 2784 deletions

View File

@ -43,10 +43,18 @@ I will provide the details of each workflow below.
| Workflow Name | File name | Description |
| ---------------------- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
| `Build on PR` | `build_on_pr.yml` | This workflow is triggered when a PR changes essential files. It will run all the unit tests in the repository with 4 GPUs. |
| `Build on PR` | `build_on_pr.yml` | This workflow is triggered when a PR changes essential files and a branch is created/deleted. It will run all the unit tests in the repository with 4 GPUs. |
| `Build on Schedule` | `build_on_schedule.yml` | This workflow will run the unit tests everyday with 8 GPUs. The result is sent to Lark. |
| `Report test coverage` | `report_test_coverage.yml` | This PR will put up a comment to report the test coverage results when `Build` is done. |
To reduce the average time of the unit test on PR, `Build on PR` workflow manages testmon cache.
1. When creating a new branch, it copies `cache/main/.testmondata*` to `cache/<branch>/`.
2. When creating a new PR or change the base branch of a PR, it copies `cache/<base_ref>/.testmondata*` to `cache/_pull/<pr_number>/`.
3. When running unit tests for each PR, it restores testmon cache from `cache/_pull/<pr_number>/`. After the test, it stores the cache back to `cache/_pull/<pr_number>/`.
4. When a PR is closed, if it's merged, it copies `cache/_pull/<pr_number>/.testmondata*` to `cache/<base_ref>/`. Otherwise, it just removes `cache/_pull/<pr_number>`.
5. When a branch is deleted, it removes `cache/<ref>`.
### Example Test
| Workflow Name | File name | Description |

View File

@ -2,7 +2,7 @@ name: Build on PR
on:
pull_request:
types: [synchronize, opened, reopened]
types: [synchronize, opened, reopened, ready_for_review, closed, edited]
branches:
- "main"
- "develop"
@ -18,11 +18,63 @@ on:
- "!tests/**.md" # ignore doc change
- "pytest.ini" # test config change
- "setup.py" # install command change
create:
delete:
jobs:
prepare_cache:
name: Prepare testmon cache
if: |
github.event_name == 'create' &&
github.event.ref_type == 'branch' &&
github.event.repository.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --rm
timeout-minutes: 5
defaults:
run:
shell: bash
steps:
- name: Copy testmon cache
run: | # branch name may contain slash, we need to replace it with space
export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
[ ! -z "$(ls -A /github/home/testmon_cache/${MAIN_BRANCH})" ] && cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
fi
env:
MAIN_BRANCH: ${{ github.event.master_branch }}
prepare_cache_for_pr:
name: Prepare testmon cache for PR
if: |
github.event_name == 'pull_request' &&
(github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --rm
timeout-minutes: 5
defaults:
run:
shell: bash
steps:
- name: Copy testmon cache
run: | # branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
if [ -d "/github/home/testmon_cache/${BASE}" ]; then
[ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ] && mkdir -p /github/home/testmon_cache/_pull && cp -p -r "/github/home/testmon_cache/${BASE}" /github/home/testmon_cache/_pull/${PR_NUMBER}
fi
env:
PR_NUMBER: ${{ github.event.number }}
detect:
name: Detect file change
if: |
github.event_name == 'pull_request' &&
(github.event.action == 'synchronize' || github.event.action == 'opened' || github.event.action == 'reopened' || github.event.action == 'ready_for_review') &&
github.event.pull_request.draft == false &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
outputs:
@ -135,9 +187,11 @@ jobs:
- name: Restore Testmon Cache
run: |
if [ -d /github/home/testmon_cache ]; then
[ ! -z "$(ls -A /github/home/testmon_cache)" ] && cp -p -r /github/home/testmon_cache/.testmondata /__w/ColossalAI/ColossalAI/
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ]; then
[ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ] && cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
fi
env:
PR_NUMBER: ${{ github.event.number }}
- name: Execute Unit Testing
run: |
@ -149,8 +203,10 @@ jobs:
- name: Store Testmon Cache
run: |
[ -d /github/home/testmon_cache ] || mkdir /github/home/testmon_cache
cp -p -r /__w/ColossalAI/ColossalAI/.testmondata /github/home/testmon_cache/
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
env:
PR_NUMBER: ${{ github.event.number }}
- name: Collate artifact
env:
@ -188,3 +244,55 @@ jobs:
with:
name: report
path: report/
store_cache:
name: Store testmon cache for PR
if: |
github.event_name == 'pull_request' &&
github.event.action == 'closed' &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --rm
timeout-minutes: 5
defaults:
run:
shell: bash
steps:
- name: Store testmon cache if possible
if: github.event.pull_request.merged == true
run: | # branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ]; then
[ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ] && cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
fi
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Remove testmon cache
if: github.event.pull_request.merged != true
run: |
rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
remove_cache:
name: Remove testmon cache
if: |
github.event_name == 'delete' &&
github.event.ref_type == 'branch' &&
github.event.repository.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --rm
timeout-minutes: 5
defaults:
run:
shell: bash
steps:
- name: Remove testmon cache
run: | # branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
rm -rf "/github/home/testmon_cache/${BASE}"

View File

@ -19,26 +19,26 @@ jobs:
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- id: set-matrix
env:
TORCH_VERSIONS: ${{ inputs.torch_version }}
CUDA_VERSIONS: ${{ inputs.cuda_version }}
run: |
IFS=','
DOCKER_IMAGE=()
- id: set-matrix
env:
TORCH_VERSIONS: ${{ inputs.torch_version }}
CUDA_VERSIONS: ${{ inputs.cuda_version }}
run: |
IFS=','
DOCKER_IMAGE=()
for tv in $TORCH_VERSIONS
do
for cv in $CUDA_VERSIONS
do
DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tv}-${cv}\"")
done
done
for tv in $TORCH_VERSIONS
do
for cv in $CUDA_VERSIONS
do
DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tv}-${cv}\"")
done
done
container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
container="[${container}]"
echo "$container"
echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
container="[${container}]"
echo "$container"
echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
build:
name: Test for PyTorch Compatibility
@ -70,6 +70,17 @@ jobs:
- uses: actions/checkout@v2
with:
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
- name: Download cub for CUDA 10.2
run: |
CUDA_VERSION=$(cat $CUDA_HOME/version.txt | grep "CUDA Version" | awk '{print $NF}' | cut -d. -f1,2)
# check if it is CUDA 10.2
# download cub
if [ "$CUDA_VERSION" = "10.2" ]; then
wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
unzip 1.8.0.zip
cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
fi
- name: Install Colossal-AI
run: |
pip install -r requirements/requirements.txt

View File

@ -3,8 +3,8 @@ name: Compatibility Test on PR
on:
pull_request:
paths:
- 'version.txt'
- '.compatibility'
- "version.txt"
- ".compatibility"
jobs:
matrix_preparation:
@ -58,6 +58,18 @@ jobs:
- uses: actions/checkout@v2
with:
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
- name: Download cub for CUDA 10.2
run: |
CUDA_VERSION=$(cat $CUDA_HOME/version.txt | grep "CUDA Version" | awk '{print $NF}' | cut -d. -f1,2)
# check if it is CUDA 10.2
# download cub
if [ "$CUDA_VERSION" = "10.2" ]; then
wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
unzip 1.8.0.zip
cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
fi
- name: Install Colossal-AI
run: |
pip install -v --no-cache-dir .

View File

@ -8,7 +8,7 @@ on:
jobs:
release:
name: Publish Docker Image to DockerHub
if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI'
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: "hpcaitech/docker-in-docker:latest"

View File

@ -20,7 +20,7 @@ jobs:
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat
options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat --shm-size=10.24gb
timeout-minutes: 30
defaults:
run:

4
.gitignore vendored
View File

@ -155,3 +155,7 @@ colossalai/version.py
# ignore coverage test file
coverage.lcov
coverage.xml
# ignore testmon and coverage files
.coverage
.testmondata*

View File

@ -362,6 +362,22 @@ If you want to install and enable CUDA kernel fusion (compulsory installation wh
CUDA_EXT=1 pip install .
```
For Users with CUDA 10.2, you can still build ColossalAI from source. However, you need to manually download the cub library and copy it to the corresponding directory.
```bash
# clone the repository
git clone https://github.com/hpcaitech/ColossalAI.git
cd ColossalAI
# download the cub library
wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
unzip 1.8.0.zip
cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
# install
CUDA_EXT=1 pip install .
```
<p align="right">(<a href="#top">back to top</a>)</p>
## Use Docker

View File

@ -0,0 +1,178 @@
import argparse
import os
import socket
from functools import partial
import ray
import torch
from coati.quant import llama_load_quant, low_resource_init
from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
from coati.ray.experience_maker_holder import ExperienceMakerHolder
from coati.ray.utils import (
get_actor_from_args,
get_critic_from_args,
get_receivers_per_sender,
get_reward_model_from_args,
get_strategy_from_args,
)
from torch.utils.data import DataLoader
from transformers import AutoConfig, AutoTokenizer
from transformers.modeling_utils import no_init_weights
def get_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
return s.getsockname()[1]
def get_local_ip():
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
s.connect(('8.8.8.8', 80))
return s.getsockname()[0]
def main(args):
master_addr = str(get_local_ip())
# trainer_env_info
trainer_port = str(get_free_port())
env_info_trainers = [{
'local_rank': '0',
'rank': str(rank),
'world_size': str(args.num_trainers),
'master_port': trainer_port,
'master_addr': master_addr
} for rank in range(args.num_trainers)]
# maker_env_info
maker_port = str(get_free_port())
env_info_maker = {
'local_rank': '0',
'rank': '0',
'world_size': '1',
'master_port': maker_port,
'master_addr': master_addr
}
# configure tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
tokenizer.pad_token = tokenizer.eos_token
def model_fn():
actor_cfg = AutoConfig.from_pretrained(args.pretrain)
critic_cfg = AutoConfig.from_pretrained(args.critic_pretrain)
actor = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
critic = get_critic_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda()
reward_model = get_reward_model_from_args(args.critic_model,
config=critic_cfg).requires_grad_(False).half().cuda()
if args.initial_model_quant_ckpt is not None and args.model == 'llama':
# quantize initial model
with low_resource_init(), no_init_weights():
initial_model = get_actor_from_args(args.model, config=actor_cfg)
initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
args.quant_group_size).cuda().requires_grad_(False)
else:
initial_model = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
return actor, critic, reward_model, initial_model
# configure Experience Maker
experience_holder_ref = ExperienceMakerHolder.options(name="maker0", num_gpus=1, max_concurrency=2).remote(
detached_trainer_name_list=[f'trainer{i}' for i in range(args.num_trainers)],
strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
model_fn=model_fn,
env_info=env_info_maker,
kl_coef=0.1,
debug=args.debug,
# sync_models_from_trainers=True,
# generation kwargs:
max_length=512,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
eval_performance=True,
use_cache=True,
)
def trainer_model_fn():
actor = get_actor_from_args(args.model, config=AutoConfig.from_pretrained(args.pretrain)).half().cuda()
critic = get_critic_from_args(args.critic_model,
config=AutoConfig.from_pretrained(args.critic_pretrain)).half().cuda()
return actor, critic
# configure Trainer
trainer_refs = [
DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
experience_maker_holder_name_list=[
f'maker{x}' for x in get_receivers_per_sender(i, args.num_trainers, 1, allow_idle_sender=True)
],
strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
model_fn=trainer_model_fn,
env_info=env_info_trainer,
train_batch_size=args.train_batch_size,
buffer_limit=16,
eval_performance=True,
debug=args.debug,
) for i, env_info_trainer in enumerate(env_info_trainers)
]
dataset_size = args.experience_batch_size * 4
def data_gen_fn():
input_ids = torch.randint(tokenizer.vocab_size, (256,), device=torch.cuda.current_device())
attn_mask = torch.ones_like(input_ids)
return {'input_ids': input_ids, 'attention_mask': attn_mask}
def build_dataloader(size):
dataset = [data_gen_fn() for _ in range(size)]
dataloader = DataLoader(dataset, batch_size=args.experience_batch_size)
return dataloader
# uncomment this function if sync_models_from_trainers is True
# ray.get([
# trainer_ref.sync_models_to_remote_makers.remote()
# for trainer_ref in trainer_refs
# ])
wait_tasks = []
wait_tasks.append(
experience_holder_ref.workingloop.remote(partial(build_dataloader, dataset_size),
num_steps=args.experience_steps))
total_steps = args.experience_batch_size * args.experience_steps // (args.num_trainers * args.train_batch_size)
for trainer_ref in trainer_refs:
wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
ray.get(wait_tasks)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--num_trainers', type=int, default=1)
parser.add_argument('--trainer_strategy',
choices=[
'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
'colossalai_zero2_cpu'
],
default='naive')
parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
parser.add_argument('--pretrain', type=str, default=None)
parser.add_argument('--critic_pretrain', type=str, default=None)
parser.add_argument('--experience_steps', type=int, default=4)
parser.add_argument('--experience_batch_size', type=int, default=8)
parser.add_argument('--train_epochs', type=int, default=1)
parser.add_argument('--update_steps', type=int, default=2)
parser.add_argument('--train_batch_size', type=int, default=8)
parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
parser.add_argument('--quant_bits', type=int, default=4)
parser.add_argument('--quant_group_size', type=int, default=128)
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()
ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
main(args)

View File

@ -0,0 +1,189 @@
import argparse
import os
import socket
from functools import partial
import ray
import torch
from coati.quant import llama_load_quant, low_resource_init
from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
from coati.ray.experience_maker_holder import ExperienceMakerHolder
from coati.ray.utils import (
get_actor_from_args,
get_critic_from_args,
get_receivers_per_sender,
get_reward_model_from_args,
get_strategy_from_args,
)
from torch.utils.data import DataLoader
from transformers import AutoConfig, AutoTokenizer
from transformers.modeling_utils import no_init_weights
def get_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
return s.getsockname()[1]
def get_local_ip():
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
s.connect(('8.8.8.8', 80))
return s.getsockname()[0]
def main(args):
master_addr = str(get_local_ip())
# trainer_env_info
trainer_port = str(get_free_port())
env_info_trainers = [{
'local_rank': '0',
'rank': str(rank),
'world_size': str(args.num_trainers),
'master_port': trainer_port,
'master_addr': master_addr
} for rank in range(args.num_trainers)]
# maker_env_info
maker_port = str(get_free_port())
env_info_makers = [{
'local_rank': '0',
'rank': str(rank),
'world_size': str(args.num_makers),
'master_port': maker_port,
'master_addr': master_addr
} for rank in range(args.num_makers)]
# configure tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
tokenizer.pad_token = tokenizer.eos_token
def model_fn():
actor_cfg = AutoConfig.from_pretrained(args.pretrain)
critic_cfg = AutoConfig.from_pretrained(args.critic_pretrain)
actor = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
critic = get_critic_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda()
reward_model = get_reward_model_from_args(args.critic_model,
config=critic_cfg).requires_grad_(False).half().cuda()
if args.initial_model_quant_ckpt is not None and args.model == 'llama':
# quantize initial model
with low_resource_init(), no_init_weights():
initial_model = get_actor_from_args(args.model, config=actor_cfg)
initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
args.quant_group_size).cuda().requires_grad_(False)
else:
initial_model = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
return actor, critic, reward_model, initial_model
# configure Experience Maker
experience_holder_refs = [
ExperienceMakerHolder.options(name=f"maker{i}", num_gpus=1, max_concurrency=2).remote(
detached_trainer_name_list=[
f'trainer{x}'
for x in get_receivers_per_sender(i, args.num_makers, args.num_trainers, allow_idle_sender=False)
],
strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
model_fn=model_fn,
env_info=env_info_maker,
kl_coef=0.1,
debug=args.debug,
# sync_models_from_trainers=True,
# generation kwargs:
max_length=512,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
eval_performance=True,
use_cache=True,
)
for i, env_info_maker in enumerate(env_info_makers)
]
def trainer_model_fn():
actor = get_actor_from_args(args.model, config=AutoConfig.from_pretrained(args.pretrain)).half().cuda()
critic = get_critic_from_args(args.critic_model,
config=AutoConfig.from_pretrained(args.critic_pretrain)).half().cuda()
return actor, critic
# configure Trainer
trainer_refs = [
DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
experience_maker_holder_name_list=[
f"maker{x}"
for x in get_receivers_per_sender(i, args.num_trainers, args.num_makers, allow_idle_sender=True)
],
strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
model_fn=trainer_model_fn,
env_info=env_info_trainer,
train_batch_size=args.train_batch_size,
buffer_limit=16,
eval_performance=True,
debug=args.debug,
)
for i, env_info_trainer in enumerate(env_info_trainers)
]
dataset_size = args.experience_batch_size * 4
def data_gen_fn():
input_ids = torch.randint(tokenizer.vocab_size, (256,), device=torch.cuda.current_device())
attn_mask = torch.ones_like(input_ids)
return {'input_ids': input_ids, 'attention_mask': attn_mask}
def build_dataloader(size):
dataset = [data_gen_fn() for _ in range(size)]
dataloader = DataLoader(dataset, batch_size=args.experience_batch_size)
return dataloader
# uncomment this function if sync_models_from_trainers is True
# ray.get([
# trainer_ref.sync_models_to_remote_makers.remote()
# for trainer_ref in trainer_refs
# ])
wait_tasks = []
for experience_holder_ref in experience_holder_refs:
wait_tasks.append(
experience_holder_ref.workingloop.remote(partial(build_dataloader, dataset_size),
num_steps=args.experience_steps))
total_steps = args.experience_batch_size * args.experience_steps * \
args.num_makers // (args.num_trainers * args.train_batch_size)
for trainer_ref in trainer_refs:
wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
ray.get(wait_tasks)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--num_makers', type=int, default=1)
parser.add_argument('--num_trainers', type=int, default=1)
parser.add_argument('--trainer_strategy',
choices=[
'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
'colossalai_zero2_cpu'
],
default='naive')
parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
parser.add_argument('--pretrain', type=str, default=None)
parser.add_argument('--critic_pretrain', type=str, default=None)
parser.add_argument('--experience_steps', type=int, default=4)
parser.add_argument('--experience_batch_size', type=int, default=8)
parser.add_argument('--train_epochs', type=int, default=1)
parser.add_argument('--update_steps', type=int, default=2)
parser.add_argument('--train_batch_size', type=int, default=8)
parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
parser.add_argument('--quant_bits', type=int, default=4)
parser.add_argument('--quant_group_size', type=int, default=128)
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()
ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
main(args)

View File

@ -61,7 +61,13 @@ class LoraLinear(lora.LoRALayer, nn.Module):
if self.merge_weights and self.merged:
# Make sure that the weights are not merged
if self.r > 0:
self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
if not hasattr(self, "lora_A") or not hasattr(self, "lora_B"):
# FIXME(csric): temporary fix
self.lora_A = nn.Parameter(self.weight.new_empty((self.r, self.in_features)))
self.lora_B = nn.Parameter(self.weight.new_empty((self.out_features, self.r)))
self.reset_parameters()
else:
self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
self.merged = False
def eval(self):

View File

@ -0,0 +1,7 @@
from .llama_gptq import load_quant as llama_load_quant
from .utils import low_resource_init
__all__ = [
'llama_load_quant',
'low_resource_init',
]

View File

@ -0,0 +1,5 @@
from .loader import load_quant
__all__ = [
'load_quant',
]

View File

@ -0,0 +1,26 @@
import torch
import torch.nn as nn
from .model_utils import find_layers
from .quant import make_quant
def load_quant(model: nn.Module, checkpoint: str, wbits: int, groupsize: int):
model = model.eval()
layers = find_layers(model)
# ignore lm head
layers = find_layers(model)
for name in ['lm_head']:
if name in layers:
del layers[name]
make_quant(model, layers, wbits, groupsize)
if checkpoint.endswith('.safetensors'):
from safetensors.torch import load_file as safe_load
model.load_state_dict(safe_load(checkpoint))
else:
model.load_state_dict(torch.load(checkpoint))
return model

View File

@ -0,0 +1,13 @@
# copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/past/modelutils.py
import torch
import torch.nn as nn
def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
if type(module) in layers:
return {name: module}
res = {}
for name1, child in module.named_children():
res.update(find_layers(child, layers=layers, name=name + '.' + name1 if name != '' else name1))
return res

View File

@ -0,0 +1,283 @@
# copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/past/quant.py
import math
import numpy as np
import torch
import torch.nn as nn
def quantize(x, scale, zero, maxq):
q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
return scale * (q - zero)
class Quantizer(nn.Module):
def __init__(self, shape=1):
super(Quantizer, self).__init__()
self.register_buffer('maxq', torch.tensor(0))
self.register_buffer('scale', torch.zeros(shape))
self.register_buffer('zero', torch.zeros(shape))
def configure(self, bits, perchannel=False, sym=True, mse=False, norm=2.4, grid=100, maxshrink=.8):
self.maxq = torch.tensor(2**bits - 1)
self.perchannel = perchannel
self.sym = sym
self.mse = mse
self.norm = norm
self.grid = grid
self.maxshrink = maxshrink
def find_params(self, x, weight=False):
dev = x.device
self.maxq = self.maxq.to(dev)
shape = x.shape
if self.perchannel:
if weight:
x = x.flatten(1)
else:
if len(shape) == 4:
x = x.permute([1, 0, 2, 3])
x = x.flatten(1)
if len(shape) == 3:
x = x.reshape((-1, shape[-1])).t()
if len(shape) == 2:
x = x.t()
else:
x = x.flatten().unsqueeze(0)
tmp = torch.zeros(x.shape[0], device=dev)
xmin = torch.minimum(x.min(1)[0], tmp)
xmax = torch.maximum(x.max(1)[0], tmp)
if self.sym:
xmax = torch.maximum(torch.abs(xmin), xmax)
tmp = xmin < 0
if torch.any(tmp):
xmin[tmp] = -xmax[tmp]
tmp = (xmin == 0) & (xmax == 0)
xmin[tmp] = -1
xmax[tmp] = +1
self.scale = (xmax - xmin) / self.maxq
if self.sym:
self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
else:
self.zero = torch.round(-xmin / self.scale)
if self.mse:
best = torch.full([x.shape[0]], float('inf'), device=dev)
for i in range(int(self.maxshrink * self.grid)):
p = 1 - i / self.grid
xmin1 = p * xmin
xmax1 = p * xmax
scale1 = (xmax1 - xmin1) / self.maxq
zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
q -= x
q.abs_()
q.pow_(self.norm)
err = torch.sum(q, 1)
tmp = err < best
if torch.any(tmp):
best[tmp] = err[tmp]
self.scale[tmp] = scale1[tmp]
self.zero[tmp] = zero1[tmp]
if not self.perchannel:
if weight:
tmp = shape[0]
else:
tmp = shape[1] if len(shape) != 3 else shape[2]
self.scale = self.scale.repeat(tmp)
self.zero = self.zero.repeat(tmp)
if weight:
shape = [-1] + [1] * (len(shape) - 1)
self.scale = self.scale.reshape(shape)
self.zero = self.zero.reshape(shape)
return
if len(shape) == 4:
self.scale = self.scale.reshape((1, -1, 1, 1))
self.zero = self.zero.reshape((1, -1, 1, 1))
if len(shape) == 3:
self.scale = self.scale.reshape((1, 1, -1))
self.zero = self.zero.reshape((1, 1, -1))
if len(shape) == 2:
self.scale = self.scale.unsqueeze(0)
self.zero = self.zero.unsqueeze(0)
def quantize(self, x):
if self.ready():
return quantize(x, self.scale, self.zero, self.maxq)
return x
def enabled(self):
return self.maxq > 0
def ready(self):
return torch.all(self.scale != 0)
try:
import quant_cuda
except:
print('CUDA extension not installed.')
# Assumes layer is perfectly divisible into 256 * 256 blocks
class QuantLinear(nn.Module):
def __init__(self, bits, groupsize, infeatures, outfeatures):
super().__init__()
if bits not in [2, 3, 4, 8]:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
self.infeatures = infeatures
self.outfeatures = outfeatures
self.bits = bits
if groupsize != -1 and groupsize < 32 and groupsize != int(math.pow(2, int(math.log2(groupsize)))):
raise NotImplementedError("groupsize supports powers of 2 greater than 32. (e.g. : 32,64,128,etc)")
groupsize = groupsize if groupsize != -1 else infeatures
self.groupsize = groupsize
self.register_buffer(
'qzeros', torch.zeros((math.ceil(infeatures / groupsize), outfeatures // 256 * (bits * 8)),
dtype=torch.int))
self.register_buffer('scales', torch.zeros((math.ceil(infeatures / groupsize), outfeatures)))
self.register_buffer('bias', torch.zeros(outfeatures))
self.register_buffer('qweight', torch.zeros((infeatures // 256 * (bits * 8), outfeatures), dtype=torch.int))
self._initialized_quant_state = False
def pack(self, linear, scales, zeros):
scales = scales.t().contiguous()
zeros = zeros.t().contiguous()
scale_zeros = zeros * scales
self.scales = scales.clone()
if linear.bias is not None:
self.bias = linear.bias.clone()
intweight = []
for idx in range(self.infeatures):
g_idx = idx // self.groupsize
intweight.append(
torch.round((linear.weight.data[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:,
None])
intweight = torch.cat(intweight, dim=1)
intweight = intweight.t().contiguous()
intweight = intweight.numpy().astype(np.uint32)
qweight = np.zeros((intweight.shape[0] // 256 * (self.bits * 8), intweight.shape[1]), dtype=np.uint32)
i = 0
row = 0
while row < qweight.shape[0]:
if self.bits in [2, 4, 8]:
for j in range(i, i + (32 // self.bits)):
qweight[row] |= intweight[j] << (self.bits * (j - i))
i += 32 // self.bits
row += 1
elif self.bits == 3:
for j in range(i, i + 10):
qweight[row] |= intweight[j] << (3 * (j - i))
i += 10
qweight[row] |= intweight[i] << 30
row += 1
qweight[row] |= (intweight[i] >> 2) & 1
i += 1
for j in range(i, i + 10):
qweight[row] |= intweight[j] << (3 * (j - i) + 1)
i += 10
qweight[row] |= intweight[i] << 31
row += 1
qweight[row] |= (intweight[i] >> 1) & 0x3
i += 1
for j in range(i, i + 10):
qweight[row] |= intweight[j] << (3 * (j - i) + 2)
i += 10
row += 1
else:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
qweight = qweight.astype(np.int32)
self.qweight = torch.from_numpy(qweight)
zeros -= 1
zeros = zeros.numpy().astype(np.uint32)
qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 256 * (self.bits * 8)), dtype=np.uint32)
i = 0
col = 0
while col < qzeros.shape[1]:
if self.bits in [2, 4, 8]:
for j in range(i, i + (32 // self.bits)):
qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
i += 32 // self.bits
col += 1
elif self.bits == 3:
for j in range(i, i + 10):
qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
i += 10
qzeros[:, col] |= zeros[:, i] << 30
col += 1
qzeros[:, col] |= (zeros[:, i] >> 2) & 1
i += 1
for j in range(i, i + 10):
qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
i += 10
qzeros[:, col] |= zeros[:, i] << 31
col += 1
qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
i += 1
for j in range(i, i + 10):
qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
i += 10
col += 1
else:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
qzeros = qzeros.astype(np.int32)
self.qzeros = torch.from_numpy(qzeros)
def forward(self, x):
intermediate_dtype = torch.float32
if not self._initialized_quant_state:
# Do we even have a bias? Check for at least one non-zero element.
if self.bias is not None and bool(torch.any(self.bias != 0)):
# Then make sure it's the right type.
self.bias.data = self.bias.data.to(intermediate_dtype)
else:
self.bias = None
outshape = list(x.shape)
outshape[-1] = self.outfeatures
x = x.reshape(-1, x.shape[-1])
if self.bias is None:
y = torch.zeros(x.shape[0], outshape[-1], dtype=intermediate_dtype, device=x.device)
else:
y = self.bias.clone().repeat(x.shape[0], 1)
output_dtype = x.dtype
x = x.to(intermediate_dtype)
if self.bits == 2:
quant_cuda.vecquant2matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
elif self.bits == 3:
quant_cuda.vecquant3matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
elif self.bits == 4:
quant_cuda.vecquant4matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
elif self.bits == 8:
quant_cuda.vecquant8matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
else:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
y = y.to(output_dtype)
return y.reshape(outshape)
def make_quant(module, names, bits, groupsize, name=''):
if isinstance(module, QuantLinear):
return
for attr in dir(module):
tmp = getattr(module, attr)
name1 = name + '.' + attr if name != '' else attr
if name1 in names:
setattr(module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features))
for name1, child in module.named_children():
make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1)

View File

@ -0,0 +1,28 @@
from contextlib import contextmanager
import torch
def _noop(*args, **kwargs):
pass
@contextmanager
def low_resource_init():
"""This context manager disables weight initialization and sets the default float dtype to half.
"""
old_kaiming_uniform_ = torch.nn.init.kaiming_uniform_
old_uniform_ = torch.nn.init.uniform_
old_normal_ = torch.nn.init.normal_
dtype = torch.get_default_dtype()
try:
torch.nn.init.kaiming_uniform_ = _noop
torch.nn.init.uniform_ = _noop
torch.nn.init.normal_ = _noop
torch.set_default_dtype(torch.half)
yield
finally:
torch.nn.init.kaiming_uniform_ = old_kaiming_uniform_
torch.nn.init.uniform_ = old_uniform_
torch.nn.init.normal_ = old_normal_
torch.set_default_dtype(dtype)

View File

@ -0,0 +1,160 @@
# Distributed PPO Training on Stage 3
## Detach Experience Makers and Trainers
We can completely separate the trainers and makers.
<p align="center">
<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/basic_structure.png?raw=true" width=600/>
</p>
- The experience maker performs inference, produces experience, and remotely delivers it to the trainer (1).
- The trainer consumes experience to train models, and periodically transmits new model parameters to the maker (2.1, 2.2).
- Using an experience buffer to overlap transmission and computing.
In this manner, each node will work continuously without model idle time, and different optimization strategies can be applied for inference and training to meet the needs of speed or storage. It is also helpful for scalability.
`DetachedPPOTrainer` and `ExperienceMakerHolder` are Ray Actors (distinguished from Actor Model), representing Trainer and Experience Maker on the graph above, respectively.
[More about Ray Core](https://docs.ray.io/en/latest/ray-core/walkthrough.html)
## Usage
See examples at `ColossalAI/application/Chat/examples/ray`
### Setup Makers
- define makers' environment variables :
```python
env_info_makers = [{
'local_rank': '0',
'rank': str(rank),
'world_size': str(num_makers),
'master_port': maker_port,
'master_addr': master_addr
} for rank in range(num_makers)]
```
- define maker models :
```python
def model_fn():
actor = get_actor_from_args(...)
critic = get_critic_from_args(...)
reward_model = get_reward_model_from_args(...)
initial_model = get_actor_from_args(...)
return actor, critic, reward_model, initial_model
```
- set experience_holder_refs :
```python
experience_holder_refs = [
ExperienceMakerHolder.options(
name=f"maker_{i}",
num_gpus=1,
max_concurrency=2
).remote(
detached_trainer_name_list=[f"trainer_{x}" for x in target_trainers(...)],
model_fn=model_fn,
...)
for i, env_info_maker in enumerate(env_info_makers)
]
```
The names in the `detached_trainer_name_list` refer to the target trainers that the maker should send experience to.
We set a trainer's name the same as a maker, by `.options(name="str")`. See below.
### Setup Trainers
- define trainers' environment variables :
```python
env_info_trainers = [{
'local_rank': '0',
'rank': str(rank),
'world_size': str(num_trainers),
'master_port': trainer_port,
'master_addr': master_addr
} for rank in range(num_trainers)]
```
- define trainer models :
```python
def trainer_model_fn():
actor = get_actor_from_args(...)
critic = get_critic_from_args(...)
return actor, critic
```
- set trainer_refs :
```python
trainer_refs = [
DetachedPPOTrainer.options(
name=f"trainer{i}",
num_gpus=1,
max_concurrency=2
).remote(
experience_maker_holder_name_list=[f"maker{x}" for x in target_makers(...)],
model_fn = trainer_model_fn(),
...)
for i, env_info_trainer in enumerate(env_info_trainers)
]
```
The names in `experience_maker_holder_name_list` refer to the target makers that the trainer should send updated models to.
By setting `detached_trainer_name_list` and `experience_maker_holder_name_list`, we can customize the transmission graph.
### Launch Jobs
- define data_loader :
```python
def data_loader_fn():
return = torch.utils.data.DataLoader(dataset=dataset)
```
- launch makers :
```python
wait_tasks = []
for experience_holder_ref in experience_holder_refs:
wait_tasks.append(
experience_holder_ref.workingloop.remote(data_loader_fn(),
num_steps=experience_steps))
```
- launch trainers :
```python
for trainer_ref in trainer_refs:
wait_tasks.append(trainer_ref.fit.remote(total_steps, update_steps, train_epochs))
```
- wait for done :
```python
ray.get(wait_tasks)
```
## Flexible Structure
We can deploy different strategies to makers and trainers. Here are some notions.
### 2 Makers 1 Trainer
<p align="center">
<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/2m1t.png?raw=true" width=600/>
</p>
### 2 Makers 2 Trainer
<p align="center">
<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/2m2t.png?raw=true" width=600/>
</p>
### Maker Inference Quantization
<p align="center">
<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/2m2t_quantize.png?raw=true" width=600/>
</p>
### Tensor Parallel
<p align="center">
<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/tp_ddp_hybrid.png?raw=true" width=600/>
</p>
## TODO
- [ ] Support LoRA
- [ ] Support TP & PP

View File

@ -1,2 +0,0 @@
from .src.detached_replay_buffer import DetachedReplayBuffer
from .src.detached_trainer_ppo import DetachedPPOTrainer

View File

@ -0,0 +1,9 @@
from .base import MakerCallback, TrainerCallback
from .performance_evaluator import ExperienceMakerPerformanceEvaluator, TrainerPerformanceEvaluator
__all__ = [
"TrainerCallback",
"MakerCallback",
"ExperienceMakerPerformanceEvaluator",
"TrainerPerformanceEvaluator",
]

View File

@ -0,0 +1,66 @@
from abc import ABC
from coati.experience_maker import Experience
class TrainerCallback(ABC):
"""
Base callback class. It defines the interface for callbacks.
"""
def on_fit_start(self) -> None:
pass
def on_fit_end(self) -> None:
pass
def on_episode_start(self, episode: int) -> None:
pass
def on_episode_end(self, episode: int) -> None:
pass
def on_epoch_start(self, epoch: int) -> None:
pass
def on_epoch_end(self, epoch: int) -> None:
pass
def on_batch_start(self) -> None:
pass
def on_batch_end(self, metrics: dict, experience: Experience) -> None:
pass
def on_update_start(self) -> None:
pass
def on_update_end(self) -> None:
pass
class MakerCallback(ABC):
def on_loop_start(self) -> None:
pass
def on_loop_end(self) -> None:
pass
def on_make_experience_start(self) -> None:
pass
def on_make_experience_end(self, experience: Experience) -> None:
pass
def on_send_start(self) -> None:
pass
def on_send_end(self) -> None:
pass
def on_batch_start(self) -> None:
pass
def on_batch_end(self) -> None:
pass

View File

@ -0,0 +1,212 @@
from time import time
from typing import Optional
import torch
import torch.distributed as dist
from coati.experience_maker import Experience
from .base import MakerCallback, TrainerCallback
def get_world_size() -> int:
if dist.is_initialized():
return dist.get_world_size()
return 1
def print_rank_0(*args, **kwargs) -> None:
if not dist.is_initialized() or dist.get_rank() == 0:
print(*args, **kwargs)
@torch.no_grad()
def all_reduce_mean(x: float, world_size: int) -> float:
if world_size == 1:
return x
tensor = torch.tensor([x], device=torch.cuda.current_device())
dist.all_reduce(tensor)
tensor = tensor / world_size
return tensor.item()
class Timer:
def __init__(self) -> None:
self.start_time: Optional[float] = None
self.duration: float = 0.
def start(self) -> None:
self.start_time = time()
def end(self) -> None:
self.duration += time() - self.start_time
def reset(self) -> None:
self.duration = 0.
class ExperienceMakerPerformanceEvaluator(MakerCallback):
def __init__(self, actor_num_params: int, critic_num_params: int, initial_model_num_params: int,
reward_model_num_params: int) -> None:
super().__init__()
self.world_size = get_world_size()
self.actor_num_params = actor_num_params
self.critic_num_params = critic_num_params
self.initial_model_num_params = initial_model_num_params
self.reward_model_num_params = reward_model_num_params
self.batch_timer = Timer()
self.send_timer = Timer()
self.make_experience_timer = Timer()
self.total_samples: int = 0
self.make_experience_flop: int = 0
print_rank_0(
f'ExperienceMaker actor: {actor_num_params/1024**3:.2f}B, critic: {critic_num_params/1024**3:.2f}B, initial model: {initial_model_num_params/1024**3:.2f}B, reward model: {reward_model_num_params/1024**3:.2f}B, world size: {self.world_size}'
)
def on_make_experience_start(self) -> None:
self.make_experience_timer.start()
def on_make_experience_end(self, experience: Experience) -> None:
self.make_experience_timer.end()
batch_size, seq_len = experience.sequences.shape
self.total_samples += batch_size
# actor generate
num_actions = experience.action_mask.size(1)
input_len = seq_len - num_actions
total_seq_len = (input_len + seq_len - 1) * num_actions / 2
self.make_experience_flop += self.actor_num_params * batch_size * total_seq_len * 2
# actor forward
self.make_experience_flop += self.actor_num_params * batch_size * seq_len * 2
# critic forward
self.make_experience_flop += self.critic_num_params * batch_size * seq_len * 2
# initial model forward
self.make_experience_flop += self.initial_model_num_params * batch_size * seq_len * 2
# reward model forward
self.make_experience_flop += self.reward_model_num_params * batch_size * seq_len * 2
def on_send_start(self) -> None:
self.send_timer.start()
def on_send_end(self) -> None:
self.send_timer.end()
def on_batch_start(self) -> None:
self.batch_timer.start()
def on_batch_end(self) -> None:
self.batch_timer.end()
def on_loop_end(self) -> None:
avg_make_experience_duration = all_reduce_mean(self.make_experience_timer.duration, self.world_size)
avg_overall_duration = all_reduce_mean(self.batch_timer.duration, self.world_size)
avg_send_duration = all_reduce_mean(self.send_timer.duration, self.world_size)
avg_throughput = self.total_samples * self.world_size / (avg_overall_duration + 1e-12)
avg_make_experience_tflops = self.make_experience_flop / 1e12 / (avg_make_experience_duration + 1e-12)
avg_time_per_sample = (avg_overall_duration + 1e-12) / (self.total_samples * self.world_size)
avg_make_experience_time_per_sample = (avg_make_experience_duration + 1e-12) / \
(self.total_samples * self.world_size)
avg_send_time_per_sample = (avg_send_duration + 1e-12) / (self.total_samples * self.world_size)
print_rank_0(
'Making Experience Performance Summary:\n' + f'Throughput: {avg_throughput:.3f} samples/sec\n' +
f'TFLOPS per GPU: {avg_make_experience_tflops:.3f}\n' +
f'Sample time (overall): {avg_time_per_sample:.3f} s\n' +
f'Sample time (make experience): {avg_make_experience_time_per_sample:.3f} s, {avg_make_experience_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+
f'Sample time (send): {avg_send_time_per_sample:.3f} s, {avg_send_time_per_sample/avg_time_per_sample*100:.2f}%\n'
)
class TrainerPerformanceEvaluator(TrainerCallback):
def __init__(self,
actor_num_params: int,
critic_num_params: int,
enable_grad_checkpoint: bool = False,
ignore_first_episodes: int = 1) -> None:
super().__init__()
self.world_size = get_world_size()
self.actor_num_params = actor_num_params
self.critic_num_params = critic_num_params
self.enable_grad_checkpoint = enable_grad_checkpoint
self.ignore_first_episodes = ignore_first_episodes
self.ignore_this_episode = False
self.episode_timer = Timer()
self.batch_timer = Timer()
self.update_timer = Timer()
self.total_samples: int = 0
self.learn_flop: int = 0
print_rank_0(
f'Trainer actor: {self.actor_num_params/1024**3:.2f}B, critic: {self.critic_num_params/1024**3:.2f}B, world size: {self.world_size}'
)
def on_episode_start(self, episodes: int) -> None:
self.ignore_this_episode = episodes < self.ignore_first_episodes
if self.ignore_this_episode:
return
self.episode_timer.start()
def on_episode_end(self, episodes: int) -> None:
if self.ignore_this_episode:
return
self.episode_timer.end()
def on_batch_start(self) -> None:
if self.ignore_this_episode:
return
self.batch_timer.start()
def on_batch_end(self, metrics: dict, experience: Experience) -> None:
if self.ignore_this_episode:
return
self.batch_timer.end()
batch_size, seq_len = experience.sequences.shape
self.total_samples += batch_size
# actor forward-backward, 3 means forward(1) + backward(2)
self.learn_flop += self.actor_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
# critic forward-backward
self.learn_flop += self.critic_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
def on_update_start(self) -> None:
if self.ignore_this_episode:
return
self.update_timer.start()
def on_update_end(self) -> None:
if self.ignore_this_episode:
return
self.update_timer.end()
def on_fit_end(self) -> None:
if self.total_samples == 0:
print_rank_0('No samples are collected, skip trainer performance evaluation')
return
avg_train_duration = all_reduce_mean(self.batch_timer.duration, self.world_size)
avg_update_duration = all_reduce_mean(self.update_timer.duration, self.world_size)
avg_episode_duration = all_reduce_mean(self.episode_timer.duration, self.world_size)
avg_throughput = self.total_samples * self.world_size / (avg_episode_duration + 1e-12)
avg_learn_tflops = self.learn_flop / 1e12 / (avg_train_duration + 1e-12)
avg_time_per_sample = (avg_episode_duration + 1e-12) / (self.total_samples * self.world_size)
avg_train_time_per_sample = (avg_train_duration + 1e-12) / (self.total_samples * self.world_size)
avg_update_time_per_sample = (avg_update_duration + 1e-12) / (self.total_samples * self.world_size)
print_rank_0(
'Learning Performance Summary:\n' + f'Throughput: {avg_throughput:.3f} samples/sec\n' +
f'TFLOPS per GPU: {avg_learn_tflops:.3f}\n' + f'Sample time (overall): {avg_time_per_sample:.3f} s\n' +
f'Sample time (train): {avg_train_time_per_sample:.3f} s, {avg_train_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+
f'Sample time (update): {avg_update_time_per_sample:.3f} s, {avg_update_time_per_sample/avg_time_per_sample*100:.2f}%\n'
)

View File

@ -1,22 +1,24 @@
import torch
import asyncio
import copy
import random
from typing import List, Any
from threading import Lock
from typing import Any, List
import ray
import torch
from coati.experience_maker.base import Experience
from coati.replay_buffer import ReplayBuffer
from coati.replay_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
# from torch.multiprocessing import Queue
from ray.util.queue import Queue
import ray
import asyncio
from coati.experience_maker.base import Experience
from coati.replay_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
from coati.replay_buffer import ReplayBuffer
from threading import Lock
import copy
class DetachedReplayBuffer:
'''
Detached replay buffer. Share Experience across workers on the same node.
Therefore a trainer node is expected to have only one instance.
Detached replay buffer. Share Experience across workers on the same node.
Therefore a trainer node is expected to have only one instance.
It is ExperienceMakerHolder's duty to call append(exp) method, remotely.
Args:
sample_batch_size: Batch size when sampling. Exp won't enqueue until they formed a batch.
tp_world_size: Number of workers in the same tp group
@ -24,31 +26,25 @@ class DetachedReplayBuffer:
cpu_offload: Whether to offload experience to cpu when sampling. Defaults to True.
'''
def __init__(self, sample_batch_size: int, tp_world_size: int = 1, limit : int = 0, cpu_offload: bool = True) -> None:
self.cpu_offload = cpu_offload
def __init__(self, sample_batch_size: int, limit: int = 0) -> None:
self.sample_batch_size = sample_batch_size
self.limit = limit
self.items = Queue(self.limit, actor_options={"num_cpus":1})
self.batch_collector : List[BufferItem] = []
'''
Workers in the same tp group share this buffer and need same sample for one step.
Therefore a held_sample should be returned tp_world_size times before it could be dropped.
worker_state records wheter a worker got the held_sample
'''
self.tp_world_size = tp_world_size
self.worker_state = [False] * self.tp_world_size
self.held_sample = None
self._worker_state_lock = Lock()
self.items = Queue(self.limit, actor_options={"num_cpus": 1})
self.batch_collector: List[BufferItem] = []
@torch.no_grad()
def append(self, experience: Experience) -> None:
'''
Expected to be called remotely.
'''
if self.cpu_offload:
experience.to_device(torch.device('cpu'))
items = split_experience_batch(experience)
self.extend(items)
@torch.no_grad()
def extend(self, items: List[BufferItem]) -> None:
'''
Expected to be called remotely.
'''
self.batch_collector.extend(items)
while len(self.batch_collector) >= self.sample_batch_size:
items = self.batch_collector[:self.sample_batch_size]
@ -62,19 +58,10 @@ class DetachedReplayBuffer:
self.items = Queue(self.limit)
self.worker_state = [False] * self.tp_world_size
self.batch_collector = []
@torch.no_grad()
def sample(self, worker_rank = 0, to_device = "cpu") -> Experience:
self._worker_state_lock.acquire()
if not any(self.worker_state):
self.held_sample = self._sample_and_erase()
self.worker_state[worker_rank] = True
if all(self.worker_state):
self.worker_state = [False] * self.tp_world_size
ret = self.held_sample
else:
ret = copy.deepcopy(self.held_sample)
self._worker_state_lock.release()
def sample(self, worker_rank=0, to_device="cpu") -> Experience:
ret = self._sample_and_erase()
ret.to_device(to_device)
return ret
@ -85,4 +72,4 @@ class DetachedReplayBuffer:
def get_length(self) -> int:
ret = self.items.qsize()
return ret
return ret

View File

@ -0,0 +1,179 @@
import os
from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
import ray
import torch
from coati.experience_maker import Experience
from coati.replay_buffer.utils import BufferItem
from torch.utils.data import DataLoader
from tqdm import tqdm
from .callbacks import TrainerCallback
from .detached_replay_buffer import DetachedReplayBuffer
from .utils import is_rank_0
class DetachedTrainer(ABC):
'''
Base class for detached rlhf trainers.
'detach' means that the experience maker is detached compared to a normal Trainer.
Please set name attribute during init:
>>> trainer = DetachedTrainer.options(..., name = "xxx", ...).remote()
So an ExperienceMakerHolder can reach the detached_replay_buffer by Actor's name.
Args:
detached_strategy (DetachedStrategy): the strategy to use for training
detached_replay_buffer_ref (ObjectRef[DetachedReplayBuffer]): the replay buffer to use for training
data_loader_pin_memory (bool, defaults to True): whether to pin memory for data loader
callbacks (List[Callback], defaults to []): the callbacks to call during training process
generate_kwargs (dict, optional): the kwargs to use while model generating
'''
def __init__(self,
experience_maker_holder_name_list: List[str],
train_batch_size: int = 8,
buffer_limit: int = 0,
dataloader_pin_memory: bool = True,
callbacks: List[TrainerCallback] = [],
debug: bool = False) -> None:
super().__init__()
self.detached_replay_buffer = DetachedReplayBuffer(train_batch_size, limit=buffer_limit)
self.dataloader_pin_memory = dataloader_pin_memory
self.callbacks = callbacks
self.target_holder_name_list = experience_maker_holder_name_list
self.target_holder_list = []
self._is_target_holder_initialized = False
self._debug = debug
def update_target_holder_list(self):
# as the length of target_holder_list may be zero, we need to check it by a bool flag
if not self._is_target_holder_initialized:
for name in self.target_holder_name_list:
self.target_holder_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
self._is_target_holder_initialized = True
@abstractmethod
def _update_remote_makers(self, fully_update: bool = False, **kwargs):
pass
def sync_models_to_remote_makers(self, **kwargs):
self._update_remote_makers(fully_update=True, **kwargs)
@abstractmethod
def training_step(self, experience: Experience) -> Dict[str, Any]:
pass
def _learn(self, update_steps: int, train_epochs: int) -> None:
data = []
# warmup
pbar = tqdm(range(update_steps), desc=f'Train epoch [1/{train_epochs}]', disable=not is_rank_0())
self._on_epoch_start(0)
self._learn_epoch(pbar, data)
self._on_epoch_end(0)
# item is already a batch
dataloader = DataLoader(data,
batch_size=1,
shuffle=True,
pin_memory=self.dataloader_pin_memory,
collate_fn=lambda x: x[0])
for epoch in range(1, train_epochs):
pbar = tqdm(dataloader, desc=f'Train epoch [{epoch + 1}/{train_epochs}]', disable=not is_rank_0())
self._on_epoch_start(epoch)
self._learn_epoch(pbar, data)
self._on_epoch_end(epoch)
def _learn_epoch(self, pbar: tqdm, data: List[Experience]) -> None:
is_warmup = len(data) == 0
for x in pbar:
if self._debug:
print("[trainer] training step")
# sample a batch and then train to avoid waiting
experience = x if not is_warmup else self._buffer_sample()
experience.to_device(torch.cuda.current_device())
self._on_batch_start()
metrics = self.training_step(experience)
self._on_batch_end(metrics, experience)
if self._debug:
print("[trainer] step over")
experience.to_device("cpu")
if is_warmup:
data.append(experience)
pbar.set_postfix(metrics)
def fit(self, total_steps: int, update_steps: int, train_epochs: int = 1) -> None:
self._on_fit_start()
for i in tqdm(range(total_steps // update_steps), desc='Trainer', disable=not is_rank_0()):
self._on_episode_start(i)
self._learn(update_steps, train_epochs)
self._on_update_start()
self._update_remote_makers()
self._on_update_end()
self._on_episode_end(i)
self._on_fit_end()
@ray.method(concurrency_group="buffer_length")
def buffer_get_length(self):
# called by ExperienceMakerHolder
if self._debug:
print("[trainer] telling length")
return self.detached_replay_buffer.get_length()
@ray.method(concurrency_group="buffer_append")
def buffer_append(self, experience: Experience):
# called by ExperienceMakerHolder
if self._debug:
print(f"[trainer] receiving exp.")
self.detached_replay_buffer.append(experience)
@ray.method(concurrency_group="buffer_append")
def buffer_extend(self, items: List[BufferItem]):
# called by ExperienceMakerHolder
if self._debug:
print(f"[trainer] receiving exp.")
self.detached_replay_buffer.extend(items)
@ray.method(concurrency_group="buffer_sample")
def _buffer_sample(self):
return self.detached_replay_buffer.sample()
def _on_fit_start(self) -> None:
for callback in self.callbacks:
callback.on_fit_start()
def _on_fit_end(self) -> None:
for callback in self.callbacks:
callback.on_fit_end()
def _on_episode_start(self, episode: int) -> None:
for callback in self.callbacks:
callback.on_episode_start(episode)
def _on_episode_end(self, episode: int) -> None:
for callback in self.callbacks:
callback.on_episode_end(episode)
def _on_epoch_start(self, epoch: int) -> None:
for callback in self.callbacks:
callback.on_epoch_start(epoch)
def _on_epoch_end(self, epoch: int) -> None:
for callback in self.callbacks:
callback.on_epoch_end(epoch)
def _on_batch_start(self) -> None:
for callback in self.callbacks:
callback.on_batch_start()
def _on_batch_end(self, metrics: dict, experience: Experience) -> None:
for callback in self.callbacks:
callback.on_batch_end(metrics, experience)
def _on_update_start(self) -> None:
for callback in self.callbacks:
callback.on_update_start()
def _on_update_end(self) -> None:
for callback in self.callbacks:
callback.on_update_end()

View File

@ -1,24 +1,38 @@
from typing import Any, Callable, Dict, List, Optional
import torch
from torch.optim import Adam
from typing import Any, Callable, Dict, List, Optional, Tuple
import ray
import torch
from coati.experience_maker import Experience, NaiveExperienceMaker
from coati.models.base import Actor, Critic
from coati.models.generation_utils import update_model_kwargs_fn
from coati.models.loss import PolicyLoss, ValueLoss
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy, Strategy
from coati.trainer.callbacks import Callback
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy, Strategy
from torch.optim import Adam
from colossalai.nn.optimizer import HybridAdam
import ray
from .utils import is_rank_0, get_cuda_actor_critic_from_args, get_strategy_from_args, set_dist_env
from .callbacks import TrainerCallback, TrainerPerformanceEvaluator
from .detached_trainer_base import DetachedTrainer
from .lora_constructor import LoRAConstructor
from .utils import (
get_actor_from_args,
get_critic_from_args,
get_model_numel,
get_rank,
get_strategy_from_args,
is_rank_0,
set_dist_env,
state_dict_to,
)
@ray.remote(concurrency_groups={"buffer_length": 1, "buffer_append":1, "buffer_sample":1,"model_io": 1, "compute": 1})
@ray.remote(concurrency_groups={
"buffer_length": 1,
"buffer_append": 1,
"buffer_sample": 1,
"model_io": 1,
"compute": 1
})
class DetachedPPOTrainer(DetachedTrainer):
'''
Detached Trainer for PPO algorithm
@ -40,86 +54,102 @@ class DetachedPPOTrainer(DetachedTrainer):
generate_kwargs (dict, optional): the kwargs to use while model generating
'''
def __init__(self,
experience_maker_holder_name_list: List[str],
strategy: str,
model: str,
env_info: Dict[str, str] = None,
pretrained: str = None,
lora_rank: int = 0,
train_batch_size: int = 8,
buffer_limit: int = 0,
buffer_cpu_offload: bool = True,
eps_clip: float = 0.2,
value_clip: float = 0.4,
experience_batch_size: int = 8,
max_epochs: int = 10,
dataloader_pin_memory: bool = True,
callbacks: List[Callback] = [],
**generate_kwargs) -> None:
def __init__(
self,
experience_maker_holder_name_list: List[str],
strategy_fn: Callable[[], Strategy],
model_fn: Callable[[], Tuple[Actor, Critic]],
env_info: Dict[str, str] = None,
train_batch_size: int = 8,
buffer_limit: int = 0,
eps_clip: float = 0.2,
value_clip: float = 0.4,
dataloader_pin_memory: bool = True,
callbacks: List[TrainerCallback] = [],
eval_performance: bool = False,
debug: bool = False,
update_lora_weights: bool = False,
) -> None:
# set environment variables
if env_info:
set_dist_env(env_info=env_info)
# configure strategy
self.strategy = get_strategy_from_args(strategy)
self.strategy = strategy_fn()
# configure models, loss and optimizers
with self.strategy.model_init_context():
self.actor, self.critic = get_cuda_actor_critic_from_args(model, pretrained, lora_rank)
self.actor, self.critic = model_fn()
if strategy != 'colossalai_gemini':
self.actor.to(torch.float16).to(torch.cuda.current_device())
self.critic.to(torch.float16).to(torch.cuda.current_device())
if eval_performance:
actor_numel = get_model_numel(self.actor)
critic_numel = get_model_numel(self.critic)
evaluator = TrainerPerformanceEvaluator(actor_numel, critic_numel)
callbacks = callbacks + [evaluator]
if strategy.startswith('colossalai'):
self.actor_optim = HybridAdam(self.actor.parameters(), lr=5e-6)
self.critic_optim = HybridAdam(self.critic.parameters(), lr=5e-6)
if isinstance(self.strategy, ColossalAIStrategy):
self.actor_optim = HybridAdam(self.actor.parameters(), lr=1e-7)
self.critic_optim = HybridAdam(self.critic.parameters(), lr=1e-7)
else:
self.actor_optim = Adam(self.actor.parameters(), lr=5e-6)
self.critic_optim = Adam(self.critic.parameters(), lr=5e-6)
self.actor_optim = Adam(self.actor.parameters(), lr=1e-7)
self.critic_optim = Adam(self.critic.parameters(), lr=1e-7)
(self.actor, self.actor_optim), (self.critic, self.critic_optim) = \
self.strategy.prepare((self.actor, self.actor_optim), (self.critic, self.critic_optim))
generate_kwargs = _set_default_generate_kwargs(self.strategy, generate_kwargs, self.actor)
# configure trainer
self.actor_loss_fn = PolicyLoss(eps_clip)
self.critic_loss_fn = ValueLoss(value_clip)
super().__init__(experience_maker_holder_name_list,
train_batch_size=train_batch_size,
buffer_limit=buffer_limit,
buffer_cpu_offload=buffer_cpu_offload,
experience_batch_size=experience_batch_size,
max_epochs=max_epochs,
dataloader_pin_memory=dataloader_pin_memory,
callbacks=callbacks,
**generate_kwargs)
debug=debug)
if self._debug:
print(f'[trainer{get_rank()}] will send state dict to {experience_maker_holder_name_list}')
self._update_lora_weights = update_lora_weights
@ray.method(concurrency_group="model_io")
def _update_remote_makers(self):
@torch.no_grad()
def _update_remote_makers(self, fully_update: bool = False, **config):
# TODO: balance duties
if is_rank_0():
self.update_target_holder_list(self.target_holder_name_list)
if not fully_update:
config['requires_grad_only'] = True
self.update_target_holder_list()
# mark start, ensure order
tasks = []
for target_holder in self.target_holder_list:
tasks.append(target_holder.update_experience_maker.remote(chunk_start=True, fully_update=fully_update))
ray.get(tasks)
# sending loop
tasks = []
for state_dict_shard in self._get_model_state_dict_shard(self.actor, fully_update=fully_update, **config):
for target_holder in self.target_holder_list:
# TODO: reduce malloc
with torch.no_grad():
ray.get(target_holder.update_experience_maker.remote(self._get_unwrapped_actor(), self._get_unwrapped_critic()))
@ray.method(concurrency_group="model_io")
def initialize_remote_makers(self):
# TODO: balance duties
if is_rank_0():
self.update_target_holder_list(self.target_holder_name_list)
tasks.append(
target_holder.update_experience_maker.remote(
new_actor_state_dict=state_dict_shard,
new_actor_lora_config_dict=self._get_model_lora_config_dict(self.actor),
fully_update=fully_update))
# sending loop
for state_dict_shard in self._get_model_state_dict_shard(self.critic, fully_update=fully_update, **config):
for target_holder in self.target_holder_list:
# TODO: reduce malloc
with torch.no_grad():
ray.get(target_holder.initialize_experience_maker.remote(self._get_unwrapped_actor(), self._get_unwrapped_critic()))
tasks.append(
target_holder.update_experience_maker.remote(
new_critic_state_dict=state_dict_shard,
new_critic_lora_config_dict=self._get_model_lora_config_dict(self.critic),
fully_update=fully_update))
ray.get(tasks)
# mark end
for target_holder in self.target_holder_list:
target_holder.update_experience_maker.remote(chunk_end=True, fully_update=fully_update)
@ray.method(concurrency_group="compute")
def training_step(self, experience: Experience) -> Dict[str, float]:
self.actor.train()
self.critic.train()
experience.to_device(torch.cuda.current_device())
num_actions = experience.action_mask.size(1)
action_log_probs = self.actor(experience.sequences, num_actions, attention_mask=experience.attention_mask)
actor_loss = self.actor_loss_fn(action_log_probs,
@ -155,38 +185,16 @@ class DetachedPPOTrainer(DetachedTrainer):
def strategy_save_critic_optim(self, path: str, only_rank0: bool = False) -> None:
self.strategy.save_optimizer(self.critic_optim, path, only_rank0)
def _get_unwrapped_actor(self):
if False:
pass
elif isinstance(self.strategy, ColossalAIStrategy):
ret = Actor(self.strategy._unwrap_model(self.actor))
return ret
elif isinstance(self.strategy, DDPStrategy):
return Actor(self.strategy._unwrap_actor(self.actor))
elif isinstance(self.strategy, NaiveStrategy):
return self.actor
def _get_model_state_dict_shard(self, model: torch.nn.Module, fully_update=False, **config):
for state_dict in self.strategy.get_model_state_dict_shard(model, **config):
if not self._update_lora_weights or fully_update:
yield state_dict_to(state_dict)
else:
state_dict_lora, _ = LoRAConstructor.filter_state_dict_lora(state_dict)
yield state_dict_to(state_dict_lora)
def _get_unwrapped_critic(self):
if False:
pass
elif isinstance(self.strategy, ColossalAIStrategy):
ret = self.strategy._unwrap_model(self.critic)
return ret
elif isinstance(self.strategy, DDPStrategy):
return self.critic.module
elif isinstance(self.strategy, NaiveStrategy):
return self.critic
def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> None:
origin_model = strategy._unwrap_actor(actor)
new_kwargs = {**generate_kwargs}
# use huggingface models method directly
if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
new_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
if 'update_model_kwargs_fn' not in generate_kwargs:
new_kwargs['update_model_kwargs_fn'] = update_model_kwargs_fn
return new_kwargs
def _get_model_lora_config_dict(self, model: torch.nn.Module):
if not self._update_lora_weights:
return None
unwrapped_model = self.strategy.unwrap_model(model)
return LoRAConstructor.extract_lora_config(unwrapped_model)

View File

@ -1,153 +0,0 @@
import argparse
from copy import deepcopy
import pandas as pd
import torch
from coati.trainer import PPOTrainer
from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
from coati.experience_maker import NaiveExperienceMaker
from torch.optim import Adam
from transformers import AutoTokenizer, BloomTokenizerFast
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
from colossalai.nn.optimizer import HybridAdam
import ray
import os
import socket
def get_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
return s.getsockname()[1]
def get_local_ip():
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
s.connect(('8.8.8.8', 80))
return s.getsockname()[0]
def main(args):
master_addr = str(get_local_ip())
# trainer_env_info
trainer_port = str(get_free_port())
env_info_trainer = {'local_rank' : '0',
'rank' : '0',
'world_size' : '1',
'master_port' : trainer_port,
'master_addr' : master_addr}
# maker_env_info
maker_port = str(get_free_port())
env_info_maker = {'local_rank' : '0',
'rank' : '0',
'world_size' : '1',
'master_port' : maker_port,
'master_addr' : master_addr}
# configure tokenizer
if args.model == 'gpt2':
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
elif args.model == 'bloom':
tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
tokenizer.pad_token = tokenizer.eos_token
elif args.model == 'opt':
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
else:
raise ValueError(f'Unsupported model "{args.model}"')
# configure Trainer
trainer_ref = DetachedPPOTrainer.options(name="trainer1", num_gpus=1, max_concurrency=2).remote(
experience_maker_holder_name_list=["maker1"],
strategy=args.trainer_strategy,
model=args.model,
env_info = env_info_trainer,
pretrained=args.pretrain,
lora_rank=args.lora_rank,
train_batch_size=args.train_batch_size,
buffer_limit=16,
experience_batch_size=args.experience_batch_size,
max_epochs=args.max_epochs,
#kwargs:
max_length=128,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
debug=args.debug,
)
# configure Experience Maker
experience_holder_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
detached_trainer_name_list=["trainer1"],
strategy=args.maker_strategy,
env_info = env_info_maker,
experience_batch_size=args.experience_batch_size,
kl_coef=0.1,
#kwargs:
max_length=128,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
debug=args.debug,
)
# trainer send its actor and critic to experience holders.
ray.get(trainer_ref.initialize_remote_makers.remote())
# configure sampler
dataset = pd.read_csv(args.prompt_path)['prompt']
def tokenize_fn(texts):
# MUST padding to max length to ensure inputs of all ranks have the same length
# Different length may lead to hang when using gemini, as different generation steps
batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
return {k: v.cuda() for k, v in batch.items()}
trainer_done_ref = trainer_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs + 3 # +3 for fault tolerance
maker_done_ref = experience_holder_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
ray.get([trainer_done_ref, maker_done_ref])
# save model checkpoint after fitting
trainer_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
# save optimizer checkpoint on all ranks
if args.need_optim_ckpt:
trainer_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
only_rank0=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('prompt_path')
parser.add_argument('--trainer_strategy',
choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
default='naive')
parser.add_argument('--maker_strategy',
choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
default='naive')
parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
parser.add_argument('--pretrain', type=str, default=None)
parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
parser.add_argument('--need_optim_ckpt', type=bool, default=False)
parser.add_argument('--num_episodes', type=int, default=10)
parser.add_argument('--max_timesteps', type=int, default=10)
parser.add_argument('--update_timesteps', type=int, default=10)
parser.add_argument('--max_epochs', type=int, default=5)
parser.add_argument('--train_batch_size', type=int, default=8)
parser.add_argument('--experience_batch_size', type=int, default=8)
parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()
ray.init(namespace=os.environ["RAY_NAMESPACE"])
main(args)

View File

@ -1,23 +0,0 @@
set_n_least_used_CUDA_VISIBLE_DEVICES() {
local n=${1:-"9999"}
echo "GPU Memory Usage:"
local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
| tail -n +2 \
| nl -v 0 \
| tee /dev/tty \
| sort -g -k 2 \
| awk '{print $1}' \
| head -n $n)
export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
echo "Now CUDA_VISIBLE_DEVICES is set to:"
echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
}
set_n_least_used_CUDA_VISIBLE_DEVICES 2
export RAY_NAMESPACE="admin"
python 1m1t.py "/path/to/prompts.csv" \
--trainer_strategy colossalai_zero2 --maker_strategy naive --lora_rank 2 --pretrain "facebook/opt-350m" --model 'opt' \
--num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
--max_epochs 10 --debug

View File

@ -1,186 +0,0 @@
import argparse
from copy import deepcopy
import pandas as pd
import torch
from coati.trainer import PPOTrainer
from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
from coati.experience_maker import NaiveExperienceMaker
from torch.optim import Adam
from transformers import AutoTokenizer, BloomTokenizerFast
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
from colossalai.nn.optimizer import HybridAdam
import ray
import os
import socket
def get_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
return s.getsockname()[1]
def get_local_ip():
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
s.connect(('8.8.8.8', 80))
return s.getsockname()[0]
def main(args):
master_addr = str(get_local_ip())
# trainer_env_info
trainer_port = str(get_free_port())
env_info_trainer_1 = {'local_rank' : '0',
'rank' : '0',
'world_size' : '2',
'master_port' : trainer_port,
'master_addr' : master_addr}
env_info_trainer_2 = {'local_rank' : '0',
'rank' : '1',
'world_size' : '2',
'master_port' : trainer_port,
'master_addr' : master_addr}
# maker_env_info
maker_port = str(get_free_port())
env_info_maker_1 = {'local_rank' : '0',
'rank' : '0',
'world_size' : '2',
'master_port' : maker_port,
'master_addr' : master_addr}
print([env_info_trainer_1,
env_info_trainer_2,
env_info_maker_1])
ray.init(dashboard_port = 1145)
# configure tokenizer
if args.model == 'gpt2':
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
elif args.model == 'bloom':
tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
tokenizer.pad_token = tokenizer.eos_token
elif args.model == 'opt':
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
else:
raise ValueError(f'Unsupported model "{args.model}"')
# configure Trainer
trainer_1_ref = DetachedPPOTrainer.options(name="trainer1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
experience_maker_holder_name_list=["maker1"],
strategy=args.trainer_strategy,
model=args.model,
env_info=env_info_trainer_1,
pretrained=args.pretrain,
lora_rank=args.lora_rank,
train_batch_size=args.train_batch_size,
buffer_limit=16,
experience_batch_size=args.experience_batch_size,
max_epochs=args.max_epochs,
#kwargs:
max_length=128,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
debug=args.debug,
)
trainer_2_ref = DetachedPPOTrainer.options(name="trainer2", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
experience_maker_holder_name_list=["maker1"],
strategy=args.trainer_strategy,
model=args.model,
env_info=env_info_trainer_2,
pretrained=args.pretrain,
lora_rank=args.lora_rank,
train_batch_size=args.train_batch_size,
buffer_limit=16,
experience_batch_size=args.experience_batch_size,
max_epochs=args.max_epochs,
#kwargs:
max_length=128,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
debug= args.debug,
)
# configure Experience Maker
experience_holder_1_ref = ExperienceMakerHolder.options(name="maker1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
detached_trainer_name_list=["trainer1", "trainer2"],
strategy=args.maker_strategy,
env_info=env_info_maker_1,
experience_batch_size=args.experience_batch_size,
kl_coef=0.1,
#kwargs:
max_length=128,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
debug=args.debug,
)
# trainer send its actor and critic to experience holders.
# TODO: balance duty
ray.get(trainer_1_ref.initialize_remote_makers.remote())
# configure sampler
dataset = pd.read_csv(args.prompt_path)['prompt']
def tokenize_fn(texts):
# MUST padding to max length to ensure inputs of all ranks have the same length
# Different length may lead to hang when using gemini, as different generation steps
batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
return {k: v.cuda() for k, v in batch.items()}
trainer_1_done_ref = trainer_1_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
trainer_2_done_ref = trainer_2_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs * 2 + 3 # +3 for fault tolerance
maker_1_done_ref = experience_holder_1_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
ray.get([trainer_1_done_ref, trainer_2_done_ref, maker_1_done_ref])
# save model checkpoint after fitting
trainer_1_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
trainer_2_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
# save optimizer checkpoint on all ranks
if args.need_optim_ckpt:
trainer_1_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
only_rank0=False)
trainer_2_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
only_rank0=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('prompt_path')
parser.add_argument('--trainer_strategy',
choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
default='naive')
parser.add_argument('--maker_strategy',
choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
default='naive')
parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
parser.add_argument('--pretrain', type=str, default=None)
parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
parser.add_argument('--need_optim_ckpt', type=bool, default=False)
parser.add_argument('--num_episodes', type=int, default=10)
parser.add_argument('--max_timesteps', type=int, default=10)
parser.add_argument('--update_timesteps', type=int, default=10)
parser.add_argument('--max_epochs', type=int, default=5)
parser.add_argument('--train_batch_size', type=int, default=8)
parser.add_argument('--experience_batch_size', type=int, default=8)
parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()
main(args)

View File

@ -1,23 +0,0 @@
set_n_least_used_CUDA_VISIBLE_DEVICES() {
local n=${1:-"9999"}
echo "GPU Memory Usage:"
local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
| tail -n +2 \
| nl -v 0 \
| tee /dev/tty \
| sort -g -k 2 \
| awk '{print $1}' \
| head -n $n)
export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
echo "Now CUDA_VISIBLE_DEVICES is set to:"
echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
}
set_n_least_used_CUDA_VISIBLE_DEVICES 2
export RAY_NAMESPACE="admin"
python 1m2t.py "/path/to/prompts.csv" --model gpt2 \
--maker_strategy naive --trainer_strategy ddp --lora_rank 2 \
--num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
--max_epochs 10 #--debug

View File

@ -1,140 +0,0 @@
import argparse
from copy import deepcopy
import pandas as pd
import torch
from coati.trainer import PPOTrainer
from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
from coati.experience_maker import NaiveExperienceMaker
from torch.optim import Adam
from transformers import AutoTokenizer, BloomTokenizerFast
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
from colossalai.nn.optimizer import HybridAdam
import ray
import os
import socket
def main(args):
# configure tokenizer
if args.model == 'gpt2':
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
elif args.model == 'bloom':
tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
tokenizer.pad_token = tokenizer.eos_token
elif args.model == 'opt':
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
else:
raise ValueError(f'Unsupported model "{args.model}"')
# configure Trainer
trainer_ref = DetachedPPOTrainer.options(name="trainer1", num_gpus=1, max_concurrency=2).remote(
experience_maker_holder_name_list=["maker1", "maker2"],
strategy=args.trainer_strategy,
model=args.model,
pretrained=args.pretrain,
lora_rank=args.lora_rank,
train_batch_size=args.train_batch_size,
buffer_limit=16,
experience_batch_size=args.experience_batch_size,
max_epochs=args.max_epochs,
#kwargs:
max_length=128,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
debug=args.debug,
)
# configure Experience Maker
experience_holder_1_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
detached_trainer_name_list=["trainer1"],
strategy=args.maker_strategy,
experience_batch_size=args.experience_batch_size,
kl_coef=0.1,
#kwargs:
max_length=128,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
debug=args.debug,
)
experience_holder_2_ref = ExperienceMakerHolder.options(name="maker2", num_gpus=1, max_concurrency=2).remote(
detached_trainer_name_list=["trainer1"],
strategy=args.maker_strategy,
experience_batch_size=args.experience_batch_size,
kl_coef=0.1,
#kwargs:
max_length=128,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
debug=args.debug,
)
# trainer send its actor and critic to experience holders.
ray.get(trainer_ref.initialize_remote_makers.remote())
# configure sampler
dataset = pd.read_csv(args.prompt_path)['prompt']
def tokenize_fn(texts):
# MUST padding to max length to ensure inputs of all ranks have the same length
# Different length may lead to hang when using gemini, as different generation steps
batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
return {k: v.cuda() for k, v in batch.items()}
trainer_done_ref = trainer_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs // 2 + 3 # +3 for fault tolerance
maker_1_done_ref = experience_holder_1_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
maker_2_done_ref = experience_holder_2_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
ray.get([trainer_done_ref, maker_1_done_ref, maker_2_done_ref])
# save model checkpoint after fitting
trainer_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
# save optimizer checkpoint on all ranks
if args.need_optim_ckpt:
trainer_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
only_rank0=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('prompt_path')
parser.add_argument('--trainer_strategy',
choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
default='naive')
parser.add_argument('--maker_strategy',
choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
default='naive')
parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
parser.add_argument('--pretrain', type=str, default=None)
parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
parser.add_argument('--need_optim_ckpt', type=bool, default=False)
parser.add_argument('--num_episodes', type=int, default=10)
parser.add_argument('--max_timesteps', type=int, default=10)
parser.add_argument('--update_timesteps', type=int, default=10)
parser.add_argument('--max_epochs', type=int, default=5)
parser.add_argument('--train_batch_size', type=int, default=8)
parser.add_argument('--experience_batch_size', type=int, default=8)
parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()
ray.init(namespace=os.environ["RAY_NAMESPACE"])
main(args)

View File

@ -1,23 +0,0 @@
set_n_least_used_CUDA_VISIBLE_DEVICES() {
local n=${1:-"9999"}
echo "GPU Memory Usage:"
local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
| tail -n +2 \
| nl -v 0 \
| tee /dev/tty \
| sort -g -k 2 \
| awk '{print $1}' \
| head -n $n)
export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
echo "Now CUDA_VISIBLE_DEVICES is set to:"
echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
}
set_n_least_used_CUDA_VISIBLE_DEVICES 3
export RAY_NAMESPACE="admin"
python 2m1t.py "/path/to/prompts.csv" \
--trainer_strategy naive --maker_strategy naive --lora_rank 2 --pretrain "facebook/opt-350m" --model 'opt' \
--num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
--max_epochs 10 # --debug

View File

@ -1,209 +0,0 @@
import argparse
from copy import deepcopy
import pandas as pd
import torch
from coati.trainer import PPOTrainer
from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
from coati.experience_maker import NaiveExperienceMaker
from torch.optim import Adam
from transformers import AutoTokenizer, BloomTokenizerFast
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
from colossalai.nn.optimizer import HybridAdam
import ray
import os
import socket
def get_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
return s.getsockname()[1]
def get_local_ip():
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
s.connect(('8.8.8.8', 80))
return s.getsockname()[0]
def main(args):
master_addr = str(get_local_ip())
# trainer_env_info
trainer_port = str(get_free_port())
env_info_trainer_1 = {'local_rank' : '0',
'rank' : '0',
'world_size' : '2',
'master_port' : trainer_port,
'master_addr' : master_addr}
env_info_trainer_2 = {'local_rank' : '0',
'rank' : '1',
'world_size' : '2',
'master_port' : trainer_port,
'master_addr' : master_addr}
# maker_env_info
maker_port = str(get_free_port())
env_info_maker_1 = {'local_rank' : '0',
'rank' : '0',
'world_size' : '2',
'master_port' : maker_port,
'master_addr' : master_addr}
env_info_maker_2 = {'local_rank' : '0',
'rank' : '1',
'world_size' : '2',
'master_port': maker_port,
'master_addr' : master_addr}
print([env_info_trainer_1,
env_info_trainer_2,
env_info_maker_1,
env_info_maker_2])
ray.init()
# configure tokenizer
if args.model == 'gpt2':
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
elif args.model == 'bloom':
tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
tokenizer.pad_token = tokenizer.eos_token
elif args.model == 'opt':
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
else:
raise ValueError(f'Unsupported model "{args.model}"')
# configure Trainer
trainer_1_ref = DetachedPPOTrainer.options(name="trainer1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
experience_maker_holder_name_list=["maker1", "maker2"],
strategy=args.trainer_strategy,
model=args.model,
env_info=env_info_trainer_1,
pretrained=args.pretrain,
lora_rank=args.lora_rank,
train_batch_size=args.train_batch_size,
buffer_limit=16,
experience_batch_size=args.experience_batch_size,
max_epochs=args.max_epochs,
#kwargs:
max_length=128,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
debug=args.debug,
)
trainer_2_ref = DetachedPPOTrainer.options(name="trainer2", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
experience_maker_holder_name_list=["maker1", "maker2"],
strategy=args.trainer_strategy,
model=args.model,
env_info=env_info_trainer_2,
pretrained=args.pretrain,
lora_rank=args.lora_rank,
train_batch_size=args.train_batch_size,
buffer_limit=16,
experience_batch_size=args.experience_batch_size,
max_epochs=args.max_epochs,
#kwargs:
max_length=128,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
debug=args.debug,
)
# configure Experience Maker
experience_holder_1_ref = ExperienceMakerHolder.options(name="maker1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
detached_trainer_name_list=["trainer1", "trainer2"],
strategy=args.maker_strategy,
env_info=env_info_maker_1,
experience_batch_size=args.experience_batch_size,
kl_coef=0.1,
#kwargs:
max_length=128,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
debug=args.debug,
)
experience_holder_2_ref = ExperienceMakerHolder.options(name="maker2", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
detached_trainer_name_list=["trainer1", "trainer2"],
strategy=args.maker_strategy,
env_info=env_info_maker_2,
experience_batch_size=args.experience_batch_size,
kl_coef=0.1,
#kwargs:
max_length=128,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
debug=args.debug,
)
# trainer send its actor and critic to experience holders.
# TODO: balance duty
ray.get(trainer_1_ref.initialize_remote_makers.remote())
# configure sampler
dataset = pd.read_csv(args.prompt_path)['prompt']
def tokenize_fn(texts):
# MUST padding to max length to ensure inputs of all ranks have the same length
# Different length may lead to hang when using gemini, as different generation steps
batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
return {k: v.cuda() for k, v in batch.items()}
trainer_1_done_ref = trainer_1_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
trainer_2_done_ref = trainer_2_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs + 3 # +3 for fault tolerance
maker_1_done_ref = experience_holder_1_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
maker_2_done_ref = experience_holder_2_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
ray.get([trainer_1_done_ref, trainer_2_done_ref, maker_1_done_ref, maker_2_done_ref])
# save model checkpoint after fitting
trainer_1_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
trainer_2_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
# save optimizer checkpoint on all ranks
if args.need_optim_ckpt:
trainer_1_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
only_rank0=False)
trainer_2_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
only_rank0=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('prompt_path')
parser.add_argument('--trainer_strategy',
choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
default='naive')
parser.add_argument('--maker_strategy',
choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
default='naive')
parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
parser.add_argument('--pretrain', type=str, default=None)
parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
parser.add_argument('--need_optim_ckpt', type=bool, default=False)
parser.add_argument('--num_episodes', type=int, default=10)
parser.add_argument('--max_timesteps', type=int, default=10)
parser.add_argument('--update_timesteps', type=int, default=10)
parser.add_argument('--max_epochs', type=int, default=5)
parser.add_argument('--train_batch_size', type=int, default=8)
parser.add_argument('--experience_batch_size', type=int, default=8)
parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()
main(args)

View File

@ -1,23 +0,0 @@
set_n_least_used_CUDA_VISIBLE_DEVICES() {
local n=${1:-"9999"}
echo "GPU Memory Usage:"
local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
| tail -n +2 \
| nl -v 0 \
| tee /dev/tty \
| sort -g -k 2 \
| awk '{print $1}' \
| head -n $n)
export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
echo "Now CUDA_VISIBLE_DEVICES is set to:"
echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
}
set_n_least_used_CUDA_VISIBLE_DEVICES 2
export RAY_NAMESPACE="admin"
python 2m2t.py "path/to/prompts.csv" \
--maker_strategy naive --trainer_strategy colossalai_zero2 --lora_rank 2 \
--num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
--max_epochs 10 --debug

View File

@ -0,0 +1,271 @@
import os
import time
import tracemalloc
from copy import deepcopy
from threading import Lock
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import ray
import torch
import torch.nn as nn
from coati.experience_maker import Experience, ExperienceMaker, NaiveExperienceMaker
from coati.models.base import Actor, Critic, RewardModel
from coati.replay_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
from coati.trainer.callbacks import Callback
from coati.trainer.strategies import Strategy
from coati.trainer.strategies.sampler import DistributedSampler
from ray.exceptions import GetTimeoutError
from torch import Tensor
from tqdm import tqdm
from .callbacks import ExperienceMakerPerformanceEvaluator, MakerCallback
from .utils import (get_model_numel,
get_rank,
get_world_size,
is_rank_0,
set_dist_env,
state_dict_to)
from .lora_constructor import LoRAConstructor
@ray.remote(concurrency_groups={"experience_io": 1, "model_io": 1, "compute": 1})
class ExperienceMakerHolder:
'''
Args:
detached_trainer_name_list: str list to get ray actor handles
strategy:
kl_coef: the coefficient of kl divergence loss
sync_models_from_trainers: whether to sync models from trainers. If True, you must call sync_models_to_remote_makers() in trainers to sync models.
'''
def __init__(
self,
detached_trainer_name_list: List[str],
strategy_fn: Callable[[], Strategy],
# a function returns (actor, critic, reward_model, initial_model)
model_fn: Callable[[], Tuple[Actor, Critic, RewardModel, Actor]],
env_info: Dict[str, str] = None,
sync_models_from_trainers: bool = False,
buffer_cpu_offload: bool = True,
kl_coef: float = 0.1,
callbacks: List[MakerCallback] = [],
eval_performance: bool = False,
debug: bool = False,
update_lora_weights: bool = False,
**generate_kwargs):
# set environment variables
if env_info:
set_dist_env(env_info=env_info)
self.target_trainer_list = []
assert len(detached_trainer_name_list) > 0
self._detached_trainer_name_list = detached_trainer_name_list
self.strategy = strategy_fn()
self.buffer_cpu_offload = buffer_cpu_offload
self.kl_coef = kl_coef
# init models
with self.strategy.model_init_context():
actor, critic, reward_model, initial_model = model_fn()
self.generate_kwargs = _set_default_generate_kwargs(generate_kwargs, actor)
if eval_performance:
actor_numel = get_model_numel(actor)
critic_numel = get_model_numel(critic)
initial_model_numel = get_model_numel(initial_model)
reward_model_numel = get_model_numel(reward_model)
evaluator = ExperienceMakerPerformanceEvaluator(actor_numel, critic_numel, initial_model_numel,
reward_model_numel)
callbacks = callbacks + [evaluator]
actor, critic, reward_model, initial_model = self.strategy.prepare(actor, critic, reward_model, initial_model)
self.experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, self.kl_coef)
self.callbacks = callbacks
self._model_visit_lock = Lock()
self._is_fully_initialized = not sync_models_from_trainers
self._debug = debug
self._update_lora_weights = update_lora_weights
if self._update_lora_weights:
self.actor_lora_constructor = LoRAConstructor()
self.critic_lora_constructor = LoRAConstructor()
self.target_auto_balance = False
self._target_idx = 0
if self._debug:
print(f'[maker{get_rank()}] will send items to {self._detached_trainer_name_list}')
if not self._is_fully_initialized:
print(f'[maker{get_rank()}] Waiting for INIT')
def _get_ready(self):
while not self._fully_initialized():
time.sleep(1.0)
def _fully_initialized(self):
return self._is_fully_initialized
def _init_target_trainer_list(self):
if len(self.target_trainer_list) > 0:
return
for name in self._detached_trainer_name_list:
self.target_trainer_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
# copy from ../trainer/base.py
@ray.method(concurrency_group="compute")
def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experience:
if isinstance(inputs, Tensor):
return self.experience_maker.make_experience(inputs, **self.generate_kwargs)
elif isinstance(inputs, dict):
return self.experience_maker.make_experience(**inputs, **self.generate_kwargs)
else:
raise ValueError(f'Unsupported input type "{type(inputs)}"')
@ray.method(concurrency_group="experience_io")
def _send_items(self, experience: Experience) -> None:
self._init_target_trainer_list()
items = split_experience_batch(experience)
items_per_trainer = [[] for _ in range(len(self.target_trainer_list))]
for item in items:
items_per_trainer[self._target_idx].append(item)
self._target_idx = (self._target_idx + 1) % len(self.target_trainer_list)
for i, target_trainer in enumerate(self.target_trainer_list):
if len(items_per_trainer[i]) > 0:
target_trainer.buffer_extend.remote(items_per_trainer[i])
def _inference_step(self, batch) -> None:
self._on_batch_start()
with self._model_visit_lock:
self._on_make_experience_start()
experience = self._make_experience(batch)
self._on_make_experience_end(experience)
self._on_send_start()
if self.buffer_cpu_offload:
experience.to_device('cpu')
self._send_items(experience)
self._on_send_end()
self._on_batch_end()
def workingloop(self, dataloader_fn: Callable[[], Iterable], num_epochs: int = 1, num_steps: int = 0):
"""Working loop of the experience maker.
Args:
dataloader_fn (Callable[[], Iterable]): A function that returns a dataloader.
num_epochs (int, optional): Iterate the dataloader for number of epochs. Defaults to 1.
num_steps (int, optional): Iterate the dataloader for number if steps. If this value > 0, num_epochs will be ignored. Defaults to 0.
"""
self._get_ready()
self._on_loop_start()
dataloader = dataloader_fn()
if num_steps > 0:
# ignore num epochs
it = iter(dataloader)
for _ in tqdm(range(num_steps), desc='ExperienceMaker', disable=not is_rank_0()):
try:
batch = next(it)
except StopIteration:
it = iter(dataloader)
batch = next(it)
self._inference_step(batch)
else:
with tqdm(total=num_epochs * len(dataloader), desc='ExperienceMaker', disable=not is_rank_0()) as pbar:
for _ in range(num_epochs):
for batch in dataloader:
self._inference_step(batch)
pbar.update()
self._on_loop_end()
@ray.method(concurrency_group="model_io")
def update_experience_maker(self,
new_actor_state_dict: Dict[str, Any] = None,
new_actor_lora_config_dict: Dict[str, Any] = None,
new_critic_state_dict: Dict[str, Any] = None,
new_critic_lora_config_dict: Dict[str, Any] = None,
fully_update: bool = False,
chunk_start: bool = None,
chunk_end: bool = None):
'''
called by trainer
chunk_start: Set True at the first call. Before sending state_dict calls
chunk_end: Set True at the last call. After sending state_dict calls.
fully_update: Set True if you want to sync models when initializing
TODO: load_state_dict integrate with model-sharding strategy
'''
_watch_memory = self._debug
if chunk_start:
if self._debug:
print("[maker] UPDATE ")
if _watch_memory:
tracemalloc.start()
self._model_visit_lock.acquire()
with torch.no_grad():
if new_actor_state_dict is not None:
if not self._update_lora_weights or fully_update:
self.experience_maker.actor.model.load_state_dict(new_actor_state_dict, strict=False)
else:
new_actor_state_dict = state_dict_to(new_actor_state_dict, device=torch.cuda.current_device())
state_dict_increasae = self.actor_lora_constructor.reconstruct_increase(new_actor_state_dict, new_actor_lora_config_dict)
self.actor_lora_constructor.load_state_dict_increase(self.experience_maker.actor.model, state_dict_increasae)
if new_critic_state_dict is not None:
if not self._update_lora_weights or fully_update:
self.experience_maker.critic.load_state_dict(new_critic_state_dict, strict=False)
else:
new_critic_state_dict = state_dict_to(new_critic_state_dict, device=torch.cuda.current_device())
state_dict_increasae = self.critic_lora_constructor.reconstruct_increase(new_critic_state_dict, new_critic_lora_config_dict)
self.critic_lora_constructor.load_state_dict_increase(self.experience_maker.critic, state_dict_increasae)
# the lock must be released after both actor and critic being updated
if chunk_end:
self._model_visit_lock.release()
if _watch_memory:
current, peak = tracemalloc.get_traced_memory()
print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
tracemalloc.stop()
if fully_update:
self._is_fully_initialized = True
def _on_make_experience_start(self) -> None:
for callback in self.callbacks:
callback.on_make_experience_start()
def _on_make_experience_end(self, experience: Experience) -> None:
for callback in self.callbacks:
callback.on_make_experience_end(experience)
def _on_loop_start(self) -> None:
for callback in self.callbacks:
callback.on_loop_start()
def _on_loop_end(self) -> None:
for callback in self.callbacks:
callback.on_loop_end()
def _on_send_start(self) -> None:
for callback in self.callbacks:
callback.on_send_start()
def _on_send_end(self) -> None:
for callback in self.callbacks:
callback.on_send_end()
def _on_batch_start(self) -> None:
for callback in self.callbacks:
callback.on_batch_start()
def _on_batch_end(self) -> None:
for callback in self.callbacks:
callback.on_batch_end()
def _set_default_generate_kwargs(generate_kwargs: dict, actor: Actor) -> None:
origin_model = actor.model
new_kwargs = {**generate_kwargs}
# use huggingface models method directly
if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
new_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
if 'update_model_kwargs_fn' not in generate_kwargs and hasattr(origin_model, '_update_model_kwargs_for_generation'):
new_kwargs['update_model_kwargs_fn'] = origin_model._update_model_kwargs_for_generation
return new_kwargs

View File

@ -0,0 +1,122 @@
from typing import Any, Callable, Dict, List, Optional
from collections import OrderedDict
from dataclasses import dataclass
import torch
import torch.nn as nn
from loralib.layers import LoRALayer
from coati.models.lora import LoraLinear
@dataclass
class LoRAConfig:
r: int = 0
lora_alpha: int = 1
lora_dropout: float = 0
fan_in_fan_out: bool = False
class LoRAConstructor:
'''
Tools for reconstructing a model from a remote LoRA model.
(Transfering only LoRA data costs much less!)
Usage:
Step 1 (Sender):
filter_state_dict_lora()
Step 2 (Sender, Optional):
extract_lora_config()
Step 3 (Sender):
send state_dict_lora and lora_config_dict
Step 4 (Receiver):
reconstruct_increase()
Step 5 (Receiver):
load_state_dict_increase()
'''
def __init__(self):
self.lora_config_dict = None
def register_lora_config(self, lora_config_dict: Dict[str, Any]):
self.lora_config_dict = lora_config_dict
def reconstruct_increase(self, state_dict_lora: Dict[str, Any], lora_config_dict: Dict[str, Any]):
'''
xxx.lora_A, xxx.lora_B -->> xxx.weight
Warning: the xxx.weight here is the increment actually.
'''
if lora_config_dict is not None:
self.register_lora_config(lora_config_dict)
state_dict_increasae = OrderedDict()
config_iter = iter(self.lora_config_dict.items())
lora_A, lora_B, layer_prefix = None, None, None
for k, v in state_dict_lora.items():
if k.rpartition('.')[-1] == 'lora_A':
lora_A = v
layer_prefix = k.rpartition('.')[0]
elif k.rpartition('.')[-1] == 'lora_B':
assert layer_prefix == k.rpartition('.')[0], "unmatched (lora_A, lora_B) pair"
layer_prefix_2, config = next(config_iter)
assert layer_prefix_2 == layer_prefix, "unmatched (state_dict, config_dict) pair"
lora_B = v
weight_data_increase = self._compute(lora_A, lora_B, config)
state_dict_increasae[layer_prefix + '.weight'] = weight_data_increase
lora_A, lora_B, layer_prefix = None, None, None
else:
raise ValueError('unexpected key')
return state_dict_increasae
def _compute(self, lora_A, lora_B, config=LoRAConfig()):
def T(w):
return w.T if config.fan_in_fan_out else w
if config.r > 0:
scaling = config.lora_alpha / config.r
weight_data_increase = T(lora_B @ lora_A) * scaling
return weight_data_increase
return 0
def load_state_dict_increase(self, model: nn.Module, state_dict_increasae: Dict[str, Any]):
'''
The final reconstruction step
'''
# naive approach
model.load_state_dict({k: v + model.state_dict()[k] for k, v in state_dict_increasae.items()}, strict=False)
@staticmethod
def filter_state_dict_lora(state_dict: Dict[str, Any], keep_non_lora=False):
'''
if keep_non_lora, also return non_lora state_dict
'''
state_dict_lora = OrderedDict()
state_dict_non_lora = OrderedDict()
for k, v in state_dict.items():
if 'lora_A' in k or 'lora_B' in k:
state_dict_lora[k] = v
elif keep_non_lora:
state_dict_non_lora[k] = v
if keep_non_lora:
return state_dict_lora, state_dict_non_lora
else:
return state_dict_lora, None
@staticmethod
def extract_lora_config(model: nn.Module) -> Dict[str, LoRAConfig]:
'''
extract LoraLinear model.
return OrderedDict(): name -> LoRAConfig
'''
lora_config_dict = OrderedDict()
for name, child in model.named_modules():
if isinstance(child, LoraLinear):
lora_config_dict[name] = LoRAConfig(r=child.r,
lora_alpha=child.lora_alpha,
lora_dropout=child.lora_dropout,
fan_in_fan_out=child.fan_in_fan_out)
return lora_config_dict

View File

@ -1,121 +0,0 @@
from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, List, Optional, Union
from tqdm import tqdm
from coati.trainer.callbacks import Callback
from coati.experience_maker import Experience
import ray
import os
from .detached_replay_buffer import DetachedReplayBuffer
from .utils import is_rank_0
class DetachedTrainer(ABC):
'''
Base class for detached rlhf trainers.
'detach' means that the experience maker is detached compared to a normal Trainer.
Please set name attribute during init:
>>> trainer = DetachedTrainer.options(..., name = "xxx", ...).remote()
So an ExperienceMakerHolder can reach the detached_replay_buffer by Actor's name.
Args:
detached_strategy (DetachedStrategy): the strategy to use for training
detached_replay_buffer_ref (ObjectRef[DetachedReplayBuffer]): the replay buffer to use for training
experience_batch_size (int, defaults to 8): the batch size to use for experience generation
max_epochs (int, defaults to 1): the number of epochs of training process
data_loader_pin_memory (bool, defaults to True): whether to pin memory for data loader
callbacks (List[Callback], defaults to []): the callbacks to call during training process
generate_kwargs (dict, optional): the kwargs to use while model generating
'''
def __init__(self,
experience_maker_holder_name_list: List[str],
train_batch_size: int = 8,
buffer_limit: int = 0,
buffer_cpu_offload: bool = True,
experience_batch_size: int = 8,
max_epochs: int = 1,
dataloader_pin_memory: bool = True,
callbacks: List[Callback] = [],
**generate_kwargs) -> None:
super().__init__()
self.detached_replay_buffer = DetachedReplayBuffer(train_batch_size, limit=buffer_limit, cpu_offload=buffer_cpu_offload)
self.experience_batch_size = experience_batch_size
self.max_epochs = max_epochs
self.dataloader_pin_memory = dataloader_pin_memory
self.callbacks = callbacks
self.generate_kwargs = generate_kwargs
self.target_holder_name_list = experience_maker_holder_name_list
self.target_holder_list = []
def update_target_holder_list(self, experience_maker_holder_name_list):
self.target_holder_name_list = experience_maker_holder_name_list
self.target_holder_list = []
for name in self.target_holder_name_list:
self.target_holder_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
@abstractmethod
def _update_remote_makers(self):
pass
@abstractmethod
def training_step(self, experience: Experience) -> Dict[str, Any]:
pass
def _learn(self):
pbar = tqdm(range(self.max_epochs), desc='Train epoch', disable=not is_rank_0())
for _ in pbar:
if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
print("[trainer] sampling exp")
experience = self._buffer_sample()
if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
print("[trainer] training step")
metrics = self.training_step(experience)
if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
print("[trainer] step over")
pbar.set_postfix(metrics)
def fit(self, num_episodes: int = 50000, max_timesteps: int = 500, update_timesteps: int = 5000) -> None:
self._on_fit_start()
for episode in range(num_episodes):
self._on_episode_start(episode)
for timestep in tqdm(range(max_timesteps // update_timesteps),
desc=f'Episode [{episode+1}/{num_episodes}]',
disable=not is_rank_0()):
self._learn()
self._update_remote_makers()
self._on_episode_end(episode)
self._on_fit_end()
@ray.method(concurrency_group="buffer_length")
def buffer_get_length(self):
# called by ExperienceMakerHolder
if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
print("[trainer] telling length")
return self.detached_replay_buffer.get_length()
@ray.method(concurrency_group="buffer_append")
def buffer_append(self, experience: Experience):
# called by ExperienceMakerHolder
if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
# print(f"[trainer] receiving exp. Current buffer length: {self.detached_replay_buffer.get_length()}")
print(f"[trainer] receiving exp.")
self.detached_replay_buffer.append(experience)
@ray.method(concurrency_group="buffer_sample")
def _buffer_sample(self):
return self.detached_replay_buffer.sample()
def _on_fit_start(self) -> None:
for callback in self.callbacks:
callback.on_fit_start()
def _on_fit_end(self) -> None:
for callback in self.callbacks:
callback.on_fit_end()
def _on_episode_start(self, episode: int) -> None:
for callback in self.callbacks:
callback.on_episode_start(episode)
def _on_episode_end(self, episode: int) -> None:
for callback in self.callbacks:
callback.on_episode_end(episode)

View File

@ -1,172 +0,0 @@
import torch
from typing import Any, Callable, Dict, List, Optional, Union
import ray
from ray.exceptions import GetTimeoutError
from torch import Tensor
import torch.nn as nn
from coati.models.base import Actor, Critic, RewardModel
from coati.trainer.strategies.sampler import DistributedSampler
from coati.trainer.strategies import Strategy
from coati.experience_maker import NaiveExperienceMaker, Experience, ExperienceMaker
from copy import deepcopy
from threading import Lock
import time
import os
from .utils import is_rank_0, get_strategy_from_args, set_dist_env
@ray.remote(concurrency_groups={"experience_io": 1, "model_io": 1, "compute": 1})
class ExperienceMakerHolder:
'''
Args:
detached_trainer_name_list: str list to get ray actor handleskkk
strategy:
experience_batch_size: batch size of generated experience
kl_coef: the coefficient of kl divergence loss
'''
def __init__(self,
detached_trainer_name_list: List[str],
strategy: str,
env_info: Dict[str, str] = None,
experience_batch_size: int = 8,
kl_coef: float = 0.1,
**generate_kwargs):
# set environment variables
if env_info:
set_dist_env(env_info=env_info)
self.target_trainer_list = []
for name in detached_trainer_name_list:
self.target_trainer_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
self.strategy_str = strategy
self.strategy = get_strategy_from_args(strategy)
self.experience_batch_size = experience_batch_size
self.kl_coef = kl_coef
self.generate_kwargs = generate_kwargs
# Need a trainer to give an actor and a critic via initialize_experience_maker(...)
actor, critic, reward_model, initial_model = None, None, None, None
self.experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, self.kl_coef)
self._model_visit_lock = Lock()
self.fully_initialized = False
if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
print('[maker] Waiting for INIT')
def _get_ready(self):
while not self.fully_initialized:
time.sleep(1.0)
def update_target_trainer_list(self, detached_trainer_name_list):
self.target_trainer_list = []
for name in detached_trainer_name_list:
self.target_trainer_list.append(ray.get_actor(name))
# copy from ../trainer/base.py
@ray.method(concurrency_group="compute")
def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experience:
self._get_ready()
if isinstance(inputs, Tensor):
return self.experience_maker.make_experience(inputs, **self.generate_kwargs)
elif isinstance(inputs, dict):
return self.experience_maker.make_experience(**inputs, **self.generate_kwargs)
else:
raise ValueError(f'Unsupported input type "{type(inputs)}"')
@ray.method(concurrency_group="experience_io")
def _send_experience(self, experience):
'''
ignore it
# choose a trainer that has the least experience batch in its detached_replay_buffer
chosen_trainer = None
min_length = None
if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
print("[maker] choosing target trainer")
while chosen_trainer is None:
for target_trainer in self.target_trainer_list:
try:
temp_length = ray.get(target_trainer.buffer_get_length.remote(), timeout=0.1)
if min_length is None:
min_length = temp_length
chosen_trainer = target_trainer
else:
if temp_length < min_length:
min_length = temp_length
chosen_trainer = target_trainer
except GetTimeoutError:
pass
if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
print(f"[maker] sending exp to {chosen_trainer}")
chosen_trainer.buffer_append.remote(experience)
'''
#
if not hasattr(self, "_target_idx"):
self._target_idx = 0
chosen_trainer = self.target_trainer_list[self._target_idx]
if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
print(f"[maker] sending exp to {chosen_trainer}")
chosen_trainer.buffer_append.remote(experience)
self._target_idx = (self._target_idx + 1) % len(self.target_trainer_list)
def workingloop(self, dataset, tokenizer: Optional[Callable[[Any], dict]] = None, times=5000 * 50000):
self._get_ready()
sampler = self.strategy.setup_sampler(dataset)
for _ in range(times):
rand_prompts = sampler.sample(self.experience_batch_size)
if tokenizer is not None:
inputs = tokenizer(rand_prompts)
else:
inputs = rand_prompts
self._model_visit_lock.acquire()
experience = self._make_experience(inputs=inputs)
self._model_visit_lock.release()
self._send_experience(experience=experience)
@ray.method(concurrency_group="model_io")
def initialize_experience_maker(self, init_actor: Actor, init_critic: Critic):
'''
called by trainer. Only once.
'''
# TODO: reduce malloc
if self.fully_initialized:
return
if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
print('[maker] INIT')
with torch.no_grad():
with self.strategy.model_init_context():
actor = init_actor
critic = init_critic
initial_model = deepcopy(actor)
reward_model = RewardModel(deepcopy(critic.model),
deepcopy(critic.value_head)).to(torch.cuda.current_device())
if self.strategy_str != 'colossalai_gemini':
actor.to(torch.float16).to(torch.cuda.current_device())
critic.to(torch.float16).to(torch.cuda.current_device())
initial_model.to(torch.float16).to(torch.cuda.current_device())
reward_model.to(torch.float16).to(torch.cuda.current_device())
self.experience_maker.actor = self.strategy.prepare(actor)
self.experience_maker.critic = self.strategy.prepare(critic)
self.experience_maker.initial_model = self.strategy.prepare(initial_model)
self.experience_maker.reward_model = self.strategy.prepare(reward_model)
self.fully_initialized = True
@ray.method(concurrency_group="model_io")
def update_experience_maker(self, new_actor: Actor, new_critic: Critic):
'''
called by trainer
'''
# TODO: reduce malloc
self._model_visit_lock.acquire()
with torch.no_grad():
if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
print("[maker] UPDATE ")
if self.strategy_str != 'colossalai_gemini':
new_actor.to(torch.float16).to(torch.cuda.current_device())
new_critic.to(torch.float16).to(torch.cuda.current_device())
self.experience_maker.actor = self.strategy.prepare(new_actor)
self.experience_maker.critic = self.strategy.prepare(new_critic)
self._model_visit_lock.release()

View File

@ -1,105 +0,0 @@
# WIP
from coati.trainer.strategies import Strategy
from coati.trainer.strategies import NaiveStrategy
from coati.models.base import Actor, RewardModel, Critic
import numpy as np
import torch
from torch._C._distributed_rpc import _is_current_rpc_agent_set
import colossalai
from colossalai.pipeline.pipeline_process_group import ppg
from colossalai.pipeline.rpc._pipeline_schedule import OneFOneBPipelineEngine
from colossalai.fx import ColoTracer
from colossalai.fx.passes.adding_split_node_pass import balanced_split_pass, split_with_split_nodes_pass
from colossalai.pipeline.middleware.adaptor import get_fx_topology
import os
from functools import partial
import random
rpc_is_initialized = _is_current_rpc_agent_set
class PipelineModel(torch.nn.Module):
'''
Actor has 2 kinds of jobs: forward and generate.
better to just pipelinize the inner model
'''
def __init__(self,
model: torch.nn.Module,
stage_num: int,
num_microbatches: int,
data_kwargs = None,
):
super().__init__()
# create partition module
def create_partition_module(pp_rank:int, stage_num: int, model, data_kwargs):
model.eval()
tracer = ColoTracer()
meta_args = {k: v.to('meta') for k, v in data_kwargs.items()}
graph = tracer.trace(root=model, meta_args=meta_args)
gm = torch.fx.GraphModule(model, graph, model.__class__.__name__)
annotated_model = balanced_split_pass(gm, stage_num)
top_module, split_submodules = split_with_split_nodes_pass(annotated_model, merge_output=True)
topo = get_fx_topology(top_module)
for submodule in split_submodules:
if isinstance(submodule, torch.fx.GraphModule):
setattr(submodule, '_topo', topo)
return split_submodules[pp_rank + 1]
def partition(model, data_kwargs: dict, pp_rank: int, chunk: int, stage_num: int):
partition = create_partition_module(pp_rank, stage_num, model, data_kwargs)
return partition
self.inference_engine = OneFOneBPipelineEngine(
partition_fn=partial(partition, model, data_kwargs),
stage_num=stage_num,
num_microbatches=num_microbatches,
device='cuda',
)
def forward(self,
**model_inputs):
return self.inference_engine.forward_backward(**model_inputs, forward_only=True)
class PPStrategy(NaiveStrategy):
"""
Strategy for Pipeline inference (inference only!)
master node only
"""
def __init__(
self,
seed: int = 42
):
self.seed = seed
super().__init__()
def setup_distributed(self) -> None:
colossalai.launch_from_torch({}, seed=self.seed)
ppg.set_global_info(rank = int(os.environ['RANK']),
world_size=int(os.environ['WORLD_SIZE']),
dp_degree=1,
tp_degree=1,
num_worker_threads=128,
device="cuda")
def model_init_context(self):
return super().model_init_context()
def setup_model(self, model: torch.nn.Module) -> torch.nn.Module:
if isinstance(model, Actor) or \
isinstance(model, RewardModel) or \
isinstance(model, Critic):
model.model = PipelineModel(model.model)
def set_seed(self, seed: int) -> None:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

View File

@ -1,48 +0,0 @@
import torch.distributed as dist
from typing import Any, Callable, Dict, List, Optional
from coati.models.bloom import BLOOMActor, BLOOMCritic
from coati.models.gpt import GPTActor, GPTCritic
from coati.models.opt import OPTActor, OPTCritic
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
import torch
import os
def is_rank_0() -> bool:
return not dist.is_initialized() or dist.get_rank() == 0
def get_cuda_actor_critic_from_args(model: str, pretrained: str = None, lora_rank=0):
if model == 'gpt2':
actor = GPTActor(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
critic = GPTCritic(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
elif model == 'bloom':
actor = BLOOMActor(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
critic = BLOOMCritic(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
elif model == 'opt':
actor = OPTActor(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
critic = OPTCritic(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
else:
raise ValueError(f'Unsupported model "{model}"')
return actor, critic
def get_strategy_from_args(strategy: str):
if strategy == 'naive':
strategy_ = NaiveStrategy()
elif strategy == 'ddp':
strategy_ = DDPStrategy()
elif strategy == 'colossalai_gemini':
strategy_ = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
elif strategy == 'colossalai_zero2':
strategy_ = ColossalAIStrategy(stage=2, placement_policy='cuda')
else:
raise ValueError(f'Unsupported strategy "{strategy}"')
return strategy_
def set_dist_env(env_info: Dict[str, str]):
os.environ["RANK"] = env_info['rank']
os.environ["LOCAL_RANK"] = env_info['local_rank']
os.environ["WORLD_SIZE"] = env_info['world_size']
os.environ['MASTER_PORT'] = env_info['master_port']
os.environ['MASTER_ADDR'] = env_info['master_addr']

View File

@ -0,0 +1,152 @@
import os
from typing import Any, Callable, Dict, List, Optional
from collections import OrderedDict
import torch
import torch.distributed as dist
import torch.nn as nn
from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
from coati.models.gpt import GPTRM, GPTActor, GPTCritic
from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
from coati.models.opt import OPTRM, OPTActor, OPTCritic
from coati.models.roberta import RoBERTaActor, RoBERTaCritic, RoBERTaRM
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
from coati.utils import prepare_llama_tokenizer_and_embedding
from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer, RobertaTokenizer
def is_rank_0() -> bool:
return not dist.is_initialized() or dist.get_rank() == 0
def get_rank() -> int:
return dist.get_rank() if dist.is_initialized() else 0
def get_world_size() -> int:
return dist.get_world_size() if dist.is_initialized() else 1
def get_actor_from_args(model: str, pretrained: str = None, config=None, lora_rank=0):
if model == 'gpt2':
actor = GPTActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
elif model == 'bloom':
actor = BLOOMActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
elif model == 'opt':
actor = OPTActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
elif model == 'llama':
actor = LlamaActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
elif model == 'roberta':
actor = RoBERTaActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
else:
raise ValueError(f'Unsupported actor model "{model}"')
return actor
def get_critic_from_args(model: str, pretrained: str = None, config=None, lora_rank=0):
if model == 'gpt2':
critic = GPTCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
elif model == 'bloom':
critic = BLOOMCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
elif model == 'opt':
critic = OPTCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
elif model == 'llama':
critic = LlamaCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
elif model == 'roberta':
critic = RoBERTaCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
else:
raise ValueError(f'Unsupported reward model "{model}"')
return critic
def get_reward_model_from_args(model: str, pretrained: str = None, config=None):
if model == 'gpt2':
reward_model = GPTRM(pretrained=pretrained, config=config)
elif model == 'bloom':
reward_model = BLOOMRM(pretrained=pretrained, config=config)
elif model == 'opt':
reward_model = OPTRM(pretrained=pretrained, config=config)
elif model == 'llama':
reward_model = LlamaRM(pretrained=pretrained, config=config)
elif model == 'roberta':
reward_model = RoBERTaRM(pretrained=pretrained, config=config)
else:
raise ValueError(f'Unsupported reward model "{model}"')
return reward_model
def get_strategy_from_args(strategy: str):
if strategy == 'naive':
strategy_ = NaiveStrategy()
elif strategy == 'ddp':
strategy_ = DDPStrategy()
elif strategy == 'colossalai_gemini':
strategy_ = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
elif strategy == 'colossalai_zero2':
strategy_ = ColossalAIStrategy(stage=2, placement_policy='cuda')
elif strategy == 'colossalai_gemini_cpu':
strategy_ = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5)
elif strategy == 'colossalai_zero2_cpu':
strategy_ = ColossalAIStrategy(stage=2, placement_policy='cpu')
else:
raise ValueError(f'Unsupported strategy "{strategy}"')
return strategy_
def get_tokenizer_from_args(model: str, **kwargs):
if model == 'gpt2':
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
elif model == 'bloom':
tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m')
elif model == 'opt':
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
elif model == 'llama':
pretrain_path = kwargs["pretrain"]
tokenizer = AutoTokenizer.from_pretrained(pretrain_path)
elif model == 'roberta':
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
else:
raise ValueError(f'Unsupported model "{model}"')
tokenizer.pad_token = tokenizer.eos_token
return tokenizer
def set_dist_env(env_info: Dict[str, str]):
os.environ["RANK"] = env_info['rank']
os.environ["LOCAL_RANK"] = env_info['local_rank']
os.environ["WORLD_SIZE"] = env_info['world_size']
os.environ['MASTER_PORT'] = env_info['master_port']
os.environ['MASTER_ADDR'] = env_info['master_addr']
def get_model_numel(model: nn.Module) -> int:
numel = sum(p.numel() for p in model.parameters())
return numel
def get_receivers_per_sender(sender_idx: int, num_senders: int, num_receivers: int, allow_idle_sender: bool) -> list:
target_receivers = []
if num_senders <= num_receivers or allow_idle_sender:
# a sender will send data to one or more than one receivers
# a receiver only has one sender
for i in range(num_receivers):
if i % num_senders == sender_idx:
target_receivers.append(i)
else:
# a sender will send data to one receiver
# a receiver may have more than one sender
target_receivers.append(sender_idx % num_receivers)
return target_receivers
def state_dict_to(state_dict: Dict[str, Any],
dtype: torch.dtype = torch.float16,
device: torch.device = torch.device('cpu')):
'''
keep state_dict intact
'''
new_state_dict = OrderedDict()
for k, v in state_dict.items():
new_state_dict[k] = v.to(dtype=dtype, device=device)
return new_state_dict

View File

@ -130,3 +130,7 @@ class Strategy(ABC):
only_rank0: bool = True,
tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
pass
@abstractmethod
def get_model_state_dict_shard(self, model: nn.Module, **config):
pass

View File

@ -186,3 +186,15 @@ class ColossalAIStrategy(DDPStrategy):
if self.stage == 3:
raise RuntimeError('ColossalAI strategy with stage-3 does not support save_pretrained() now')
super().save_pretrained(model, path, only_rank0, tokenizer)
def get_model_state_dict_shard(self, model: nn.Module, **config):
if self.stage != 3:
yield from super().get_model_state_dict_shard(model, **config)
else:
# unwrapped_model = self._unwrap_model(model)
# for module in unwrapped_model.modules():
# if isinstance(module, LoraLinear):
# module.merge_weights = True
# module.eval()
base_model: ZeroDDP = get_base_model(model)
yield from base_model.state_dict_shard(max_shard_size=1024, only_rank_0=False)

View File

@ -26,19 +26,8 @@ class DDPStrategy(NaiveStrategy):
super().__init__()
def setup_distributed(self) -> None:
try:
rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
host = os.environ['MASTER_ADDR']
port = int(os.environ['MASTER_PORT'])
except KeyError as e:
raise RuntimeError(
f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
)
dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
self._try_init_dist(force=True)
self.set_seed(self.seed)
torch.cuda.set_device(local_rank)
def set_seed(self, seed: int) -> None:
random.seed(seed)

View File

@ -1,10 +1,17 @@
from typing import Any, Optional
import os
import sys
from collections import OrderedDict
from typing import Any, Dict, Optional
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
from coati.models.base import get_base_model
from coati.replay_buffer import ReplayBuffer
from coati.models.base import RewardModel
from coati.models.lora import LoraLinear
from coati.replay_buffer import ReplayBuffer
from torch.optim import Optimizer
from torch.utils.data import DataLoader
from transformers.modeling_utils import PreTrainedModel
@ -13,6 +20,15 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from .base import Strategy
# TODO Move this to a util.py (Moving to ray.util introduces ringed import)
def get_grad_required_state_dict(model: nn.Module):
state_dict = OrderedDict()
for name, parameter in model.named_parameters():
if parameter.requires_grad:
state_dict[name] = parameter.detach()
return state_dict
class NaiveStrategy(Strategy):
"""
Strategy for single GPU. No parallelism is used.
@ -25,7 +41,7 @@ class NaiveStrategy(Strategy):
optimizer.step()
def setup_distributed(self) -> None:
pass
self._try_init_dist(force=False)
def setup_model(self, model: nn.Module) -> nn.Module:
return model
@ -68,3 +84,45 @@ class NaiveStrategy(Strategy):
unwrapped_model.save_pretrained(path)
if tokenizer is not None:
tokenizer.save_pretrained(path)
def get_model_state_dict_shard(self, model: nn.Module, **config):
# TODO: implement sharding on naive strategy
model = self.unwrap_model(model)
if 'requires_grad_only' in config and config['requires_grad_only'] == True:
state_dict = get_grad_required_state_dict(model)
else:
state_dict = model.state_dict()
if 'shard_size' in config:
shard_size = config['shard_size']
accumulate_size = 0
state_dict_shard = OrderedDict()
for name, param in state_dict.items():
state_dict_shard[name] = param
accumulate_size += param.numel() * param.element_size()
if accumulate_size >= shard_size:
accumulate_size = 0
yield state_dict_shard
state_dict_shard = OrderedDict()
if accumulate_size > 0:
yield state_dict_shard
else:
yield state_dict
def _try_init_dist(self, force: bool = False) -> None:
try:
rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
host = os.environ['MASTER_ADDR']
port = int(os.environ['MASTER_PORT'])
dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
torch.cuda.set_device(local_rank)
except KeyError as e:
if force:
raise RuntimeError(
f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
)
except Exception as e:
if force:
raise e

View File

@ -27,6 +27,7 @@ class DistributedSampler:
assert len(indices) == self.num_samples
self.indices = indices
def sample(self, batch_size: int) -> list:
sampled_indices = np.random.choice(self.indices, batch_size, replace=False)
return [self.dataset[idx] for idx in sampled_indices]

View File

@ -1,10 +1,11 @@
# Evaluation
In this directory, we introduce how you can evaluate your model with our pipeline. This pipeline is available for model
evaluation of Chinese capability and the one for English capability is under preparation.
In this directory, we introduce how you can evaluate your model with our pipeline. This pipeline is now available for evaluation of both Chinese and English capability.
## Installation
To start model evaluation, you need to install required packages which listed in `requirements.txt` under `evaluate` folder.
```shell
pip install -r requirements.txt
```
@ -12,84 +13,94 @@ pip install -r requirements.txt
## Evaluation Pipeline
The whole evaluation pipeline consists of two methods:
1. `GPT Evaluation`: evaluates model predictions using the GPT-3.5.
1. `GPT Evaluation`: evaluates model predictions using GPT models.
* Compare the performance of two different models (battle).
* Rate model according to pre-defined metrics using prompting design.
* Rate the model according to pre-defined metrics using prompting design.
2. `Automatic Evaluation`: evaluates model predictions using automatic metrics.
### Evaluation Category
The model capability is seperated into 10 evaluation categories, which refers to the user case mentioned in InstructGPT.
Following table introduces each category:
| Evaluation Category | Description |
|:-------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------|
| Roleplay | Given certain characteristic, the capability of chatting as the character |
| Chat | Conduct multiple rounds of dialogue, the capability of understanding and memorization of previous rounds of dialogue |
| Open QA | Given an open question, the capability of answering questions in opened-ended way |
| Closed QA | Given a closed question, the capability of answering questions with limited scope (such as single/multiple choice question) |
| Brainstorming | Given a question requiring divergent answers, the capability of divergent answering and listing in points |
| Generation | Given generation task, the capability of generating in high quality and human-written way (such as writing an email) |
| Rewriting | Given rewriting task, the capability of rewriting sentences to meet task requirements (such as active and passive switches, translation) |
| Classification | Given classification task, the capability of accurate classification |
| Extraction | Given extraction task, the capability of extracting required information |
| Summarization | Given a paragraph or passage, the capability of summarization |
Our evaluation pipeline examines the model's capability using 10 categories of questions. The following table introduces each category:
To better understand each evaluation category, here are some prompt examples provided.
| Evaluation Category | Description |
| :-----------------: | :----------------------------------------------------------- |
| Brainstorming | Models are asked to generate a range of creative and diverse ideas according to the question. The capability of creativity is required. |
| Chat | Models are asked to continue a multi-round dialogue given the roles involved. The capability of understanding, memorizing previous rounds of the dialogue and answering according to the persona provided is required. |
| Classification | Models are asked to do classification tasks. The capability of accurate classification is required. |
| Closed QA | Models are asked to answer a closed QA question. The capability of answering questions with limited scope (such as single/multiple choice question) is required. |
| Extraction | Models are asked to extract information from a given material. The capability of extracting required information is required. |
| Generation | Models are asked to generate an email, letter, article, etc. The capability of generating texts in a high quality and human-written way is required. |
| Open QA | Models are asked to answer an open QA question(without context provided). The capability of answering questions with the models' own knowledge base is required. |
| Roleplay | Models are asked to play the role provided. The capability of engaging in the scenario and effectively interacting with the user is required. |
| Rewriting | Models are asked to do rewriting tasks such as translation and grammar correction. The capability of rewriting according to different instructions is required. |
| Summarization | Models are asked to summarize the given paragraph or passage. The capability of summarization is required. |
To better understand each evaluation category, here are some example questions provided.
| Evaluation Category | Chinese Example | English Example |
|:-------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Roleplay | **Example 1**<br/>我想让你担任Android开发工程师面试官。我将成为候选人您将向我询问Android开发工程师职位的面试问题。我希望你只作为面试官回答。不要一次写出所有的问题。我希望你只对我进行采访。问我问题等待我的回答。不要写解释。像面试官一样一个一个问我等我回答。我的第一句话是“面试官你好”。 <br/><br/>**Example 2**<br/>我想让你扮演讲故事的角色。你会想出引人入胜、富有想象力和吸引观众的有趣故事。它可以是童话故事、教育故事或任何其他类型的有潜力的故事以吸引人们的注意力和想象力。根据目标受众,您可以为您的讲故事环节选择特定的主题或主题,例如,如果是儿童,那么您可以谈论动物;如果是成人,那么基于历史的故事可能会更好地吸引他们等。我的第一个请求是我需要一个关于毅力的有趣故事。 | **Example 1**<br/>I want you to act as a journalist. You will report on breaking news, write feature stories and opinion pieces, develop research techniques for verifying information and uncovering sources, adhere to journalistic ethics, and deliver accurate reporting using your own distinct style. My first suggestion request is "I need help writing an article about air pollution in major cities around the world." <br/><br/>**Example 2:**<br/> I want you to act as a debater. I will provide you with some topics related to current events and your task is to research both sides of the debates, present valid arguments for each side, refute opposing points of view, and draw persuasive conclusions based on evidence. Your goal is to help people come away from discussion with increased knowledge and insight into the topic at hand. My first request is "I want an opinion piece about Deno." |
| Chat | **Example 1**<br/>基于以下角色信息完成一段对话。小王20岁是一名大学生小张22岁是一名大学生。<br/> 小王:小张,我们一起去看电影吧? 小张:好啊,你想看什么电影呢? 小王:我听说最近有一部很好看的科幻电影。你觉得怎么样?<br/>小张:<br/><br/>**Example 2:**<br/>基于以下角色信息完成一段对话。小明是一名医生,一位老年病患者想要停药,但他对病情有所忽视并有担忧;王叔叔是老年病患者的儿子,希望能够听取医生的建议。<br/>小明:你好,王叔叔,我了解你想要让你父亲停药。<br/>王叔叔:是的,我父亲已经吃了那么久的药,我担心药物对他的身体会有副作用。<br/>小明: | **Example 1**<br/>Human: Who is the highest try scorer in the current Sydney Roosters team?<br/>AI: As an AI language model, I don't have access to current statistics. However, you can check the current Sydney Roosters team's official website or relevant sports news websites to find the answer.<br/>Human: Thanks for the suggestion, but can you recommend a reliable sports news website that I can check?<br/>AI:<br/><br/>**Example 2:**<br/>Complete a dialogue based on the following role information. <br/>A: Elementary student B: Teacher<br/>B: Good morning, Student A. Today we're going to learn about addition and subtraction.<br/>A: Teacher, I already know this very well. Why do I need to learn it again?<br/>B: |
| Open QA | **Example 1**<br/>请问万有引力定律由谁提出的?<br/><br/>**Example 2**<br/>哪些国家参与了第一次世界大战? | **Example 1**<br/>Who are the indigenous people of New Zealand?<br/><br/>**Example 2**<br/>How do you take the derivative of the sin function? |
| Closed QA | **Example 1**<br/>请从以下选项中选择正确答案。以下哪个是世界上最高山峰? <br/>A. 长城 <br/>B. 泰山 <br/>C. 珠穆朗玛峰 <br/>D. 黄山<br/><br/>**Example 2**<br/>请从以下选项中选择一个最佳答案回答下面的问题。问题:非洲最高的山是哪座山?<br/> 选项: <br/>A. 麦金利山 <br/>B. 喜马拉雅山 <br/>C. 乞力马扎罗山 | **Example 1**<br/>Answer the following question:<br/>What shape is the Earth?<br/>A) A circle<br/>B) A sphere<br/>C) An ellipse<br/>D) A plane<br/><br/>**Example 2**<br/>Choose the correct classification for the following question:<br/>"What type of restaurant is 'Burger King'"?<br/>fast food<br/>family style<br/>formal dining<br/>buffet<br/> |
| Brainstorming | **Example 1**<br/>请介绍一下人工智能的多个领域。<br/><br/>**Example 2**<br/>请给出管理家庭财务的3个小技巧。<br/> | **Example 1**<br/>What are 10 science fiction books I should read next?<br/><br/>**Example 2**<br/>List five ideas for how to regain enthusiasm for my career. |
| Generation | **Example 1**<br/>请撰写一篇文章,介绍如何通过改善生活习惯来预防疾病和延长寿命。<br/><br/>**Example 2**<br/>请根据以下情节撰写一篇短篇小说:一名年轻人被困在一个荒岛上,他必须想办法生存下去直到被救援。但他很快发现自己并不孤单。 | **Example 1**<br/>Can you help me write a formal email to a potential business partner proposing a joint venture?<br/><br/>**Example 2**<br/>Please use the appropriate format to write a formal letter of recommendation for a student applying to a prestigious computer science graduate program at a university. |
| Rewriting | **Example 1**<br/>将以下句子改为被动语态:<br/>"他们正在洗车"<br/><br/>**Example 2**<br/>将以下文本翻译成英语:<br/>“这个周末我要去海边玩” | **Example 1**<br/>Translate the following text into English: <br/>"我最喜欢的季节是春天,因为我可以看到美丽的花朵。"<br/><br/>**Example 2**<br/>Please correct the following sentences and give them the correct sentence.<br/>"Their going to the party there." |
| Classification | **Example 1**<br/>新闻标题:今日立夏,有一上联,立夏万物并秀,下联怎么对?<br/>请根据以上新闻标题判断新闻所属的分类,你需要从文化,娱乐,体育,财经,房产,教育,科技,旅游,游戏,军事这十类中选择一个答案。<br/><br/> **Example 2**<br/>新闻标题:赵丽颖很久没有登上微博热搜了,但你们别急,她只是在憋大招而已。<br/>请根据新闻标题判断新闻所属的分类,你需要从文化,娱乐,体育,财经,房产,教育,科技,旅游,游戏,军事这十类中选择一个答案。 | **Example 1**<br/>Classify the given email as spam or non-spam.<br/>"Hello, this is an email reminding you to pay your property fees"<br/><br/>**Example 2**<br/>Classify the following text as news, ads or forum posts<br/>"The latest iPhone 13 is now available, shop now!" |
| Extraction | **Example 1**<br/>根据以下新闻文本提取新闻报道时间例如回答时按照格式“新闻报道时间2007年8月10日”<br/>新闻文本如下2007-4-7中新网4月7日电据中国消防在线消息4月4日晚上7时30分左右湖南长潭高速公路上发生一起6车连环相撞失火事故。长株潭三地消防部门共出动消防车21台警力100余人。经过消防官兵近2个小时奋力扑救大火被成功扑灭。据初步调查有1人在此次事故中死亡。<br/><br/>**Example 2**<br/>根据以下新闻文本提取新闻报道时间例如回答时按照格式“新闻报道时间2007年8月10日”<br/>新闻文本如下2014年1月15日据外媒《俄罗斯报》报道称位于北半球的澳大利亚现在正处于炎热的夏季而近日也到了高温酷暑的时候当地时间1月14日晚澳大利亚南部一夜间发生至少250起火灾。受炎热天气及雷雨天气影响澳大利亚南部一夜间发生至少250起火灾灾情多集中在维多利亚州。火灾发生后救援人员立即展开救灾行动。目前大部分起火点火势已被控制。 | **Example 1**<br/>Extract all phenotypes of the following text:<br/>"The 55-year-old patient has fever and hypertension."<br/><br/>**Example 2**<br/>Extract the location mentioned in the following text:<br/>"The student graduated from Harvard university, which is located in Boston" |
| Summarization | **Example 1**<br/>请简要总结概括以下段落材料。<br/>新华社快讯斯里兰卡政府部门21日说首都科伦坡包括教堂、酒店等多个地点当天发生的爆炸已导致至少70人死亡另有260多人受伤。<br/><br/> **Example 2**<br/>请简要总结概括以下段落材料。<br/>近期参与京雄高铁站站房建设的中铁十二局因在施工过程中存在环境违法行为被雄安新区公开通报。通报发出后引起社会广泛关注。近日人民网记者从雄安新区相关部门及中铁十二局获悉新区有关部门已经集中约谈了中铁十二局等24个参与雄安建设的项目单位。对于约谈内容和结果中铁十二局有关宣传负责人回应“具体内容不清楚最好找雄安新区相关部门了解情况。”新区有关部门负责人表示此前涉及的环境违法行为中铁十二局已基本整改到位但约谈内容和结果暂不公开接下来将按部就班推进环境治理工作。原题为《雄安新区中铁十二局涉环境违法已基本整改到位》 | **Example 1**<br/>Please provide a summary based on the following news<br/>"China plans to launch its first space station core module in 2022, an important development in the country's space program. The space station, called Tianhe, will include three modules: a core module, an experiment module and an astronomy module. The first launch of the core module will be used to test and verify the basic functions of the station, as well as to conduct related scientific research and technology experiments. "<br/><br/>**Example 2**<br/>What information is provided in the table below? Summarize the core information in it<br/>"Ranking, Player Name, Team, Position, Salary (in millions of dollars)<br/>1, LeBron James, Los Angeles Lakers, SF, 45.0<br/>2, Stephen Curry, Golden State Warriors, PG, 43.5" |
| Evaluation Category | Chinese Example | English Example |
| :-----------------: | :----------------------------------------------------------- | :----------------------------------------------------------- |
| Brainstorming | **Example 1:**<br/>请介绍一下人工智能的多个领域。<br/><br/>**Example 2:**<br/>请给出管理家庭财务的3个小技巧。<br/> | **Example 1:**<br/>How can I improve my memory? Any useful techniques you can suggest?<br/><br/>**Example 2:**<br/>What are some ways to increase productivity while working from home? |
| Chat | **Example 1:**<br/>基于以下角色信息完成一段对话。小张是一名新手爱好者,对养鸡有浓厚的兴趣。老李是一名有丰富经验的养鸡大师。<br/>小张:您好,老李,我最近开始对养鸡感兴趣了,想请教您一些问题。 <br/>老李:你好,小张,我很乐意帮助你。你想问些什么? <br/>小张:我想知道如何确定鸡的品种和性别? <br/>老李:确切的品种可以通过鸡的外貌特征来确定,而性别一般是通过鸡卵的大小和形状来判断。还有什么问题吗?<br/> 小张:<br/><br/>**Example 2:**<br/>基于以下角色信息完成一段对话。小明是一名医生,一位老年病患者想要停药,但他对病情有所忽视并有担忧;王叔叔是老年病患者的儿子,希望能够听取医生的建议。<br/>小明:你好,王叔叔,我了解你想要让你父亲停药。<br/>王叔叔:是的,我父亲已经吃了那么久的药,我担心药物对他的身体会有副作用。<br/>小明: | **Example 1:**<br/>Complete a conversation based on the following character information. Amy is a 30-year-old chef who runs her own restaurant. Jack is a food blogger who specializes in reviewing local restaurants.<br/>Amy: Hi Jack, I heard that you're a food blogger. Nice to meet you. <br/>Jack: Hi Amy, yes I am. Your restaurant has been receiving a lot of good reviews lately. <br/>Amy: Yes, we use only fresh and quality ingredients, and every dish is carefully crafted. <br/>Jack: <br/><br/>**Example 2:**<br/>Complete a dialogue based on the following role information. A: Elementary student B: Teacher<br/>B: Good morning, Student A. Today we're going to learn about addition and subtraction.<br/>A: Teacher, I already know this very well. Why do I need to learn it again?<br/>B: |
| Classification | **Example 1:**<br/>新闻标题:今日立夏,有一上联,立夏万物并秀,下联怎么对?<br/>请根据以上新闻标题判断新闻所属的分类,你需要从文化,娱乐,体育,财经,房产,教育,科技,旅游,游戏,军事这十类中选择一个答案。<br/><br/> **Example 2:**<br/>新闻标题:赵丽颖很久没有登上微博热搜了,但你们别急,她只是在憋大招而已。<br/>请根据新闻标题判断新闻所属的分类,你需要从文化,娱乐,体育,财经,房产,教育,科技,旅游,游戏,军事这十类中选择一个答案。 | **Example 1:**<br/>Title: Fighting for Love (2020) <br/>Description: Jasmine got obsessed with a man and now he's obsessed with her. Steamy nights, kisses and rules being broken awaits them. She turned his whole world upside down and now he's doing it to hers. In this free fall, can they survive each others love?\"<br/>Based on the above information, determine which genre the work of art belongs to. You can only choose one from \"sport\", \"horror\", \"drama\", \"history\", \"romance\", \"biography\", \"science fiction\", \"comedy\", \"animation\", \"documentary\", \"music\" and \"news\".<br/><br/>**Example2:** <br/>Title: Summer Breeze: The Isley Brothers Greatest Hits Live (2005)<br/>Description: Filmed in the US in 2005 and captured in excellent form led by Ron Isley's vocals and Ernie Isley's hard edged guitar. Virtually every track is a hit including Shout, Who's That Lady, Twist And Shout, Summer Breeze and Harvest For The World.<br/>Based on the above information, determine which genre the work of art belongs to. You can only choose one from \"sport\", \"horror\", \"drama\", \"history\", \"romance\", \"biography\", \"science fiction\", \"comedy\", \"animation\", \"documentary\", \"music\" and \"news\"." |
| Closed QA | **Example 1:**<br/>请从以下选项中选择正确答案。以下哪个是世界上最高山峰? <br/>A. 长城 <br/>B. 泰山 <br/>C. 珠穆朗玛峰 <br/>D. 黄山<br/><br/>**Example 2:**<br/>请从以下选项中选择一个最佳答案回答下面的问题。问题:非洲最高的山是哪座山?<br/> 选项: <br/>A. 麦金利山 <br/>B. 喜马拉雅山 <br/>C. 乞力马扎罗山 | **Example 1:**<br/>Which of the following options is NOT a primary color?<br/>(a) yellow<br/>(b) blue<br/>(c) orange<br/>(d) red<br/><br/>**Example 2:**<br/>Choose the correct option to complete the following sentence: \"Harry Potter and the Chamber of Secrets\" is the ________ book in the Harry Potter series.<br/>(A) first<br/>(B) second<br/>(C) third<br/>(D) fourth |
| Extraction | **Example 1:**<br/>根据以下新闻文本提取新闻报道时间例如回答时按照格式“新闻报道时间2007年8月10日”<br/>新闻文本如下2007-4-7中新网4月7日电据中国消防在线消息4月4日晚上7时30分左右湖南长潭高速公路上发生一起6车连环相撞失火事故。长株潭三地消防部门共出动消防车21台警力100余人。经过消防官兵近2个小时奋力扑救大火被成功扑灭。据初步调查有1人在此次事故中死亡。<br/><br/>**Example 2:**<br/>根据以下新闻文本提取新闻报道时间例如回答时按照格式“新闻报道时间2007年8月10日”<br/>新闻文本如下2014年1月15日据外媒《俄罗斯报》报道称位于北半球的澳大利亚现在正处于炎热的夏季而近日也到了高温酷暑的时候当地时间1月14日晚澳大利亚南部一夜间发生至少250起火灾。受炎热天气及雷雨天气影响澳大利亚南部一夜间发生至少250起火灾灾情多集中在维多利亚州。火灾发生后救援人员立即展开救灾行动。目前大部分起火点火势已被控制。 | **Example 1:**<br/>Ernest Hemingway, an American literary giant known for his spare and direct writing style, has penned timeless works such as 'The Old Man and the Sea', 'For Whom the Bell Tolls', and 'A Farewell to Arms', which have made a profound impact on the literary world and continue to be widely read and admired today.<br/>Extract the name of the author mentioned above.<br/><br/>**Example 2:**<br/>In the epic fantasy series 'A Song of Ice and Fire', George R.R. Martin weaves a complex web of political intrigue, war, and magic across the fictional continents of Westeros and Essos. Martin's richly developed characters and intricate plotlines have captivated readers worldwide, much like his other acclaimed works such as 'A Clash of Kings' and 'A Storm of Swords'.<br/>Extract the name of the author in the above material. |
| Generation | **Example 1:**<br/>请撰写一篇文章,介绍如何通过改善生活习惯来预防疾病和延长寿命。<br/><br/>**Example 2:**<br/>请根据以下情节撰写一篇短篇小说:一名年轻人被困在一个荒岛上,他必须想办法生存下去直到被救援。但他很快发现自己并不孤单。 | **Example 1:**<br/>Write a descriptive paragraph about an island to relax and unwind, including details about the location and atmosphere.<br/><br/>**Example 2:**<br/>Can you help me write a persuasive email to my colleagues encouraging them to participate in a charitable fundraising event? |
| Open QA | **Example 1:**<br/>请问万有引力定律由谁提出的?<br/><br/>**Example 2:**<br/>哪些国家参与了第一次世界大战? | **Example 1:**<br/>What are the four basic tastes of the human palate?<br/><br/>**Example 2:**<br/>Who painted the The Scream? |
| Rewriting | **Example 1:**<br/>请将以下句子改为正确的语序。 <br/>生日快乐你祝他了吗?<br/><br/>**Example 2:**<br/>将以下文本翻译成英语:<br/>“这个周末我要去海边玩” | **Example 1:**<br/>Please translate the following sentences, which are a mixture of Chinese and English, into full English. <br/>我需要买一些healthy snacks比如nuts和dried fruits作为我的office的午餐.<br/><br/>**Example 2:**<br/>Please rewrite the sentence using an inverted sentence structure.<br/>We won't begin our journey until the sun sets. |
| Roleplay | **Example 1:**<br/>我想让你担任Android开发工程师面试官。我将成为候选人您将向我询问Android开发工程师职位的面试问题。我希望你只作为面试官回答。不要一次写出所有的问题。我希望你只对我进行采访。问我问题等待我的回答。不要写解释。像面试官一样一个一个问我等我回答。我的第一句话是“面试官你好”。 <br/><br/>**Example 2:**<br/>我想让你扮演讲故事的角色。你会想出引人入胜、富有想象力和吸引观众的有趣故事。它可以是童话故事、教育故事或任何其他类型的有潜力的故事以吸引人们的注意力和想象力。根据目标受众,您可以为您的讲故事环节选择特定的主题或主题,例如,如果是儿童,那么您可以谈论动物;如果是成人,那么基于历史的故事可能会更好地吸引他们等。我的第一个请求是我需要一个关于毅力的有趣故事。 | **Example 1:**<br/>Assume the role of a marriage counselor. Develop a series of communication exercises for a couple who are experiencing difficulties in their relationship. These exercises should promote active listening, empathy, and effective expression of emotions. Your first assignment is to provide a set of three exercises that focus on resolving conflicts and rebuilding trust. <br/><br/>**Example 2:**<br/>I want you to act as a travel agent. I will tell you my desired destination, travel dates, and budget, and it will be your job to suggest the best travel itinerary for me. Your recommendations should include the best transportation options, hotel accommodations, and any popular tourist attractions nearby. My first request is "I want to plan a trip to Tokyo for a week, with a budget of $2000. I want to explore the culture and food of the city." |
| Summarization | **Example 1:**<br/>请简要总结概括以下段落材料。<br/>当地时间29日泰国卫生部通报新增143名新冠肺炎确诊病例和1名死亡病例。截止到当地时间29日上午泰国累计确诊病例1388例其中泰国籍1172例非泰国籍216例。死亡病例累计7例。原题为《泰国新增143例新冠肺炎确诊病例累计确诊1388例》<br/><br/> **Example 2:**<br/>请简要总结概括以下段落材料。<br/>近期参与京雄高铁站站房建设的中铁十二局因在施工过程中存在环境违法行为被雄安新区公开通报。通报发出后引起社会广泛关注。近日人民网记者从雄安新区相关部门及中铁十二局获悉新区有关部门已经集中约谈了中铁十二局等24个参与雄安建设的项目单位。对于约谈内容和结果中铁十二局有关宣传负责人回应“具体内容不清楚最好找雄安新区相关部门了解情况。”新区有关部门负责人表示此前涉及的环境违法行为中铁十二局已基本整改到位但约谈内容和结果暂不公开接下来将按部就班推进环境治理工作。原题为《雄安新区中铁十二局涉环境违法已基本整改到位》 | **Example 1:**<br/>The 21 year-old-woman was treated by paramedics after the kitchen fire in Botfield Road in Shifnal, Shropshire. West Mercia Police said it is treating Wednesday morning's incident as arson and are appealing for any witnesses to contact them.The 50-year-old man has been arrested on suspicion of arson with intent to endanger life. For more on this and other stories from Shropshire.<br/>Please briefly summarize the above material within 20 words.<br/><br/>**Example 2:**<br/>South Wales Police were called to a property in Heolgerrig, Merthyr Tydfil, at about 13:40 BST on Sunday. The child was airlifted to Prince Charles Hospital but died shortly afterwards. Police are investigating the circumstances surrounding the incident and have appealed for witnesses. The girl's family are being supported by specially trained officers.<br/>Please briefly summarize the above material within 20 words. |
### Evaluation Metrics
#### GPT Evaluation
Use GPT-3.5 to evaluate the prediction of different models, and pre-define evaluation metrics for different categories. There are 10 pre-defined evaluation metrics and you can refer to the table below:
| Evaluation Metric | Prompt Words | CoT |
|:-----------------------:|:-------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Language organization | 语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。 | 1. 阅读答案,并检查是否有语法错误、用词不当或其他显著的错误。<br/> 2.检查答案是否具有逻辑性,能够按照合理的顺序传达信息并且能够自圆其说<br/> 3. 确定答案是否与问题或主题相关,并且能够传达清晰的信息。<br/> 4. 检查答案是否连贯,是否使用适当的转换和过渡来保持句子和段落之间的连贯性。<br/> 5. 检查答案是否具有明确的结构和组织方式,使得读者可以轻松理解信息的层次和结构。<br/> 6. 根据以上因素综合评估答案的语言组织并给出一个1到5的分数其中5表示语言组织非常好而1表示语言组织非常差。 |
| Relevance | 切题(1-5):答案内容是否切题,不答非所问,并且严格遵照题目要求。 | 1. 阅读题目,确定题目所问的问题是什么,以及需要回答哪些方面的问题。<br/> 2. 阅读答案,确认答案是否直接回答了题目所问的问题。<br/> 3. 检查答案是否严格遵照了题目的要求,包括答题方式、答题长度、答题格式等等。<br/> 4. 根据以上因素综合评估答案的切题程度并给出一个1到5的分数其中5表示答案非常切题而1表示答案完全没有切题。 |
| Creativity | 创意性(1-5):某些头脑风暴问题可能需要答案具有创意,提出新的思路。 | 1. 仔细阅读所提供的头脑风暴问题,确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验,判断所提供的答案是否可行。如果答案不可行,则创意性评分可能会受到影响。<br/> 3. 考虑答案中是否包含新颖的想法或独特的思路。答案可能与已知的解决方案有所重叠,但仍然可以被认为是有创意的,只要它提供了新的角度或方法来解决问题。<br/> 4. 根据答案的创意性给出一个1到5的评分。如果答案缺乏创意则应给出一个较低的评分。如果答案具有创意并提供了新的思路应给出一个较高的评分。 |
| Practicality | 实用性(1-5):某些头脑风暴问题可能需要答案提出实用的建议或解决方法。 | 1. 仔细阅读所提供的头脑风暴问题,确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验,判断所提供的答案是否可行。如果答案不可行,则实用性评分可能会受到影响。<br/> 3. 考虑答案中提出的建议或解决方法是否实用并可行。答案可能看起来很好,但如果无法实现或应用,则实用性评分可能会受到影响。<br/> 4. 根据答案的实用性给出一个1到5的评分。如果答案缺乏实用性则应给出一个较低的评分。如果答案提出了实用的建议或解决方法并且可以很好地解决问题则应给出一个较高的评分。 |
| Correctness | 正确性(1-5):答案应该符合常识、生活实际等等 | 1. 仔细阅读所提供的头脑风暴问题,确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验,判断所提供的答案是否可行。如果答案不可行,则正确性评分可能会受到影响。<br/> 3. 考虑答案中所提供的信息是否正确、符合常识、生活实际等等。如果答案中存在明显的错误或不合理之处,则正确性评分可能会受到影响。<br/> 4. 根据答案的正确性给出一个1到5的评分。如果答案存在明显的错误或不合理之处则应给出一个较低的评分。如果答案正确、符合常识、生活实际等等则应给出一个较高的评分。 |
| Naturalness | 自然(1-5):答案是否自然,并且符合问题给定的身份。 | 1. 阅读题目,确定题目提供的身份信息。<br/> 2. 检查答案内容是否符合题目给定的身份。<br/> 3. 根据以上因素对该回答的自然性进行打分分数从1到5其中1表示不自然5表示非常自然并符合问题给定的身份。 |
| Engagingness | 参与感(1-5):答案是否对前面的对话内容做出了恰当的反应,是否理解对话的语境和背景。 | 1. 阅读题目,确定对话的语境和背景。<br/> 2. 检查答案是否充分理解对话的语境和背景,能否自然地融入到对话中而不显得突兀。<br/> 3. 根据以上因素对该回答的参与感进行打分分数从1到5其中1表示没有参与感5表示非常有参与感并且恰当地理解了对话的语境和背景。 |
| Reasonableness | 合理性(1-5):答案是否能够与前面的对话内容形成逻辑上的衔接,是否符合常理,能否在这个上下文中合理存在。 | 1. 阅读题目,确定对话的主题以及问题期望的回答方向。<br/> 2. 判断答案是否能够与前面的对话内容形成逻辑上的衔接,是否符合常理,能否在这个上下文中合理存在。<br/> 3. 根据以上因素对该回答的合理性进行打分分数从1到5其中1表示不合理5表示非常合理并且能够与前面的对话内容形成逻辑上的衔接并符合常理。 |
| Diversity | 多样性(1-5):答案使用语言是否优美,具有有一定的创造性和想象力。然而,回答也应该保持合理和适度,不要过于夸张或离题。 | 1. 仔细阅读整个回答,确保完全理解回答所表达的内容和主题。<br/> 2. 在阅读回答的同时,注意语言的质量,例如措辞是否正确,语言是否生动等。<br/> 3. 检查回答的创造性和想象力,看看回答是否能够吸引人阅读下去。<br/> 4. 检查回答的合理性和适度看看回答是否夸张或离题。5. 将多样性的评分打分在1到5之间5分表示回答的质量很好能够吸引人阅读1分表示回答的内容生硬或者有离题的问题。 |
| Fidelity | 保真度(1-5):答案是否能够严格遵守角色的设定回答给定的请求。 | 1. 仔细阅读问题,了解角色在问题中的设定和表现,包括职业、背景、观点、性格等方面。<br/> 阅读题目的请求,确认回答请求时需要注意的细节。<br/> 3. 对比提供的回答与该角色的设定,评估回答是否能够严格遵守角色的设定。<br/> 4. 结合以上评估结果给出保真度的评分范围从1到5分其中1分表示回答与角色设定完全不符5分表示回答完全符合角色设定且满足给定请求。 |
| Conciseness | 简明扼要(1-5):答案是否简明扼要,没有冗余内容。 | 1. 阅读题目,提取出材料的重点。<br/> 2. 阅读该总结,并注意其中的主要观点和信息。<br/> 3. 评估总结的长度。一个简明扼要的总结通常应该在几句话或几段文字内传达关键信息,而不是冗长的段落或文章。<br/> 4. 检查总结是否包含与主要观点无关的信息或冗余信息。<br/> 5. 确定总结涵盖了材料中的关键信息,并且没有忽略任何重要细节。<br/> 6. 给总结打出1-5的分数其中5表示总结简明扼要没有冗余内容而1表示总结冗长或包含不必要的信息难以理解或记忆。根据您的判断打出适当的得分。 |
GPT evaluation uses GPT models to evaluate the prediction of different models and different pre-defined evaluation metrics are applied to different categories. The following table shows the 11 pre-defined evaluation metrics both in Chinese and English:
GPT-3.5 evaluates the quality of model predictions based on the given prompt words and gives a score between 1-5.
| Evaluation Metric | Prompt Words | CoT(Chain-of-Thought) |
| :-------------------: | :----------------------------------------------------------- | :----------------------------------------------------------- |
| 语言组织<br/>(Language organization) | 语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。</br></br>Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc. | 1. 阅读答案,并检查是否有语法错误、用词不当或其他显著的错误。<br/> 2. 检查答案是否具有逻辑性,能够按照合理的顺序传达信息并且能够自圆其说<br/> 3. 确定答案是否与问题或主题相关,并且能够传达清晰的信息。<br/> 4. 检查答案是否连贯,是否使用适当的转换和过渡来保持句子和段落之间的连贯性。<br/> 5. 检查答案是否具有明确的结构和组织方式,使得读者可以轻松理解信息的层次和结构。<br/> 6. 根据以上因素综合评估答案的语言组织并给出一个1到5的分数其中5表示语言组织非常好而1表示语言组织非常差。</br></br>1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.<br>2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.<br>3. Determine if the answer is relevant to the question or topic and conveys a clear message.<br>4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.<br>5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.<br>6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization. |
| 切题<br/>(Relevance) | 切题(1-5):答案内容是否切题,不答非所问,并且严格遵照题目要求。</br></br>Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic. | 1. 阅读题目,确定题目所问的问题是什么,以及需要回答哪些方面的问题。<br/> 2. 阅读答案,确认答案是否直接回答了题目所问的问题。<br/> 3. 检查答案是否严格遵照了题目的要求,包括答题方式、答题长度、答题格式等等。<br/> 4. 根据以上因素综合评估答案的切题程度并给出一个1到5的分数其中5表示答案非常切题而1表示答案完全没有切题。</br></br>1. Read the question to determine what the question asks and what aspects of the question need to be answered.<br>2. Read the answers to make sure that they directly answer the question asked.<br>3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.<br>4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all. |
| 创意性<br/>(Creativity) | 创意性(1-5):某些头脑风暴问题可能需要答案具有创意,提出新的思路。</br></br>Creativity (1-5): Some brainstorming questions may require answers that are creative and suggest new ideas. | 1. 仔细阅读所提供的头脑风暴问题,确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验,判断所提供的答案是否可行。如果答案不可行,则创意性评分可能会受到影响。<br/> 3. 考虑答案中是否包含新颖的想法或独特的思路。答案可能与已知的解决方案有所重叠,但仍然可以被认为是有创意的,只要它提供了新的角度或方法来解决问题。<br/> 4. 根据答案的创意性给出一个1到5的评分。如果答案缺乏创意则应给出一个较低的评分。如果答案具有创意并提供了新的思路应给出一个较高的评分。</br></br>1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.<br>2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the creativity score may be affected.<br>3. Consider whether the answer contains novel ideas or unique thoughts. An answer may overlap with a known solution and still be considered creative, as long as it offers a new perspective or approach to the problem.<br>4. Give a score of 1 to 5 depending on the creativity of the answer. If the answer lacks creativity, a lower score should be given. If the answer is creative and provides a new idea, a higher score should be given. |
| 实用性<br/>(Practicality) | 实用性(1-5):某些头脑风暴问题可能需要答案提出实用的建议或解决方法。</br></br>Practicality (1-5): Some brainstorming questions may require answers to suggest practical suggestions or solutions. | 1. 仔细阅读所提供的头脑风暴问题,确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验,判断所提供的答案是否可行。如果答案不可行,则实用性评分可能会受到影响。<br/> 3. 考虑答案中提出的建议或解决方法是否实用并可行。答案可能看起来很好,但如果无法实现或应用,则实用性评分可能会受到影响。<br/> 4. 根据答案的实用性给出一个1到5的评分。如果答案缺乏实用性则应给出一个较低的评分。如果答案提出了实用的建议或解决方法并且可以很好地解决问题则应给出一个较高的评分。</br></br>1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.<br>2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the practicality score may be affected.<br>3. Consider whether the suggestions or solutions presented in the answer are practical and workable. The answer may look good, but if it cannot be implemented or applied, the practicality score may be affected.<br>4. Give a score of 1 to 5 depending on the practicality of the answer. If the answer lacks practicality, a lower score should be given. If the answer makes a practical suggestion or solution and solves the problem well, a higher score should be given. |
| 正确性<br/>(Correctness) | 正确性(1-5):答案应该符合常识、生活实际等等。 </br></br> Correctness (1-5): The answer should be in line with common sense, life experience, etc. | 1. 仔细阅读所提供的头脑风暴问题,确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验,判断所提供的答案是否可行。如果答案不可行,则正确性评分可能会受到影响。<br/> 3. 考虑答案中所提供的信息是否正确、符合常识、生活实际等等。如果答案中存在明显的错误或不合理之处,则正确性评分可能会受到影响。<br/> 4. 根据答案的正确性给出一个1到5的评分。如果答案存在明显的错误或不合理之处则应给出一个较低的评分。如果答案正确、符合常识、生活实际等等则应给出一个较高的评分。</br></br>1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.<br>2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the correctness score may be affected.<br>3. Consider whether the information provided in the answer is correct, consistent with common sense, real life, etc. If there are obvious errors or implausibilities in the answer, the correctness score may be affected.<br>4. Give a score of 1 to 5 depending on the correctness of the answer. If the answer contains obvious errors or unreasonable points, a lower score should be given. A higher score should be given if the answer is correct, consistent with common sense, real life, etc. |
| 自然<br/>(Naturalness) | 自然(1-5):答案是否自然,并且符合问题给定的身份。</br></br>Naturalness (1-5): whether the answer is natural and fits the identity given by the question. | 1. 阅读题目,确定题目提供的身份信息。<br/> 2. 检查答案内容是否符合题目给定的身份。<br/> 3. 根据以上因素对该回答的自然性进行打分分数从1到5其中1表示不自然5表示非常自然并符合问题给定的身份。</br></br>1. Read the question and determine the identity information provided in the question.<br>2. Check whether the content of the answer matches the identity given in the question.<br>3. Based on the above factors, score the naturalness of the response on a scale from 1 to 5, where 1 means unnatural and 5 means very natural and in accordance with the identity given in the question. |
| 参与感<br/>(Engagingness) | 参与感(1-5):答案是否对前面的对话内容做出了恰当的反应,是否理解对话的语境和背景。</br></br>Engagingness (1-5): whether the answer responds appropriately to the content of the preceding conversation and whether it understands the context and background of the conversation. | 1. 阅读题目,确定对话的语境和背景。<br/> 2. 检查答案是否充分理解对话的语境和背景,能否自然地融入到对话中而不显得突兀。<br/> 3. 根据以上因素对该回答的参与感进行打分分数从1到5其中1表示没有参与感5表示非常有参与感并且恰当地理解了对话的语境和背景。</br></br>1. Read the questions to determine the context and background of the dialogue.<br>2. Check that the answer fully understands the context and background of the conversation and that it fits naturally into the conversation without seeming abrupt.<br>3. Based on the above factors, rate the response's engagement on a scale from 1 to 5, where 1 means not engaged and 5 means very engaged and appropriately understands the context and background of the conversation. |
| 合理性<br/>(Reasonableness) | 合理性(1-5):答案是否能够与前面的对话内容形成逻辑上的衔接,是否符合常理,能否在这个上下文中合理存在。</br></br>Reasonableness (1-5): Whether the answer can form a logical connection with the content of the previous dialogue, whether it is consistent with common sense, and whether it can reasonably exist in this context. | 1. 阅读题目,确定对话的主题以及问题期望的回答方向。<br/> 2. 判断答案是否能够与前面的对话内容形成逻辑上的衔接,是否符合常理,能否在这个上下文中合理存在。<br/> 3. 根据以上因素对该回答的合理性进行打分分数从1到5其中1表示不合理5表示非常合理并且能够与前面的对话内容形成逻辑上的衔接并符合常理。</br></br>1. Read the question and determine the topic of the conversation and the direction the question expects the answer to go.<br>2. Determine whether the answer can be logically connected to the preceding conversation, whether it makes common sense, and whether it can reasonably exist in this context.<br>3. Based on the above factors, rate the reasonableness of the answer on a scale from 1 to 5, where 1 means unreasonable and 5 means very reasonable and able to form a logical connection with the preceding dialogue content and consistent with common sense. |
| 多样性<br/>(Diversity) | 多样性(1-5):答案使用语言是否优美,具有有一定的创造性和想象力。然而,回答也应该保持合理和适度,不要过于夸张或离题。</br></br>Diversity (1-5): Whether the answers use beautiful language and have some creativity and imagination. However, answers should also be kept reasonable and moderate, not overly exaggerated or off-topic. | 1. 仔细阅读整个回答,确保完全理解回答所表达的内容和主题。<br/> 2. 在阅读回答的同时,注意语言的质量,例如措辞是否正确,语言是否生动等。<br/> 3. 检查回答的创造性和想象力,看看回答是否能够吸引人阅读下去。<br/> 4. 检查回答的合理性和适度看看回答是否夸张或离题。5. 将多样性的评分打分在1到5之间5分表示回答的质量很好能够吸引人阅读1分表示回答的内容生硬或者有离题的问题。</br></br>1. Read the entire response carefully to ensure that you fully understand the content and theme expressed in the response.<br>2. While reading the response, pay attention to the quality of the language, such as whether the wording is correct and the language is vivid.<br>3. Check the creativity and imagination of the response to see if the response is engaging to read on.<br>4. Check the reasonableness and appropriateness of the responses to see if the responses are exaggerated or off-topic.<br>5. Rate the diversity on a scale of 1 to 5, with a 5 indicating a good quality response that is engaging to read and a 1 indicating a raw response or a question that is off-topic. |
| 保真度<br/>(Fidelity) | 保真度(1-5):答案是否能够严格遵守角色的设定回答给定的请求。</br></br>Fidelity (1-5): whether the answer is able to answer the given request in strict compliance with the role setting. | 1. 仔细阅读问题,了解角色在问题中的设定和表现,包括职业、背景、观点、性格等方面。<br/> 阅读题目的请求,确认回答请求时需要注意的细节。<br/> 3. 对比提供的回答与该角色的设定,评估回答是否能够严格遵守角色的设定。<br/> 4. 结合以上评估结果给出保真度的评分范围从1到5分其中1分表示回答与角色设定完全不符5分表示回答完全符合角色设定且满足给定请求。</br></br>1. Read the question carefully to understand how the character is set up and represented in the question, including aspects such as occupation, background, point of view, and personality.<br>2. Read the question's request and confirm the details that need to be taken into account when answering the request.<br>3. Compare the provided answer with the setting of the role and assess whether the answer can strictly adhere to the setting of the role.<br>4. Combine the results of the above assessment to give a fidelity score ranging from 1 to 5, where a score of 1 means that the response does not match the persona at all, and a score of 5 means that the response fully complies with the persona and satisfies the given request. |
| 简明扼要<br/>(Conciseness) | 简明扼要(1-5):答案是否简明扼要,没有冗余内容。</br></br>Conciseness (1-5): answers should be concise and without redundant content. | 1. 阅读题目,提取出材料的重点。<br/> 2. 阅读该总结,并注意其中的主要观点和信息。<br/> 3. 评估总结的长度。一个简明扼要的总结通常应该在几句话或几段文字内传达关键信息,而不是冗长的段落或文章。<br/> 4. 检查总结是否包含与主要观点无关的信息或冗余信息。<br/> 5. 确定总结涵盖了材料中的关键信息,并且没有忽略任何重要细节。<br/> 6. 给总结打出1-5的分数其中5表示总结简明扼要没有冗余内容而1表示总结冗长或包含不必要的信息难以理解或记忆。根据您的判断打出适当的得分。</br></br>1. Read the title and extract the main points of the material.<br>2. Read the summary and note the main ideas and messages in it.<br>3. Assess the length of the summary. A concise summary should usually convey key information within a few sentences or paragraphs, rather than lengthy paragraphs or essays.<br>4. Check that the summary does not contain information that is not relevant to the main ideas or that is redundant.<br>5. Make sure that the summary covers the key information in the material and that no important details have been omitted.<br>6. Rate the summary on a scale of 1-5, where 5 means the summary is concise and free of redundancy, and 1 means the summary is lengthy or contains unnecessary information that is difficult to understand or remember. Based on your judgment, assign the appropriate score. |
GPT models evaluate the quality of model predictions based on the given prompt words and gives a score between 1-5.
> **NOTE:** Even for the same metric, the details of its prompt words and CoT(Chain-of-Thought) can differ based on which category you want to evaluate. For example, prompt words for metric `correctness` showed here is "The answer should be in line with common sense, life experience, etc."(this is for category `brainstorming`), but for category `extraction`, prompt words can be "Answers should extract the required information accurately and should not contain any incorrect or misleading information." You can find all the prompt words and CoT(Chain-of-Thought) in `prompt/evaluation_prompt`.
#### Automatic Evaluation
Automated metrics evaluate the capability of a model by comparing model predictions with reference answers.
There are two ways to obtain reference answers:
* For instruction coming from human-designed problems, the reference answers are generated by GPT-3.5, such as roleplay, chat.
* For instruction related with classic NLP problems, the reference answers are collected from open-sourced dataset with target answers, such as classification, extraction, summarization.
There are 5 types of automatic evaluation metrics listed in the table below:
| Automatic Evaluation Metric | Description |
|:-----------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| BLEU-n | Measure the accuracy between prediction and reference.<br/> BLEU-1 (Unigram) evaluates accuracy in word level<br/> BLEU-n (n-gram) evaluate the fluency in sentence level. |
| Automatic Evaluation Metric | Description |
| :---------------------------------: | :----------------------------------------------------------- |
| BLEU-n | Measure the accuracy between prediction and reference.<br/> BLEU-1 (Unigram) evaluates accuracy in word level.<br/> BLEU-n (n-gram) evaluate the fluency in sentence level. |
| ROUGE | ROUGE-N measures the number of matching n-grams between prediction and reference. <br/> ROUGE-L measures the number of matching longest common subsequence (LCS) between prediction and reference. |
| Distinct | Measure the diversity of generation text by counting the unique n-grams. |
| BERTScore | Measure the semantic similarity between tokens of predictions and references with BERT. |
| Precision<br/> Recall<br/> F1 Score | Measure the number of overlaps between prediction and reference (design for classification and extraction categories) |
| Distinct | Measure the diversity of generation text by counting the unique n-grams. |
| BERTScore | Measure the semantic similarity between tokens of predictions and references with BERT. |
| Precision<br/> Recall<br/> F1 Score | Measure the number of overlaps between prediction and reference (design for classification and extraction categories). |
## Evaluation Process
### Data Format
#### Target Answers / Predictions
A JSON file contains one list. Each element in the list is a target answer / prediction record for one instruction / question.
An element should have the following fields:
@ -103,7 +114,8 @@ An element should have the following fields:
If the `input` has a target answer, the `output` can be empty. Otherwise, we generate answers from GPT-3.5 as the `output`, and the `target` field is empty.
Example:
```
```json
[
{
"category": "brainstorming",
@ -138,7 +150,8 @@ An element should have the following fields:
* `id` (int, compulsory): The ID of the instruction / question.
Example:
```
```json
[
{
"category": "brainstorming",
@ -159,34 +172,79 @@ Example:
]
```
### Evaluation
#### Configuration
The configuration file `config_cn.json` can control how evaluate the performance of the model.
The following is an example showing the config structure:
### Prompt
#### Battle Prompt
The following is the Chinese battle prompt. In the battle prompt, the question and answers from two different models are fed into the prompt template. You can find example battle prompt files for Chinese and English in `prompt/battle_prompt`.
```json
{
"id": 1,
"system_prompt": "你是一个检查回答质量的好助手。",
"prompt_template": "[问题]\n{question}\n\n[1号AI助手的答案]\n{answer_1}\n\n[1号AI助手答案终止]\n\n[2号AI助手的答 案]\n{answer_2}\n\n[2号AI助手答案终止]\n\n[要求]\n{prompt}\n\n",
"prompt": "我们需要你评价这两个AI助手回答的性能。\n请对他们的回答的有用性、相关性、准确性、详细程度进行评分。每个AI助手都会得到一个1到10分的总分分数越高表示整体表现越好。\n请首先输出一行该行只包含两个数值分别表示1号和2号AI助手的分数。这两个分数之间要有一个空格。在随后的一行中请对你的评价作出全面的解释避免任何潜在的偏见并确保AI助手回答的顺序不会影响您的判断。"
}
```
#### Evaluation Prompt
The following is an example of a Chinese GPT evaluation prompt. In an evaluation prompt, you should define your metrics in `metrics` and provide CoT(Chain-of-Thought) in `CoT`. You can find example evaluation prompt files for Chinese and English in `prompt/evaluation_prompt`.
```json
{
"brainstorming": {
"id": 1,
"category": "brainstorming",
"metrics": {
"language organization": "语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。"
},
"CoT": {
"language organization": "1. 阅读答案,并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性,能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关,并且能够传达清晰的信息。\n4. 检查答案是否连贯,是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式,使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织并给出一个1到5的分数其中5表示语言组织非常好而1表示语言组织非常差。\n\n语言组织"
},
"prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下\n\n{question}\n\n答案如下\n\n{answer}\n\n评分的指标如下\n\n{metric}\n\n请你遵照以下的评分步骤\n\n{steps}"
}
}
```
`"metrics"`: the metrics that can be used in GPT evaluation. This field determines which metrics can be added to your config file.
`"CoT"`: evaluation steps you prompt to GPT models for each metric defined in `"metrics"`.
### Evaluation
#### Configuration
The following is an example of a Chinese config file. The configuration file can control how the pipeline evaluates the model. You need to specify GPT evaluation metrics and automatic metrics in key `GPT` and `Metrics`. You can find an example Chinese config file in `config`.
```json
{
"language": "cn",
"category": {
"brainstorming": {
"GPT-3.5": ["relevance", "creativity", "practicality", "correctness"],
"GPT": ["relevance", "creativity", "practicality", "correctness"],
"Metrics": ["Distinct"]
},
"chat": {
"GPT-3.5": [ "relevance", "naturalness", "engagingness", "reasonableness"],
"GPT": [ "relevance", "naturalness", "engagingness", "reasonableness"],
"Metrics": ["Distinct"]
}
}
}
```
`"language"`: evaluate the model capability in which language, we only support Chinese `"cn"` for now.
`"category"`: evaluate the model capability in which category/categories.
`"GPT-3.5"`: config metrics for GPT-3.5 evaluation.
`"Metrics"`: config metrics for automatic metrics evaluation.
`"language"`: the language used to evaluate the model capability. We only support Chinese `"cn"` for now.
`"category"`: the category/categories needed to evaluate the model capability.
`"GPT"`: the metrics you want to use for GPT evaluation.
`"Metrics"`: the metrics you want to use for automatic metrics evaluation.
You can create your config file based on available settings listed in following table.
| "category" | "GPT-3.5" | "Metrics" |
|:----------------:|:-----------------------:|:-----------:|
| "category" | "GPT" | "Metrics" |
| :--------------: | :---------------------: | :---------: |
| "brainstorming" | "language organization" | "BLEU" |
| "chat" | "relevance" | "ROUGE" |
| "classification" | "creativity" | "Distinct" |
@ -194,16 +252,19 @@ You can create your config file based on available settings listed in following
| "extraction" | "correctness" | "Precision" |
| "generation" | "naturalness" | "Recall" |
| "open_qa" | "engagingness" | "F1 score" |
| "rewriting" | "reasonableness" |
| "roleplay" | "diversity" |
| "summarization" | "fidelity" |
| | "conciseness" |
| "rewriting" | "reasonableness" | |
| "roleplay" | "diversity" | |
| "summarization" | "fidelity" | |
| | "conciseness" | |
> **NOTE:** For categories which don't have standard answers such as `brainstorming`, you should avoid using automatic metrics such as `BLEU` and `ROUGE` which are based on similarity measures and you should use `Distinct` instead in your config file.
#### Evaluate
After setting the configuration file, you can evaluate the model using `eval.py`.
After setting the configuration file, you can evaluate the model using `eval.py`. If you want to make comparisons between answers of two different models, you should specify two answer files in the argument `answer_file_list` and two model names in the argument `model_name_list`. If you want to evaluate one answer file, the length of both `answer_file_list` and `model_name_list` should be 1 and the program will perform evaluation using automatic metrics and GPT models.
An example script is provided as follows:
```shell
python eval.py \
--config_file "path to the config file" \
@ -212,14 +273,40 @@ python eval.py \
--target_file "path to the target answer file" \
--answer_file_list "path to the answer files of at most 2 models" \
--model_name_list "the names of at most 2 models" \
--gpt_model "which GPT model to use for evaluation" \
--save_path "path to save results" \
--openai_key "your openai key" \
```
## FAQ
<details><summary><b>How can I add a new GPT evaluation metric?</b></summary>
For example, if you want to add a new metric `persuasiveness` into category `brainstorming`, you should add the metric definition and its corresponding CoT(Chain-of-thought) in the evaluation prompt file in `prompt/evaluation_promt`. The CoT can be generated using ChatGPT. You can prompt ChatGPT to generate evaluation steps for the new metric.
```json
{
"brainstorming": {
"id": 1,
"category": "brainstorming",
"metrics": {
"persuasiveness": "说服力(1-5)XXX"
},
"CoT": {
"persuasiveness": "XXX\n\n说服力"
},
"prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下\n\n{question}\n\n答案如下\n\n{answer}\n\n评分的指标如下\n\n{metric}\n\n请你遵照以下的评分步骤\n\n{steps}"
}
}
```
</details>
## To Do
- [ ] Add evaluation for English capability
- [x] Add evaluation for English capability
- [ ] Support UniEval
- [ ] Support GPT-4 evaluation
- [x] Support GPT-4 evaluation
## Citations
@ -232,15 +319,6 @@ python eval.py \
year = {2023}
}
@misc{ouyang2022training,
title={Training language models to follow instructions with human feedback},
author={Long Ouyang and Jeff Wu and Xu Jiang and Diogo Almeida and Carroll L. Wainwright and Pamela Mishkin and Chong Zhang and Sandhini Agarwal and Katarina Slama and Alex Ray and John Schulman and Jacob Hilton and Fraser Kelton and Luke Miller and Maddie Simens and Amanda Askell and Peter Welinder and Paul Christiano and Jan Leike and Ryan Lowe},
year={2022},
eprint={2203.02155},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{liu2023geval,
title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},

View File

@ -2,7 +2,7 @@
"language": "cn",
"category": {
"brainstorming": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"creativity",
@ -14,7 +14,7 @@
]
},
"chat": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"naturalness",
@ -26,7 +26,7 @@
]
},
"classification": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
@ -38,7 +38,7 @@
]
},
"closed_qa": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
@ -50,7 +50,7 @@
]
},
"extraction": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
@ -62,7 +62,7 @@
]
},
"generation": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"diversity"
@ -74,7 +74,7 @@
]
},
"open_qa": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
@ -84,7 +84,7 @@
]
},
"rewriting": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
@ -96,7 +96,7 @@
]
},
"roleplay": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"fidelity",
@ -107,7 +107,7 @@
]
},
"summarization": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness",

View File

@ -0,0 +1,123 @@
{
"language": "en",
"category": {
"brainstorming": {
"GPT": [
"language organization",
"relevance",
"creativity",
"practicality",
"correctness"
],
"Metrics": [
"Distinct"
]
},
"chat": {
"GPT": [
"language organization",
"relevance",
"naturalness",
"engagingness",
"reasonableness"
],
"Metrics": [
"Distinct"
]
},
"classification": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score"
]
},
"closed_qa": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"extraction": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score"
]
},
"generation": {
"GPT": [
"language organization",
"relevance",
"diversity"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"open_qa": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Distinct"
]
},
"rewriting": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"roleplay": {
"GPT": [
"language organization",
"relevance",
"fidelity",
"creativity"
],
"Metrics": [
"Distinct"
]
},
"summarization": {
"GPT": [
"language organization",
"relevance",
"correctness",
"conciseness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
}
}
}

View File

@ -14,7 +14,7 @@ def main(args):
# load config
config = jload(args.config_file)
if config["language"] == "cn":
if config["language"] in ["cn", "en"]:
# get metric settings for all categories
metrics_per_category = {}
for category in config["category"].keys():
@ -39,7 +39,8 @@ def main(args):
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
# initialize evaluator
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
config["language"])
if len(args.model_name_list) == 2:
answers1 = jload(args.answer_file_list[0])
answers2 = jload(args.answer_file_list[1])
@ -87,6 +88,10 @@ if __name__ == '__main__':
default=[],
required=True,
help='the names of at most 2 models')
parser.add_argument('--gpt_model',
default="gpt-3.5-turbo",
choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
help='which GPT model to use for evaluation')
parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
args = parser.parse_args()

View File

@ -4,7 +4,7 @@ from typing import Any, Dict, List
import gpt_evaluate
import metrics
import pandas as pd
from utils import get_data_per_category, jdump
from utils import analyze_automatic_results, get_data_per_category, save_automatic_results
class Evaluator(object):
@ -14,13 +14,15 @@ class Evaluator(object):
"""
def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str,
Any]) -> None:
def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
gpt_model: str, language: str) -> None:
self.params = params
self.battle_prompt = battle_prompt
self.gpt_evaluation_prompt = gpt_evaluation_prompt
self.gpt_model = gpt_model
self.language = language
self.automatic_metric_stats = dict()
self.gpt35_evaluation_results = dict()
self.gpt_evaluation_results = dict()
self.battle_results = []
def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
@ -40,21 +42,21 @@ class Evaluator(object):
"""
def switch(metric):
def switch(metric, language):
if metric == "BLEU":
return metrics.bleu_score(preds=predicts_list, targets=targets_list)
return metrics.bleu_score(preds=predicts_list, targets=targets_list, language=language)
elif metric == "ROUGE":
return metrics.rouge_cn_score(preds=predicts_list, targets=targets_list)
return metrics.rouge_score(preds=predicts_list, targets=targets_list, language=language)
elif (metric == "Distinct"):
return metrics.distinct_score(preds=predicts_list)
return metrics.distinct_score(preds=predicts_list, language=language)
elif (metric == "BERTScore"):
return metrics.bert_score(preds=predicts_list, targets=targets_list)
return metrics.bert_score(preds=predicts_list, targets=targets_list, language=language)
elif (metric == "Precision"):
return metrics.precision(preds=predicts_list, targets=targets_list)
return metrics.precision(preds=predicts_list, targets=targets_list, language=language)
elif (metric == "Recall"):
return metrics.recall(preds=predicts_list, targets=targets_list)
return metrics.recall(preds=predicts_list, targets=targets_list, language=language)
elif (metric == "F1 score"):
return metrics.F1_score(preds=predicts_list, targets=targets_list)
return metrics.F1_score(preds=predicts_list, targets=targets_list, language=language)
else:
raise ValueError(f"Unexpected metric")
@ -63,6 +65,10 @@ class Evaluator(object):
# automatic evaluation
for category in self.params:
if len(answers_per_category[category]) == 0:
print(f"Category {category} specified in your config doesn't have corresponding answers!")
continue
category_metrics = self.params[category]["Metrics"]
self.automatic_metric_stats[category] = {}
@ -72,19 +78,23 @@ class Evaluator(object):
predicts_list = [answer["output"] for answer in answers_per_category[category]]
for metric in category_metrics:
self.automatic_metric_stats[category].update(switch(metric=metric))
self.automatic_metric_stats[category].update(switch(metric=metric, language=self.language))
# gpt35 evaluation
# gpt evaluation
for category in self.params:
category_metrics = self.params[category]["GPT-3.5"]
if len(answers_per_category[category]) == 0:
print(f"Category {category} specified in your config doesn't have corresponding answers!")
continue
category_metrics = self.params[category]["GPT"]
prompt = self.gpt_evaluation_prompt.get(category, None)
if prompt is None:
print(f"No prompt for category {category}! Use prompt for category general now.")
prompt = self.gpt_evaluation_prompt["general"]
self.gpt35_evaluation_results[category] = gpt_evaluate.gpt35_evaluate(answers_per_category[category],
prompt, category_metrics, category)
self.gpt_evaluation_results[category] = gpt_evaluate.evaluate(answers_per_category[category], prompt,
category_metrics, category, self.gpt_model)
def save(self, path: str, model_name_list: List[str]) -> None:
"""
@ -96,35 +106,29 @@ class Evaluator(object):
save_path = os.path.join(path, "gpt_evaluate", "battle_results")
gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
else:
# save evaluation results for automatic metrics
automatic_df = pd.DataFrame(self.automatic_metric_stats)
# Save evaluation results for automatic metrics
automatic_base_save_path = os.path.join(path, "automatic_results")
automatic_results_save_path = os.path.join(automatic_base_save_path, "evaluation_results")
automatic_results_save_path = os.path.join(path, "automatic_results")
if not os.path.exists(automatic_results_save_path):
os.makedirs(automatic_results_save_path)
automatic_df.to_csv(os.path.join(automatic_results_save_path, f"{model_name_list[0]}.csv"), index=True)
# Save evaluation results for GPT-3.5 evaluation metrics.
all_evaluations = []
base_save_path = os.path.join(path, "gpt_evaluate", "gpt35_evaluate_results")
evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")
for category, evaluations in self.gpt35_evaluation_results.items():
jdump(
evaluations,
os.path.join(evaluation_results_save_path, model_name_list[0],
f"{category}_evaluation_results.json"))
all_evaluations.extend(evaluations)
jdump(all_evaluations,
os.path.join(evaluation_results_save_path, f"{model_name_list[0]}_evaluation_results.json"))
# Start to calculate scores and save statictics.
evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations,
evaluation_statistics_save_path)
save_automatic_results(model_name_list[0], self.automatic_metric_stats, automatic_results_save_path)
# Save charts and csv.
evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
gpt_evaluate.analyze_gpt35_evaluation_statistics(evaluation_statistics_save_path,
evaluation_analyses_save_path)
automatic_analyses_save_path = os.path.join(automatic_base_save_path, "evaluation_analyses")
analyze_automatic_results(automatic_results_save_path, automatic_analyses_save_path)
# Save evaluation results for GPT evaluation metrics.
gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
all_evaluations = gpt_evaluate.save_gpt_evaluation_results(model_name_list[0], self.gpt_evaluation_results,
gpt_evaluation_results_save_path)
# Start to calculate scores and save statistics.
gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
gpt_evaluation_statistics_save_path)
# Save charts and csv.
gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
gpt_evaluate.analyze_gpt_evaluation_statistics(gpt_evaluation_statistics_save_path,
gpt_evaluation_analyses_save_path)

View File

@ -16,7 +16,7 @@ from utils import jdump, jload
def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
"""
Get evaluation from GPT-4.
Get battle evaluation from GPT-4.
Args:
sys_prompt: prompt for the system.
@ -51,7 +51,7 @@ def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: in
except Exception as e:
print(e)
time.sleep(1)
print(f" Evaluation {id} failed after {MAX_API_RETRY} retries.")
print(f"Evaluation {id} failed after {MAX_API_RETRY} retries.")
return {"evaluation": "", "id": id}
@ -233,12 +233,77 @@ def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_pa
print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
def get_gpt35_evaluation(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
max_tokens: int = 2048) -> Dict[str, Any]:
def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
model: str = "gpt-3.5-turbo",
max_tokens: int = 2048) -> Dict[str, Any]:
"""
Use GPT-3.5 to evaluate one model answer.
Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
Args:
prompt: a dictionary including prompt template, CoT and metrics.
inst: the instruction that is needed to be evaluated.
metrics: the metrics for evaluation.
model: the model used to evaluate answers.
max_tokens: the maximum number of tokens to generate in the chat completion.
Returns:
An evaluation of one answer.
"""
MAX_API_RETRY = 3
question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
answer = inst["output"]
inst["evaluation"] = {}
for metric in metrics:
if prompt["metrics"].get(metric, None) is None:
raise Exception(
f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
)
for i in range(MAX_API_RETRY):
try:
response = openai.ChatCompletion.create(
model=model,
messages=[
{
"role":
"user",
"content":
prompt["prompt"].format(
question=question,
answer=answer,
metric=prompt["metrics"][metric],
steps=prompt["CoT"][metric],
),
},
],
temperature=0,
max_tokens=max_tokens,
)
inst["evaluation"][metric] = {
"response": response["choices"][0]["message"]["content"],
"logprobs": None,
}
break
except Exception as e:
print(e)
time.sleep(1)
if metric not in inst["evaluation"]:
print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
inst["evaluation"][metric] = {}
return inst
def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
max_tokens: int = 2048) -> Dict[str, Any]:
"""
Use completion model(text-davinci-003) to evaluate one model answer.
Only completion models can return log probabilities.
Args:
prompt: a dictionary including prompt template, CoT and metrics.
@ -283,23 +348,22 @@ def get_gpt35_evaluation(prompt: Dict[str, Any],
except Exception as e:
print(e)
time.sleep(1)
if metric not in inst["evaluation"]:
print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
inst["evaluation"][metric] = {}
return inst
def gpt35_evaluate(
answers: List[Dict],
prompt: Dict[str, Any],
metrics: List[str],
category: str,
) -> List[Dict]:
def evaluate(answers: List[Dict], prompt: Dict[str, Any], metrics: List[str], category: str, model: str) -> List[Dict]:
"""
Use GPT-3.5 to evaluate model answers and save evaluation results.
Use GPT models to evaluate model answers and save evaluation results.
Args:
answers: model answers.
prompt: prompt for GPT-3.5 evaluation.
metrics: metrics for GPT-3.5 evaluation.
prompt: prompt for GPT evaluation.
metrics: metrics for GPT evaluation.
category: the category of the model answers for evaluation.
model: the specific GPT model used to evaluate answers.
Returns:
Evaluations of the given answers.
@ -315,7 +379,12 @@ def gpt35_evaluate(
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for inst in answers:
future = executor.submit(get_gpt35_evaluation, prompt, inst, metrics, 1)
# Completion models can return log probabilities.
if model == "text-davinci-003":
future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
else:
future = executor.submit(get_gpt_evaluation_without_logprobs, prompt, inst, metrics, model, 1)
futures.append(future)
for future in tqdm.tqdm(
@ -334,20 +403,19 @@ def gpt35_evaluate(
def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
"""
Calculate score from log probabilities returned by text-davinci-003.
Only openai.Completion can return logprobs.
Calculate the score according to log probabilities returned by text-davinci-003.
Calculation formula:
score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.
Ref: https://arxiv.org/abs/2303.16634
This paper proposes NLG evaluation methods using GPT-3.5(logprobs returned by openai api) and GPT-4(logprobs obtained by sampling).
This paper proposes NLG evaluation methods using text-davinci-003(log probabilities returned by completion models) and GPT-4(probabilities obtained by sampling).
Args:
logprobs: logprobs returned by openai.Completion.
Returns:
Score of one answer.
The score of one answer.
"""
# GPT-3.5 only returns score of 1 to 5.
@ -369,14 +437,59 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
return score
def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
"""
Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
Args:
response: logprobs returned by openai.Completion.
evaluation: the evaluation corresponds to the question.
Returns:
The score of one answer.
"""
try:
results = re.findall(r"\d", response)
if len(results) == 1:
return int(results[0])
else:
raise Exception(f"Invalid score pair. Got {evaluation}.")
except Exception as e:
return 0
def save_gpt_evaluation_results(model_name: str, gpt_evaluation_results: Dict[str, Any],
save_path: str) -> Dict[str, Any]:
"""
Save evaluation results for different categories for one model.
Args:
model_name: name of the model for saving evaluation results.
gpt_evaluation_results: evaluations results for all of the model answers.
save_path: path to save GPT evaluation statistics.
"""
all_evaluations = []
for category, evaluations in gpt_evaluation_results.items():
jdump(evaluations, os.path.join(save_path, model_name, f"{category}_evaluation_results.json"))
all_evaluations.extend(evaluations)
jdump(all_evaluations, os.path.join(save_path, f"{model_name}_evaluation_results.json"))
return all_evaluations
def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
"""
Generate statistics for one model.
Args:
model_name: name of the model for saving statistics.
evaluations: evaluations for all of the model answers.
save_path: path to save GPT-3.5 evaluation statistics.
save_path: path to save GPT evaluation statistics.
"""
if not os.path.exists(save_path):
@ -396,7 +509,15 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
scores = {metric: [] for metric in metrics}
for evaluation in data:
for metric in metrics:
scores[metric].append(calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
if evaluation["evaluation"][metric] == {}:
# This means after 3 retries, the server still returns an error and we set the score to 0.
scores[metric].append(0)
elif evaluation["evaluation"][metric]["logprobs"] is not None:
scores[metric].append(
calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
else:
scores[metric].append(
calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation))
statistics = {}
for metric in metrics:
@ -414,9 +535,9 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
)
def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> None:
def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> None:
"""
Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
Analyze and visualize all GPT evaluation statistics in the given directory.
Args:
statistics_path: path to all the models' statistics.
@ -474,7 +595,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) ->
os.makedirs(save_path)
frame_all = pd.DataFrame(frame_all)
frame_all.to_csv(os.path.join(save_path, "gpt35_evaluation_statistics.csv"))
frame_all.to_csv(os.path.join(save_path, "gpt_evaluation_statistics.csv"))
for category in tqdm.tqdm(
frame_per_category.keys(),
@ -494,3 +615,5 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) ->
figure = fig.get_figure()
figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400)
plt.close()

View File

@ -1,13 +1,16 @@
import statistics
from typing import Dict, List
import jieba
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu
from rouge_chinese import Rouge as Rouge_cn
from rouge_score import rouge_scorer as Rouge_en
from sklearn.metrics import f1_score, precision_score, recall_score
from utils import preprocessing_text, remove_redundant_space
def bleu_score(preds: list, targets: list) -> dict:
def bleu_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate BLEU Score Metric
The calculation includes BLEU-1 for unigram, BLEU-2 for bigram,
@ -21,8 +24,12 @@ def bleu_score(preds: list, targets: list) -> dict:
(1. / 4., 1. / 4., 1. / 4., 1. / 4.)]
for pred, target in zip(preds, targets):
pred_list = (' '.join(jieba.cut(pred))).split()
target_list = [(' '.join(jieba.cut(target))).split()]
if language == "cn":
pred_list = ' '.join(jieba.cut(preprocessing_text(pred))).split()
target_list = [(' '.join(jieba.cut(preprocessing_text(target)))).split()]
elif language == "en":
pred_list = preprocessing_text(pred).split()
target_list = [preprocessing_text(target).split()]
bleu = sentence_bleu(target_list, pred_list, weights=weights)
cumulative_bleu = [a + b for a, b in zip(cumulative_bleu, bleu)]
@ -33,7 +40,7 @@ def bleu_score(preds: list, targets: list) -> dict:
return bleu_scores
def rouge_cn_score(preds: list, targets: list) -> dict:
def rouge_cn_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
"""Calculate Chinese ROUGE Score Metric
The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
@ -41,13 +48,13 @@ def rouge_cn_score(preds: list, targets: list) -> dict:
the preds and targets. ROUGE-L measures the number of matching
longest common subsequence (LCS) between preds and targets.
"""
rouge_scores = {"rouge1": {}, "rouge2": {}, "rougeL": {}}
rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
all_preds = []
all_targets = []
for pred, target in zip(preds, targets):
pred_list = ' '.join(jieba.cut(pred))
target_list = ' '.join(jieba.cut(target))
pred_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(pred))))
target_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(target))))
all_preds.append(pred_list)
all_targets.append(target_list)
@ -61,7 +68,42 @@ def rouge_cn_score(preds: list, targets: list) -> dict:
return rouge_scores
def distinct_score(preds: list) -> dict:
def rouge_en_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
"""Calculate English ROUGE Score Metric
The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
the preds and targets. ROUGE-L measures the number of matching
longest common subsequence (LCS) between preds and targets.
"""
rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
all_preds = []
all_targets = []
rouge_en = Rouge_en.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
for pred, target in zip(preds, targets):
score = rouge_en.score(preprocessing_text(pred), preprocessing_text(target))
rouge_scores["rouge1"] += score['rouge1'].fmeasure
rouge_scores["rouge2"] += score['rouge2'].fmeasure
rouge_scores["rougeL"] += score['rougeL'].fmeasure
rouge_scores["rouge1"] = rouge_scores["rouge1"] / len(preds)
rouge_scores["rouge2"] = rouge_scores["rouge2"] / len(preds)
rouge_scores["rougeL"] = rouge_scores["rougeL"] / len(preds)
return rouge_scores
def rouge_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate ROUGE Score Metric"""
if language == "cn":
return rouge_cn_score(preds, targets)
elif language == "en":
return rouge_en_score(preds, targets)
def distinct_score(preds: List[str], language: str) -> Dict[str, float]:
"""Calculate Distinct Score Metric
This metric refers to https://arxiv.org/abs/1510.03055.
@ -72,19 +114,36 @@ def distinct_score(preds: list) -> dict:
cumulative_distinct = []
for pred in preds:
pred_seg_list = list(' '.join(jieba.cut(pred)))
count_segs = len(pred_seg_list)
unique_segs = set(pred_seg_list)
count_unique_chars = len(unique_segs)
if language == "cn":
pred_seg_list = ' '.join(jieba.cut(pred)).split()
count_segs = len(pred_seg_list)
unique_segs = set(pred_seg_list)
count_unique_chars = len(unique_segs)
cumulative_distinct.append(count_unique_chars / count_segs)
cumulative_distinct.append(count_unique_chars / count_segs)
elif language == "en":
# calculate distinct 1-gram, 2-gram, 3-gram
unique_ngram = [set() for _ in range(0, 3)]
all_ngram_count = [0 for _ in range(0, 3)]
split_pred = preprocessing_text(pred).split()
for n in range(0, 3):
for i in range(0, len(split_pred) - n):
ngram = ' '.join(split_pred[i:i + n + 1])
unique_ngram[n].add(ngram)
all_ngram_count[n] += 1
# Sometimes the answer may contain only one word. For 2-gram and 3-gram, the gram count(denominator) may be zero.
avg_distinct = [len(a) / (b + 1e-6) for a, b in zip(unique_ngram, all_ngram_count)]
cumulative_distinct.append(statistics.mean(avg_distinct))
distinct_score["distinct"] = statistics.mean(cumulative_distinct)
return distinct_score
def bert_score(preds: list, targets: list) -> dict:
def bert_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate BERTScore Metric
The BERTScore evaluates the semantic similarity between
@ -95,23 +154,25 @@ def bert_score(preds: list, targets: list) -> dict:
target_list = []
for pred, target in zip(preds, targets):
pred_list.append(' '.join(jieba.cut(pred)))
target_list.append(' '.join(jieba.cut(target)))
pred_list.append(pred)
target_list.append(target)
_, _, F = score(pred_list, target_list, lang="zh", verbose=True)
if language == "cn":
_, _, F = score(pred_list, target_list, lang="zh", verbose=True)
elif language == "en":
_, _, F = score(pred_list, target_list, lang="en", verbose=True)
bert_score["bert_score"] = F.mean().item()
return bert_score
def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
def calculate_precision_recall_f1(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Precision, Recall and F1-Score Calculation
The calculation of precision, recall and f1-score is realized by counting
the number f overlaps between the preds and target. The comparison length
limited by the shorter one of preds and targets. This design is mainly
considered for classifiction and extraction categories.
limited by the shorter one of preds and targets.
"""
precision_recall_f1 = {"precision": 0, "recall": 0, "f1_score": 0}
precision_scores = []
@ -119,8 +180,12 @@ def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
f1_scores = []
for pred, target in zip(preds, targets):
pred_list = [char for char in pred]
target_list = [char for char in target]
if language == "cn":
pred_list = [char for char in ' '.join(jieba.cut(preprocessing_text(pred))).split()]
target_list = [char for char in ' '.join(jieba.cut(preprocessing_text(target))).split()]
elif language == "en":
pred_list = [char for char in preprocessing_text(pred).split()]
target_list = [char for char in preprocessing_text(target).split()]
target_labels = [1] * min(len(target_list), len(pred_list))
pred_labels = [int(pred_list[i] == target_list[i]) for i in range(0, min(len(target_list), len(pred_list)))]
@ -136,34 +201,31 @@ def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
return precision_recall_f1
def precision(preds: list, targets: list) -> dict:
def precision(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate Precision Metric
(design for classifiction and extraction categories)
Calculating precision by counting the number of overlaps between the preds and target.
"""
precision = {"precision": 0}
precision["precision"] = calculate_precision_recall_f1(preds, targets)["precision"]
precision["precision"] = calculate_precision_recall_f1(preds, targets, language)["precision"]
return precision
def recall(preds: list, targets: list) -> dict:
def recall(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate Recall Metric
(design for classifiction and extraction categories)
Calculating recall by counting the number of overlaps between the preds and target.
"""
recall = {"recall": 0}
recall["recall"] = calculate_precision_recall_f1(preds, targets)["recall"]
recall["recall"] = calculate_precision_recall_f1(preds, targets, language)["recall"]
return recall
def F1_score(preds: list, targets: list) -> dict:
def F1_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate F1-score Metric
(design for classifiction and extraction categories)
Calculating f1-score by counting the number of overlaps between the preds and target.
"""
f1 = {"f1_score": 0}
f1["f1_score"] = calculate_precision_recall_f1(preds, targets)["f1_score"]
f1["f1_score"] = calculate_precision_recall_f1(preds, targets, language)["f1_score"]
return f1

View File

@ -0,0 +1,6 @@
{
"id": 1,
"system_prompt": "You are a helpful and precise assistant for checking the quality of the answer. You will be given two different answers to the same question",
"prompt_template": "[Question]\n{question}\n\n[The Start of AI Assistant 1's Answer]\n{answer_1}\n\n[The End of AI Assistant 1's Answer]\n\n[The Start of AI Assistant 2's Answer]\n{answer_2}\n\n[The End of AI Assistant 2's Answer]\n\n[Requirements]\n{prompt}\n\n",
"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."
}

View File

@ -1,5 +1,5 @@
[
{
{
"brainstorming": {
"id": 1,
"category": "brainstorming",
"metrics": {
@ -18,7 +18,7 @@
},
"prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下\n\n{question}\n\n答案如下\n\n{answer}\n\n评分的指标如下\n\n{metric}\n\n请你遵照以下的评分步骤\n\n{steps}"
},
{
"chat": {
"id": 2,
"category": "chat",
"metrics": {
@ -37,7 +37,7 @@
},
"prompt": "你是一个好助手。请你为下面的“补全对话”问题的答案打分。\n\n问题如下\n\n{question}\n\n答案如下\n\n{answer}\n\n评分的指标如下\n\n{metric}\n\n请你遵照以下的评分步骤\n\n{steps}"
},
{
"classification": {
"id": 3,
"category": "classification",
"metrics": {
@ -52,7 +52,7 @@
},
"prompt": "你是一个好助手。请你为下面的“分类“问题的答案打分。\n\n问题如下\n\n{question}\n\n答案如下\n\n{answer}\n\n评分的指标如下\n\n{metric}\n\n请你遵照以下的评分步骤\n\n{steps}"
},
{
"closed_qa": {
"id": 4,
"category": "closed_qa",
"metrics": {
@ -67,7 +67,7 @@
},
"prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下\n\n{question}\n\n需要你评分的答案如下\n\n{answer}\n\n评分的指标如下\n\n{metric}\n\n请你遵照以下的评分步骤\n\n{steps}"
},
{
"extraction": {
"id": 5,
"category": "extraction",
"metrics": {
@ -82,7 +82,7 @@
},
"prompt": "你是一个好助手。请你为下面的“提取”问题的答案打分。\n\n问题如下\n\n{question}\n\n答案如下\n\n{answer}\n\n评分的指标如下\n\n{metric}\n\n请你遵照以下的评分步骤\n\n{steps}"
},
{
"generation": {
"id": 6,
"category": "generation",
"metrics": {
@ -97,7 +97,7 @@
},
"prompt": "你是一个好助手。请你为下面的“生成”问题的答案打分。\n\n问题如下\n\n{question}\n\n答案如下\n\n{answer}\n\n评分的指标如下\n\n{metric}\n\n请你遵照以下的评分步骤\n\n{steps}"
},
{
"open_qa": {
"id": 7,
"category": "open_qa",
"metrics": {
@ -112,7 +112,7 @@
},
"prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下\n\n{question}\n\n答案如下\n\n{answer}\n\n评分的指标如下\n\n{metric}\n\n请你遵照以下的评分步骤\n\n{steps}"
},
{
"rewriting": {
"id": 8,
"category": "rewriting",
"metrics": {
@ -127,7 +127,7 @@
},
"prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下\n\n{question}\n\n答案如下\n\n{answer}\n\n评分的指标如下\n\n{metric}\n\n请你遵照以下的评分步骤\n\n{steps}"
},
{
"roleplay": {
"id": 9,
"category": "roleplay",
"metrics": {
@ -144,7 +144,7 @@
},
"prompt": "你是一个好助手。请你为下面的“角色扮演”问题的答案打分。\n\n问题如下\n\n{question}\n\n答案如下\n\n{answer}\n\n评分的指标如下\n\n{metric}\n\n请你遵照以下的评分步骤\n\n{steps}"
},
{
"summarization": {
"id": 10,
"category": "summarization",
"metrics": {
@ -161,7 +161,7 @@
},
"prompt": "你是一个好助手。请你为下面的“总结”问题的答案打分。\n\n问题如下\n\n{question}\n\n答案如下\n\n{answer}\n\n评分的指标如下\n\n{metric}\n\n请你遵照以下的评分步骤\n\n{steps}"
},
{
"general": {
"id": 11,
"category": "general",
"metrics": {
@ -176,4 +176,4 @@
},
"prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下\n\n{question}\n\n需要你评分的答案如下\n\n{answer}\n\n评分的指标如下\n\n{metric}\n\n请你遵照以下的评分步骤\n\n{steps}"
}
]
}

View File

@ -0,0 +1,179 @@
{
"brainstorming": {
"id": 1,
"category": "brainstorming",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
"relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
"creativity": "Creativity (1-5): Some brainstorming questions may require answers that are creative and suggest new ideas.",
"practicality": "Practicality (1-5): Some brainstorming questions may require answers to suggest practical suggestions or solutions.",
"correctness": "Correctness (1-5): The answer should be in line with common sense, life experience, etc."
},
"CoT": {
"language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
"relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
"creativity": "1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.\n2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the creativity score may be affected.\n3. Consider whether the answer contains novel ideas or unique thoughts. An answer may overlap with a known solution and still be considered creative, as long as it offers a new perspective or approach to the problem.\n4. Give a score of 1 to 5 depending on the creativity of the answer. If the answer lacks creativity, a lower score should be given. If the answer is creative and provides a new idea, a higher score should be given.\n\nCreativity:",
"practicality": "1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.\n2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the practicality score may be affected.\n3. Consider whether the suggestions or solutions presented in the answer are practical and workable. The answer may look good, but if it cannot be implemented or applied, the practicality score may be affected.\n4. Give a score of 1 to 5 depending on the practicality of the answer. If the answer lacks practicality, a lower score should be given. If the answer makes a practical suggestion or solution and solves the problem well, a higher score should be given.\n\nPracticality:",
"correctness": "1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.\n2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the correctness score may be affected.\n3. Consider whether the information provided in the answer is correct, consistent with common sense, real life, etc. If there are obvious errors or implausibilities in the answer, the correctness score may be affected.\n4. Give a score of 1 to 5 depending on the correctness of the answer. If the answer contains obvious errors or unreasonable points, a lower score should be given. A higher score should be given if the answer is correct, consistent with common sense, real life, etc.\n\nCorrectness:"
},
"prompt": "You are a good assistant. Please rate the given answer to the \"brainstorming\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
"chat": {
"id": 2,
"category": "chat",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
"relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
"naturalness": "Naturalness (1-5): whether the answer is natural and fits the identity given by the question.",
"engagingness": "Engagingness (1-5): whether the answer responds appropriately to the content of the preceding conversation and whether it understands the context and background of the conversation.",
"reasonableness": "Reasonableness (1-5): Whether the answer can form a logical connection with the content of the previous dialogue, whether it is consistent with common sense, and whether it can reasonably exist in this context."
},
"CoT": {
"language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
"relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
"naturalness": "1. Read the question and determine the identity information provided in the question.\n2. Check whether the content of the answer matches the identity given in the question.\n3. Based on the above factors, score the naturalness of the response on a scale from 1 to 5, where 1 means unnatural and 5 means very natural and in accordance with the identity given in the question.\n\nNaturalness:",
"engagingness": "1. Read the questions to determine the context and background of the dialogue.\n2. Check that the answer fully understands the context and background of the conversation and that it fits naturally into the conversation without seeming abrupt.\n3. Based on the above factors, rate the response's engagement on a scale from 1 to 5, where 1 means not engaged and 5 means very engaged and appropriately understands the context and background of the conversation.\n\nEngagingness:",
"reasonableness": "1. Read the question and determine the topic of the conversation and the direction the question expects the answer to go.\n2. Determine whether the answer can be logically connected to the preceding conversation, whether it makes common sense, and whether it can reasonably exist in this context.\n3. Based on the above factors, rate the reasonableness of the answer on a scale from 1 to 5, where 1 means unreasonable and 5 means very reasonable and able to form a logical connection with the preceding dialogue content and consistent with common sense.\n\nReasonableness:"
},
"prompt": "You are a good assistant. Please rate the given answer to the \"chat\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
"classification": {
"id": 3,
"category": "classification",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
"relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
"correctness": "Correctness (1-5): whether the answer is correct or not."
},
"CoT": {
"language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
"relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
"correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be given. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
},
"prompt": "You are a good assistant. Please rate the given answer to the \"classification\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
"closed_qa": {
"id": 4,
"category": "closed_qa",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
"relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
"correctness": "Correctness (1-5): whether the answer is correct or not."
},
"CoT": {
"language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
"relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
"correctness": "1. Read the question carefully and try to answer the question by yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
},
"prompt": "You are a good assistant. Please rate the given answer to the \"closed qa\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
"extraction": {
"id": 5,
"category": "extraction",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
"relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
"correctness": "correctness (1-5): Answers should extract the required information accurately and should not contain any incorrect or misleading information."
},
"CoT": {
"language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
"relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
"correctness": "1. Read the questions carefully and identify the information that needs to be extracted from the material.\n2. Read the answer carefully and make sure it covers all the information that needs to be extracted.\n3. Use the material provided to verify the correctness of the response. If the response is inaccurate or contains incorrect or misleading information, a high score cannot be given.\n4. Check that the answer contains all the information required to be extracted and do not leave out any important details.\n5. Give a score between 1 and 5 based on the correctness and completeness of the response, with a score of 5 indicating a very accurate and complete response and a score of 1 indicating that the response barely extracts the required information.\n\nCorrectness:"
},
"prompt": "You are a good assistant. Please rate the given answer to the \"extraction\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
"generation": {
"id": 6,
"category": "generation",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
"relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
"diversity": "Diversity (1-5): Whether the answers use beautiful language and have some creativity and imagination. However, answers should also be kept reasonable and moderate, not overly exaggerated or off-topic."
},
"CoT": {
"language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
"relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
"diversity": "1. Read the entire response carefully to ensure that you fully understand the content and theme expressed in the response.\n2. While reading the response, pay attention to the quality of the language, such as whether the wording is correct and the language is vivid.\n3. Check the creativity and imagination of the response to see if the response is engaging to read on.\n4. Check the reasonableness and appropriateness of the responses to see if the responses are exaggerated or off-topic.\n5. Rate the diversity on a scale of 1 to 5, with a 5 indicating a good quality response that is engaging to read and a 1 indicating a raw response or a question that is off-topic.\n\nDiversity:"
},
"prompt": "You are a good assistant. Please rate the given answer to the \"generation\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
"open_qa": {
"id": 7,
"category": "open_qa",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
"relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
"correctness": "Correctness (1-5): whether the answer is correct or not."
},
"CoT": {
"language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
"relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
"correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be given. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
},
"prompt": "You are a good assistant. Please rate the answers to the \"open qa\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
"rewriting": {
"id": 8,
"category": "rewriting",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
"relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
"correctness": "Correctness (1-5): whether the answer is correct or not."
},
"CoT": {
"language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
"relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
"correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
},
"prompt": "You are a good assistant. Please rate the answers to the \"rewriting\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
"roleplay": {
"id": 9,
"category": "roleplay",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
"relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
"fidelity": "Fidelity (1-5): whether the answer is able to answer the given request in strict compliance with the role setting.",
"creativity": "Creativity (1-5): The answers to the role-play questions need to be somewhat creative, but at the same time they need to adhere to the setting of the role."
},
"CoT": {
"language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
"relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
"fidelity": "1. Read the question carefully to understand how the character is set up and represented in the question, including aspects such as occupation, background, point of view, and personality.\n2. Read the question's request and confirm the details that need to be taken into account when answering the request.\n3. Compare the provided answer with the setting of the role and assess whether the answer can strictly adhere to the setting of the role.\n4. Combine the results of the above assessment to give a fidelity score ranging from 1 to 5, where a score of 1 means that the response does not match the persona at all, and a score of 5 means that the response fully complies with the persona and satisfies the given request.\n\nFidelity:",
"creativity": "1. Read the question carefully to understand how the character is set up and represented in the question, including career, background, perspective, and personality.\n2. Evaluate whether the answer has unique ideas and suggestions that bring new ideas and insights to the questioner.\n3. Compare the creativity in the response to the setting of the persona and assess whether the response adheres to the setting and essential characteristics of the persona.\n4. Evaluate the quality of the responses in general and combine the results of the above assessment to give a creativity score ranging from 1 to 5, where a score of 1 indicates that the response lacks creativity and a score of 5 indicates that the response has unique ideas and suggestions and is able to adhere to the set-up of the persona.\n\nCreativity:"
},
"prompt": "You are a good assistant. Please rate the given answer to the \"role-play\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
"summarization": {
"id": 10,
"category": "summarization",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
"relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
"correctness": "Correctness (1-5): answers should summarize the main points of the material accurately and unambiguously.",
"conciseness": "Conciseness (1-5): answers should be concise and without redundant content."
},
"CoT": {
"language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
"relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
"correctness": "1. Read the material given in the question carefully to understand its content and main points.\n2. Assess whether the answer accurately summarizes the key points of the source material.\n3. assess whether the response contains all the key information in the source material.\n4. Based on the above steps, give a score of 1-5, where 1 means that the response does not accurately summarize the main points of the material and 5 means that the response completely accurately summarizes the main points of the material.\n\nCorrectness:",
"conciseness": "1. Read the title and extract the main points of the material.\n2. Read the summary and note the main ideas and messages in it.\n3. Assess the length of the summary. A concise summary should usually convey key information within a few sentences or paragraphs, rather than lengthy paragraphs or essays.\n4. Check that the summary does not contain information that is not relevant to the main ideas or that is redundant.\n5. Make sure that the summary covers the key information in the material and that no important details have been omitted.\n6. Rate the summary on a scale of 1-5, where 5 means the summary is concise and free of redundancy, and 1 means the summary is lengthy or contains unnecessary information that is difficult to understand or remember. Based on your judgment, assign the appropriate score.\n\nConciseness:"
},
"prompt": "You are a good assistant. Please rate the given answer to the \"summarization\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
"general": {
"id": 11,
"category": "general",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
"relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
"correctness": "Correctness (1-5): whether the answer is correct or not."
},
"CoT": {
"language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
"relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
"correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
},
"prompt": "You are a good assistant. Please rate the given answer to the question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
}
}

View File

@ -8,3 +8,5 @@ seaborn
pandas
matplotlib
numpy
zhon
rouge_score

View File

@ -1,6 +1,15 @@
import io
import json
import os
import re
import string
from typing import Dict
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tqdm
from zhon import hanzi
def _make_w_io_base(f, mode: str):
@ -29,7 +38,7 @@ def jdump(obj, f, mode="w", indent=4, default=str):
"""
f = _make_w_io_base(f, mode)
if isinstance(obj, (dict, list)):
json.dump(obj, f, indent=indent, default=default)
json.dump(obj, f, indent=indent, default=default, ensure_ascii=False)
elif isinstance(obj, str):
f.write(obj)
else:
@ -57,6 +66,153 @@ def get_data_per_category(data, categories):
data_per_category = {category: [] for category in categories}
for item in data:
category = item["category"]
data_per_category[category].append(item)
if category in categories:
data_per_category[category].append(item)
return data_per_category
def remove_articles(text: str) -> str:
"""
Remove articles "a, an, the" in the given text.
It is used in evaluation of automatic metrics.
"""
pattern = re.compile(r"\b(a|an|the)\b", re.UNICODE)
return re.sub(pattern, " ", text)
def remove_punctuations(text: str) -> str:
"""
Remove punctuations in the given text.
It is used in evaluation of automatic metrics.
"""
punctuation = string.punctuation + hanzi.punctuation
punctuation = set([char for char in punctuation])
punctuation.difference_update(set("!@#$%&()<>?|,.\"'"))
out = []
for char in text:
if char in punctuation:
continue
else:
out.append(char)
return "".join(out)
def remove_redundant_space(text: str) -> str:
"""
Remove redundant spaces in the given text.
It is used in evaluation of automatic metrics.
"""
return " ".join(text.split())
def preprocessing_text(text: str) -> str:
"""
Preprocess the given text.
It is used in evaluation of automatic metrics.
"""
return remove_redundant_space(remove_articles(remove_punctuations(text.lower())))
def save_automatic_results(model_name: str, automatic_metric_stats: Dict[str, Dict], save_path: str) -> None:
"""
Save automatic evaluation results of different categories for one model.
"""
if not os.path.exists(save_path):
os.makedirs(save_path)
automatic_df = pd.DataFrame(automatic_metric_stats)
automatic_df.to_csv(os.path.join(save_path, f"{model_name}_results.csv"), index=True)
def read_automatic_results(results_path: str, file_name: str) -> Dict[str, Dict]:
"""
Read a csv file and return a dictionary which stores scores per metric.
"""
results = pd.read_csv(os.path.join(results_path, file_name), index_col=0)
results_dict = {metric: {} for metric in list(results.index)}
for i, metric in enumerate(results_dict.keys()):
for j, category in enumerate(list(results.columns)):
if pd.isnull(results.iloc[i][j]):
continue
results_dict[metric][category] = results.iloc[i][j]
return results_dict
def analyze_automatic_results(results_path: str, save_path: str) -> None:
"""
Analyze and visualize all csv files in the given folder.
"""
if not os.path.exists(results_path):
raise Exception(f'The given directory "{results_path}" doesn\'t exist! No results found!')
all_statistics = {}
for file_name in os.listdir(results_path):
if file_name.endswith("_results.csv"):
model_name = file_name.split("_results.csv")[0]
all_statistics[model_name] = read_automatic_results(results_path, file_name)
if len(list(all_statistics.keys())) == 0:
raise Exception(f'There are no csv files in the given directory "{results_path}"!')
frame_all = {"model": [], "category": [], "metric": [], "score": []}
frame_per_metric = {}
for model_name, model_statistics in all_statistics.items():
for metric, metric_statistics in model_statistics.items():
if frame_per_metric.get(metric) is None:
frame_per_metric[metric] = {"model": [], "category": [], "score": []}
for category, category_score in metric_statistics.items():
frame_all["model"].append(model_name)
frame_all["category"].append(category)
frame_all["metric"].append(metric)
frame_all["score"].append(category_score)
frame_per_metric[metric]["model"].append(model_name)
frame_per_metric[metric]["category"].append(category)
frame_per_metric[metric]["score"].append(category_score)
if not os.path.exists(save_path):
os.makedirs(save_path)
frame_all = pd.DataFrame(frame_all)
frame_all.to_csv(os.path.join(save_path, "automatic_evaluation_statistics.csv"))
for metric in tqdm.tqdm(
frame_per_metric.keys(),
desc=f"metric: ",
total=len(frame_per_metric.keys()),
):
data = pd.DataFrame(frame_per_metric[metric])
sns.set()
fig = plt.figure(figsize=(16, 10))
fig = sns.barplot(x="category", y="score", hue="model", data=data, dodge=True)
fig.set_title(f"Comparison between Different Models for Metric {metric.title()}")
plt.xlabel("Evaluation Category")
plt.ylabel("Score")
figure = fig.get_figure()
figure.savefig(os.path.join(save_path, f"{metric}.png"), dpi=400)
plt.close()

View File

@ -0,0 +1,175 @@
import argparse
import os
import socket
from functools import partial
import pandas as pd
import ray
import torch
from coati.quant import llama_load_quant, low_resource_init
from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
from coati.ray.experience_maker_holder import ExperienceMakerHolder
from coati.ray.utils import (
get_actor_from_args,
get_critic_from_args,
get_reward_model_from_args,
get_strategy_from_args,
get_tokenizer_from_args,
)
from torch.utils.data import DataLoader
from transformers import AutoConfig
from transformers.modeling_utils import no_init_weights
def get_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
return s.getsockname()[1]
def get_local_ip():
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
s.connect(('8.8.8.8', 80))
return s.getsockname()[0]
def main(args):
master_addr = str(get_local_ip())
# trainer_env_info
trainer_port = str(get_free_port())
env_info_trainers = [{
'local_rank': '0',
'rank': str(rank),
'world_size': str(args.num_trainers),
'master_port': trainer_port,
'master_addr': master_addr
} for rank in range(args.num_trainers)]
# maker_env_info
maker_port = str(get_free_port())
env_info_maker = {
'local_rank': '0',
'rank': '0',
'world_size': '1',
'master_port': maker_port,
'master_addr': master_addr
}
# configure tokenizer
tokenizer = get_tokenizer_from_args(args.model)
def trainer_model_fn():
actor = get_actor_from_args(args.model, args.pretrain).half().cuda()
critic = get_critic_from_args(args.model, args.critic_pretrain).half().cuda()
return actor, critic
# configure Trainer
trainer_refs = [
DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
experience_maker_holder_name_list=["maker1"],
strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
model_fn=trainer_model_fn,
env_info=env_info_trainer,
train_batch_size=args.train_batch_size,
buffer_limit=16,
eval_performance=True,
debug=args.debug,
update_lora_weights=not (args.lora_rank == 0),
) for i, env_info_trainer in enumerate(env_info_trainers)
]
def model_fn():
actor = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
critic = get_critic_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
reward_model = get_reward_model_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
if args.initial_model_quant_ckpt is not None and args.model == 'llama':
# quantize initial model
actor_cfg = AutoConfig.from_pretrained(args.pretrain)
with low_resource_init(), no_init_weights():
initial_model = get_actor_from_args(args.model, config=actor_cfg)
initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
args.quant_group_size).cuda().requires_grad_(False)
else:
initial_model = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
return actor, critic, reward_model, initial_model
# configure Experience Maker
experience_holder_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
detached_trainer_name_list=[f'trainer{i}' for i in range(args.num_trainers)],
strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
model_fn=model_fn,
env_info=env_info_maker,
experience_batch_size=args.experience_batch_size,
kl_coef=0.1,
debug=args.debug,
update_lora_weights=not (args.lora_rank == 0),
# sync_models_from_trainers=True,
# generation kwargs:
max_length=512,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
eval_performance=True,
use_cache=True,
)
# uncomment this function if sync_models_from_trainers is True
# ray.get([
# trainer_ref.sync_models_to_remote_makers.remote()
# for trainer_ref in trainer_refs
# ])
wait_tasks = []
total_steps = args.experience_batch_size * args.experience_steps // (args.num_trainers * args.train_batch_size)
for trainer_ref in trainer_refs:
wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
dataset_size = args.experience_batch_size * 4
def build_dataloader():
def tokenize_fn(texts):
batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
return {k: v.cuda() for k, v in batch.items()}
dataset = pd.read_csv(args.prompt_path)['prompt']
dataloader = DataLoader(dataset=dataset, batch_size=dataset_size, shuffle=True, collate_fn=tokenize_fn)
return dataloader
wait_tasks.append(experience_holder_ref.workingloop.remote(build_dataloader, num_steps=args.experience_steps))
ray.get(wait_tasks)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--prompt_path', type=str, default=None)
parser.add_argument('--num_trainers', type=int, default=1)
parser.add_argument('--trainer_strategy',
choices=[
'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
'colossalai_zero2_cpu'
],
default='naive')
parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
parser.add_argument('--pretrain', type=str, default=None)
parser.add_argument('--critic_pretrain', type=str, default=None)
parser.add_argument('--experience_steps', type=int, default=4)
parser.add_argument('--experience_batch_size', type=int, default=8)
parser.add_argument('--train_epochs', type=int, default=1)
parser.add_argument('--update_steps', type=int, default=2)
parser.add_argument('--train_batch_size', type=int, default=8)
parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
parser.add_argument('--quant_bits', type=int, default=4)
parser.add_argument('--quant_group_size', type=int, default=128)
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()
ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
main(args)

View File

@ -0,0 +1,189 @@
import argparse
import os
import socket
from functools import partial
import pandas as pd
import ray
import torch
from coati.quant import llama_load_quant, low_resource_init
from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
from coati.ray.experience_maker_holder import ExperienceMakerHolder
from coati.ray.utils import (
get_actor_from_args,
get_critic_from_args,
get_receivers_per_sender,
get_reward_model_from_args,
get_strategy_from_args,
)
from torch.utils.data import DataLoader
from transformers import AutoConfig, AutoTokenizer
from transformers.modeling_utils import no_init_weights
def get_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
return s.getsockname()[1]
def get_local_ip():
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
s.connect(('8.8.8.8', 80))
return s.getsockname()[0]
def main(args):
master_addr = str(get_local_ip())
# trainer_env_info
trainer_port = str(get_free_port())
env_info_trainers = [{
'local_rank': '0',
'rank': str(rank),
'world_size': str(args.num_trainers),
'master_port': trainer_port,
'master_addr': master_addr
} for rank in range(args.num_trainers)]
# maker_env_info
maker_port = str(get_free_port())
env_info_makers = [{
'local_rank': '0',
'rank': str(rank),
'world_size': str(args.num_makers),
'master_port': maker_port,
'master_addr': master_addr
} for rank in range(args.num_makers)]
# configure tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
tokenizer.pad_token = tokenizer.eos_token
def model_fn():
actor = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
critic = get_critic_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
reward_model = get_reward_model_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
if args.initial_model_quant_ckpt is not None and args.model == 'llama':
# quantize initial model
actor_cfg = AutoConfig.from_pretrained(args.pretrain)
with low_resource_init(), no_init_weights():
initial_model = get_actor_from_args(args.model, config=actor_cfg)
initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
args.quant_group_size).cuda().requires_grad_(False)
else:
initial_model = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
return actor, critic, reward_model, initial_model
# configure Experience Maker
experience_holder_refs = [
ExperienceMakerHolder.options(name=f"maker{i}", num_gpus=1, max_concurrency=2).remote(
detached_trainer_name_list=[
f'trainer{x}'
for x in get_receivers_per_sender(i, args.num_makers, args.num_trainers, allow_idle_sender=False)
],
strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
model_fn=model_fn,
env_info=env_info_maker,
kl_coef=0.1,
debug=args.debug,
update_lora_weights=not (args.lora_rank == 0),
# sync_models_from_trainers=True,
# generation kwargs:
max_length=512,
do_sample=True,
temperature=1.0,
top_k=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
eval_performance=True,
use_cache=True,
)
for i, env_info_maker in enumerate(env_info_makers)
]
def trainer_model_fn():
actor = get_actor_from_args(args.model, args.pretrain, lora_rank=args.lora_rank).half().cuda()
critic = get_critic_from_args(args.model, args.critic_pretrain, lora_rank=args.lora_rank).half().cuda()
return actor, critic
# configure Trainer
trainer_refs = [
DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
experience_maker_holder_name_list=[
f"maker{x}"
for x in get_receivers_per_sender(i, args.num_trainers, args.num_makers, allow_idle_sender=True)
],
strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
model_fn=trainer_model_fn,
env_info=env_info_trainer,
train_batch_size=args.train_batch_size,
buffer_limit=16,
eval_performance=True,
debug=args.debug,
update_lora_weights=not (args.lora_rank == 0),
)
for i, env_info_trainer in enumerate(env_info_trainers)
]
dataset_size = args.experience_batch_size * 4
def build_dataloader():
def tokenize_fn(texts):
batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
return {k: v.cuda() for k, v in batch.items()}
dataset = pd.read_csv(args.prompt_path)['prompt']
dataloader = DataLoader(dataset=dataset, batch_size=dataset_size, shuffle=True, collate_fn=tokenize_fn)
return dataloader
# uncomment this function if sync_models_from_trainers is True
# ray.get([
# trainer_ref.sync_models_to_remote_makers.remote()
# for trainer_ref in trainer_refs
# ])
wait_tasks = []
for experience_holder_ref in experience_holder_refs:
wait_tasks.append(experience_holder_ref.workingloop.remote(build_dataloader, num_steps=args.experience_steps))
total_steps = args.experience_batch_size * args.experience_steps * \
args.num_makers // (args.num_trainers * args.train_batch_size)
for trainer_ref in trainer_refs:
wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
ray.get(wait_tasks)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--prompt_path', type=str, default=None)
parser.add_argument('--num_makers', type=int, default=1)
parser.add_argument('--num_trainers', type=int, default=1)
parser.add_argument('--trainer_strategy',
choices=[
'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
'colossalai_zero2_cpu'
],
default='naive')
parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
parser.add_argument('--pretrain', type=str, default=None)
parser.add_argument('--critic_pretrain', type=str, default=None)
parser.add_argument('--experience_steps', type=int, default=4)
parser.add_argument('--experience_batch_size', type=int, default=8)
parser.add_argument('--train_epochs', type=int, default=1)
parser.add_argument('--update_steps', type=int, default=2)
parser.add_argument('--train_batch_size', type=int, default=8)
parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
parser.add_argument('--quant_bits', type=int, default=4)
parser.add_argument('--quant_group_size', type=int, default=128)
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()
ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
main(args)

View File

@ -0,0 +1 @@
ray

View File

@ -0,0 +1,12 @@
#!/bin/bash
set -xe
BASE=$(realpath $(dirname $0))
export RAY_NAMESPACE=admin
export DATA=/data/scratch/chatgpt/prompts.csv
# install requirements
pip install -r ${BASE}/requirements.txt
python ${BASE}/mmmt_prompt.py --prompt_path $DATA --num_makers 2 --num_trainers 2 --trainer_strategy colossalai_gemini --model opt --critic_model opt --pretrain facebook/opt-350m --critic_pretrain facebook/opt-125m --experience_batch_size 4 --train_batch_size 2

View File

@ -124,3 +124,6 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_datas
rm -rf ${BASE}/rm_ckpt_gpt.pt
rm -rf ${BASE}/actor_checkpoint_prompts.pt
# 3080 doesn't support P2P, skip this test
# cd ${BASE}/ray && bash test_ci.sh && cd ${BASE}

View File

@ -0,0 +1,9 @@
from .base import MixedPrecisionMixin
from .bf16 import BF16MixedPrecisionMixin
from .fp16 import FP16MixedPrecisionMixin
__all__ = [
'MixedPrecisionMixin',
'FP16MixedPrecisionMixin',
'BF16MixedPrecisionMixin',
]

View File

@ -0,0 +1,91 @@
from abc import ABC, abstractmethod
import torch
from torch import Tensor
class MixedPrecisionMixin(ABC):
"""A helper class for mixed precision training. This mixin is used in mixed precision optimizers.
Attributes:
dtype (torc.dtype): The expected dtype of the gradients.
Examples:
```python
class MyMixedPrecisionOptimizer(OptimizerWrapper):
def __init__(self, optim: Optimizer):
super().__init__(optim)
self.mixed_precision = MixedPrecisionMixin()
def backward(self, loss):
loss = self.mixed_precision.pre_backward(loss)
loss.backward()
def backward_by_grad(self, tensor, grad):
grad = self.mixed_precision.pre_backward_by_grad(tensor, grad)
tensor.backward(grad)
def step(self):
if self.mixed_precision.should_skip_step():
self.zero_grad()
return
div_scale = self.mixed_precision.get_grad_div_scale()
# maybe clip grad here
# maybe scale grad here
self.optim.step()
def zero_grad(self):
self.mixed_precision.pre_zero_grad()
return self.optim.zero_grad()
```
"""
dtype: torch.dtype
@abstractmethod
def pre_backward(self, loss: Tensor) -> Tensor:
"""Called before backward.
Args:
loss (Tensor): Loss value.
Returns:
Tensor: Loss value (possibly scaled).
"""
pass
@abstractmethod
def pre_backward_by_grad(self, tensor: Tensor, grad: Tensor) -> Tensor:
"""Called before backward by grad. This is helpful for pipeline parallelism.
Args:
tensor (Tensor): Tensor to backward.
grad (Tensor): Gradient of the tensor.
Returns:
Tensor: Gradient of the tensor (possibly scaled).
"""
pass
@abstractmethod
def should_skip_step(self) -> bool:
"""Called before step.
Returns:
bool: Whether to skip the step.
"""
pass
@abstractmethod
def pre_zero_grad(self) -> None:
"""Called before zero_grad.
"""
pass
@abstractmethod
def get_grad_div_scale(self) -> float:
"""Called before step or clip_grad. To keep computation efficiency, this method does not (maybe) unscale grads.
Returns:
float: A divisor for gradient clipping or step.
"""
pass

View File

@ -0,0 +1,23 @@
import torch
from torch import Tensor
from .base import MixedPrecisionMixin
class BF16MixedPrecisionMixin(MixedPrecisionMixin):
dtype = torch.bfloat16
def pre_backward(self, loss: Tensor) -> Tensor:
return loss
def pre_backward_by_grad(self, tensor: Tensor, grad: Tensor) -> Tensor:
return grad
def should_skip_step(self) -> bool:
return False
def pre_zero_grad(self) -> None:
pass
def get_grad_div_scale(self) -> float:
return 1.0

View File

@ -0,0 +1,84 @@
from abc import abstractmethod
from enum import Enum
import torch
import torch.distributed as dist
from torch import Tensor
from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
from colossalai.utils import get_current_device
from .base import MixedPrecisionMixin
class OptimState(Enum):
SCALED = 0
UNSCALED = 1
class FP16MixedPrecisionMixin(MixedPrecisionMixin):
dtype = torch.float16
def __init__(self,
initial_scale: float = 2**16,
min_scale: float = 1,
growth_factor: float = 2,
backoff_factor: float = 0.5,
growth_interval: int = 1000,
hysteresis: int = 2,
max_scale: float = 2**32) -> None:
super().__init__()
self.grad_scaler = DynamicGradScaler(initial_scale=initial_scale,
min_scale=min_scale,
growth_factor=growth_factor,
backoff_factor=backoff_factor,
growth_interval=growth_interval,
hysteresis=hysteresis,
max_scale=max_scale)
self.optim_state = OptimState.UNSCALED
self.found_overflow = torch.zeros(1, dtype=torch.float, device=get_current_device())
@property
def loss_scale(self) -> float:
return self.grad_scaler.scale.item()
@abstractmethod
def check_local_overflow(self) -> bool:
"""Check whether there is overflow in the local process. This method should be implemented by subclasses.
Returns:
bool: Whether there is overflow in the local process.
"""
pass
def check_overflow(self) -> bool:
# clear previous overflow record
self.found_overflow.fill_(0.0)
if self.check_local_overflow():
self.found_overflow.fill_(1.0)
dist.all_reduce(self.found_overflow, op=dist.ReduceOp.MAX)
return self.found_overflow.item() > 0
def pre_backward(self, loss: Tensor) -> Tensor:
loss = self.loss_scale * loss
self.optim_state = OptimState.SCALED
return loss
def pre_backward_by_grad(self, tensor: Tensor, grad: Tensor) -> Tensor:
self.optim_state = OptimState.SCALED
return grad
def should_skip_step(self) -> bool:
found_inf = self.check_overflow()
self.grad_scaler.update(found_inf)
if found_inf:
self.optim_state = OptimState.UNSCALED
return found_inf
def pre_zero_grad(self) -> None:
pass
def get_grad_div_scale(self) -> float:
assert self.optim_state == OptimState.SCALED, 'grads should be scaled before clipping'
self.optim_state = OptimState.UNSCALED
return self.loss_scale

View File

@ -206,7 +206,7 @@ class Broadcaster(BmmTransform):
# e.g. [1, 2, 4] x [4, 4, 8] -> [4, 2, 8]
# the dim 0 of [1, 2, 4] is multiplied to 4
tensor_shape[dim_idx] = 1
elif broadcast_type == BroadcastType.PADDDING:
elif broadcast_type == BroadcastType.PADDING:
# if the dim is padded
# we remove its sharding
tensor_shape[dim_idx] = None

View File

@ -21,7 +21,7 @@ __all__ = [
class BroadcastType(Enum):
EQUAL = auto()
PADDDING = auto()
PADDING = auto()
MULTIPLE = auto()
@ -69,18 +69,18 @@ def get_broadcast_dim_info(logical_shape, physical_shape):
for i in range(logical_num_dims):
# get the trailing dim size
logical_dim_idx = logical_num_dims - i - 1
phyiscal_dim_idx = physical_num_dims - i - 1
physical_dim_idx = physical_num_dims - i - 1
logical_dim_size = logical_shape[logical_dim_idx]
if phyiscal_dim_idx >= 0:
physical_dim_size = physical_shape[phyiscal_dim_idx]
if physical_dim_idx >= 0:
physical_dim_size = physical_shape[physical_dim_idx]
if physical_dim_size == logical_dim_size:
logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.EQUAL
elif physical_dim_size == 1 and physical_dim_size != logical_dim_size:
logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.MULTIPLE
else:
logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.PADDDING
logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.PADDING
return logical_dim_broadcast_info
@ -117,7 +117,7 @@ def recover_sharding_spec_for_broadcast_shape(logical_sharding_spec: ShardingSpe
for shape_dim, mesh_dim in logical_dim_partition.items():
logical_broadcast_type = logical_dim_broadcast_info[shape_dim]
if logical_broadcast_type == BroadcastType.PADDDING or logical_broadcast_type == BroadcastType.MULTIPLE:
if logical_broadcast_type == BroadcastType.PADDING or logical_broadcast_type == BroadcastType.MULTIPLE:
removed_dims.extend(mesh_dim)
else:
# get the corresponding physical dim

View File

@ -25,11 +25,11 @@ class Booster:
Examples:
```python
colossalai.launch(...)
plugin = GeminiPlugin(stage=3, ...)
plugin = GeminiPlugin(...)
booster = Booster(precision='fp16', plugin=plugin)
model = GPT2()
optimizer = Adam(model.parameters())
optimizer = HybridAdam(model.parameters())
dataloader = Dataloader(Dataset)
lr_scheduler = LinearWarmupScheduler()
criterion = GPTLMLoss()

View File

@ -23,6 +23,9 @@ from .dp_plugin_base import DPPluginBase
__all__ = ['GeminiPlugin']
SUPPORTED_PRECISION = ['fp16', 'bf16']
PRECISION_STR_TO_DTYPE = {'fp16': torch.half, 'bf16': torch.bfloat16}
class GeminiCheckpointIO(GeneralCheckpointIO):
@ -171,6 +174,7 @@ class GeminiPlugin(DPPluginBase):
Args:
device (torch.device): device to place the model.
placement_policy (str, optional): "cpu", "cuda", "auto". Defaults to "cpu".
precision (str, optional): precision. Support 'fp16' and 'bf16'. Defaults to 'fp16'.
pin_memory (bool, optional): use pin memory on CPU. Defaults to False.
force_outputs_fp32 (bool, optional): force outputs are fp32. Defaults to False.
strict_ddp_mode (bool, optional): use strict ddp mode (only use dp without other parallelism). Defaults to False.
@ -203,6 +207,7 @@ class GeminiPlugin(DPPluginBase):
self,
device: Optional[torch.device] = None,
placement_policy: str = "cpu",
precision: str = "fp16",
pin_memory: bool = False,
force_outputs_fp32: bool = False,
strict_ddp_mode: bool = False,
@ -223,6 +228,7 @@ class GeminiPlugin(DPPluginBase):
verbose: bool = False,
) -> None:
super().__init__()
assert precision in SUPPORTED_PRECISION, f'precision {precision} is not supported'
self.gemini_config = dict(
device=(device or get_current_device()),
placement_policy=placement_policy,
@ -233,6 +239,7 @@ class GeminiPlugin(DPPluginBase):
hidden_dim=hidden_dim,
min_chunk_size_mb=min_chunk_size_mb,
memstats=memstats,
mixed_precision=PRECISION_STR_TO_DTYPE[precision],
)
self.zero_optim_config = dict(gpu_margin_mem_ratio=gpu_margin_mem_ratio,)
self.optim_kwargs = dict(initial_scale=initial_scale,
@ -253,7 +260,7 @@ class GeminiPlugin(DPPluginBase):
return True
def supported_precisions(self) -> List[str]:
return ['fp16']
return SUPPORTED_PRECISION
def control_device(self) -> bool:
return True

View File

@ -1,4 +1,5 @@
import warnings
from functools import partial
from typing import Callable, Iterator, List, Optional, Tuple, Union
import torch
@ -20,12 +21,15 @@ from .torch_ddp_plugin import TorchDDPCheckpointIO
__all__ = ['LowLevelZeroPlugin']
def _convert_to_fp16(x):
def _convert_floating_point(x, dtype: torch.dtype = torch.float16):
if isinstance(x, torch.Tensor) and torch.is_floating_point(x):
return x.half()
return x.to(dtype)
return x
SUPPORTED_PRECISION = ['fp16', 'bf16', 'fp32']
class LowLevelZeroCheckpointIO(TorchDDPCheckpointIO):
def save_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: str, gather_dtensor: bool):
@ -49,17 +53,24 @@ class LowLevelZeroModel(ModelWrapper):
def __init__(self, module: nn.Module, stage: int, precision: str) -> None:
super().__init__(module)
self.convert_inputs = (precision == 'fp16')
module = zero_model_wrapper(module, zero_stage=stage)
self.dtype = None
if precision == 'fp16':
module = module.half()
self.dtype = torch.float16
elif precision == 'bf16':
self.dtype = torch.bfloat16
module = zero_model_wrapper(module, zero_stage=stage)
if self.dtype is not None:
module = module.to(self.dtype)
module = module.to(get_current_device())
self.module = module
self.convert_fn = None
if self.dtype is not None:
self.convert_fn = partial(_convert_floating_point, dtype=self.dtype)
def forward(self, *args, **kwargs):
if self.convert_inputs:
args = tree_map(_convert_to_fp16, args)
kwargs = tree_map(_convert_to_fp16, kwargs)
if self.convert_fn is not None:
args = tree_map(self.convert_fn, args)
kwargs = tree_map(self.convert_fn, kwargs)
return super().forward(*args, **kwargs)
@ -110,7 +121,7 @@ class LowLevelZeroPlugin(DPPluginBase):
Args:
strage (int, optional): ZeRO stage. Defaults to 1.
precision (str, optional): precision. Support 'fp16' and 'fp32'. Defaults to 'fp16'.
precision (str, optional): precision. Support 'fp16', 'bf16' and 'fp32'. Defaults to 'fp16'.
initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
@ -149,7 +160,7 @@ class LowLevelZeroPlugin(DPPluginBase):
) -> None:
super().__init__()
assert stage in (1, 2), f'LowLevelZeroPlugin only supports stage 1/2 training'
assert precision in ('fp16', 'fp32'), f'LowLevelZeroPlugin only supports fp16/fp32 training'
assert precision in SUPPORTED_PRECISION, f'LowLevelZeroPlugin only supports {SUPPORTED_PRECISION} training'
self.stage = stage
self.precision = precision
@ -175,7 +186,7 @@ class LowLevelZeroPlugin(DPPluginBase):
return True
def supported_precisions(self) -> List[str]:
return ['fp16', 'fp32']
return SUPPORTED_PRECISION
def control_device(self) -> bool:
return True

View File

@ -3,10 +3,10 @@ from typing import Callable, Iterable, Iterator, List, Optional, Tuple, Union
import torch
import torch.nn as nn
import warnings
from packaging import version
from torch.distributed import ProcessGroup
if version.parse(torch.__version__) >= version.parse('1.12.0'):
from torch.distributed.fsdp import FullStateDictConfig
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
@ -202,6 +202,11 @@ class TorchFSDPPlugin(DPPluginBase):
# wrap the model with PyTorch FSDP
fsdp_model = TorchFSDPModel(model, device_id=torch.cuda.current_device(), **self.fsdp_kwargs)
if len(optimizer.param_groups) > 1:
warnings.warn(
'TorchFSDPPlugin does not support optimizer that use multi param groups. The results may not be as expected if used.'
)
optimizer.__init__(fsdp_model.parameters(), **optimizer.defaults)
if not isinstance(optimizer, FSDPOptimizerWrapper):

View File

@ -28,7 +28,7 @@ from .run import launch_multi_processes
type=str,
default=None,
help=
"Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --includ,"
"Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --include,"
" only effective when used with --hostfile.")
@click.option("--num_nodes",
type=int,

View File

@ -38,7 +38,7 @@ class HostInfo:
# socket.getfqdn("127.0.0.1") does not return localhost
# on some users' machines
# thus, we directly return True if hostname is locahost, 127.0.0.1 or 0.0.0.0
# thus, we directly return True if hostname is localhost, 127.0.0.1 or 0.0.0.0
if hostname in ("localhost", "127.0.0.1", "0.0.0.0"):
return True

View File

@ -114,7 +114,7 @@ class MultiNodeRunner:
Receive messages from all hosts
Returns:
msg_from_node (dict): a dictionry which contains messages from each node
msg_from_node (dict): a dictionary which contains messages from each node
"""
msg_from_node = dict()

View File

@ -154,7 +154,7 @@ def get_launch_command(
extra_launch_args = dict()
torch_version = version.parse(torch.__version__)
assert torch_version.major == 1
assert torch_version.major >= 1
if torch_version.minor < 9:
cmd = [
@ -298,7 +298,7 @@ def launch_multi_processes(args: Config) -> None:
# receive the stop status
msg_from_node = runner.recv_from_all()
# printe node status
# print node status
click.echo("\n====== Stopping All Nodes =====")
for hostname, msg in msg_from_node.items():
click.echo(f"{hostname}: {msg}")

View File

@ -197,7 +197,7 @@ class AlphaBetaProfiler:
dist.broadcast_object_list(broadcast_list, src=process_group[0])
alpha_beta_dict[process_group] = tuple(broadcast_list)
# add symmetry pair to the apha_beta_dict
# add symmetry pair to the alpha_beta_dict
symmetry_ab_dict = {}
for process_group, alpha_beta_pair in alpha_beta_dict.items():
symmetry_process_group = (process_group[1], process_group[0])

View File

@ -51,7 +51,7 @@ class BiasAdditionModule(ABC):
For example:
The kwargs for conv2d module is {} because the attributes like 'padding' or 'groups' are
considered during module initilizing. However, we need to consider those attributes as kwargs
considered during module initializing. However, we need to consider those attributes as kwargs
in F.conv2d.
"""
pass

View File

@ -295,7 +295,7 @@ class ColoTracer(Tracer):
@staticmethod
def forward(ctx, run_function, preserve_rng_state, *args):
# signal that the current tracing occurs within activaton checkpoint part
# signal that the current tracing occurs within activation checkpoint part
self.inside_torch_checkpoint_func = True
out = run_function(*args)
self.inside_torch_checkpoint_func = False

View File

@ -92,7 +92,7 @@ class ColoTracer(Tracer):
return proxy
# if graph is traced for auto parallelism module, some extra node will be added during
# graph construction to deal with the compatability between bias addition and all reduce.
# graph construction to deal with the compatibility between bias addition and all reduce.
# if no extra manipulation is applied, we just pass the origin arguments to create_proxy function
# to create node on computation graph
@ -208,7 +208,7 @@ class ColoTracer(Tracer):
self.proxy_cls = ColoProxy
self.tracer_type = TracerType.META
else:
raise ValueError(f"Unrecognised tracer type {tracer_type}")
raise ValueError(f"Unrecognized tracer type {tracer_type}")
def _meta_data_computing(self, kind, target, args, kwargs):
@ -445,7 +445,7 @@ class ColoTracer(Tracer):
@staticmethod
def forward(ctx, run_function, preserve_rng_state, *args):
# signal that the current tracing occurs within activaton checkpoint part
# signal that the current tracing occurs within activation checkpoint part
self.inside_torch_checkpoint_func = True
out = run_function(*args)
self.inside_torch_checkpoint_func = False

View File

@ -171,6 +171,21 @@
using g_scalar_t_##LEVEL = at::Half; \
using p_scalar_t_##LEVEL = at::Half; \
__VA_ARGS__; \
} else if (GTYPE == at::ScalarType::Float && \
PTYPE == at::ScalarType::BFloat16) { \
using g_scalar_t_##LEVEL = float; \
using p_scalar_t_##LEVEL = at::BFloat16; \
__VA_ARGS__; \
} else if (GTYPE == at::ScalarType::BFloat16 && \
PTYPE == at::ScalarType::Float) { \
using g_scalar_t_##LEVEL = at::BFloat16; \
using p_scalar_t_##LEVEL = float; \
__VA_ARGS__; \
} else if (GTYPE == at::ScalarType::BFloat16 && \
PTYPE == at::ScalarType::BFloat16) { \
using g_scalar_t_##LEVEL = at::BFloat16; \
using p_scalar_t_##LEVEL = at::BFloat16; \
__VA_ARGS__; \
} else { \
AT_ERROR(#NAME, "not implemented for '", toString(GTYPE), toString(PTYPE), \
"'"); \

View File

@ -138,7 +138,7 @@ if HAS_MEM_EFF_ATTN:
elif attn_mask_type == AttnMaskType.causal: # gpt style
attn_bias = LowerTriangularMask()
if bias is not None: # alibi / relative position emebedding
if bias is not None: # alibi / relative position embedding
assert allow_alibi, "flash attention with bias is not supported in this system."
assert attn_mask_type == AttnMaskType.causal, \
"attention with bias is only supported for causal attention so far."

View File

@ -43,7 +43,7 @@ class Config:
attn_prob_dropout_ratio: float # attention score dropout ratio
hidden_dropout_ratio: float # dropout ration before residual
norm_first: bool # norm_first
fp16: bool # fp16 presion
fp16: bool # fp16 precision
class MultiHeadAttention1DFunc(Function):

View File

@ -43,7 +43,7 @@ def warmup_jit_fusion(batch_size: int,
seq_length: int = 512,
vocab_size: int = 32768,
dtype: torch.dtype = torch.float32):
""" Compilie JIT functions before the main training steps """
""" Compile JIT functions before the main training steps """
embed = Embedding(vocab_size, hidden_size).to(get_current_device())
linear_1 = Linear(hidden_size, hidden_size * 4, skip_bias_add=True).to(get_current_device())

View File

@ -0,0 +1,6 @@
from .lazy_init import LazyInitContext, LazyTensor
__all__ = [
'LazyInitContext',
'LazyTensor',
]

View File

@ -37,7 +37,7 @@ _EARLY_MATERIALIZED_OPS = ['__getitem__', 'split']
# If your intent is to change the metadata of a Tensor (such as sizes / strides / storage / storage_offset)
# without autograd tracking the change, remove the .data / .detach() call and wrap the change in a `with torch.no_grad():` block.
# These ops cannot be unwrapped using .data
_CHANGE_META_OPS = ['_cudnn_rnn_flatten_weight', 'requires_grad_', '__get__', '__set__']
_CHANGE_META_OPS = ['_cudnn_rnn_flatten_weight', 'requires_grad_', '__get__', '__set__', 'numel', 'size', 'dim']
_LEGACY_TENSOR_CONSTRUCTOR = {
'FloatTensor': torch.float,
@ -350,7 +350,14 @@ class LazyTensor(torch.Tensor):
copied.requires_grad_()
return copied
target = LazyTensor(factory_fn, meta_data=self._meta_data)
if self._materialized_data is not None:
# self is early materialized
copied = self._materialized_data.detach().clone()
if self.requires_grad:
copied.requires_grad_()
target = LazyTensor(lambda: None, concrete_data=copied)
else:
target = LazyTensor(factory_fn, meta_data=self._meta_data)
memo[id(self)] = target
return target

View File

@ -195,7 +195,7 @@ class _Linear(nn.Module):
keep_master_weight_for_test: This was added for testing and should be
set to False. It returns the master weights
used for initialization.
skip_bias_add: This was added to enable performance optimations where bias
skip_bias_add: This was added to enable performance optimizations where bias
can be fused with other elementwise operations. we skip
adding bias but instead return it.
"""

View File

@ -21,7 +21,7 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
# Subtract the maximum value.
vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
# Get the partition's vocab indecies
# Get the partition's vocab indices
partition_vocab_size = vocab_parallel_logits.size()[-1]
rank = dist.get_rank(process_group)
vocab_start_index = partition_vocab_size * rank
@ -61,10 +61,10 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
@custom_bwd
def backward(ctx, grad_output):
# Retreive tensors from the forward path.
# Retrieve tensors from the forward path.
softmax, target_mask, masked_target_1d = ctx.saved_tensors
# All the inputs have softmax as thier gradient.
# All the inputs have softmax as their gradient.
grad_input = softmax
# For simplicity, work with the 2D gradient.
partition_vocab_size = softmax.size()[-1]

View File

@ -106,7 +106,7 @@ class _VocabParallelCrossEntropy2D(torch.autograd.Function):
@staticmethod
@custom_bwd
def backward(ctx, output_grad):
# Retreive tensors from the forward path.
# Retrieve tensors from the forward path.
softmax, target_mask, masked_target = ctx.saved_tensors
# All the inputs have softmax as their gradient.

View File

@ -100,7 +100,7 @@ class _VocabParallelCrossEntropy2p5D(torch.autograd.Function):
@staticmethod
@custom_bwd
def backward(ctx, output_grad):
# Retreive tensors from the forward path.
# Retrieve tensors from the forward path.
softmax, target_mask, masked_target = ctx.saved_tensors
# All the inputs have softmax as their gradient.

View File

@ -99,10 +99,10 @@ class _VocabParallelCrossEntropy3D(torch.autograd.Function):
@staticmethod
@custom_bwd
def backward(ctx, output_grad):
# Retreive tensors from the forward path.
# Retrieve tensors from the forward path.
softmax, target_mask, masked_target = ctx.saved_tensors
# All the inputs have softmax as thier gradient.
# All the inputs have softmax as their gradient.
input_grad = softmax
# For simplicity, work with the 2D gradient.
partition_vocab_size = softmax.size()[-1]

View File

@ -13,7 +13,7 @@ from .nvme_optimizer import NVMeOptimizer
class CPUAdam(NVMeOptimizer):
"""Implements Adam algorithm.
Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
Supports parameters updating on both GPU and CPU, depending on the device of parameters.
But the parameters and gradients should on the same device:
* Parameters on CPU and gradients on CPU is allowed.
* Parameters on GPU and gradients on GPU is allowed.
@ -21,7 +21,7 @@ class CPUAdam(NVMeOptimizer):
`CPUAdam` requires CUDA extensions which can be built during installation or runtime.
This version of CPU Adam accelates parameters updating on CPU with SIMD.
This version of CPU Adam accelerates parameters updating on CPU with SIMD.
Support of AVX2 or AVX512 is required.
The GPU part is implemented in an naive way.
@ -93,8 +93,7 @@ class CPUAdam(NVMeOptimizer):
bias_correction1,
bias_correction2,
use_adamw=False):
# FIXME(ver217): remove the below line when replace torch adam with fused adam
grad = grad.float()
grad = grad.to(data.dtype)
if weight_decay != 0:
if use_adamw:
@ -133,10 +132,12 @@ class CPUAdam(NVMeOptimizer):
if len(state) == 0:
state['step'] = 0
# FIXME(ver217): CPU adam kernel only supports fp32 states now
assert p.dtype is torch.float, "CPUAdam only support fp32 parameters"
# gradient momentums
state['exp_avg'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
state['exp_avg'] = torch.zeros_like(p, device=target_device)
# gradient variances
state['exp_avg_sq'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
state['exp_avg_sq'] = torch.zeros_like(p, device=target_device)
self._post_state_init(p)
state['step'] += 1
@ -147,9 +148,17 @@ class CPUAdam(NVMeOptimizer):
assert state['exp_avg'].device.type == 'cpu', "exp_avg should stay on cpu"
assert state['exp_avg_sq'].device.type == 'cpu', "exp_avg should stay on cpu"
self._pre_update(p, 'exp_avg', 'exp_avg_sq')
self.cpu_adam_op.step(state['step'], group['lr'], beta1, beta2, group['eps'], group['weight_decay'],
group['bias_correction'], p.data, p.grad.data, state['exp_avg'],
state['exp_avg_sq'], div_scale)
if p.grad.dtype is torch.bfloat16:
# cpu adam kernel does not support bf16 now
bias_correction1 = 1 - beta1**state['step']
bias_correction2 = 1 - beta2**state['step']
self.torch_adam_update(p.data, p.grad.data, state['exp_avg'], state['exp_avg_sq'], group['lr'],
beta1, beta2, group['eps'], group['weight_decay'], bias_correction1,
bias_correction2, self.adamw_mode)
else:
self.cpu_adam_op.step(state['step'], group['lr'], beta1, beta2, group['eps'],
group['weight_decay'], group['bias_correction'], p.data, p.grad.data,
state['exp_avg'], state['exp_avg_sq'], div_scale)
self._post_update(p, 'exp_avg', 'exp_avg_sq')
elif target_device.type == 'cuda':
assert div_scale == -1, "div_scale should remain default"

View File

@ -134,8 +134,8 @@ class FusedAdam(torch.optim.Optimizer):
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p)
if p.dtype not in [torch.float16, torch.float32]:
raise RuntimeError('FusedAdam only support fp16 and fp32.')
if p.dtype not in [torch.float16, torch.float32, torch.bfloat16]:
raise RuntimeError('FusedAdam only support fp16, fp32 and bf16.')
g_l.append(p.grad.data)
p_l.append(p.data)

View File

@ -1,19 +1,20 @@
from typing import Any, Optional
import torch
from torch.optim import Adam
from colossalai.kernel.op_builder import CPUAdamBuilder, FusedOptimBuilder
from colossalai.kernel.op_builder import FusedOptimBuilder
from colossalai.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier
from .nvme_optimizer import NVMeOptimizer
from .cpu_adam import CPUAdam
@OPTIMIZERS.register_module
class HybridAdam(NVMeOptimizer):
class HybridAdam(CPUAdam):
"""Implements Adam algorithm.
Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
Supports parameters updating on both GPU and CPU, depending on the device of parameters.
But the parameters and gradients should on the same device:
* Parameters on CPU and gradients on CPU is allowed.
* Parameters on GPU and gradients on GPU is allowed.
@ -74,15 +75,9 @@ class HybridAdam(NVMeOptimizer):
nvme_offload_dir: Optional[str] = None,
**defaults: Any):
default_args = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, bias_correction=bias_correction)
super(HybridAdam, self).__init__(model_params, default_args, nvme_offload_fraction, nvme_offload_dir)
self.adamw_mode = adamw_mode
# build during runtime if not found
cpu_optim = CPUAdamBuilder().load()
super().__init__(model_params, lr, bias_correction, betas, eps, weight_decay, adamw_mode, nvme_offload_fraction,
nvme_offload_dir)
fused_optim = FusedOptimBuilder().load()
self.cpu_adam_op = cpu_optim.CPUAdamOptimizer(lr, betas[0], betas[1], eps, weight_decay, adamw_mode)
self.gpu_adam_op = fused_optim.multi_tensor_adam
self._dummy_overflow_buf = torch.cuda.IntTensor([0])
@ -108,10 +103,12 @@ class HybridAdam(NVMeOptimizer):
if len(state) == 0:
state['step'] = 0
# FIXME(ver217): CPU adam kernel only supports fp32 states now
assert p.dtype is torch.float, "HybridAdam only support fp32 parameters"
# gradient momentums
state['exp_avg'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
state['exp_avg'] = torch.zeros_like(p, device=target_device)
# gradient variances
state['exp_avg_sq'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
state['exp_avg_sq'] = torch.zeros_like(p, device=target_device)
self._post_state_init(p)
state['step'] += 1
@ -122,9 +119,17 @@ class HybridAdam(NVMeOptimizer):
assert state['exp_avg'].device.type == 'cpu', "exp_avg should stay on cpu"
assert state['exp_avg_sq'].device.type == 'cpu', "exp_avg should stay on cpu"
self._pre_update(p, 'exp_avg', 'exp_avg_sq')
self.cpu_adam_op.step(state['step'], group['lr'], beta1, beta2, group['eps'], group['weight_decay'],
group['bias_correction'], p.data, p.grad.data, state['exp_avg'],
state['exp_avg_sq'], div_scale)
if p.grad.dtype is torch.bfloat16:
# cpu adam kernel does not support bf16 now
bias_correction1 = 1 - beta1**state['step']
bias_correction2 = 1 - beta2**state['step']
self.torch_adam_update(p.data, p.grad.data, state['exp_avg'], state['exp_avg_sq'], group['lr'],
beta1, beta2, group['eps'], group['weight_decay'], bias_correction1,
bias_correction2, self.adamw_mode)
else:
self.cpu_adam_op.step(state['step'], group['lr'], beta1, beta2, group['eps'],
group['weight_decay'], group['bias_correction'], p.data, p.grad.data,
state['exp_avg'], state['exp_avg_sq'], div_scale)
self._post_update(p, 'exp_avg', 'exp_avg_sq')
elif target_device.type == 'cuda':

View File

@ -59,7 +59,7 @@ class Lamb(Optimizer):
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')
raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instead.')
state = self.state[p]

View File

@ -43,7 +43,7 @@ class NVMeOptimizer(torch.optim.Optimizer):
self.offloader = None
self.is_on_nvme: Dict[Parameter, bool] = {}
self.offloaded_numel: int = 0
# As param may be not materialized here, these attributes are initalized when the first step
# As param may be not materialized here, these attributes are initialized when the first step
self.total_numel: Optional[int] = None
self.can_offload_numel: Optional[int] = None

View File

@ -12,23 +12,23 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
Cached Embedding. Apply a GPU-based software cache approaches to dynamically manage the embedding table in the CPU and GPU memory space.
It can leverage the id's frequency statistics of the target dataset, by passing a frequency list to param `ids_freq_mapping`.
You can also apply a navie LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
You can also apply a naive LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
Args:
num_embeddings (int): size of the dictionary of embeddings
embedding_dim (int): the size of each embedding vector
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; therefore, the embedding vector at padding_idx is not updated during training, i.e. it remains as a fixed pad. For a newly constructed EmbeddingBag, the embedding vector at padding_idx will default to all zeros, but can be updated to another value to be used as the padding vector. Note that the embedding vector at padding_idx is excluded from the reduction.
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is renormalized to have norm max_norm
norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2..
norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2.
scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. Note: this option is not supported when mode="max". Defaults to False.
sparse (bool, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients. Note: this option is not supported when mode="max".. Defaults to False.
_weight (torch.Tensor, optional): an embedding weight tensor. Concate multiple tables in a embedding bag as a single one. Defaults to None.
_weight (torch.Tensor, optional): an embedding weight tensor. Concatenate multiple tables in a embedding bag as a single one. Defaults to None.
mode (str, optional): "sum", "mean" or "max". Specifies the way to reduce the bag. "sum" computes the weighted sum, taking per_sample_weights into consideration. "mean" computes the average of the values in the bag, "max" computes the max value over each bag. Default: "mean". Defaults to 'mean'.
include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occures in dataset. Defaults to None.
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
pin_weight (bool, optional): pin the cpu weight. Defaults to False.
@ -145,7 +145,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
def swap_in_bandwidth(self):
if self.cache_weight_mgr._cpu_to_cuda_numel > 0:
return self.cache_weight_mgr._cpu_to_cuda_numel * self.cache_weight_mgr.elem_size_in_byte / 1e6 / \
self.cache_weight_mgr._cpu_to_cuda_elpase
self.cache_weight_mgr._cpu_to_cuda_elapse
else:
return 0

View File

@ -17,7 +17,7 @@ class LimitBuffIndexCopyer(object):
def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
"""copy
src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
The valid rows in the src tensor are continous, while rows in tgt tensor is scattered.
The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
Args:
dim (int): dimension along which to index

View File

@ -114,7 +114,7 @@ class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):
# get result of shape = (batch_size, (len(assigned_table_list)*embedding_dim))
local_output = torch.cat(local_output_list, 1)
# then concatenate those local_output on the second demension.
# then concatenate those local_output on the second dimension.
# use all_to_all
remains = batch_size % self.world_size
scatter_strides = [batch_size // self.world_size + int(i < remains) for i in range(self.world_size)]

Some files were not shown because too many files have changed in this diff Show More