ColossalAI/examples/language/roberta/preprocessing/tokenize_mask.py

import time
import os
import psutil
import h5py
import socket
import argparse
import numpy as np
import multiprocessing
from tqdm import tqdm
from random import shuffle
from transformers import AutoTokenizer
from get_mask import PreTrainingDataset


def get_raw_instance(document, max_sequence_length=512):

    """
    获取初步的训练实例，将整段按照max_sequence_length切分成多个部分,并以多个处理好的实例的形式返回。
    :param document: 一整段
    :param max_sequence_length:
    :return: a list. each element is a sequence of text
    """
    # document = self.documents[index]
    max_sequence_length_allowed = max_sequence_length - 2
    # document = [seq for seq in document if len(seq)<max_sequence_length_allowed]
    sizes = [len(seq) for seq in document]

    result_list = []
    curr_seq = [] # 当前处理的序列
    sz_idx = 0
    while sz_idx < len(sizes):
        # 当前句子加上新的句子，如果长度小于最大限制，则合并当前句子和新句子；否则即超过了最大限制，那么做为一个新的序列加到目标列表中

        if len(curr_seq) + sizes[sz_idx] <= max_sequence_length_allowed: # or len(curr_seq)==0:
            curr_seq += document[sz_idx]
            sz_idx += 1
        elif sizes[sz_idx] >= max_sequence_length_allowed:
            if len(curr_seq) > 0:
                result_list.append(curr_seq)
            curr_seq = []
            result_list.append(document[sz_idx][ : max_sequence_length_allowed])
            sz_idx += 1
        else:
            result_list.append(curr_seq)
            curr_seq = []
    # 对最后一个序列进行处理，如果太短的话，丢弃掉。
    if len(curr_seq) > max_sequence_length_allowed / 2: # /2
        result_list.append(curr_seq)

    # # 计算总共可以得到多少份
    # num_instance=int(len(big_list)/max_sequence_length_allowed)+1
    # print("num_instance:",num_instance)
    # # 切分成多份，添加到列表中
    # result_list=[]
    # for j in range(num_instance):
    #     index=j*max_sequence_length_allowed
    #     end_index=index+max_sequence_length_allowed if j!=num_instance-1 else -1
    #     result_list.append(big_list[index:end_index])
    return result_list


def split_numpy_chunk(path, tokenizer, pretrain_data, host):

    documents = []
    instances = []

    s = time.time()
    with open(path, encoding='utf-8') as fd:
        document = []
        for i, line in enumerate(tqdm(fd)):
            line = line.strip()
            # document = line
            # if len(document.split("<sep>")) <= 3:
            #     continue
            if len(line
                    ) > 0 and line[:2] == "]]":  # This is end of document
                documents.append(document)
                document = []
            elif len(line) >= 2:
                document.append(line)
        if len(document) > 0:
            documents.append(document)
    print('read_file ', time.time() - s)

    # documents = [x for x in documents if x]
    # print(len(documents))
    # print(len(documents[0]))
    # print(documents[0][0:10])
    from typing import List
    import multiprocessing

    ans = []
    for docs in tqdm(documents):
        ans.append(pretrain_data.tokenize(docs))
    print(time.time() - s)
    del documents

    instances = []
    for a in tqdm(ans):
        raw_ins = get_raw_instance(a)
        instances.extend(raw_ins)
    del ans

    print('len instance', len(instances))

    sen_num = len(instances)
    seq_len = 512
    input_ids = np.zeros([sen_num, seq_len], dtype=np.int32)
    input_mask = np.zeros([sen_num, seq_len], dtype=np.int32)
    segment_ids = np.zeros([sen_num, seq_len], dtype=np.int32)
    masked_lm_output = np.zeros([sen_num, seq_len], dtype=np.int32)

    for index, ins in tqdm(enumerate(instances)):
        mask_dict = pretrain_data.create_training_instance(ins)
        input_ids[index] = mask_dict[0]
        input_mask[index] = mask_dict[1]
        segment_ids[index] = mask_dict[2]
        masked_lm_output[index] = mask_dict[3]

    with h5py.File(f'/output/{host}.h5', 'w') as hf:
        hf.create_dataset("input_ids",  data=input_ids)
        hf.create_dataset("input_mask",  data=input_ids)
        hf.create_dataset("segment_ids",  data=segment_ids)
        hf.create_dataset("masked_lm_positions",  data=masked_lm_output)

    del instances


def split_numpy_chunk_pool(input_path,
                            output_path,
                            pretrain_data,
                            worker,
                            dupe_factor,
                            seq_len,
                            file_name):

    if os.path.exists(os.path.join(output_path, f'{file_name}.h5')):
        print(f'{file_name}.h5 exists')
        return

    documents = []
    instances = []

    s = time.time()
    with open(input_path, 'r', encoding='utf-8') as fd:
        document = []
        for i, line in enumerate(tqdm(fd)):
            line = line.strip()
            if len(line
                    ) > 0 and line[:2] == "]]":  # This is end of document
                documents.append(document)
                document = []
            elif len(line) >= 2:
                document.append(line)
        if len(document) > 0:
            documents.append(document)
    print(f'read_file cost {time.time() - s}, length is {len(documents)}')

    ans = []
    s = time.time()
    pool = multiprocessing.Pool(worker)
    encoded_doc = pool.imap_unordered(pretrain_data.tokenize, documents, 100)
    for index, res in tqdm(enumerate(encoded_doc, start=1), total=len(documents), colour='cyan'):
        ans.append(res)
    pool.close()
    print((time.time() - s) / 60)
    del documents

    instances = []
    for a in tqdm(ans, colour='MAGENTA'):
        raw_ins = get_raw_instance(a, max_sequence_length=seq_len)
        instances.extend(raw_ins)
    del ans

    print('len instance', len(instances))

    new_instances = []
    for _ in range(dupe_factor):
        for ins in instances:
            new_instances.append(ins)

    shuffle(new_instances)
    instances = new_instances
    print('after dupe_factor, len instance', len(instances))

    sentence_num = len(instances)
    input_ids = np.zeros([sentence_num, seq_len], dtype=np.int32)
    input_mask = np.zeros([sentence_num, seq_len], dtype=np.int32)
    segment_ids = np.zeros([sentence_num, seq_len], dtype=np.int32)
    masked_lm_output = np.zeros([sentence_num, seq_len], dtype=np.int32)

    s = time.time()
    pool = multiprocessing.Pool(worker)
    encoded_docs = pool.imap_unordered(pretrain_data.create_training_instance, instances, 32)
    for index, mask_dict in tqdm(enumerate(encoded_docs), total=len(instances), colour='blue'):
        input_ids[index] = mask_dict[0]
        input_mask[index] = mask_dict[1]
        segment_ids[index] = mask_dict[2]
        masked_lm_output[index] = mask_dict[3]
    pool.close()
    print((time.time() - s) / 60)

    with h5py.File(os.path.join(output_path, f'{file_name}.h5'), 'w') as hf:
        hf.create_dataset("input_ids",  data=input_ids)
        hf.create_dataset("input_mask",  data=input_mask)
        hf.create_dataset("segment_ids",  data=segment_ids)
        hf.create_dataset("masked_lm_positions",  data=masked_lm_output)

    del instances


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--tokenizer_path', type=str, required=True, default=10, help='path of tokenizer')
    parser.add_argument('--seq_len', type=int, default=512, help='sequence length')
    parser.add_argument('--max_predictions_per_seq', type=int, default=80, help='number of shards, e.g., 10, 50, or 100')
    parser.add_argument('--input_path', type=str, required=True, help='input path of shard which has split sentence')
    parser.add_argument('--output_path', type=str, required=True, help='output path of h5 contains token id')
    parser.add_argument('--backend', type=str, default='python', help='backend of mask token, python, c++, numpy respectively')
    parser.add_argument('--dupe_factor', type=int, default=1, help='specifies how many times the preprocessor repeats to create the input from the same article/document')
    parser.add_argument('--worker', type=int, default=32, help='number of process')
    parser.add_argument('--server_num', type=int, default=10, help='number of servers')
    args = parser.parse_args()

    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
    pretrain_data = PreTrainingDataset(tokenizer,
                                        args.seq_len,
                                        args.backend,
                                        max_predictions_per_seq=args.max_predictions_per_seq)


    data_len = len(os.listdir(args.input_path))

    for i in range(data_len):
        input_path = os.path.join(args.input_path, f'{i}.txt')
        if os.path.exists(input_path):
            start = time.time()
            print(f'process {input_path}')
            split_numpy_chunk_pool(input_path,
                                    args.output_path,
                                    pretrain_data,
                                    args.worker,
                                    args.dupe_factor,
                                    args.seq_len,
                                    i)
            end_ = time.time()
            print(u'memory：%.4f GB' % (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024) )
            print(f'has cost {(end_ - start) / 60}')
            print('-' * 100)
            print('')

    # if you have multiple server, you can use code below or modify code to openmpi

    # host = int(socket.gethostname().split('GPU')[-1])
    # for i in range(data_len // args.server_num + 1):
    #     h = args.server_num * i + host - 1
    #     input_path = os.path.join(args.input_path, f'{h}.txt')
    #     if os.path.exists(input_path):
    #         start = time.time()
    #         print(f'I am server {host}, process {input_path}')
    #         split_numpy_chunk_pool(input_path,
    #                                 args.output_path,
    #                                 pretrain_data,
    #                                 args.worker,
    #                                 args.dupe_factor,
    #                                 args.seq_len,
    #                                 h)
    #         end_ = time.time()
    #         print(u'memory：%.4f GB' % (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024) )
    #         print(f'has cost {(end_ - start) / 60}')
    #         print('-' * 100)
    #         print('')