ColossalAI/examples/community/roberta/preprocessing/sentence_split.py

import argparse
import functools
import json
import multiprocessing
import os
import re
import time
from typing import List

from tqdm import tqdm


def split_sentence(document: str, flag: str = "all", limit: int = 510) -> List[str]:
    sent_list = []
    try:
        if flag == "zh":
            document = re.sub('(?P<quotation_mark>([。？！…](?![”’"\'])))', r'\g<quotation_mark>\n', document)
            document = re.sub('(?P<quotation_mark>([。？！]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document)
        elif flag == "en":
            document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)
            document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n',
                              document)    # Special quotation marks
        else:
            document = re.sub('(?P<quotation_mark>([。？！….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)

            document = re.sub('(?P<quotation_mark>(([。？！.!?]|…{1,2})[”’"\']))', r'\g<quotation_mark>\n',
                              document)    # Special quotation marks

        sent_list_ori = document.splitlines()
        for sent in sent_list_ori:
            sent = sent.strip()
            if not sent:
                continue
            elif len(sent) <= 2:
                continue
            else:
                while len(sent) > limit:
                    temp = sent[0:limit]
                    sent_list.append(temp)
                    sent = sent[limit:]
                sent_list.append(sent)
    except:
        sent_list.clear()
        sent_list.append(document)
    return sent_list


def get_sent(output_path, input_path, fin_list=[], host=-1, seq_len=512) -> None:

    workers = 32

    if input_path[-1] == '/':
        input_path = input_path[:-1]

    cur_path = os.path.join(output_path, str(host) + '.txt')
    new_split_sentence = functools.partial(split_sentence, limit=seq_len - 2)
    with open(cur_path, 'w', encoding='utf-8') as f:
        for fi, fin_path in enumerate(fin_list):
            if not os.path.exists(os.path.join(input_path, fin_path[0])):
                continue
            if '.json' not in fin_path[0]:
                continue

            print("Processing ", fin_path[0], " ", fi)

            with open(os.path.join(input_path, fin_path[0]), 'r') as fin:
                f_data = [l['content'] for l in json.load(fin)]

                pool = multiprocessing.Pool(workers)
                all_sent = pool.imap_unordered(new_split_sentence, f_data, 32)
                pool.close()
            print('finished..')

            cnt = 0
            for d in tqdm(all_sent):
                for i in d:
                    f.write(i.strip() + '\n')
                f.write(']]' + '\n')
                cnt += 1
                # if cnt >= 2:
                #     exit()


def getFileSize(filepath, shard):
    all_data = []
    for i in os.listdir(filepath):
        all_data.append(os.path.join(filepath, i))
    all_size = sum([os.path.getsize(os.path.join(filepath, f)) for f in all_data])
    ans = [[f.split('/')[-1], os.path.getsize(os.path.join(filepath, f))] for f in all_data]
    ans = sorted(ans, key=lambda x: x[1], reverse=True)
    per_size = all_size / shard
    real_shard = []
    temp = []
    accu_size = 0
    for i in ans:
        accu_size += i[1]
        temp.append(i)
        if accu_size > per_size:
            real_shard.append(temp)
            accu_size = 0
            temp = []

    if len(temp) > 0:
        real_shard.append(temp)

    return real_shard


def get_start_end(real_shard, base=0, server_num=10, server_name='GPU'):
    import socket
    host = int(socket.gethostname().split(server_name)[-1])

    fin_list = real_shard[server_num * base + host - 1]
    print(fin_list)
    print(f'I am server {host}, process {server_num * base + host - 1}, len {len(fin_list)}')
    return fin_list, host


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--server_num', type=int, default=10, help='number of servers')
    parser.add_argument('--seq_len', type=int, default=512, help='sequence length')
    parser.add_argument('--shard', type=int, default=100, help='number of shards, e.g., 10, 50, or 100')
    parser.add_argument('--input_path', type=str, required=True, help='input path of original corpus')
    parser.add_argument('--output_path', type=str, required=True, help='output path of shard which has split sentence')
    args = parser.parse_args()

    server_num = args.server_num
    seq_len = args.seq_len
    shard = args.shard
    input_path = args.input_path
    output_path = args.output_path

    real_shard = getFileSize(input_path, shard)

    start = time.time()
    for index, shard in enumerate(real_shard):
        get_sent(output_path, input_path, fin_list=shard, host=index, seq_len=seq_len)
    print(f'cost {str(time.time() - start)}')

    # if you have multiple server, you can use code below or modify code to openmpi

    # for i in range(len(real_shard) // server_num + 1):
    #     fin_list, host = get_start_end(real_shard, i)

    #     start = time.time()
    #     get_sent(output_path,
    #             input_path,
    #             fin_list=fin_list, host= 10 * i + host - 1)

    #     print(f'cost {str(time.time() - start)}')
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`import argparse`
			`import functools`
			`import json`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`import multiprocessing`
			`import os`
			`import re`
			`import time`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`from typing import List`

			`from tqdm import tqdm`

add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00
			`def split_sentence(document: str, flag: str = "all", limit: int = 510) -> List[str]:`
			`sent_list = []`
			`try:`
			`if flag == "zh":`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`document = re.sub('(?P<quotation_mark>([。？！…](?![”’"\'])))', r'\g<quotation_mark>\n', document)`
[example] update roberta with newer ColossalAI (#3472) * update roberta example * update roberta example 2023-04-07 02:34:51 +00:00			`document = re.sub('(?P<quotation_mark>([。？！]\|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document)`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`elif flag == "en":`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)`
			`document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n',`
			`document) # Special quotation marks`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`else:`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`document = re.sub('(?P<quotation_mark>([。？！….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)`

add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`document = re.sub('(?P<quotation_mark>(([。？！.!?]\|…{1,2})[”’"\']))', r'\g<quotation_mark>\n',`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`document) # Special quotation marks`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00
			`sent_list_ori = document.splitlines()`
			`for sent in sent_list_ori:`
			`sent = sent.strip()`
			`if not sent:`
			`continue`
			`elif len(sent) <= 2:`
			`continue`
			`else:`
			`while len(sent) > limit:`
			`temp = sent[0:limit]`
			`sent_list.append(temp)`
			`sent = sent[limit:]`
			`sent_list.append(sent)`
			`except:`
			`sent_list.clear()`
			`sent_list.append(document)`
			`return sent_list`


[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`def get_sent(output_path, input_path, fin_list=[], host=-1, seq_len=512) -> None:`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00
			`workers = 32`

			`if input_path[-1] == '/':`
			`input_path = input_path[:-1]`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`cur_path = os.path.join(output_path, str(host) + '.txt')`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`new_split_sentence = functools.partial(split_sentence, limit=seq_len - 2)`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`with open(cur_path, 'w', encoding='utf-8') as f:`
			`for fi, fin_path in enumerate(fin_list):`
			`if not os.path.exists(os.path.join(input_path, fin_path[0])):`
			`continue`
			`if '.json' not in fin_path[0]:`
			`continue`

			`print("Processing ", fin_path[0], " ", fi)`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`with open(os.path.join(input_path, fin_path[0]), 'r') as fin:`
			`f_data = [l['content'] for l in json.load(fin)]`

			`pool = multiprocessing.Pool(workers)`
			`all_sent = pool.imap_unordered(new_split_sentence, f_data, 32)`
			`pool.close()`
			`print('finished..')`

			`cnt = 0`
			`for d in tqdm(all_sent):`
			`for i in d:`
			`f.write(i.strip() + '\n')`
			`f.write(']]' + '\n')`
			`cnt += 1`
			`# if cnt >= 2:`
			`# exit()`


			`def getFileSize(filepath, shard):`
			`all_data = []`
			`for i in os.listdir(filepath):`
			`all_data.append(os.path.join(filepath, i))`
			`all_size = sum([os.path.getsize(os.path.join(filepath, f)) for f in all_data])`
			`ans = [[f.split('/')[-1], os.path.getsize(os.path.join(filepath, f))] for f in all_data]`
			`ans = sorted(ans, key=lambda x: x[1], reverse=True)`
			`per_size = all_size / shard`
			`real_shard = []`
			`temp = []`
			`accu_size = 0`
			`for i in ans:`
			`accu_size += i[1]`
			`temp.append(i)`
			`if accu_size > per_size:`
			`real_shard.append(temp)`
			`accu_size = 0`
			`temp = []`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`if len(temp) > 0:`
			`real_shard.append(temp)`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`return real_shard`


			`def get_start_end(real_shard, base=0, server_num=10, server_name='GPU'):`
			`import socket`
			`host = int(socket.gethostname().split(server_name)[-1])`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`fin_list = real_shard[server_num * base + host - 1]`
			`print(fin_list)`
			`print(f'I am server {host}, process {server_num * base + host - 1}, len {len(fin_list)}')`
			`return fin_list, host`


			`if __name__ == '__main__':`

			`parser = argparse.ArgumentParser()`
			`parser.add_argument('--server_num', type=int, default=10, help='number of servers')`
			`parser.add_argument('--seq_len', type=int, default=512, help='sequence length')`
			`parser.add_argument('--shard', type=int, default=100, help='number of shards, e.g., 10, 50, or 100')`
			`parser.add_argument('--input_path', type=str, required=True, help='input path of original corpus')`
			`parser.add_argument('--output_path', type=str, required=True, help='output path of shard which has split sentence')`
			`args = parser.parse_args()`

[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`server_num = args.server_num`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`seq_len = args.seq_len`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`shard = args.shard`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`input_path = args.input_path`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`output_path = args.output_path`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00
			`real_shard = getFileSize(input_path, shard)`

			`start = time.time()`
			`for index, shard in enumerate(real_shard):`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`get_sent(output_path, input_path, fin_list=shard, host=index, seq_len=seq_len)`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`print(f'cost {str(time.time() - start)}')`

			`# if you have multiple server, you can use code below or modify code to openmpi`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`# for i in range(len(real_shard) // server_num + 1):`
			`# fin_list, host = get_start_end(real_shard, i)`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`# start = time.time()`
			`# get_sent(output_path,`
			`# input_path,`
			`# fin_list=fin_list, host= 10 * i + host - 1)`

			`# print(f'cost {str(time.time() - start)}')`