Change quantization instruction

2023-04-02 00:35:40 +08:00 · 2023-04-02 00:35:40 +08:00 · ca43864f39
parent 4371f7a572
commit ca43864f39
2 changed files with 2 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -136,7 +136,7 @@ curl -X POST "http://127.0.0.1:8000" \

 ```python
 # 按需修改，目前只支持 4/8 bit 量化
-model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().quantize(4).cuda()
+model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).quantize(4).half().cuda()
 ```

 进行 2 至 3 轮对话后，8-bit 量化下 GPU 显存占用约为 10GB，4-bit 量化下仅需 6GB 占用。随着对话轮数的增多，对应消耗显存也随之增长，由于采用了相对位置编码，理论上 ChatGLM-6B 支持无限长的 context-length，但总长度超过 2048（训练长度）后性能会逐渐下降。
--- a/ptuning/main.py
+++ b/ptuning/main.py
@ -112,10 +112,10 @@ def main():

    model = AutoModel.from_pretrained(model_args.model_name_or_path, config=config, trust_remote_code=True)

-    model = model.half()
    if model_args.quantization_bit is not None:
        print(f"Quantized to {model_args.quantization_bit} bit")
        model = model.quantize(model_args.quantization_bit)
+    model = model.half()
    model.transformer.prefix_encoder.float()

    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""