diff --git a/examples/mcore/llama32/data_convert_llama32_ppo.sh b/examples/mcore/llama32/data_convert_llama32_ppo.sh new file mode 100644 index 0000000000000000000000000000000000000000..9e8e4b097f16f1d83dfa30e321f92fbfefc4d433 --- /dev/null +++ b/examples/mcore/llama32/data_convert_llama32_ppo.sh @@ -0,0 +1,14 @@ +# 请按照您的真实环境修改 set_env.sh 路径 +source /usr/local/Ascend/ascend-toolkit/set_env.sh +mkdir ./dataset/llama32-hf/ + +python ./preprocess_data.py \ + --input ./dataset/descriptiveness-00000-of-00001.parquet \ + --tokenizer-name-or-path ./model_from_hf/llama32-hf/ \ + --output-prefix ./dataset/llama32-hf/descriptiveness \ + --workers 16 \ + --log-interval 1000 \ + --tokenizer-type PretrainedFromHF \ + --handler-name PPOAlpacaStyleInstructionHandler \ + --prompt-type llama3 \ + --map-keys '{"prompt":"prompt", "query":"", "response": "prompt", "system":""}' \ No newline at end of file